No OneTemporary
Actions

Size

7 MB

Referenced Files

None

Subscribers

None

View Options

This file is larger than 256 KB, so syntax highlighting was skipped.

	diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
	index 8bf605e5e76b..533c5b1f6ff0 100644
	--- a/clang/lib/Sema/SemaOpenMP.cpp
	+++ b/clang/lib/Sema/SemaOpenMP.cpp
	@@ -1,19179 +1,19183 @@
	//===--- SemaOpenMP.cpp - Semantic Analysis for OpenMP constructs ---------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	/// \file
	/// This file implements semantic analysis for OpenMP directives and
	/// clauses.
	///
	//===----------------------------------------------------------------------===//

	#include "TreeTransform.h"
	#include "clang/AST/ASTContext.h"
	#include "clang/AST/ASTMutationListener.h"
	#include "clang/AST/CXXInheritance.h"
	#include "clang/AST/Decl.h"
	#include "clang/AST/DeclCXX.h"
	#include "clang/AST/DeclOpenMP.h"
	#include "clang/AST/OpenMPClause.h"
	#include "clang/AST/StmtCXX.h"
	#include "clang/AST/StmtOpenMP.h"
	#include "clang/AST/StmtVisitor.h"
	#include "clang/AST/TypeOrdering.h"
	#include "clang/Basic/DiagnosticSema.h"
	#include "clang/Basic/OpenMPKinds.h"
	#include "clang/Basic/PartialDiagnostic.h"
	#include "clang/Basic/TargetInfo.h"
	#include "clang/Sema/Initialization.h"
	#include "clang/Sema/Lookup.h"
	#include "clang/Sema/Scope.h"
	#include "clang/Sema/ScopeInfo.h"
	#include "clang/Sema/SemaInternal.h"
	#include "llvm/ADT/IndexedMap.h"
	#include "llvm/ADT/PointerEmbeddedInt.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/Frontend/OpenMP/OMPConstants.h"
	#include <set>

	using namespace clang;
	using namespace llvm::omp;

	//===----------------------------------------------------------------------===//
	// Stack of data-sharing attributes for variables
	//===----------------------------------------------------------------------===//

	static const Expr *checkMapClauseExpressionBase(
	Sema &SemaRef, Expr *E,
	OMPClauseMappableExprCommon::MappableExprComponentList &CurComponents,
	OpenMPClauseKind CKind, bool NoDiagnose);

	namespace {
	/// Default data sharing attributes, which can be applied to directive.
	enum DefaultDataSharingAttributes {
	DSA_unspecified = 0, /// Data sharing attribute not specified.
	DSA_none = 1 << 0, /// Default data sharing attribute 'none'.
	DSA_shared = 1 << 1, /// Default data sharing attribute 'shared'.
	DSA_firstprivate = 1 << 2, /// Default data sharing attribute 'firstprivate'.
	};

	/// Stack for tracking declarations used in OpenMP directives and
	/// clauses and their data-sharing attributes.
	class DSAStackTy {
	public:
	struct DSAVarData {
	OpenMPDirectiveKind DKind = OMPD_unknown;
	OpenMPClauseKind CKind = OMPC_unknown;
	unsigned Modifier = 0;
	const Expr *RefExpr = nullptr;
	DeclRefExpr *PrivateCopy = nullptr;
	SourceLocation ImplicitDSALoc;
	DSAVarData() = default;
	DSAVarData(OpenMPDirectiveKind DKind, OpenMPClauseKind CKind,
	const Expr RefExpr, DeclRefExpr PrivateCopy,
	SourceLocation ImplicitDSALoc, unsigned Modifier)
	: DKind(DKind), CKind(CKind), Modifier(Modifier), RefExpr(RefExpr),
	PrivateCopy(PrivateCopy), ImplicitDSALoc(ImplicitDSALoc) {}
	};
	using OperatorOffsetTy =
	llvm::SmallVector<std::pair<Expr *, OverloadedOperatorKind>, 4>;
	using DoacrossDependMapTy =
	llvm::DenseMap<OMPDependClause *, OperatorOffsetTy>;
	/// Kind of the declaration used in the uses_allocators clauses.
	enum class UsesAllocatorsDeclKind {
	/// Predefined allocator
	PredefinedAllocator,
	/// User-defined allocator
	UserDefinedAllocator,
	/// The declaration that represent allocator trait
	AllocatorTrait,
	};

	private:
	struct DSAInfo {
	OpenMPClauseKind Attributes = OMPC_unknown;
	unsigned Modifier = 0;
	/// Pointer to a reference expression and a flag which shows that the
	/// variable is marked as lastprivate(true) or not (false).
	llvm::PointerIntPair<const Expr *, 1, bool> RefExpr;
	DeclRefExpr *PrivateCopy = nullptr;
	};
	using DeclSAMapTy = llvm::SmallDenseMap<const ValueDecl *, DSAInfo, 8>;
	using UsedRefMapTy = llvm::SmallDenseMap<const ValueDecl , const Expr , 8>;
	using LCDeclInfo = std::pair<unsigned, VarDecl *>;
	using LoopControlVariablesMapTy =
	llvm::SmallDenseMap<const ValueDecl *, LCDeclInfo, 8>;
	/// Struct that associates a component with the clause kind where they are
	/// found.
	struct MappedExprComponentTy {
	OMPClauseMappableExprCommon::MappableExprComponentLists Components;
	OpenMPClauseKind Kind = OMPC_unknown;
	};
	using MappedExprComponentsTy =
	llvm::DenseMap<const ValueDecl *, MappedExprComponentTy>;
	using CriticalsWithHintsTy =
	llvm::StringMap<std::pair<const OMPCriticalDirective *, llvm::APSInt>>;
	struct ReductionData {
	using BOKPtrType = llvm::PointerEmbeddedInt<BinaryOperatorKind, 16>;
	SourceRange ReductionRange;
	llvm::PointerUnion<const Expr *, BOKPtrType> ReductionOp;
	ReductionData() = default;
	void set(BinaryOperatorKind BO, SourceRange RR) {
	ReductionRange = RR;
	ReductionOp = BO;
	}
	void set(const Expr *RefExpr, SourceRange RR) {
	ReductionRange = RR;
	ReductionOp = RefExpr;
	}
	};
	using DeclReductionMapTy =
	llvm::SmallDenseMap<const ValueDecl *, ReductionData, 4>;
	struct DefaultmapInfo {
	OpenMPDefaultmapClauseModifier ImplicitBehavior =
	OMPC_DEFAULTMAP_MODIFIER_unknown;
	SourceLocation SLoc;
	DefaultmapInfo() = default;
	DefaultmapInfo(OpenMPDefaultmapClauseModifier M, SourceLocation Loc)
	: ImplicitBehavior(M), SLoc(Loc) {}
	};

	struct SharingMapTy {
	DeclSAMapTy SharingMap;
	DeclReductionMapTy ReductionMap;
	UsedRefMapTy AlignedMap;
	UsedRefMapTy NontemporalMap;
	MappedExprComponentsTy MappedExprComponents;
	LoopControlVariablesMapTy LCVMap;
	DefaultDataSharingAttributes DefaultAttr = DSA_unspecified;
	SourceLocation DefaultAttrLoc;
	DefaultmapInfo DefaultmapMap[OMPC_DEFAULTMAP_unknown];
	OpenMPDirectiveKind Directive = OMPD_unknown;
	DeclarationNameInfo DirectiveName;
	Scope *CurScope = nullptr;
	SourceLocation ConstructLoc;
	/// Set of 'depend' clauses with 'sink\|source' dependence kind. Required to
	/// get the data (loop counters etc.) about enclosing loop-based construct.
	/// This data is required during codegen.
	DoacrossDependMapTy DoacrossDepends;
	/// First argument (Expr *) contains optional argument of the
	/// 'ordered' clause, the second one is true if the regions has 'ordered'
	/// clause, false otherwise.
	llvm::Optional<std::pair<const Expr , OMPOrderedClause >> OrderedRegion;
	unsigned AssociatedLoops = 1;
	bool HasMutipleLoops = false;
	const Decl *PossiblyLoopCounter = nullptr;
	bool NowaitRegion = false;
	bool CancelRegion = false;
	bool LoopStart = false;
	bool BodyComplete = false;
	SourceLocation PrevScanLocation;
	SourceLocation PrevOrderedLocation;
	SourceLocation InnerTeamsRegionLoc;
	/// Reference to the taskgroup task_reduction reference expression.
	Expr *TaskgroupReductionRef = nullptr;
	llvm::DenseSet<QualType> MappedClassesQualTypes;
	SmallVector<Expr *, 4> InnerUsedAllocators;
	llvm::DenseSet<CanonicalDeclPtr<Decl>> ImplicitTaskFirstprivates;
	/// List of globals marked as declare target link in this target region
	/// (isOpenMPTargetExecutionDirective(Directive) == true).
	llvm::SmallVector<DeclRefExpr *, 4> DeclareTargetLinkVarDecls;
	/// List of decls used in inclusive/exclusive clauses of the scan directive.
	llvm::DenseSet<CanonicalDeclPtr<Decl>> UsedInScanDirective;
	llvm::DenseMap<CanonicalDeclPtr<const Decl>, UsesAllocatorsDeclKind>
	UsesAllocatorsDecls;
	SharingMapTy(OpenMPDirectiveKind DKind, DeclarationNameInfo Name,
	Scope *CurScope, SourceLocation Loc)
	: Directive(DKind), DirectiveName(Name), CurScope(CurScope),
	ConstructLoc(Loc) {}
	SharingMapTy() = default;
	};

	using StackTy = SmallVector<SharingMapTy, 4>;

	/// Stack of used declaration and their data-sharing attributes.
	DeclSAMapTy Threadprivates;
	const FunctionScopeInfo *CurrentNonCapturingFunctionScope = nullptr;
	SmallVector<std::pair<StackTy, const FunctionScopeInfo *>, 4> Stack;
	/// true, if check for DSA must be from parent directive, false, if
	/// from current directive.
	OpenMPClauseKind ClauseKindMode = OMPC_unknown;
	Sema &SemaRef;
	bool ForceCapturing = false;
	/// true if all the variables in the target executable directives must be
	/// captured by reference.
	bool ForceCaptureByReferenceInTargetExecutable = false;
	CriticalsWithHintsTy Criticals;
	unsigned IgnoredStackElements = 0;

	/// Iterators over the stack iterate in order from innermost to outermost
	/// directive.
	using const_iterator = StackTy::const_reverse_iterator;
	const_iterator begin() const {
	return Stack.empty() ? const_iterator()
	: Stack.back().first.rbegin() + IgnoredStackElements;
	}
	const_iterator end() const {
	return Stack.empty() ? const_iterator() : Stack.back().first.rend();
	}
	using iterator = StackTy::reverse_iterator;
	iterator begin() {
	return Stack.empty() ? iterator()
	: Stack.back().first.rbegin() + IgnoredStackElements;
	}
	iterator end() {
	return Stack.empty() ? iterator() : Stack.back().first.rend();
	}

	// Convenience operations to get at the elements of the stack.

	bool isStackEmpty() const {
	return Stack.empty() \|\|
	Stack.back().second != CurrentNonCapturingFunctionScope \|\|
	Stack.back().first.size() <= IgnoredStackElements;
	}
	size_t getStackSize() const {
	return isStackEmpty() ? 0
	: Stack.back().first.size() - IgnoredStackElements;
	}

	SharingMapTy *getTopOfStackOrNull() {
	size_t Size = getStackSize();
	if (Size == 0)
	return nullptr;
	return &Stack.back().first[Size - 1];
	}
	const SharingMapTy *getTopOfStackOrNull() const {
	return const_cast<DSAStackTy&>(*this).getTopOfStackOrNull();
	}
	SharingMapTy &getTopOfStack() {
	assert(!isStackEmpty() && "no current directive");
	return *getTopOfStackOrNull();
	}
	const SharingMapTy &getTopOfStack() const {
	return const_cast<DSAStackTy&>(*this).getTopOfStack();
	}

	SharingMapTy *getSecondOnStackOrNull() {
	size_t Size = getStackSize();
	if (Size <= 1)
	return nullptr;
	return &Stack.back().first[Size - 2];
	}
	const SharingMapTy *getSecondOnStackOrNull() const {
	return const_cast<DSAStackTy&>(*this).getSecondOnStackOrNull();
	}

	/// Get the stack element at a certain level (previously returned by
	/// \c getNestingLevel).
	///
	/// Note that nesting levels count from outermost to innermost, and this is
	/// the reverse of our iteration order where new inner levels are pushed at
	/// the front of the stack.
	SharingMapTy &getStackElemAtLevel(unsigned Level) {
	assert(Level < getStackSize() && "no such stack element");
	return Stack.back().first[Level];
	}
	const SharingMapTy &getStackElemAtLevel(unsigned Level) const {
	return const_cast<DSAStackTy&>(*this).getStackElemAtLevel(Level);
	}

	DSAVarData getDSA(const_iterator &Iter, ValueDecl *D) const;

	/// Checks if the variable is a local for OpenMP region.
	bool isOpenMPLocal(VarDecl *D, const_iterator Iter) const;

	/// Vector of previously declared requires directives
	SmallVector<const OMPRequiresDecl *, 2> RequiresDecls;
	/// omp_allocator_handle_t type.
	QualType OMPAllocatorHandleT;
	/// omp_depend_t type.
	QualType OMPDependT;
	/// omp_event_handle_t type.
	QualType OMPEventHandleT;
	/// omp_alloctrait_t type.
	QualType OMPAlloctraitT;
	/// Expression for the predefined allocators.
	Expr *OMPPredefinedAllocators[OMPAllocateDeclAttr::OMPUserDefinedMemAlloc] = {
	nullptr};
	/// Vector of previously encountered target directives
	SmallVector<SourceLocation, 2> TargetLocations;
	SourceLocation AtomicLocation;

	public:
	explicit DSAStackTy(Sema &S) : SemaRef(S) {}

	/// Sets omp_allocator_handle_t type.
	void setOMPAllocatorHandleT(QualType Ty) { OMPAllocatorHandleT = Ty; }
	/// Gets omp_allocator_handle_t type.
	QualType getOMPAllocatorHandleT() const { return OMPAllocatorHandleT; }
	/// Sets omp_alloctrait_t type.
	void setOMPAlloctraitT(QualType Ty) { OMPAlloctraitT = Ty; }
	/// Gets omp_alloctrait_t type.
	QualType getOMPAlloctraitT() const { return OMPAlloctraitT; }
	/// Sets the given default allocator.
	void setAllocator(OMPAllocateDeclAttr::AllocatorTypeTy AllocatorKind,
	Expr *Allocator) {
	OMPPredefinedAllocators[AllocatorKind] = Allocator;
	}
	/// Returns the specified default allocator.
	Expr *getAllocator(OMPAllocateDeclAttr::AllocatorTypeTy AllocatorKind) const {
	return OMPPredefinedAllocators[AllocatorKind];
	}
	/// Sets omp_depend_t type.
	void setOMPDependT(QualType Ty) { OMPDependT = Ty; }
	/// Gets omp_depend_t type.
	QualType getOMPDependT() const { return OMPDependT; }

	/// Sets omp_event_handle_t type.
	void setOMPEventHandleT(QualType Ty) { OMPEventHandleT = Ty; }
	/// Gets omp_event_handle_t type.
	QualType getOMPEventHandleT() const { return OMPEventHandleT; }

	bool isClauseParsingMode() const { return ClauseKindMode != OMPC_unknown; }
	OpenMPClauseKind getClauseParsingMode() const {
	assert(isClauseParsingMode() && "Must be in clause parsing mode.");
	return ClauseKindMode;
	}
	void setClauseParsingMode(OpenMPClauseKind K) { ClauseKindMode = K; }

	bool isBodyComplete() const {
	const SharingMapTy *Top = getTopOfStackOrNull();
	return Top && Top->BodyComplete;
	}
	void setBodyComplete() {
	getTopOfStack().BodyComplete = true;
	}

	bool isForceVarCapturing() const { return ForceCapturing; }
	void setForceVarCapturing(bool V) { ForceCapturing = V; }

	void setForceCaptureByReferenceInTargetExecutable(bool V) {
	ForceCaptureByReferenceInTargetExecutable = V;
	}
	bool isForceCaptureByReferenceInTargetExecutable() const {
	return ForceCaptureByReferenceInTargetExecutable;
	}

	void push(OpenMPDirectiveKind DKind, const DeclarationNameInfo &DirName,
	Scope *CurScope, SourceLocation Loc) {
	assert(!IgnoredStackElements &&
	"cannot change stack while ignoring elements");
	if (Stack.empty() \|\|
	Stack.back().second != CurrentNonCapturingFunctionScope)
	Stack.emplace_back(StackTy(), CurrentNonCapturingFunctionScope);
	Stack.back().first.emplace_back(DKind, DirName, CurScope, Loc);
	Stack.back().first.back().DefaultAttrLoc = Loc;
	}

	void pop() {
	assert(!IgnoredStackElements &&
	"cannot change stack while ignoring elements");
	assert(!Stack.back().first.empty() &&
	"Data-sharing attributes stack is empty!");
	Stack.back().first.pop_back();
	}

	/// RAII object to temporarily leave the scope of a directive when we want to
	/// logically operate in its parent.
	class ParentDirectiveScope {
	DSAStackTy &Self;
	bool Active;
	public:
	ParentDirectiveScope(DSAStackTy &Self, bool Activate)
	: Self(Self), Active(false) {
	if (Activate)
	enable();
	}
	~ParentDirectiveScope() { disable(); }
	void disable() {
	if (Active) {
	--Self.IgnoredStackElements;
	Active = false;
	}
	}
	void enable() {
	if (!Active) {
	++Self.IgnoredStackElements;
	Active = true;
	}
	}
	};

	/// Marks that we're started loop parsing.
	void loopInit() {
	assert(isOpenMPLoopDirective(getCurrentDirective()) &&
	"Expected loop-based directive.");
	getTopOfStack().LoopStart = true;
	}
	/// Start capturing of the variables in the loop context.
	void loopStart() {
	assert(isOpenMPLoopDirective(getCurrentDirective()) &&
	"Expected loop-based directive.");
	getTopOfStack().LoopStart = false;
	}
	/// true, if variables are captured, false otherwise.
	bool isLoopStarted() const {
	assert(isOpenMPLoopDirective(getCurrentDirective()) &&
	"Expected loop-based directive.");
	return !getTopOfStack().LoopStart;
	}
	/// Marks (or clears) declaration as possibly loop counter.
	void resetPossibleLoopCounter(const Decl *D = nullptr) {
	getTopOfStack().PossiblyLoopCounter =
	D ? D->getCanonicalDecl() : D;
	}
	/// Gets the possible loop counter decl.
	const Decl *getPossiblyLoopCunter() const {
	return getTopOfStack().PossiblyLoopCounter;
	}
	/// Start new OpenMP region stack in new non-capturing function.
	void pushFunction() {
	assert(!IgnoredStackElements &&
	"cannot change stack while ignoring elements");
	const FunctionScopeInfo *CurFnScope = SemaRef.getCurFunction();
	assert(!isa<CapturingScopeInfo>(CurFnScope));
	CurrentNonCapturingFunctionScope = CurFnScope;
	}
	/// Pop region stack for non-capturing function.
	void popFunction(const FunctionScopeInfo *OldFSI) {
	assert(!IgnoredStackElements &&
	"cannot change stack while ignoring elements");
	if (!Stack.empty() && Stack.back().second == OldFSI) {
	assert(Stack.back().first.empty());
	Stack.pop_back();
	}
	CurrentNonCapturingFunctionScope = nullptr;
	for (const FunctionScopeInfo *FSI : llvm::reverse(SemaRef.FunctionScopes)) {
	if (!isa<CapturingScopeInfo>(FSI)) {
	CurrentNonCapturingFunctionScope = FSI;
	break;
	}
	}
	}

	void addCriticalWithHint(const OMPCriticalDirective *D, llvm::APSInt Hint) {
	Criticals.try_emplace(D->getDirectiveName().getAsString(), D, Hint);
	}
	const std::pair<const OMPCriticalDirective *, llvm::APSInt>
	getCriticalWithHint(const DeclarationNameInfo &Name) const {
	auto I = Criticals.find(Name.getAsString());
	if (I != Criticals.end())
	return I->second;
	return std::make_pair(nullptr, llvm::APSInt());
	}
	/// If 'aligned' declaration for given variable \a D was not seen yet,
	/// add it and return NULL; otherwise return previous occurrence's expression
	/// for diagnostics.
	const Expr addUniqueAligned(const ValueDecl D, const Expr *NewDE);
	/// If 'nontemporal' declaration for given variable \a D was not seen yet,
	/// add it and return NULL; otherwise return previous occurrence's expression
	/// for diagnostics.
	const Expr addUniqueNontemporal(const ValueDecl D, const Expr *NewDE);

	/// Register specified variable as loop control variable.
	void addLoopControlVariable(const ValueDecl D, VarDecl Capture);
	/// Check if the specified variable is a loop control variable for
	/// current region.
	/// \return The index of the loop control variable in the list of associated
	/// for-loops (from outer to inner).
	const LCDeclInfo isLoopControlVariable(const ValueDecl *D) const;
	/// Check if the specified variable is a loop control variable for
	/// parent region.
	/// \return The index of the loop control variable in the list of associated
	/// for-loops (from outer to inner).
	const LCDeclInfo isParentLoopControlVariable(const ValueDecl *D) const;
	/// Check if the specified variable is a loop control variable for
	/// current region.
	/// \return The index of the loop control variable in the list of associated
	/// for-loops (from outer to inner).
	const LCDeclInfo isLoopControlVariable(const ValueDecl *D,
	unsigned Level) const;
	/// Get the loop control variable for the I-th loop (or nullptr) in
	/// parent directive.
	const ValueDecl *getParentLoopControlVariable(unsigned I) const;

	/// Marks the specified decl \p D as used in scan directive.
	void markDeclAsUsedInScanDirective(ValueDecl *D) {
	if (SharingMapTy *Stack = getSecondOnStackOrNull())
	Stack->UsedInScanDirective.insert(D);
	}

	/// Checks if the specified declaration was used in the inner scan directive.
	bool isUsedInScanDirective(ValueDecl *D) const {
	if (const SharingMapTy *Stack = getTopOfStackOrNull())
	return Stack->UsedInScanDirective.count(D) > 0;
	return false;
	}

	/// Adds explicit data sharing attribute to the specified declaration.
	void addDSA(const ValueDecl D, const Expr E, OpenMPClauseKind A,
	DeclRefExpr *PrivateCopy = nullptr, unsigned Modifier = 0);

	/// Adds additional information for the reduction items with the reduction id
	/// represented as an operator.
	void addTaskgroupReductionData(const ValueDecl *D, SourceRange SR,
	BinaryOperatorKind BOK);
	/// Adds additional information for the reduction items with the reduction id
	/// represented as reduction identifier.
	void addTaskgroupReductionData(const ValueDecl *D, SourceRange SR,
	const Expr *ReductionRef);
	/// Returns the location and reduction operation from the innermost parent
	/// region for the given \p D.
	const DSAVarData
	getTopMostTaskgroupReductionData(const ValueDecl *D, SourceRange &SR,
	BinaryOperatorKind &BOK,
	Expr *&TaskgroupDescriptor) const;
	/// Returns the location and reduction operation from the innermost parent
	/// region for the given \p D.
	const DSAVarData
	getTopMostTaskgroupReductionData(const ValueDecl *D, SourceRange &SR,
	const Expr *&ReductionRef,
	Expr *&TaskgroupDescriptor) const;
	/// Return reduction reference expression for the current taskgroup or
	/// parallel/worksharing directives with task reductions.
	Expr *getTaskgroupReductionRef() const {
	assert((getTopOfStack().Directive == OMPD_taskgroup \|\|
	((isOpenMPParallelDirective(getTopOfStack().Directive) \|\|
	isOpenMPWorksharingDirective(getTopOfStack().Directive)) &&
	!isOpenMPSimdDirective(getTopOfStack().Directive))) &&
	"taskgroup reference expression requested for non taskgroup or "
	"parallel/worksharing directive.");
	return getTopOfStack().TaskgroupReductionRef;
	}
	/// Checks if the given \p VD declaration is actually a taskgroup reduction
	/// descriptor variable at the \p Level of OpenMP regions.
	bool isTaskgroupReductionRef(const ValueDecl *VD, unsigned Level) const {
	return getStackElemAtLevel(Level).TaskgroupReductionRef &&
	cast<DeclRefExpr>(getStackElemAtLevel(Level).TaskgroupReductionRef)
	->getDecl() == VD;
	}

	/// Returns data sharing attributes from top of the stack for the
	/// specified declaration.
	const DSAVarData getTopDSA(ValueDecl *D, bool FromParent);
	/// Returns data-sharing attributes for the specified declaration.
	const DSAVarData getImplicitDSA(ValueDecl *D, bool FromParent) const;
	/// Returns data-sharing attributes for the specified declaration.
	const DSAVarData getImplicitDSA(ValueDecl *D, unsigned Level) const;
	/// Checks if the specified variables has data-sharing attributes which
	/// match specified \a CPred predicate in any directive which matches \a DPred
	/// predicate.
	const DSAVarData
	hasDSA(ValueDecl *D, const llvm::function_ref<bool(OpenMPClauseKind)> CPred,
	const llvm::function_ref<bool(OpenMPDirectiveKind)> DPred,
	bool FromParent) const;
	/// Checks if the specified variables has data-sharing attributes which
	/// match specified \a CPred predicate in any innermost directive which
	/// matches \a DPred predicate.
	const DSAVarData
	hasInnermostDSA(ValueDecl *D,
	const llvm::function_ref<bool(OpenMPClauseKind)> CPred,
	const llvm::function_ref<bool(OpenMPDirectiveKind)> DPred,
	bool FromParent) const;
	/// Checks if the specified variables has explicit data-sharing
	/// attributes which match specified \a CPred predicate at the specified
	/// OpenMP region.
	bool hasExplicitDSA(const ValueDecl *D,
	const llvm::function_ref<bool(OpenMPClauseKind)> CPred,
	unsigned Level, bool NotLastprivate = false) const;

	/// Returns true if the directive at level \Level matches in the
	/// specified \a DPred predicate.
	bool hasExplicitDirective(
	const llvm::function_ref<bool(OpenMPDirectiveKind)> DPred,
	unsigned Level) const;

	/// Finds a directive which matches specified \a DPred predicate.
	bool hasDirective(
	const llvm::function_ref<bool(
	OpenMPDirectiveKind, const DeclarationNameInfo &, SourceLocation)>
	DPred,
	bool FromParent) const;

	/// Returns currently analyzed directive.
	OpenMPDirectiveKind getCurrentDirective() const {
	const SharingMapTy *Top = getTopOfStackOrNull();
	return Top ? Top->Directive : OMPD_unknown;
	}
	/// Returns directive kind at specified level.
	OpenMPDirectiveKind getDirective(unsigned Level) const {
	assert(!isStackEmpty() && "No directive at specified level.");
	return getStackElemAtLevel(Level).Directive;
	}
	/// Returns the capture region at the specified level.
	OpenMPDirectiveKind getCaptureRegion(unsigned Level,
	unsigned OpenMPCaptureLevel) const {
	SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
	getOpenMPCaptureRegions(CaptureRegions, getDirective(Level));
	return CaptureRegions[OpenMPCaptureLevel];
	}
	/// Returns parent directive.
	OpenMPDirectiveKind getParentDirective() const {
	const SharingMapTy *Parent = getSecondOnStackOrNull();
	return Parent ? Parent->Directive : OMPD_unknown;
	}

	/// Add requires decl to internal vector
	void addRequiresDecl(OMPRequiresDecl *RD) {
	RequiresDecls.push_back(RD);
	}

	/// Checks if the defined 'requires' directive has specified type of clause.
	template <typename ClauseType>
	bool hasRequiresDeclWithClause() const {
	return llvm::any_of(RequiresDecls, [](const OMPRequiresDecl *D) {
	return llvm::any_of(D->clauselists(), [](const OMPClause *C) {
	return isa<ClauseType>(C);
	});
	});
	}

	/// Checks for a duplicate clause amongst previously declared requires
	/// directives
	bool hasDuplicateRequiresClause(ArrayRef<OMPClause *> ClauseList) const {
	bool IsDuplicate = false;
	for (OMPClause *CNew : ClauseList) {
	for (const OMPRequiresDecl *D : RequiresDecls) {
	for (const OMPClause *CPrev : D->clauselists()) {
	if (CNew->getClauseKind() == CPrev->getClauseKind()) {
	SemaRef.Diag(CNew->getBeginLoc(),
	diag::err_omp_requires_clause_redeclaration)
	<< getOpenMPClauseName(CNew->getClauseKind());
	SemaRef.Diag(CPrev->getBeginLoc(),
	diag::note_omp_requires_previous_clause)
	<< getOpenMPClauseName(CPrev->getClauseKind());
	IsDuplicate = true;
	}
	}
	}
	}
	return IsDuplicate;
	}

	/// Add location of previously encountered target to internal vector
	void addTargetDirLocation(SourceLocation LocStart) {
	TargetLocations.push_back(LocStart);
	}

	/// Add location for the first encountered atomicc directive.
	void addAtomicDirectiveLoc(SourceLocation Loc) {
	if (AtomicLocation.isInvalid())
	AtomicLocation = Loc;
	}

	/// Returns the location of the first encountered atomic directive in the
	/// module.
	SourceLocation getAtomicDirectiveLoc() const {
	return AtomicLocation;
	}

	// Return previously encountered target region locations.
	ArrayRef<SourceLocation> getEncounteredTargetLocs() const {
	return TargetLocations;
	}

	/// Set default data sharing attribute to none.
	void setDefaultDSANone(SourceLocation Loc) {
	getTopOfStack().DefaultAttr = DSA_none;
	getTopOfStack().DefaultAttrLoc = Loc;
	}
	/// Set default data sharing attribute to shared.
	void setDefaultDSAShared(SourceLocation Loc) {
	getTopOfStack().DefaultAttr = DSA_shared;
	getTopOfStack().DefaultAttrLoc = Loc;
	}
	/// Set default data sharing attribute to firstprivate.
	void setDefaultDSAFirstPrivate(SourceLocation Loc) {
	getTopOfStack().DefaultAttr = DSA_firstprivate;
	getTopOfStack().DefaultAttrLoc = Loc;
	}
	/// Set default data mapping attribute to Modifier:Kind
	void setDefaultDMAAttr(OpenMPDefaultmapClauseModifier M,
	OpenMPDefaultmapClauseKind Kind,
	SourceLocation Loc) {
	DefaultmapInfo &DMI = getTopOfStack().DefaultmapMap[Kind];
	DMI.ImplicitBehavior = M;
	DMI.SLoc = Loc;
	}
	/// Check whether the implicit-behavior has been set in defaultmap
	bool checkDefaultmapCategory(OpenMPDefaultmapClauseKind VariableCategory) {
	if (VariableCategory == OMPC_DEFAULTMAP_unknown)
	return getTopOfStack()
	.DefaultmapMap[OMPC_DEFAULTMAP_aggregate]
	.ImplicitBehavior != OMPC_DEFAULTMAP_MODIFIER_unknown \|\|
	getTopOfStack()
	.DefaultmapMap[OMPC_DEFAULTMAP_scalar]
	.ImplicitBehavior != OMPC_DEFAULTMAP_MODIFIER_unknown \|\|
	getTopOfStack()
	.DefaultmapMap[OMPC_DEFAULTMAP_pointer]
	.ImplicitBehavior != OMPC_DEFAULTMAP_MODIFIER_unknown;
	return getTopOfStack().DefaultmapMap[VariableCategory].ImplicitBehavior !=
	OMPC_DEFAULTMAP_MODIFIER_unknown;
	}

	DefaultDataSharingAttributes getDefaultDSA(unsigned Level) const {
	return getStackSize() <= Level ? DSA_unspecified
	: getStackElemAtLevel(Level).DefaultAttr;
	}
	DefaultDataSharingAttributes getDefaultDSA() const {
	return isStackEmpty() ? DSA_unspecified
	: getTopOfStack().DefaultAttr;
	}
	SourceLocation getDefaultDSALocation() const {
	return isStackEmpty() ? SourceLocation()
	: getTopOfStack().DefaultAttrLoc;
	}
	OpenMPDefaultmapClauseModifier
	getDefaultmapModifier(OpenMPDefaultmapClauseKind Kind) const {
	return isStackEmpty()
	? OMPC_DEFAULTMAP_MODIFIER_unknown
	: getTopOfStack().DefaultmapMap[Kind].ImplicitBehavior;
	}
	OpenMPDefaultmapClauseModifier
	getDefaultmapModifierAtLevel(unsigned Level,
	OpenMPDefaultmapClauseKind Kind) const {
	return getStackElemAtLevel(Level).DefaultmapMap[Kind].ImplicitBehavior;
	}
	bool isDefaultmapCapturedByRef(unsigned Level,
	OpenMPDefaultmapClauseKind Kind) const {
	OpenMPDefaultmapClauseModifier M =
	getDefaultmapModifierAtLevel(Level, Kind);
	if (Kind == OMPC_DEFAULTMAP_scalar \|\| Kind == OMPC_DEFAULTMAP_pointer) {
	return (M == OMPC_DEFAULTMAP_MODIFIER_alloc) \|\|
	(M == OMPC_DEFAULTMAP_MODIFIER_to) \|\|
	(M == OMPC_DEFAULTMAP_MODIFIER_from) \|\|
	(M == OMPC_DEFAULTMAP_MODIFIER_tofrom);
	}
	return true;
	}
	static bool mustBeFirstprivateBase(OpenMPDefaultmapClauseModifier M,
	OpenMPDefaultmapClauseKind Kind) {
	switch (Kind) {
	case OMPC_DEFAULTMAP_scalar:
	case OMPC_DEFAULTMAP_pointer:
	return (M == OMPC_DEFAULTMAP_MODIFIER_unknown) \|\|
	(M == OMPC_DEFAULTMAP_MODIFIER_firstprivate) \|\|
	(M == OMPC_DEFAULTMAP_MODIFIER_default);
	case OMPC_DEFAULTMAP_aggregate:
	return M == OMPC_DEFAULTMAP_MODIFIER_firstprivate;
	default:
	break;
	}
	llvm_unreachable("Unexpected OpenMPDefaultmapClauseKind enum");
	}
	bool mustBeFirstprivateAtLevel(unsigned Level,
	OpenMPDefaultmapClauseKind Kind) const {
	OpenMPDefaultmapClauseModifier M =
	getDefaultmapModifierAtLevel(Level, Kind);
	return mustBeFirstprivateBase(M, Kind);
	}
	bool mustBeFirstprivate(OpenMPDefaultmapClauseKind Kind) const {
	OpenMPDefaultmapClauseModifier M = getDefaultmapModifier(Kind);
	return mustBeFirstprivateBase(M, Kind);
	}

	/// Checks if the specified variable is a threadprivate.
	bool isThreadPrivate(VarDecl *D) {
	const DSAVarData DVar = getTopDSA(D, false);
	return isOpenMPThreadPrivate(DVar.CKind);
	}

	/// Marks current region as ordered (it has an 'ordered' clause).
	void setOrderedRegion(bool IsOrdered, const Expr *Param,
	OMPOrderedClause *Clause) {
	if (IsOrdered)
	getTopOfStack().OrderedRegion.emplace(Param, Clause);
	else
	getTopOfStack().OrderedRegion.reset();
	}
	/// Returns true, if region is ordered (has associated 'ordered' clause),
	/// false - otherwise.
	bool isOrderedRegion() const {
	if (const SharingMapTy *Top = getTopOfStackOrNull())
	return Top->OrderedRegion.hasValue();
	return false;
	}
	/// Returns optional parameter for the ordered region.
	std::pair<const Expr , OMPOrderedClause > getOrderedRegionParam() const {
	if (const SharingMapTy *Top = getTopOfStackOrNull())
	if (Top->OrderedRegion.hasValue())
	return Top->OrderedRegion.getValue();
	return std::make_pair(nullptr, nullptr);
	}
	/// Returns true, if parent region is ordered (has associated
	/// 'ordered' clause), false - otherwise.
	bool isParentOrderedRegion() const {
	if (const SharingMapTy *Parent = getSecondOnStackOrNull())
	return Parent->OrderedRegion.hasValue();
	return false;
	}
	/// Returns optional parameter for the ordered region.
	std::pair<const Expr , OMPOrderedClause >
	getParentOrderedRegionParam() const {
	if (const SharingMapTy *Parent = getSecondOnStackOrNull())
	if (Parent->OrderedRegion.hasValue())
	return Parent->OrderedRegion.getValue();
	return std::make_pair(nullptr, nullptr);
	}
	/// Marks current region as nowait (it has a 'nowait' clause).
	void setNowaitRegion(bool IsNowait = true) {
	getTopOfStack().NowaitRegion = IsNowait;
	}
	/// Returns true, if parent region is nowait (has associated
	/// 'nowait' clause), false - otherwise.
	bool isParentNowaitRegion() const {
	if (const SharingMapTy *Parent = getSecondOnStackOrNull())
	return Parent->NowaitRegion;
	return false;
	}
	/// Marks parent region as cancel region.
	void setParentCancelRegion(bool Cancel = true) {
	if (SharingMapTy *Parent = getSecondOnStackOrNull())
	Parent->CancelRegion \|= Cancel;
	}
	/// Return true if current region has inner cancel construct.
	bool isCancelRegion() const {
	const SharingMapTy *Top = getTopOfStackOrNull();
	return Top ? Top->CancelRegion : false;
	}

	/// Mark that parent region already has scan directive.
	void setParentHasScanDirective(SourceLocation Loc) {
	if (SharingMapTy *Parent = getSecondOnStackOrNull())
	Parent->PrevScanLocation = Loc;
	}
	/// Return true if current region has inner cancel construct.
	bool doesParentHasScanDirective() const {
	const SharingMapTy *Top = getSecondOnStackOrNull();
	return Top ? Top->PrevScanLocation.isValid() : false;
	}
	/// Return true if current region has inner cancel construct.
	SourceLocation getParentScanDirectiveLoc() const {
	const SharingMapTy *Top = getSecondOnStackOrNull();
	return Top ? Top->PrevScanLocation : SourceLocation();
	}
	/// Mark that parent region already has ordered directive.
	void setParentHasOrderedDirective(SourceLocation Loc) {
	if (SharingMapTy *Parent = getSecondOnStackOrNull())
	Parent->PrevOrderedLocation = Loc;
	}
	/// Return true if current region has inner ordered construct.
	bool doesParentHasOrderedDirective() const {
	const SharingMapTy *Top = getSecondOnStackOrNull();
	return Top ? Top->PrevOrderedLocation.isValid() : false;
	}
	/// Returns the location of the previously specified ordered directive.
	SourceLocation getParentOrderedDirectiveLoc() const {
	const SharingMapTy *Top = getSecondOnStackOrNull();
	return Top ? Top->PrevOrderedLocation : SourceLocation();
	}

	/// Set collapse value for the region.
	void setAssociatedLoops(unsigned Val) {
	getTopOfStack().AssociatedLoops = Val;
	if (Val > 1)
	getTopOfStack().HasMutipleLoops = true;
	}
	/// Return collapse value for region.
	unsigned getAssociatedLoops() const {
	const SharingMapTy *Top = getTopOfStackOrNull();
	return Top ? Top->AssociatedLoops : 0;
	}
	/// Returns true if the construct is associated with multiple loops.
	bool hasMutipleLoops() const {
	const SharingMapTy *Top = getTopOfStackOrNull();
	return Top ? Top->HasMutipleLoops : false;
	}

	/// Marks current target region as one with closely nested teams
	/// region.
	void setParentTeamsRegionLoc(SourceLocation TeamsRegionLoc) {
	if (SharingMapTy *Parent = getSecondOnStackOrNull())
	Parent->InnerTeamsRegionLoc = TeamsRegionLoc;
	}
	/// Returns true, if current region has closely nested teams region.
	bool hasInnerTeamsRegion() const {
	return getInnerTeamsRegionLoc().isValid();
	}
	/// Returns location of the nested teams region (if any).
	SourceLocation getInnerTeamsRegionLoc() const {
	const SharingMapTy *Top = getTopOfStackOrNull();
	return Top ? Top->InnerTeamsRegionLoc : SourceLocation();
	}

	Scope *getCurScope() const {
	const SharingMapTy *Top = getTopOfStackOrNull();
	return Top ? Top->CurScope : nullptr;
	}
	SourceLocation getConstructLoc() const {
	const SharingMapTy *Top = getTopOfStackOrNull();
	return Top ? Top->ConstructLoc : SourceLocation();
	}

	/// Do the check specified in \a Check to all component lists and return true
	/// if any issue is found.
	bool checkMappableExprComponentListsForDecl(
	const ValueDecl *VD, bool CurrentRegionOnly,
	const llvm::function_ref<
	bool(OMPClauseMappableExprCommon::MappableExprComponentListRef,
	OpenMPClauseKind)>
	Check) const {
	if (isStackEmpty())
	return false;
	auto SI = begin();
	auto SE = end();

	if (SI == SE)
	return false;

	if (CurrentRegionOnly)
	SE = std::next(SI);
	else
	std::advance(SI, 1);

	for (; SI != SE; ++SI) {
	auto MI = SI->MappedExprComponents.find(VD);
	if (MI != SI->MappedExprComponents.end())
	for (OMPClauseMappableExprCommon::MappableExprComponentListRef L :
	MI->second.Components)
	if (Check(L, MI->second.Kind))
	return true;
	}
	return false;
	}

	/// Do the check specified in \a Check to all component lists at a given level
	/// and return true if any issue is found.
	bool checkMappableExprComponentListsForDeclAtLevel(
	const ValueDecl *VD, unsigned Level,
	const llvm::function_ref<
	bool(OMPClauseMappableExprCommon::MappableExprComponentListRef,
	OpenMPClauseKind)>
	Check) const {
	if (getStackSize() <= Level)
	return false;

	const SharingMapTy &StackElem = getStackElemAtLevel(Level);
	auto MI = StackElem.MappedExprComponents.find(VD);
	if (MI != StackElem.MappedExprComponents.end())
	for (OMPClauseMappableExprCommon::MappableExprComponentListRef L :
	MI->second.Components)
	if (Check(L, MI->second.Kind))
	return true;
	return false;
	}

	/// Create a new mappable expression component list associated with a given
	/// declaration and initialize it with the provided list of components.
	void addMappableExpressionComponents(
	const ValueDecl *VD,
	OMPClauseMappableExprCommon::MappableExprComponentListRef Components,
	OpenMPClauseKind WhereFoundClauseKind) {
	MappedExprComponentTy &MEC = getTopOfStack().MappedExprComponents[VD];
	// Create new entry and append the new components there.
	MEC.Components.resize(MEC.Components.size() + 1);
	MEC.Components.back().append(Components.begin(), Components.end());
	MEC.Kind = WhereFoundClauseKind;
	}

	unsigned getNestingLevel() const {
	assert(!isStackEmpty());
	return getStackSize() - 1;
	}
	void addDoacrossDependClause(OMPDependClause *C,
	const OperatorOffsetTy &OpsOffs) {
	SharingMapTy *Parent = getSecondOnStackOrNull();
	assert(Parent && isOpenMPWorksharingDirective(Parent->Directive));
	Parent->DoacrossDepends.try_emplace(C, OpsOffs);
	}
	llvm::iterator_range<DoacrossDependMapTy::const_iterator>
	getDoacrossDependClauses() const {
	const SharingMapTy &StackElem = getTopOfStack();
	if (isOpenMPWorksharingDirective(StackElem.Directive)) {
	const DoacrossDependMapTy &Ref = StackElem.DoacrossDepends;
	return llvm::make_range(Ref.begin(), Ref.end());
	}
	return llvm::make_range(StackElem.DoacrossDepends.end(),
	StackElem.DoacrossDepends.end());
	}

	// Store types of classes which have been explicitly mapped
	void addMappedClassesQualTypes(QualType QT) {
	SharingMapTy &StackElem = getTopOfStack();
	StackElem.MappedClassesQualTypes.insert(QT);
	}

	// Return set of mapped classes types
	bool isClassPreviouslyMapped(QualType QT) const {
	const SharingMapTy &StackElem = getTopOfStack();
	return StackElem.MappedClassesQualTypes.count(QT) != 0;
	}

	/// Adds global declare target to the parent target region.
	void addToParentTargetRegionLinkGlobals(DeclRefExpr *E) {
	assert(*OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(
	E->getDecl()) == OMPDeclareTargetDeclAttr::MT_Link &&
	"Expected declare target link global.");
	for (auto &Elem : *this) {
	if (isOpenMPTargetExecutionDirective(Elem.Directive)) {
	Elem.DeclareTargetLinkVarDecls.push_back(E);
	return;
	}
	}
	}

	/// Returns the list of globals with declare target link if current directive
	/// is target.
	ArrayRef<DeclRefExpr *> getLinkGlobals() const {
	assert(isOpenMPTargetExecutionDirective(getCurrentDirective()) &&
	"Expected target executable directive.");
	return getTopOfStack().DeclareTargetLinkVarDecls;
	}

	/// Adds list of allocators expressions.
	void addInnerAllocatorExpr(Expr *E) {
	getTopOfStack().InnerUsedAllocators.push_back(E);
	}
	/// Return list of used allocators.
	ArrayRef<Expr *> getInnerAllocators() const {
	return getTopOfStack().InnerUsedAllocators;
	}
	/// Marks the declaration as implicitly firstprivate nin the task-based
	/// regions.
	void addImplicitTaskFirstprivate(unsigned Level, Decl *D) {
	getStackElemAtLevel(Level).ImplicitTaskFirstprivates.insert(D);
	}
	/// Checks if the decl is implicitly firstprivate in the task-based region.
	bool isImplicitTaskFirstprivate(Decl *D) const {
	return getTopOfStack().ImplicitTaskFirstprivates.count(D) > 0;
	}

	/// Marks decl as used in uses_allocators clause as the allocator.
	void addUsesAllocatorsDecl(const Decl *D, UsesAllocatorsDeclKind Kind) {
	getTopOfStack().UsesAllocatorsDecls.try_emplace(D, Kind);
	}
	/// Checks if specified decl is used in uses allocator clause as the
	/// allocator.
	Optional<UsesAllocatorsDeclKind> isUsesAllocatorsDecl(unsigned Level,
	const Decl *D) const {
	const SharingMapTy &StackElem = getTopOfStack();
	auto I = StackElem.UsesAllocatorsDecls.find(D);
	if (I == StackElem.UsesAllocatorsDecls.end())
	return None;
	return I->getSecond();
	}
	Optional<UsesAllocatorsDeclKind> isUsesAllocatorsDecl(const Decl *D) const {
	const SharingMapTy &StackElem = getTopOfStack();
	auto I = StackElem.UsesAllocatorsDecls.find(D);
	if (I == StackElem.UsesAllocatorsDecls.end())
	return None;
	return I->getSecond();
	}
	};

	bool isImplicitTaskingRegion(OpenMPDirectiveKind DKind) {
	return isOpenMPParallelDirective(DKind) \|\| isOpenMPTeamsDirective(DKind);
	}

	bool isImplicitOrExplicitTaskingRegion(OpenMPDirectiveKind DKind) {
	return isImplicitTaskingRegion(DKind) \|\| isOpenMPTaskingDirective(DKind) \|\|
	DKind == OMPD_unknown;
	}

	} // namespace

	static const Expr getExprAsWritten(const Expr E) {
	if (const auto *FE = dyn_cast<FullExpr>(E))
	E = FE->getSubExpr();

	if (const auto *MTE = dyn_cast<MaterializeTemporaryExpr>(E))
	E = MTE->getSubExpr();

	while (const auto *Binder = dyn_cast<CXXBindTemporaryExpr>(E))
	E = Binder->getSubExpr();

	if (const auto *ICE = dyn_cast<ImplicitCastExpr>(E))
	E = ICE->getSubExprAsWritten();
	return E->IgnoreParens();
	}

	static Expr getExprAsWritten(Expr E) {
	return const_cast<Expr >(getExprAsWritten(const_cast<const Expr >(E)));
	}

	static const ValueDecl getCanonicalDecl(const ValueDecl D) {
	if (const auto *CED = dyn_cast<OMPCapturedExprDecl>(D))
	if (const auto *ME = dyn_cast<MemberExpr>(getExprAsWritten(CED->getInit())))
	D = ME->getMemberDecl();
	const auto *VD = dyn_cast<VarDecl>(D);
	const auto *FD = dyn_cast<FieldDecl>(D);
	if (VD != nullptr) {
	VD = VD->getCanonicalDecl();
	D = VD;
	} else {
	assert(FD);
	FD = FD->getCanonicalDecl();
	D = FD;
	}
	return D;
	}

	static ValueDecl getCanonicalDecl(ValueDecl D) {
	return const_cast<ValueDecl *>(
	getCanonicalDecl(const_cast<const ValueDecl *>(D)));
	}

	DSAStackTy::DSAVarData DSAStackTy::getDSA(const_iterator &Iter,
	ValueDecl *D) const {
	D = getCanonicalDecl(D);
	auto *VD = dyn_cast<VarDecl>(D);
	const auto *FD = dyn_cast<FieldDecl>(D);
	DSAVarData DVar;
	if (Iter == end()) {
	// OpenMP [2.9.1.1, Data-sharing Attribute Rules for Variables Referenced
	// in a region but not in construct]
	// File-scope or namespace-scope variables referenced in called routines
	// in the region are shared unless they appear in a threadprivate
	// directive.
	if (VD && !VD->isFunctionOrMethodVarDecl() && !isa<ParmVarDecl>(VD))
	DVar.CKind = OMPC_shared;

	// OpenMP [2.9.1.2, Data-sharing Attribute Rules for Variables Referenced
	// in a region but not in construct]
	// Variables with static storage duration that are declared in called
	// routines in the region are shared.
	if (VD && VD->hasGlobalStorage())
	DVar.CKind = OMPC_shared;

	// Non-static data members are shared by default.
	if (FD)
	DVar.CKind = OMPC_shared;

	return DVar;
	}

	// OpenMP [2.9.1.1, Data-sharing Attribute Rules for Variables Referenced
	// in a Construct, C/C++, predetermined, p.1]
	// Variables with automatic storage duration that are declared in a scope
	// inside the construct are private.
	if (VD && isOpenMPLocal(VD, Iter) && VD->isLocalVarDecl() &&
	(VD->getStorageClass() == SC_Auto \|\| VD->getStorageClass() == SC_None)) {
	DVar.CKind = OMPC_private;
	return DVar;
	}

	DVar.DKind = Iter->Directive;
	// Explicitly specified attributes and local variables with predetermined
	// attributes.
	if (Iter->SharingMap.count(D)) {
	const DSAInfo &Data = Iter->SharingMap.lookup(D);
	DVar.RefExpr = Data.RefExpr.getPointer();
	DVar.PrivateCopy = Data.PrivateCopy;
	DVar.CKind = Data.Attributes;
	DVar.ImplicitDSALoc = Iter->DefaultAttrLoc;
	DVar.Modifier = Data.Modifier;
	return DVar;
	}

	// OpenMP [2.9.1.1, Data-sharing Attribute Rules for Variables Referenced
	// in a Construct, C/C++, implicitly determined, p.1]
	// In a parallel or task construct, the data-sharing attributes of these
	// variables are determined by the default clause, if present.
	switch (Iter->DefaultAttr) {
	case DSA_shared:
	DVar.CKind = OMPC_shared;
	DVar.ImplicitDSALoc = Iter->DefaultAttrLoc;
	return DVar;
	case DSA_none:
	return DVar;
	case DSA_firstprivate:
	if (VD->getStorageDuration() == SD_Static &&
	VD->getDeclContext()->isFileContext()) {
	DVar.CKind = OMPC_unknown;
	} else {
	DVar.CKind = OMPC_firstprivate;
	}
	DVar.ImplicitDSALoc = Iter->DefaultAttrLoc;
	return DVar;
	case DSA_unspecified:
	// OpenMP [2.9.1.1, Data-sharing Attribute Rules for Variables Referenced
	// in a Construct, implicitly determined, p.2]
	// In a parallel construct, if no default clause is present, these
	// variables are shared.
	DVar.ImplicitDSALoc = Iter->DefaultAttrLoc;
	if ((isOpenMPParallelDirective(DVar.DKind) &&
	!isOpenMPTaskLoopDirective(DVar.DKind)) \|\|
	isOpenMPTeamsDirective(DVar.DKind)) {
	DVar.CKind = OMPC_shared;
	return DVar;
	}

	// OpenMP [2.9.1.1, Data-sharing Attribute Rules for Variables Referenced
	// in a Construct, implicitly determined, p.4]
	// In a task construct, if no default clause is present, a variable that in
	// the enclosing context is determined to be shared by all implicit tasks
	// bound to the current team is shared.
	if (isOpenMPTaskingDirective(DVar.DKind)) {
	DSAVarData DVarTemp;
	const_iterator I = Iter, E = end();
	do {
	++I;
	// OpenMP [2.9.1.1, Data-sharing Attribute Rules for Variables
	// Referenced in a Construct, implicitly determined, p.6]
	// In a task construct, if no default clause is present, a variable
	// whose data-sharing attribute is not determined by the rules above is
	// firstprivate.
	DVarTemp = getDSA(I, D);
	if (DVarTemp.CKind != OMPC_shared) {
	DVar.RefExpr = nullptr;
	DVar.CKind = OMPC_firstprivate;
	return DVar;
	}
	} while (I != E && !isImplicitTaskingRegion(I->Directive));
	DVar.CKind =
	(DVarTemp.CKind == OMPC_unknown) ? OMPC_firstprivate : OMPC_shared;
	return DVar;
	}
	}
	// OpenMP [2.9.1.1, Data-sharing Attribute Rules for Variables Referenced
	// in a Construct, implicitly determined, p.3]
	// For constructs other than task, if no default clause is present, these
	// variables inherit their data-sharing attributes from the enclosing
	// context.
	return getDSA(++Iter, D);
	}

	const Expr DSAStackTy::addUniqueAligned(const ValueDecl D,
	const Expr *NewDE) {
	assert(!isStackEmpty() && "Data sharing attributes stack is empty");
	D = getCanonicalDecl(D);
	SharingMapTy &StackElem = getTopOfStack();
	auto It = StackElem.AlignedMap.find(D);
	if (It == StackElem.AlignedMap.end()) {
	assert(NewDE && "Unexpected nullptr expr to be added into aligned map");
	StackElem.AlignedMap[D] = NewDE;
	return nullptr;
	}
	assert(It->second && "Unexpected nullptr expr in the aligned map");
	return It->second;
	}

	const Expr DSAStackTy::addUniqueNontemporal(const ValueDecl D,
	const Expr *NewDE) {
	assert(!isStackEmpty() && "Data sharing attributes stack is empty");
	D = getCanonicalDecl(D);
	SharingMapTy &StackElem = getTopOfStack();
	auto It = StackElem.NontemporalMap.find(D);
	if (It == StackElem.NontemporalMap.end()) {
	assert(NewDE && "Unexpected nullptr expr to be added into aligned map");
	StackElem.NontemporalMap[D] = NewDE;
	return nullptr;
	}
	assert(It->second && "Unexpected nullptr expr in the aligned map");
	return It->second;
	}

	void DSAStackTy::addLoopControlVariable(const ValueDecl D, VarDecl Capture) {
	assert(!isStackEmpty() && "Data-sharing attributes stack is empty");
	D = getCanonicalDecl(D);
	SharingMapTy &StackElem = getTopOfStack();
	StackElem.LCVMap.try_emplace(
	D, LCDeclInfo(StackElem.LCVMap.size() + 1, Capture));
	}

	const DSAStackTy::LCDeclInfo
	DSAStackTy::isLoopControlVariable(const ValueDecl *D) const {
	assert(!isStackEmpty() && "Data-sharing attributes stack is empty");
	D = getCanonicalDecl(D);
	const SharingMapTy &StackElem = getTopOfStack();
	auto It = StackElem.LCVMap.find(D);
	if (It != StackElem.LCVMap.end())
	return It->second;
	return {0, nullptr};
	}

	const DSAStackTy::LCDeclInfo
	DSAStackTy::isLoopControlVariable(const ValueDecl *D, unsigned Level) const {
	assert(!isStackEmpty() && "Data-sharing attributes stack is empty");
	D = getCanonicalDecl(D);
	for (unsigned I = Level + 1; I > 0; --I) {
	const SharingMapTy &StackElem = getStackElemAtLevel(I - 1);
	auto It = StackElem.LCVMap.find(D);
	if (It != StackElem.LCVMap.end())
	return It->second;
	}
	return {0, nullptr};
	}

	const DSAStackTy::LCDeclInfo
	DSAStackTy::isParentLoopControlVariable(const ValueDecl *D) const {
	const SharingMapTy *Parent = getSecondOnStackOrNull();
	assert(Parent && "Data-sharing attributes stack is empty");
	D = getCanonicalDecl(D);
	auto It = Parent->LCVMap.find(D);
	if (It != Parent->LCVMap.end())
	return It->second;
	return {0, nullptr};
	}

	const ValueDecl *DSAStackTy::getParentLoopControlVariable(unsigned I) const {
	const SharingMapTy *Parent = getSecondOnStackOrNull();
	assert(Parent && "Data-sharing attributes stack is empty");
	if (Parent->LCVMap.size() < I)
	return nullptr;
	for (const auto &Pair : Parent->LCVMap)
	if (Pair.second.first == I)
	return Pair.first;
	return nullptr;
	}

	void DSAStackTy::addDSA(const ValueDecl D, const Expr E, OpenMPClauseKind A,
	DeclRefExpr *PrivateCopy, unsigned Modifier) {
	D = getCanonicalDecl(D);
	if (A == OMPC_threadprivate) {
	DSAInfo &Data = Threadprivates[D];
	Data.Attributes = A;
	Data.RefExpr.setPointer(E);
	Data.PrivateCopy = nullptr;
	Data.Modifier = Modifier;
	} else {
	DSAInfo &Data = getTopOfStack().SharingMap[D];
	assert(Data.Attributes == OMPC_unknown \|\| (A == Data.Attributes) \|\|
	(A == OMPC_firstprivate && Data.Attributes == OMPC_lastprivate) \|\|
	(A == OMPC_lastprivate && Data.Attributes == OMPC_firstprivate) \|\|
	(isLoopControlVariable(D).first && A == OMPC_private));
	Data.Modifier = Modifier;
	if (A == OMPC_lastprivate && Data.Attributes == OMPC_firstprivate) {
	Data.RefExpr.setInt(/IntVal=/true);
	return;
	}
	const bool IsLastprivate =
	A == OMPC_lastprivate \|\| Data.Attributes == OMPC_lastprivate;
	Data.Attributes = A;
	Data.RefExpr.setPointerAndInt(E, IsLastprivate);
	Data.PrivateCopy = PrivateCopy;
	if (PrivateCopy) {
	DSAInfo &Data = getTopOfStack().SharingMap[PrivateCopy->getDecl()];
	Data.Modifier = Modifier;
	Data.Attributes = A;
	Data.RefExpr.setPointerAndInt(PrivateCopy, IsLastprivate);
	Data.PrivateCopy = nullptr;
	}
	}
	}

	/// Build a variable declaration for OpenMP loop iteration variable.
	static VarDecl *buildVarDecl(Sema &SemaRef, SourceLocation Loc, QualType Type,
	StringRef Name, const AttrVec *Attrs = nullptr,
	DeclRefExpr *OrigRef = nullptr) {
	DeclContext *DC = SemaRef.CurContext;
	IdentifierInfo *II = &SemaRef.PP.getIdentifierTable().get(Name);
	TypeSourceInfo *TInfo = SemaRef.Context.getTrivialTypeSourceInfo(Type, Loc);
	auto *Decl =
	VarDecl::Create(SemaRef.Context, DC, Loc, Loc, II, Type, TInfo, SC_None);
	if (Attrs) {
	for (specific_attr_iterator<AlignedAttr> I(Attrs->begin()), E(Attrs->end());
	I != E; ++I)
	Decl->addAttr(*I);
	}
	Decl->setImplicit();
	if (OrigRef) {
	Decl->addAttr(
	OMPReferencedVarAttr::CreateImplicit(SemaRef.Context, OrigRef));
	}
	return Decl;
	}

	static DeclRefExpr buildDeclRefExpr(Sema &S, VarDecl D, QualType Ty,
	SourceLocation Loc,
	bool RefersToCapture = false) {
	D->setReferenced();
	D->markUsed(S.Context);
	return DeclRefExpr::Create(S.getASTContext(), NestedNameSpecifierLoc(),
	SourceLocation(), D, RefersToCapture, Loc, Ty,
	VK_LValue);
	}

	void DSAStackTy::addTaskgroupReductionData(const ValueDecl *D, SourceRange SR,
	BinaryOperatorKind BOK) {
	D = getCanonicalDecl(D);
	assert(!isStackEmpty() && "Data-sharing attributes stack is empty");
	assert(
	getTopOfStack().SharingMap[D].Attributes == OMPC_reduction &&
	"Additional reduction info may be specified only for reduction items.");
	ReductionData &ReductionData = getTopOfStack().ReductionMap[D];
	assert(ReductionData.ReductionRange.isInvalid() &&
	(getTopOfStack().Directive == OMPD_taskgroup \|\|
	((isOpenMPParallelDirective(getTopOfStack().Directive) \|\|
	isOpenMPWorksharingDirective(getTopOfStack().Directive)) &&
	!isOpenMPSimdDirective(getTopOfStack().Directive))) &&
	"Additional reduction info may be specified only once for reduction "
	"items.");
	ReductionData.set(BOK, SR);
	Expr *&TaskgroupReductionRef =
	getTopOfStack().TaskgroupReductionRef;
	if (!TaskgroupReductionRef) {
	VarDecl *VD = buildVarDecl(SemaRef, SR.getBegin(),
	SemaRef.Context.VoidPtrTy, ".task_red.");
	TaskgroupReductionRef =
	buildDeclRefExpr(SemaRef, VD, SemaRef.Context.VoidPtrTy, SR.getBegin());
	}
	}

	void DSAStackTy::addTaskgroupReductionData(const ValueDecl *D, SourceRange SR,
	const Expr *ReductionRef) {
	D = getCanonicalDecl(D);
	assert(!isStackEmpty() && "Data-sharing attributes stack is empty");
	assert(
	getTopOfStack().SharingMap[D].Attributes == OMPC_reduction &&
	"Additional reduction info may be specified only for reduction items.");
	ReductionData &ReductionData = getTopOfStack().ReductionMap[D];
	assert(ReductionData.ReductionRange.isInvalid() &&
	(getTopOfStack().Directive == OMPD_taskgroup \|\|
	((isOpenMPParallelDirective(getTopOfStack().Directive) \|\|
	isOpenMPWorksharingDirective(getTopOfStack().Directive)) &&
	!isOpenMPSimdDirective(getTopOfStack().Directive))) &&
	"Additional reduction info may be specified only once for reduction "
	"items.");
	ReductionData.set(ReductionRef, SR);
	Expr *&TaskgroupReductionRef =
	getTopOfStack().TaskgroupReductionRef;
	if (!TaskgroupReductionRef) {
	VarDecl *VD = buildVarDecl(SemaRef, SR.getBegin(),
	SemaRef.Context.VoidPtrTy, ".task_red.");
	TaskgroupReductionRef =
	buildDeclRefExpr(SemaRef, VD, SemaRef.Context.VoidPtrTy, SR.getBegin());
	}
	}

	const DSAStackTy::DSAVarData DSAStackTy::getTopMostTaskgroupReductionData(
	const ValueDecl *D, SourceRange &SR, BinaryOperatorKind &BOK,
	Expr *&TaskgroupDescriptor) const {
	D = getCanonicalDecl(D);
	assert(!isStackEmpty() && "Data-sharing attributes stack is empty.");
	for (const_iterator I = begin() + 1, E = end(); I != E; ++I) {
	const DSAInfo &Data = I->SharingMap.lookup(D);
	if (Data.Attributes != OMPC_reduction \|\|
	Data.Modifier != OMPC_REDUCTION_task)
	continue;
	const ReductionData &ReductionData = I->ReductionMap.lookup(D);
	if (!ReductionData.ReductionOp \|\|
	ReductionData.ReductionOp.is<const Expr *>())
	return DSAVarData();
	SR = ReductionData.ReductionRange;
	BOK = ReductionData.ReductionOp.get<ReductionData::BOKPtrType>();
	assert(I->TaskgroupReductionRef && "taskgroup reduction reference "
	"expression for the descriptor is not "
	"set.");
	TaskgroupDescriptor = I->TaskgroupReductionRef;
	return DSAVarData(I->Directive, OMPC_reduction, Data.RefExpr.getPointer(),
	Data.PrivateCopy, I->DefaultAttrLoc, OMPC_REDUCTION_task);
	}
	return DSAVarData();
	}

	const DSAStackTy::DSAVarData DSAStackTy::getTopMostTaskgroupReductionData(
	const ValueDecl D, SourceRange &SR, const Expr &ReductionRef,
	Expr *&TaskgroupDescriptor) const {
	D = getCanonicalDecl(D);
	assert(!isStackEmpty() && "Data-sharing attributes stack is empty.");
	for (const_iterator I = begin() + 1, E = end(); I != E; ++I) {
	const DSAInfo &Data = I->SharingMap.lookup(D);
	if (Data.Attributes != OMPC_reduction \|\|
	Data.Modifier != OMPC_REDUCTION_task)
	continue;
	const ReductionData &ReductionData = I->ReductionMap.lookup(D);
	if (!ReductionData.ReductionOp \|\|
	!ReductionData.ReductionOp.is<const Expr *>())
	return DSAVarData();
	SR = ReductionData.ReductionRange;
	ReductionRef = ReductionData.ReductionOp.get<const Expr *>();
	assert(I->TaskgroupReductionRef && "taskgroup reduction reference "
	"expression for the descriptor is not "
	"set.");
	TaskgroupDescriptor = I->TaskgroupReductionRef;
	return DSAVarData(I->Directive, OMPC_reduction, Data.RefExpr.getPointer(),
	Data.PrivateCopy, I->DefaultAttrLoc, OMPC_REDUCTION_task);
	}
	return DSAVarData();
	}

	bool DSAStackTy::isOpenMPLocal(VarDecl *D, const_iterator I) const {
	D = D->getCanonicalDecl();
	for (const_iterator E = end(); I != E; ++I) {
	if (isImplicitOrExplicitTaskingRegion(I->Directive) \|\|
	isOpenMPTargetExecutionDirective(I->Directive)) {
	Scope *TopScope = I->CurScope ? I->CurScope->getParent() : nullptr;
	Scope *CurScope = getCurScope();
	while (CurScope && CurScope != TopScope && !CurScope->isDeclScope(D))
	CurScope = CurScope->getParent();
	return CurScope != TopScope;
	}
	}
	return false;
	}

	static bool isConstNotMutableType(Sema &SemaRef, QualType Type,
	bool AcceptIfMutable = true,
	bool *IsClassType = nullptr) {
	ASTContext &Context = SemaRef.getASTContext();
	Type = Type.getNonReferenceType().getCanonicalType();
	bool IsConstant = Type.isConstant(Context);
	Type = Context.getBaseElementType(Type);
	const CXXRecordDecl *RD = AcceptIfMutable && SemaRef.getLangOpts().CPlusPlus
	? Type->getAsCXXRecordDecl()
	: nullptr;
	if (const auto *CTSD = dyn_cast_or_null<ClassTemplateSpecializationDecl>(RD))
	if (const ClassTemplateDecl *CTD = CTSD->getSpecializedTemplate())
	RD = CTD->getTemplatedDecl();
	if (IsClassType)
	*IsClassType = RD;
	return IsConstant && !(SemaRef.getLangOpts().CPlusPlus && RD &&
	RD->hasDefinition() && RD->hasMutableFields());
	}

	static bool rejectConstNotMutableType(Sema &SemaRef, const ValueDecl *D,
	QualType Type, OpenMPClauseKind CKind,
	SourceLocation ELoc,
	bool AcceptIfMutable = true,
	bool ListItemNotVar = false) {
	ASTContext &Context = SemaRef.getASTContext();
	bool IsClassType;
	if (isConstNotMutableType(SemaRef, Type, AcceptIfMutable, &IsClassType)) {
	unsigned Diag = ListItemNotVar
	? diag::err_omp_const_list_item
	: IsClassType ? diag::err_omp_const_not_mutable_variable
	: diag::err_omp_const_variable;
	SemaRef.Diag(ELoc, Diag) << getOpenMPClauseName(CKind);
	if (!ListItemNotVar && D) {
	const VarDecl *VD = dyn_cast<VarDecl>(D);
	bool IsDecl = !VD \|\| VD->isThisDeclarationADefinition(Context) ==
	VarDecl::DeclarationOnly;
	SemaRef.Diag(D->getLocation(),
	IsDecl ? diag::note_previous_decl : diag::note_defined_here)
	<< D;
	}
	return true;
	}
	return false;
	}

	const DSAStackTy::DSAVarData DSAStackTy::getTopDSA(ValueDecl *D,
	bool FromParent) {
	D = getCanonicalDecl(D);
	DSAVarData DVar;

	auto *VD = dyn_cast<VarDecl>(D);
	auto TI = Threadprivates.find(D);
	if (TI != Threadprivates.end()) {
	DVar.RefExpr = TI->getSecond().RefExpr.getPointer();
	DVar.CKind = OMPC_threadprivate;
	DVar.Modifier = TI->getSecond().Modifier;
	return DVar;
	}
	if (VD && VD->hasAttr<OMPThreadPrivateDeclAttr>()) {
	DVar.RefExpr = buildDeclRefExpr(
	SemaRef, VD, D->getType().getNonReferenceType(),
	VD->getAttr<OMPThreadPrivateDeclAttr>()->getLocation());
	DVar.CKind = OMPC_threadprivate;
	addDSA(D, DVar.RefExpr, OMPC_threadprivate);
	return DVar;
	}
	// OpenMP [2.9.1.1, Data-sharing Attribute Rules for Variables Referenced
	// in a Construct, C/C++, predetermined, p.1]
	// Variables appearing in threadprivate directives are threadprivate.
	if ((VD && VD->getTLSKind() != VarDecl::TLS_None &&
	!(VD->hasAttr<OMPThreadPrivateDeclAttr>() &&
	SemaRef.getLangOpts().OpenMPUseTLS &&
	SemaRef.getASTContext().getTargetInfo().isTLSSupported())) \|\|
	(VD && VD->getStorageClass() == SC_Register &&
	VD->hasAttr<AsmLabelAttr>() && !VD->isLocalVarDecl())) {
	DVar.RefExpr = buildDeclRefExpr(
	SemaRef, VD, D->getType().getNonReferenceType(), D->getLocation());
	DVar.CKind = OMPC_threadprivate;
	addDSA(D, DVar.RefExpr, OMPC_threadprivate);
	return DVar;
	}
	if (SemaRef.getLangOpts().OpenMPCUDAMode && VD &&
	VD->isLocalVarDeclOrParm() && !isStackEmpty() &&
	!isLoopControlVariable(D).first) {
	const_iterator IterTarget =
	std::find_if(begin(), end(), [](const SharingMapTy &Data) {
	return isOpenMPTargetExecutionDirective(Data.Directive);
	});
	if (IterTarget != end()) {
	const_iterator ParentIterTarget = IterTarget + 1;
	for (const_iterator Iter = begin();
	Iter != ParentIterTarget; ++Iter) {
	if (isOpenMPLocal(VD, Iter)) {
	DVar.RefExpr =
	buildDeclRefExpr(SemaRef, VD, D->getType().getNonReferenceType(),
	D->getLocation());
	DVar.CKind = OMPC_threadprivate;
	return DVar;
	}
	}
	if (!isClauseParsingMode() \|\| IterTarget != begin()) {
	auto DSAIter = IterTarget->SharingMap.find(D);
	if (DSAIter != IterTarget->SharingMap.end() &&
	isOpenMPPrivate(DSAIter->getSecond().Attributes)) {
	DVar.RefExpr = DSAIter->getSecond().RefExpr.getPointer();
	DVar.CKind = OMPC_threadprivate;
	return DVar;
	}
	const_iterator End = end();
	if (!SemaRef.isOpenMPCapturedByRef(
	D, std::distance(ParentIterTarget, End),
	/OpenMPCaptureLevel=/0)) {
	DVar.RefExpr =
	buildDeclRefExpr(SemaRef, VD, D->getType().getNonReferenceType(),
	IterTarget->ConstructLoc);
	DVar.CKind = OMPC_threadprivate;
	return DVar;
	}
	}
	}
	}

	if (isStackEmpty())
	// Not in OpenMP execution region and top scope was already checked.
	return DVar;

	// OpenMP [2.9.1.1, Data-sharing Attribute Rules for Variables Referenced
	// in a Construct, C/C++, predetermined, p.4]
	// Static data members are shared.
	// OpenMP [2.9.1.1, Data-sharing Attribute Rules for Variables Referenced
	// in a Construct, C/C++, predetermined, p.7]
	// Variables with static storage duration that are declared in a scope
	// inside the construct are shared.
	if (VD && VD->isStaticDataMember()) {
	// Check for explicitly specified attributes.
	const_iterator I = begin();
	const_iterator EndI = end();
	if (FromParent && I != EndI)
	++I;
	if (I != EndI) {
	auto It = I->SharingMap.find(D);
	if (It != I->SharingMap.end()) {
	const DSAInfo &Data = It->getSecond();
	DVar.RefExpr = Data.RefExpr.getPointer();
	DVar.PrivateCopy = Data.PrivateCopy;
	DVar.CKind = Data.Attributes;
	DVar.ImplicitDSALoc = I->DefaultAttrLoc;
	DVar.DKind = I->Directive;
	DVar.Modifier = Data.Modifier;
	return DVar;
	}
	}

	DVar.CKind = OMPC_shared;
	return DVar;
	}

	auto &&MatchesAlways = [](OpenMPDirectiveKind) { return true; };
	// The predetermined shared attribute for const-qualified types having no
	// mutable members was removed after OpenMP 3.1.
	if (SemaRef.LangOpts.OpenMP <= 31) {
	// OpenMP [2.9.1.1, Data-sharing Attribute Rules for Variables Referenced
	// in a Construct, C/C++, predetermined, p.6]
	// Variables with const qualified type having no mutable member are
	// shared.
	if (isConstNotMutableType(SemaRef, D->getType())) {
	// Variables with const-qualified type having no mutable member may be
	// listed in a firstprivate clause, even if they are static data members.
	DSAVarData DVarTemp = hasInnermostDSA(
	D,
	[](OpenMPClauseKind C) {
	return C == OMPC_firstprivate \|\| C == OMPC_shared;
	},
	MatchesAlways, FromParent);
	if (DVarTemp.CKind != OMPC_unknown && DVarTemp.RefExpr)
	return DVarTemp;

	DVar.CKind = OMPC_shared;
	return DVar;
	}
	}

	// Explicitly specified attributes and local variables with predetermined
	// attributes.
	const_iterator I = begin();
	const_iterator EndI = end();
	if (FromParent && I != EndI)
	++I;
	if (I == EndI)
	return DVar;
	auto It = I->SharingMap.find(D);
	if (It != I->SharingMap.end()) {
	const DSAInfo &Data = It->getSecond();
	DVar.RefExpr = Data.RefExpr.getPointer();
	DVar.PrivateCopy = Data.PrivateCopy;
	DVar.CKind = Data.Attributes;
	DVar.ImplicitDSALoc = I->DefaultAttrLoc;
	DVar.DKind = I->Directive;
	DVar.Modifier = Data.Modifier;
	}

	return DVar;
	}

	const DSAStackTy::DSAVarData DSAStackTy::getImplicitDSA(ValueDecl *D,
	bool FromParent) const {
	if (isStackEmpty()) {
	const_iterator I;
	return getDSA(I, D);
	}
	D = getCanonicalDecl(D);
	const_iterator StartI = begin();
	const_iterator EndI = end();
	if (FromParent && StartI != EndI)
	++StartI;
	return getDSA(StartI, D);
	}

	const DSAStackTy::DSAVarData DSAStackTy::getImplicitDSA(ValueDecl *D,
	unsigned Level) const {
	if (getStackSize() <= Level)
	return DSAVarData();
	D = getCanonicalDecl(D);
	const_iterator StartI = std::next(begin(), getStackSize() - 1 - Level);
	return getDSA(StartI, D);
	}

	const DSAStackTy::DSAVarData
	DSAStackTy::hasDSA(ValueDecl *D,
	const llvm::function_ref<bool(OpenMPClauseKind)> CPred,
	const llvm::function_ref<bool(OpenMPDirectiveKind)> DPred,
	bool FromParent) const {
	if (isStackEmpty())
	return {};
	D = getCanonicalDecl(D);
	const_iterator I = begin();
	const_iterator EndI = end();
	if (FromParent && I != EndI)
	++I;
	for (; I != EndI; ++I) {
	if (!DPred(I->Directive) &&
	!isImplicitOrExplicitTaskingRegion(I->Directive))
	continue;
	const_iterator NewI = I;
	DSAVarData DVar = getDSA(NewI, D);
	if (I == NewI && CPred(DVar.CKind))
	return DVar;
	}
	return {};
	}

	const DSAStackTy::DSAVarData DSAStackTy::hasInnermostDSA(
	ValueDecl *D, const llvm::function_ref<bool(OpenMPClauseKind)> CPred,
	const llvm::function_ref<bool(OpenMPDirectiveKind)> DPred,
	bool FromParent) const {
	if (isStackEmpty())
	return {};
	D = getCanonicalDecl(D);
	const_iterator StartI = begin();
	const_iterator EndI = end();
	if (FromParent && StartI != EndI)
	++StartI;
	if (StartI == EndI \|\| !DPred(StartI->Directive))
	return {};
	const_iterator NewI = StartI;
	DSAVarData DVar = getDSA(NewI, D);
	return (NewI == StartI && CPred(DVar.CKind)) ? DVar : DSAVarData();
	}

	bool DSAStackTy::hasExplicitDSA(
	const ValueDecl *D, const llvm::function_ref<bool(OpenMPClauseKind)> CPred,
	unsigned Level, bool NotLastprivate) const {
	if (getStackSize() <= Level)
	return false;
	D = getCanonicalDecl(D);
	const SharingMapTy &StackElem = getStackElemAtLevel(Level);
	auto I = StackElem.SharingMap.find(D);
	if (I != StackElem.SharingMap.end() &&
	I->getSecond().RefExpr.getPointer() &&
	CPred(I->getSecond().Attributes) &&
	(!NotLastprivate \|\| !I->getSecond().RefExpr.getInt()))
	return true;
	// Check predetermined rules for the loop control variables.
	auto LI = StackElem.LCVMap.find(D);
	if (LI != StackElem.LCVMap.end())
	return CPred(OMPC_private);
	return false;
	}

	bool DSAStackTy::hasExplicitDirective(
	const llvm::function_ref<bool(OpenMPDirectiveKind)> DPred,
	unsigned Level) const {
	if (getStackSize() <= Level)
	return false;
	const SharingMapTy &StackElem = getStackElemAtLevel(Level);
	return DPred(StackElem.Directive);
	}

	bool DSAStackTy::hasDirective(
	const llvm::function_ref<bool(OpenMPDirectiveKind,
	const DeclarationNameInfo &, SourceLocation)>
	DPred,
	bool FromParent) const {
	// We look only in the enclosing region.
	size_t Skip = FromParent ? 2 : 1;
	for (const_iterator I = begin() + std::min(Skip, getStackSize()), E = end();
	I != E; ++I) {
	if (DPred(I->Directive, I->DirectiveName, I->ConstructLoc))
	return true;
	}
	return false;
	}

	void Sema::InitDataSharingAttributesStack() {
	VarDataSharingAttributesStack = new DSAStackTy(*this);
	}

	#define DSAStack static_cast<DSAStackTy *>(VarDataSharingAttributesStack)

	void Sema::pushOpenMPFunctionRegion() {
	DSAStack->pushFunction();
	}

	void Sema::popOpenMPFunctionRegion(const FunctionScopeInfo *OldFSI) {
	DSAStack->popFunction(OldFSI);
	}

	static bool isOpenMPDeviceDelayedContext(Sema &S) {
	assert(S.LangOpts.OpenMP && S.LangOpts.OpenMPIsDevice &&
	"Expected OpenMP device compilation.");
	return !S.isInOpenMPTargetExecutionDirective() &&
	!S.isInOpenMPDeclareTargetContext();
	}

	namespace {
	/// Status of the function emission on the host/device.
	enum class FunctionEmissionStatus {
	Emitted,
	Discarded,
	Unknown,
	};
	} // anonymous namespace

	Sema::DeviceDiagBuilder Sema::diagIfOpenMPDeviceCode(SourceLocation Loc,
	unsigned DiagID) {
	assert(LangOpts.OpenMP && LangOpts.OpenMPIsDevice &&
	"Expected OpenMP device compilation.");

	FunctionDecl *FD = getCurFunctionDecl();
	DeviceDiagBuilder::Kind Kind = DeviceDiagBuilder::K_Nop;
	if (FD) {
	FunctionEmissionStatus FES = getEmissionStatus(FD);
	switch (FES) {
	case FunctionEmissionStatus::Emitted:
	Kind = DeviceDiagBuilder::K_Immediate;
	break;
	case FunctionEmissionStatus::Unknown:
	Kind = isOpenMPDeviceDelayedContext(*this)
	? DeviceDiagBuilder::K_Deferred
	: DeviceDiagBuilder::K_Immediate;
	break;
	case FunctionEmissionStatus::TemplateDiscarded:
	case FunctionEmissionStatus::OMPDiscarded:
	Kind = DeviceDiagBuilder::K_Nop;
	break;
	case FunctionEmissionStatus::CUDADiscarded:
	llvm_unreachable("CUDADiscarded unexpected in OpenMP device compilation");
	break;
	}
	}

	return DeviceDiagBuilder(Kind, Loc, DiagID, getCurFunctionDecl(), *this);
	}

	Sema::DeviceDiagBuilder Sema::diagIfOpenMPHostCode(SourceLocation Loc,
	unsigned DiagID) {
	assert(LangOpts.OpenMP && !LangOpts.OpenMPIsDevice &&
	"Expected OpenMP host compilation.");
	FunctionEmissionStatus FES = getEmissionStatus(getCurFunctionDecl());
	DeviceDiagBuilder::Kind Kind = DeviceDiagBuilder::K_Nop;
	switch (FES) {
	case FunctionEmissionStatus::Emitted:
	Kind = DeviceDiagBuilder::K_Immediate;
	break;
	case FunctionEmissionStatus::Unknown:
	Kind = DeviceDiagBuilder::K_Deferred;
	break;
	case FunctionEmissionStatus::TemplateDiscarded:
	case FunctionEmissionStatus::OMPDiscarded:
	case FunctionEmissionStatus::CUDADiscarded:
	Kind = DeviceDiagBuilder::K_Nop;
	break;
	}

	return DeviceDiagBuilder(Kind, Loc, DiagID, getCurFunctionDecl(), *this);
	}

	static OpenMPDefaultmapClauseKind
	getVariableCategoryFromDecl(const LangOptions &LO, const ValueDecl *VD) {
	if (LO.OpenMP <= 45) {
	if (VD->getType().getNonReferenceType()->isScalarType())
	return OMPC_DEFAULTMAP_scalar;
	return OMPC_DEFAULTMAP_aggregate;
	}
	if (VD->getType().getNonReferenceType()->isAnyPointerType())
	return OMPC_DEFAULTMAP_pointer;
	if (VD->getType().getNonReferenceType()->isScalarType())
	return OMPC_DEFAULTMAP_scalar;
	return OMPC_DEFAULTMAP_aggregate;
	}

	bool Sema::isOpenMPCapturedByRef(const ValueDecl *D, unsigned Level,
	unsigned OpenMPCaptureLevel) const {
	assert(LangOpts.OpenMP && "OpenMP is not allowed");

	ASTContext &Ctx = getASTContext();
	bool IsByRef = true;

	// Find the directive that is associated with the provided scope.
	D = cast<ValueDecl>(D->getCanonicalDecl());
	QualType Ty = D->getType();

	bool IsVariableUsedInMapClause = false;
	if (DSAStack->hasExplicitDirective(isOpenMPTargetExecutionDirective, Level)) {
	// This table summarizes how a given variable should be passed to the device
	// given its type and the clauses where it appears. This table is based on
	// the description in OpenMP 4.5 [2.10.4, target Construct] and
	// OpenMP 4.5 [2.15.5, Data-mapping Attribute Rules and Clauses].
	//
	// =========================================================================
	// \| type \| defaultmap \| pvt \| first \| is_device_ptr \| map \| res. \|
	// \| \|(tofrom:scalar)\| \| pvt \| \| \| \|
	// =========================================================================
	// \| scl \| \| \| \| - \| \| bycopy\|
	// \| scl \| \| - \| x \| - \| - \| bycopy\|
	// \| scl \| \| x \| - \| - \| - \| null \|
	// \| scl \| x \| \| \| - \| \| byref \|
	// \| scl \| x \| - \| x \| - \| - \| bycopy\|
	// \| scl \| x \| x \| - \| - \| - \| null \|
	// \| scl \| \| - \| - \| - \| x \| byref \|
	// \| scl \| x \| - \| - \| - \| x \| byref \|
	//
	// \| agg \| n.a. \| \| \| - \| \| byref \|
	// \| agg \| n.a. \| - \| x \| - \| - \| byref \|
	// \| agg \| n.a. \| x \| - \| - \| - \| null \|
	// \| agg \| n.a. \| - \| - \| - \| x \| byref \|
	// \| agg \| n.a. \| - \| - \| - \| x[] \| byref \|
	//
	// \| ptr \| n.a. \| \| \| - \| \| bycopy\|
	// \| ptr \| n.a. \| - \| x \| - \| - \| bycopy\|
	// \| ptr \| n.a. \| x \| - \| - \| - \| null \|
	// \| ptr \| n.a. \| - \| - \| - \| x \| byref \|
	// \| ptr \| n.a. \| - \| - \| - \| x[] \| bycopy\|
	// \| ptr \| n.a. \| - \| - \| x \| \| bycopy\|
	// \| ptr \| n.a. \| - \| - \| x \| x \| bycopy\|
	// \| ptr \| n.a. \| - \| - \| x \| x[] \| bycopy\|
	// =========================================================================
	// Legend:
	// scl - scalar
	// ptr - pointer
	// agg - aggregate
	// x - applies
	// - - invalid in this combination
	// [] - mapped with an array section
	// byref - should be mapped by reference
	// byval - should be mapped by value
	// null - initialize a local variable to null on the device
	//
	// Observations:
	// - All scalar declarations that show up in a map clause have to be passed
	// by reference, because they may have been mapped in the enclosing data
	// environment.
	// - If the scalar value does not fit the size of uintptr, it has to be
	// passed by reference, regardless the result in the table above.
	// - For pointers mapped by value that have either an implicit map or an
	// array section, the runtime library may pass the NULL value to the
	// device instead of the value passed to it by the compiler.

	if (Ty->isReferenceType())
	Ty = Ty->castAs<ReferenceType>()->getPointeeType();

	// Locate map clauses and see if the variable being captured is referred to
	// in any of those clauses. Here we only care about variables, not fields,
	// because fields are part of aggregates.
	bool IsVariableAssociatedWithSection = false;

	DSAStack->checkMappableExprComponentListsForDeclAtLevel(
	D, Level,
	[&IsVariableUsedInMapClause, &IsVariableAssociatedWithSection, D](
	OMPClauseMappableExprCommon::MappableExprComponentListRef
	MapExprComponents,
	OpenMPClauseKind WhereFoundClauseKind) {
	// Only the map clause information influences how a variable is
	// captured. E.g. is_device_ptr does not require changing the default
	// behavior.
	if (WhereFoundClauseKind != OMPC_map)
	return false;

	auto EI = MapExprComponents.rbegin();
	auto EE = MapExprComponents.rend();

	assert(EI != EE && "Invalid map expression!");

	if (isa<DeclRefExpr>(EI->getAssociatedExpression()))
	IsVariableUsedInMapClause \|= EI->getAssociatedDeclaration() == D;

	++EI;
	if (EI == EE)
	return false;

	if (isa<ArraySubscriptExpr>(EI->getAssociatedExpression()) \|\|
	isa<OMPArraySectionExpr>(EI->getAssociatedExpression()) \|\|
	isa<MemberExpr>(EI->getAssociatedExpression()) \|\|
	isa<OMPArrayShapingExpr>(EI->getAssociatedExpression())) {
	IsVariableAssociatedWithSection = true;
	// There is nothing more we need to know about this variable.
	return true;
	}

	// Keep looking for more map info.
	return false;
	});

	if (IsVariableUsedInMapClause) {
	// If variable is identified in a map clause it is always captured by
	// reference except if it is a pointer that is dereferenced somehow.
	IsByRef = !(Ty->isPointerType() && IsVariableAssociatedWithSection);
	} else {
	// By default, all the data that has a scalar type is mapped by copy
	// (except for reduction variables).
	// Defaultmap scalar is mutual exclusive to defaultmap pointer
	IsByRef =
	(DSAStack->isForceCaptureByReferenceInTargetExecutable() &&
	!Ty->isAnyPointerType()) \|\|
	!Ty->isScalarType() \|\|
	DSAStack->isDefaultmapCapturedByRef(
	Level, getVariableCategoryFromDecl(LangOpts, D)) \|\|
	DSAStack->hasExplicitDSA(
	D, [](OpenMPClauseKind K) { return K == OMPC_reduction; }, Level);
	}
	}

	if (IsByRef && Ty.getNonReferenceType()->isScalarType()) {
	IsByRef =
	((IsVariableUsedInMapClause &&
	DSAStack->getCaptureRegion(Level, OpenMPCaptureLevel) ==
	OMPD_target) \|\|
	!(DSAStack->hasExplicitDSA(
	D,
	[](OpenMPClauseKind K) -> bool {
	return K == OMPC_firstprivate;
	},
	Level, /NotLastprivate=/true) \|\|
	DSAStack->isUsesAllocatorsDecl(Level, D))) &&
	// If the variable is artificial and must be captured by value - try to
	// capture by value.
	!(isa<OMPCapturedExprDecl>(D) && !D->hasAttr<OMPCaptureNoInitAttr>() &&
	!cast<OMPCapturedExprDecl>(D)->getInit()->isGLValue()) &&
	// If the variable is implicitly firstprivate and scalar - capture by
	// copy
	!(DSAStack->getDefaultDSA() == DSA_firstprivate &&
	!DSAStack->hasExplicitDSA(
	D, [](OpenMPClauseKind K) { return K != OMPC_unknown; }, Level) &&
	!DSAStack->isLoopControlVariable(D, Level).first);
	}

	// When passing data by copy, we need to make sure it fits the uintptr size
	// and alignment, because the runtime library only deals with uintptr types.
	// If it does not fit the uintptr size, we need to pass the data by reference
	// instead.
	if (!IsByRef &&
	(Ctx.getTypeSizeInChars(Ty) >
	Ctx.getTypeSizeInChars(Ctx.getUIntPtrType()) \|\|
	Ctx.getDeclAlign(D) > Ctx.getTypeAlignInChars(Ctx.getUIntPtrType()))) {
	IsByRef = true;
	}

	return IsByRef;
	}

	unsigned Sema::getOpenMPNestingLevel() const {
	assert(getLangOpts().OpenMP);
	return DSAStack->getNestingLevel();
	}

	bool Sema::isInOpenMPTargetExecutionDirective() const {
	return (isOpenMPTargetExecutionDirective(DSAStack->getCurrentDirective()) &&
	!DSAStack->isClauseParsingMode()) \|\|
	DSAStack->hasDirective(
	[](OpenMPDirectiveKind K, const DeclarationNameInfo &,
	SourceLocation) -> bool {
	return isOpenMPTargetExecutionDirective(K);
	},
	false);
	}

	VarDecl Sema::isOpenMPCapturedDecl(ValueDecl D, bool CheckScopeInfo,
	unsigned StopAt) {
	assert(LangOpts.OpenMP && "OpenMP is not allowed");
	D = getCanonicalDecl(D);

	auto *VD = dyn_cast<VarDecl>(D);
	// Do not capture constexpr variables.
	if (VD && VD->isConstexpr())
	return nullptr;

	// If we want to determine whether the variable should be captured from the
	// perspective of the current capturing scope, and we've already left all the
	// capturing scopes of the top directive on the stack, check from the
	// perspective of its parent directive (if any) instead.
	DSAStackTy::ParentDirectiveScope InParentDirectiveRAII(
	*DSAStack, CheckScopeInfo && DSAStack->isBodyComplete());

	// If we are attempting to capture a global variable in a directive with
	// 'target' we return true so that this global is also mapped to the device.
	//
	if (VD && !VD->hasLocalStorage() &&
	(getCurCapturedRegion() \|\| getCurBlock() \|\| getCurLambda())) {
	if (isInOpenMPDeclareTargetContext()) {
	// Try to mark variable as declare target if it is used in capturing
	// regions.
	if (LangOpts.OpenMP <= 45 &&
	!OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
	checkDeclIsAllowedInOpenMPTarget(nullptr, VD);
	return nullptr;
	} else if (isInOpenMPTargetExecutionDirective()) {
	// If the declaration is enclosed in a 'declare target' directive,
	// then it should not be captured.
	//
	if (OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
	return nullptr;
	CapturedRegionScopeInfo *CSI = nullptr;
	for (FunctionScopeInfo *FSI : llvm::drop_begin(
	llvm::reverse(FunctionScopes),
	CheckScopeInfo ? (FunctionScopes.size() - (StopAt + 1)) : 0)) {
	if (!isa<CapturingScopeInfo>(FSI))
	return nullptr;
	if (auto *RSI = dyn_cast<CapturedRegionScopeInfo>(FSI))
	if (RSI->CapRegionKind == CR_OpenMP) {
	CSI = RSI;
	break;
	}
	}
	SmallVector<OpenMPDirectiveKind, 4> Regions;
	getOpenMPCaptureRegions(Regions,
	DSAStack->getDirective(CSI->OpenMPLevel));
	if (Regions[CSI->OpenMPCaptureLevel] != OMPD_task)
	return VD;
	}
	}

	if (CheckScopeInfo) {
	bool OpenMPFound = false;
	for (unsigned I = StopAt + 1; I > 0; --I) {
	FunctionScopeInfo *FSI = FunctionScopes[I - 1];
	if(!isa<CapturingScopeInfo>(FSI))
	return nullptr;
	if (auto *RSI = dyn_cast<CapturedRegionScopeInfo>(FSI))
	if (RSI->CapRegionKind == CR_OpenMP) {
	OpenMPFound = true;
	break;
	}
	}
	if (!OpenMPFound)
	return nullptr;
	}

	if (DSAStack->getCurrentDirective() != OMPD_unknown &&
	(!DSAStack->isClauseParsingMode() \|\|
	DSAStack->getParentDirective() != OMPD_unknown)) {
	auto &&Info = DSAStack->isLoopControlVariable(D);
	if (Info.first \|\|
	(VD && VD->hasLocalStorage() &&
	isImplicitOrExplicitTaskingRegion(DSAStack->getCurrentDirective())) \|\|
	(VD && DSAStack->isForceVarCapturing()))
	return VD ? VD : Info.second;
	DSAStackTy::DSAVarData DVarTop =
	DSAStack->getTopDSA(D, DSAStack->isClauseParsingMode());
	if (DVarTop.CKind != OMPC_unknown && isOpenMPPrivate(DVarTop.CKind))
	return VD ? VD : cast<VarDecl>(DVarTop.PrivateCopy->getDecl());
	// Threadprivate variables must not be captured.
	if (isOpenMPThreadPrivate(DVarTop.CKind))
	return nullptr;
	// The variable is not private or it is the variable in the directive with
	// default(none) clause and not used in any clause.
	DSAStackTy::DSAVarData DVarPrivate = DSAStack->hasDSA(
	D, isOpenMPPrivate, [](OpenMPDirectiveKind) { return true; },
	DSAStack->isClauseParsingMode());
	// Global shared must not be captured.
	if (VD && !VD->hasLocalStorage() && DVarPrivate.CKind == OMPC_unknown &&
	((DSAStack->getDefaultDSA() != DSA_none &&
	DSAStack->getDefaultDSA() != DSA_firstprivate) \|\|
	DVarTop.CKind == OMPC_shared))
	return nullptr;
	if (DVarPrivate.CKind != OMPC_unknown \|\|
	(VD && (DSAStack->getDefaultDSA() == DSA_none \|\|
	DSAStack->getDefaultDSA() == DSA_firstprivate)))
	return VD ? VD : cast<VarDecl>(DVarPrivate.PrivateCopy->getDecl());
	}
	return nullptr;
	}

	void Sema::adjustOpenMPTargetScopeIndex(unsigned &FunctionScopesIndex,
	unsigned Level) const {
	FunctionScopesIndex -= getOpenMPCaptureLevels(DSAStack->getDirective(Level));
	}

	void Sema::startOpenMPLoop() {
	assert(LangOpts.OpenMP && "OpenMP must be enabled.");
	if (isOpenMPLoopDirective(DSAStack->getCurrentDirective()))
	DSAStack->loopInit();
	}

	void Sema::startOpenMPCXXRangeFor() {
	assert(LangOpts.OpenMP && "OpenMP must be enabled.");
	if (isOpenMPLoopDirective(DSAStack->getCurrentDirective())) {
	DSAStack->resetPossibleLoopCounter();
	DSAStack->loopStart();
	}
	}

	OpenMPClauseKind Sema::isOpenMPPrivateDecl(ValueDecl *D, unsigned Level,
	unsigned CapLevel) const {
	assert(LangOpts.OpenMP && "OpenMP is not allowed");
	if (DSAStack->hasExplicitDirective(
	[](OpenMPDirectiveKind K) { return isOpenMPTaskingDirective(K); },
	Level)) {
	bool IsTriviallyCopyable =
	- D->getType().getNonReferenceType().isTriviallyCopyableType(Context);
	+ D->getType().getNonReferenceType().isTriviallyCopyableType(Context) &&
	+ !D->getType()
	+ .getNonReferenceType()
	+ .getCanonicalType()
	+ ->getAsCXXRecordDecl();
	OpenMPDirectiveKind DKind = DSAStack->getDirective(Level);
	SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
	getOpenMPCaptureRegions(CaptureRegions, DKind);
	if (isOpenMPTaskingDirective(CaptureRegions[CapLevel]) &&
	(IsTriviallyCopyable \|\|
	!isOpenMPTaskLoopDirective(CaptureRegions[CapLevel]))) {
	if (DSAStack->hasExplicitDSA(
	D, [](OpenMPClauseKind K) { return K == OMPC_firstprivate; },
	Level, /NotLastprivate=/true))
	return OMPC_firstprivate;
	DSAStackTy::DSAVarData DVar = DSAStack->getImplicitDSA(D, Level);
	if (DVar.CKind != OMPC_shared &&
	!DSAStack->isLoopControlVariable(D, Level).first && !DVar.RefExpr) {
	DSAStack->addImplicitTaskFirstprivate(Level, D);
	return OMPC_firstprivate;
	}
	}
	}
	if (isOpenMPLoopDirective(DSAStack->getCurrentDirective())) {
	if (DSAStack->getAssociatedLoops() > 0 &&
	!DSAStack->isLoopStarted()) {
	DSAStack->resetPossibleLoopCounter(D);
	DSAStack->loopStart();
	return OMPC_private;
	}
	if ((DSAStack->getPossiblyLoopCunter() == D->getCanonicalDecl() \|\|
	DSAStack->isLoopControlVariable(D).first) &&
	!DSAStack->hasExplicitDSA(
	D, [](OpenMPClauseKind K) { return K != OMPC_private; }, Level) &&
	!isOpenMPSimdDirective(DSAStack->getCurrentDirective()))
	return OMPC_private;
	}
	if (const auto *VD = dyn_cast<VarDecl>(D)) {
	if (DSAStack->isThreadPrivate(const_cast<VarDecl *>(VD)) &&
	DSAStack->isForceVarCapturing() &&
	!DSAStack->hasExplicitDSA(
	D, [](OpenMPClauseKind K) { return K == OMPC_copyin; }, Level))
	return OMPC_private;
	}
	// User-defined allocators are private since they must be defined in the
	// context of target region.
	if (DSAStack->hasExplicitDirective(isOpenMPTargetExecutionDirective, Level) &&
	DSAStack->isUsesAllocatorsDecl(Level, D).getValueOr(
	DSAStackTy::UsesAllocatorsDeclKind::AllocatorTrait) ==
	DSAStackTy::UsesAllocatorsDeclKind::UserDefinedAllocator)
	return OMPC_private;
	return (DSAStack->hasExplicitDSA(
	D, [](OpenMPClauseKind K) { return K == OMPC_private; }, Level) \|\|
	(DSAStack->isClauseParsingMode() &&
	DSAStack->getClauseParsingMode() == OMPC_private) \|\|
	// Consider taskgroup reduction descriptor variable a private
	// to avoid possible capture in the region.
	(DSAStack->hasExplicitDirective(
	[](OpenMPDirectiveKind K) {
	return K == OMPD_taskgroup \|\|
	((isOpenMPParallelDirective(K) \|\|
	isOpenMPWorksharingDirective(K)) &&
	!isOpenMPSimdDirective(K));
	},
	Level) &&
	DSAStack->isTaskgroupReductionRef(D, Level)))
	? OMPC_private
	: OMPC_unknown;
	}

	void Sema::setOpenMPCaptureKind(FieldDecl FD, const ValueDecl D,
	unsigned Level) {
	assert(LangOpts.OpenMP && "OpenMP is not allowed");
	D = getCanonicalDecl(D);
	OpenMPClauseKind OMPC = OMPC_unknown;
	for (unsigned I = DSAStack->getNestingLevel() + 1; I > Level; --I) {
	const unsigned NewLevel = I - 1;
	if (DSAStack->hasExplicitDSA(D,
	[&OMPC](const OpenMPClauseKind K) {
	if (isOpenMPPrivate(K)) {
	OMPC = K;
	return true;
	}
	return false;
	},
	NewLevel))
	break;
	if (DSAStack->checkMappableExprComponentListsForDeclAtLevel(
	D, NewLevel,
	[](OMPClauseMappableExprCommon::MappableExprComponentListRef,
	OpenMPClauseKind) { return true; })) {
	OMPC = OMPC_map;
	break;
	}
	if (DSAStack->hasExplicitDirective(isOpenMPTargetExecutionDirective,
	NewLevel)) {
	OMPC = OMPC_map;
	if (DSAStack->mustBeFirstprivateAtLevel(
	NewLevel, getVariableCategoryFromDecl(LangOpts, D)))
	OMPC = OMPC_firstprivate;
	break;
	}
	}
	if (OMPC != OMPC_unknown)
	FD->addAttr(OMPCaptureKindAttr::CreateImplicit(Context, unsigned(OMPC)));
	}

	bool Sema::isOpenMPTargetCapturedDecl(const ValueDecl *D, unsigned Level,
	unsigned CaptureLevel) const {
	assert(LangOpts.OpenMP && "OpenMP is not allowed");
	// Return true if the current level is no longer enclosed in a target region.

	SmallVector<OpenMPDirectiveKind, 4> Regions;
	getOpenMPCaptureRegions(Regions, DSAStack->getDirective(Level));
	const auto *VD = dyn_cast<VarDecl>(D);
	return VD && !VD->hasLocalStorage() &&
	DSAStack->hasExplicitDirective(isOpenMPTargetExecutionDirective,
	Level) &&
	Regions[CaptureLevel] != OMPD_task;
	}

	bool Sema::isOpenMPGlobalCapturedDecl(ValueDecl *D, unsigned Level,
	unsigned CaptureLevel) const {
	assert(LangOpts.OpenMP && "OpenMP is not allowed");
	// Return true if the current level is no longer enclosed in a target region.

	if (const auto *VD = dyn_cast<VarDecl>(D)) {
	if (!VD->hasLocalStorage()) {
	DSAStackTy::DSAVarData TopDVar =
	DSAStack->getTopDSA(D, /FromParent=/false);
	unsigned NumLevels =
	getOpenMPCaptureLevels(DSAStack->getDirective(Level));
	if (Level == 0)
	return (NumLevels == CaptureLevel + 1) && TopDVar.CKind != OMPC_shared;
	DSAStackTy::DSAVarData DVar = DSAStack->getImplicitDSA(D, Level - 1);
	return DVar.CKind != OMPC_shared \|\|
	isOpenMPGlobalCapturedDecl(
	D, Level - 1,
	getOpenMPCaptureLevels(DSAStack->getDirective(Level - 1)) - 1);
	}
	}
	return true;
	}

	void Sema::DestroyDataSharingAttributesStack() { delete DSAStack; }

	void Sema::ActOnOpenMPBeginDeclareVariant(SourceLocation Loc,
	OMPTraitInfo &TI) {
	if (!OMPDeclareVariantScopes.empty()) {
	Diag(Loc, diag::warn_nested_declare_variant);
	return;
	}
	OMPDeclareVariantScopes.push_back(OMPDeclareVariantScope(TI));
	}

	void Sema::ActOnOpenMPEndDeclareVariant() {
	assert(isInOpenMPDeclareVariantScope() &&
	"Not in OpenMP declare variant scope!");

	OMPDeclareVariantScopes.pop_back();
	}

	void Sema::finalizeOpenMPDelayedAnalysis(const FunctionDecl *Caller,
	const FunctionDecl *Callee,
	SourceLocation Loc) {
	assert(LangOpts.OpenMP && "Expected OpenMP compilation mode.");
	Optional<OMPDeclareTargetDeclAttr::DevTypeTy> DevTy =
	OMPDeclareTargetDeclAttr::getDeviceType(Caller->getMostRecentDecl());
	// Ignore host functions during device analyzis.
	if (LangOpts.OpenMPIsDevice && DevTy &&
	*DevTy == OMPDeclareTargetDeclAttr::DT_Host)
	return;
	// Ignore nohost functions during host analyzis.
	if (!LangOpts.OpenMPIsDevice && DevTy &&
	*DevTy == OMPDeclareTargetDeclAttr::DT_NoHost)
	return;
	const FunctionDecl *FD = Callee->getMostRecentDecl();
	DevTy = OMPDeclareTargetDeclAttr::getDeviceType(FD);
	if (LangOpts.OpenMPIsDevice && DevTy &&
	*DevTy == OMPDeclareTargetDeclAttr::DT_Host) {
	// Diagnose host function called during device codegen.
	StringRef HostDevTy =
	getOpenMPSimpleClauseTypeName(OMPC_device_type, OMPC_DEVICE_TYPE_host);
	Diag(Loc, diag::err_omp_wrong_device_function_call) << HostDevTy << 0;
	Diag(FD->getAttr<OMPDeclareTargetDeclAttr>()->getLocation(),
	diag::note_omp_marked_device_type_here)
	<< HostDevTy;
	return;
	}
	if (!LangOpts.OpenMPIsDevice && DevTy &&
	*DevTy == OMPDeclareTargetDeclAttr::DT_NoHost) {
	// Diagnose nohost function called during host codegen.
	StringRef NoHostDevTy = getOpenMPSimpleClauseTypeName(
	OMPC_device_type, OMPC_DEVICE_TYPE_nohost);
	Diag(Loc, diag::err_omp_wrong_device_function_call) << NoHostDevTy << 1;
	Diag(FD->getAttr<OMPDeclareTargetDeclAttr>()->getLocation(),
	diag::note_omp_marked_device_type_here)
	<< NoHostDevTy;
	}
	}

	void Sema::StartOpenMPDSABlock(OpenMPDirectiveKind DKind,
	const DeclarationNameInfo &DirName,
	Scope *CurScope, SourceLocation Loc) {
	DSAStack->push(DKind, DirName, CurScope, Loc);
	PushExpressionEvaluationContext(
	ExpressionEvaluationContext::PotentiallyEvaluated);
	}

	void Sema::StartOpenMPClause(OpenMPClauseKind K) {
	DSAStack->setClauseParsingMode(K);
	}

	void Sema::EndOpenMPClause() {
	DSAStack->setClauseParsingMode(/K=/OMPC_unknown);
	}

	static std::pair<ValueDecl *, bool>
	getPrivateItem(Sema &S, Expr *&RefExpr, SourceLocation &ELoc,
	SourceRange &ERange, bool AllowArraySection = false);

	/// Check consistency of the reduction clauses.
	static void checkReductionClauses(Sema &S, DSAStackTy *Stack,
	ArrayRef<OMPClause *> Clauses) {
	bool InscanFound = false;
	SourceLocation InscanLoc;
	// OpenMP 5.0, 2.19.5.4 reduction Clause, Restrictions.
	// A reduction clause without the inscan reduction-modifier may not appear on
	// a construct on which a reduction clause with the inscan reduction-modifier
	// appears.
	for (OMPClause *C : Clauses) {
	if (C->getClauseKind() != OMPC_reduction)
	continue;
	auto *RC = cast<OMPReductionClause>(C);
	if (RC->getModifier() == OMPC_REDUCTION_inscan) {
	InscanFound = true;
	InscanLoc = RC->getModifierLoc();
	continue;
	}
	if (RC->getModifier() == OMPC_REDUCTION_task) {
	// OpenMP 5.0, 2.19.5.4 reduction Clause.
	// A reduction clause with the task reduction-modifier may only appear on
	// a parallel construct, a worksharing construct or a combined or
	// composite construct for which any of the aforementioned constructs is a
	// constituent construct and simd or loop are not constituent constructs.
	OpenMPDirectiveKind CurDir = Stack->getCurrentDirective();
	if (!(isOpenMPParallelDirective(CurDir) \|\|
	isOpenMPWorksharingDirective(CurDir)) \|\|
	isOpenMPSimdDirective(CurDir))
	S.Diag(RC->getModifierLoc(),
	diag::err_omp_reduction_task_not_parallel_or_worksharing);
	continue;
	}
	}
	if (InscanFound) {
	for (OMPClause *C : Clauses) {
	if (C->getClauseKind() != OMPC_reduction)
	continue;
	auto *RC = cast<OMPReductionClause>(C);
	if (RC->getModifier() != OMPC_REDUCTION_inscan) {
	S.Diag(RC->getModifier() == OMPC_REDUCTION_unknown
	? RC->getBeginLoc()
	: RC->getModifierLoc(),
	diag::err_omp_inscan_reduction_expected);
	S.Diag(InscanLoc, diag::note_omp_previous_inscan_reduction);
	continue;
	}
	for (Expr *Ref : RC->varlists()) {
	assert(Ref && "NULL expr in OpenMP nontemporal clause.");
	SourceLocation ELoc;
	SourceRange ERange;
	Expr *SimpleRefExpr = Ref;
	auto Res = getPrivateItem(S, SimpleRefExpr, ELoc, ERange,
	/AllowArraySection=/true);
	ValueDecl *D = Res.first;
	if (!D)
	continue;
	if (!Stack->isUsedInScanDirective(getCanonicalDecl(D))) {
	S.Diag(Ref->getExprLoc(),
	diag::err_omp_reduction_not_inclusive_exclusive)
	<< Ref->getSourceRange();
	}
	}
	}
	}
	}

	static void checkAllocateClauses(Sema &S, DSAStackTy *Stack,
	ArrayRef<OMPClause *> Clauses);
	static DeclRefExpr buildCapture(Sema &S, ValueDecl D, Expr *CaptureExpr,
	bool WithInit);

	static void reportOriginalDsa(Sema &SemaRef, const DSAStackTy *Stack,
	const ValueDecl *D,
	const DSAStackTy::DSAVarData &DVar,
	bool IsLoopIterVar = false);

	void Sema::EndOpenMPDSABlock(Stmt *CurDirective) {
	// OpenMP [2.14.3.5, Restrictions, C/C++, p.1]
	// A variable of class type (or array thereof) that appears in a lastprivate
	// clause requires an accessible, unambiguous default constructor for the
	// class type, unless the list item is also specified in a firstprivate
	// clause.
	if (const auto *D = dyn_cast_or_null<OMPExecutableDirective>(CurDirective)) {
	for (OMPClause *C : D->clauses()) {
	if (auto *Clause = dyn_cast<OMPLastprivateClause>(C)) {
	SmallVector<Expr *, 8> PrivateCopies;
	for (Expr *DE : Clause->varlists()) {
	if (DE->isValueDependent() \|\| DE->isTypeDependent()) {
	PrivateCopies.push_back(nullptr);
	continue;
	}
	auto *DRE = cast<DeclRefExpr>(DE->IgnoreParens());
	auto *VD = cast<VarDecl>(DRE->getDecl());
	QualType Type = VD->getType().getNonReferenceType();
	const DSAStackTy::DSAVarData DVar =
	DSAStack->getTopDSA(VD, /FromParent=/false);
	if (DVar.CKind == OMPC_lastprivate) {
	// Generate helper private variable and initialize it with the
	// default value. The address of the original variable is replaced
	// by the address of the new private variable in CodeGen. This new
	// variable is not added to IdResolver, so the code in the OpenMP
	// region uses original variable for proper diagnostics.
	VarDecl *VDPrivate = buildVarDecl(
	*this, DE->getExprLoc(), Type.getUnqualifiedType(),
	VD->getName(), VD->hasAttrs() ? &VD->getAttrs() : nullptr, DRE);
	ActOnUninitializedDecl(VDPrivate);
	if (VDPrivate->isInvalidDecl()) {
	PrivateCopies.push_back(nullptr);
	continue;
	}
	PrivateCopies.push_back(buildDeclRefExpr(
	*this, VDPrivate, DE->getType(), DE->getExprLoc()));
	} else {
	// The variable is also a firstprivate, so initialization sequence
	// for private copy is generated already.
	PrivateCopies.push_back(nullptr);
	}
	}
	Clause->setPrivateCopies(PrivateCopies);
	continue;
	}
	// Finalize nontemporal clause by handling private copies, if any.
	if (auto *Clause = dyn_cast<OMPNontemporalClause>(C)) {
	SmallVector<Expr *, 8> PrivateRefs;
	for (Expr *RefExpr : Clause->varlists()) {
	assert(RefExpr && "NULL expr in OpenMP nontemporal clause.");
	SourceLocation ELoc;
	SourceRange ERange;
	Expr *SimpleRefExpr = RefExpr;
	auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
	if (Res.second)
	// It will be analyzed later.
	PrivateRefs.push_back(RefExpr);
	ValueDecl *D = Res.first;
	if (!D)
	continue;

	const DSAStackTy::DSAVarData DVar =
	DSAStack->getTopDSA(D, /FromParent=/false);
	PrivateRefs.push_back(DVar.PrivateCopy ? DVar.PrivateCopy
	: SimpleRefExpr);
	}
	Clause->setPrivateRefs(PrivateRefs);
	continue;
	}
	if (auto *Clause = dyn_cast<OMPUsesAllocatorsClause>(C)) {
	for (unsigned I = 0, E = Clause->getNumberOfAllocators(); I < E; ++I) {
	OMPUsesAllocatorsClause::Data D = Clause->getAllocatorData(I);
	auto *DRE = dyn_cast<DeclRefExpr>(D.Allocator->IgnoreParenImpCasts());
	if (!DRE)
	continue;
	ValueDecl *VD = DRE->getDecl();
	if (!VD \|\| !isa<VarDecl>(VD))
	continue;
	DSAStackTy::DSAVarData DVar =
	DSAStack->getTopDSA(VD, /FromParent=/false);
	// OpenMP [2.12.5, target Construct]
	// Memory allocators that appear in a uses_allocators clause cannot
	// appear in other data-sharing attribute clauses or data-mapping
	// attribute clauses in the same construct.
	Expr *MapExpr = nullptr;
	if (DVar.RefExpr \|\|
	DSAStack->checkMappableExprComponentListsForDecl(
	VD, /CurrentRegionOnly=/true,
	[VD, &MapExpr](
	OMPClauseMappableExprCommon::MappableExprComponentListRef
	MapExprComponents,
	OpenMPClauseKind C) {
	auto MI = MapExprComponents.rbegin();
	auto ME = MapExprComponents.rend();
	if (MI != ME &&
	MI->getAssociatedDeclaration()->getCanonicalDecl() ==
	VD->getCanonicalDecl()) {
	MapExpr = MI->getAssociatedExpression();
	return true;
	}
	return false;
	})) {
	Diag(D.Allocator->getExprLoc(),
	diag::err_omp_allocator_used_in_clauses)
	<< D.Allocator->getSourceRange();
	if (DVar.RefExpr)
	reportOriginalDsa(*this, DSAStack, VD, DVar);
	else
	Diag(MapExpr->getExprLoc(), diag::note_used_here)
	<< MapExpr->getSourceRange();
	}
	}
	continue;
	}
	}
	// Check allocate clauses.
	if (!CurContext->isDependentContext())
	checkAllocateClauses(*this, DSAStack, D->clauses());
	checkReductionClauses(*this, DSAStack, D->clauses());
	}

	DSAStack->pop();
	DiscardCleanupsInEvaluationContext();
	PopExpressionEvaluationContext();
	}

	static bool FinishOpenMPLinearClause(OMPLinearClause &Clause, DeclRefExpr *IV,
	Expr *NumIterations, Sema &SemaRef,
	Scope S, DSAStackTy Stack);

	namespace {

	class VarDeclFilterCCC final : public CorrectionCandidateCallback {
	private:
	Sema &SemaRef;

	public:
	explicit VarDeclFilterCCC(Sema &S) : SemaRef(S) {}
	bool ValidateCandidate(const TypoCorrection &Candidate) override {
	NamedDecl *ND = Candidate.getCorrectionDecl();
	if (const auto *VD = dyn_cast_or_null<VarDecl>(ND)) {
	return VD->hasGlobalStorage() &&
	SemaRef.isDeclInScope(ND, SemaRef.getCurLexicalContext(),
	SemaRef.getCurScope());
	}
	return false;
	}

	std::unique_ptr<CorrectionCandidateCallback> clone() override {
	return std::make_unique<VarDeclFilterCCC>(*this);
	}

	};

	class VarOrFuncDeclFilterCCC final : public CorrectionCandidateCallback {
	private:
	Sema &SemaRef;

	public:
	explicit VarOrFuncDeclFilterCCC(Sema &S) : SemaRef(S) {}
	bool ValidateCandidate(const TypoCorrection &Candidate) override {
	NamedDecl *ND = Candidate.getCorrectionDecl();
	if (ND && ((isa<VarDecl>(ND) && ND->getKind() == Decl::Var) \|\|
	isa<FunctionDecl>(ND))) {
	return SemaRef.isDeclInScope(ND, SemaRef.getCurLexicalContext(),
	SemaRef.getCurScope());
	}
	return false;
	}

	std::unique_ptr<CorrectionCandidateCallback> clone() override {
	return std::make_unique<VarOrFuncDeclFilterCCC>(*this);
	}
	};

	} // namespace

	ExprResult Sema::ActOnOpenMPIdExpression(Scope *CurScope,
	CXXScopeSpec &ScopeSpec,
	const DeclarationNameInfo &Id,
	OpenMPDirectiveKind Kind) {
	LookupResult Lookup(*this, Id, LookupOrdinaryName);
	LookupParsedName(Lookup, CurScope, &ScopeSpec, true);

	if (Lookup.isAmbiguous())
	return ExprError();

	VarDecl *VD;
	if (!Lookup.isSingleResult()) {
	VarDeclFilterCCC CCC(*this);
	if (TypoCorrection Corrected =
	CorrectTypo(Id, LookupOrdinaryName, CurScope, nullptr, CCC,
	CTK_ErrorRecovery)) {
	diagnoseTypo(Corrected,
	PDiag(Lookup.empty()
	? diag::err_undeclared_var_use_suggest
	: diag::err_omp_expected_var_arg_suggest)
	<< Id.getName());
	VD = Corrected.getCorrectionDeclAs<VarDecl>();
	} else {
	Diag(Id.getLoc(), Lookup.empty() ? diag::err_undeclared_var_use
	: diag::err_omp_expected_var_arg)
	<< Id.getName();
	return ExprError();
	}
	} else if (!(VD = Lookup.getAsSingle<VarDecl>())) {
	Diag(Id.getLoc(), diag::err_omp_expected_var_arg) << Id.getName();
	Diag(Lookup.getFoundDecl()->getLocation(), diag::note_declared_at);
	return ExprError();
	}
	Lookup.suppressDiagnostics();

	// OpenMP [2.9.2, Syntax, C/C++]
	// Variables must be file-scope, namespace-scope, or static block-scope.
	if (Kind == OMPD_threadprivate && !VD->hasGlobalStorage()) {
	Diag(Id.getLoc(), diag::err_omp_global_var_arg)
	<< getOpenMPDirectiveName(Kind) << !VD->isStaticLocal();
	bool IsDecl =
	VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
	Diag(VD->getLocation(),
	IsDecl ? diag::note_previous_decl : diag::note_defined_here)
	<< VD;
	return ExprError();
	}

	VarDecl *CanonicalVD = VD->getCanonicalDecl();
	NamedDecl *ND = CanonicalVD;
	// OpenMP [2.9.2, Restrictions, C/C++, p.2]
	// A threadprivate directive for file-scope variables must appear outside
	// any definition or declaration.
	if (CanonicalVD->getDeclContext()->isTranslationUnit() &&
	!getCurLexicalContext()->isTranslationUnit()) {
	Diag(Id.getLoc(), diag::err_omp_var_scope)
	<< getOpenMPDirectiveName(Kind) << VD;
	bool IsDecl =
	VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
	Diag(VD->getLocation(),
	IsDecl ? diag::note_previous_decl : diag::note_defined_here)
	<< VD;
	return ExprError();
	}
	// OpenMP [2.9.2, Restrictions, C/C++, p.3]
	// A threadprivate directive for static class member variables must appear
	// in the class definition, in the same scope in which the member
	// variables are declared.
	if (CanonicalVD->isStaticDataMember() &&
	!CanonicalVD->getDeclContext()->Equals(getCurLexicalContext())) {
	Diag(Id.getLoc(), diag::err_omp_var_scope)
	<< getOpenMPDirectiveName(Kind) << VD;
	bool IsDecl =
	VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
	Diag(VD->getLocation(),
	IsDecl ? diag::note_previous_decl : diag::note_defined_here)
	<< VD;
	return ExprError();
	}
	// OpenMP [2.9.2, Restrictions, C/C++, p.4]
	// A threadprivate directive for namespace-scope variables must appear
	// outside any definition or declaration other than the namespace
	// definition itself.
	if (CanonicalVD->getDeclContext()->isNamespace() &&
	(!getCurLexicalContext()->isFileContext() \|\|
	!getCurLexicalContext()->Encloses(CanonicalVD->getDeclContext()))) {
	Diag(Id.getLoc(), diag::err_omp_var_scope)
	<< getOpenMPDirectiveName(Kind) << VD;
	bool IsDecl =
	VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
	Diag(VD->getLocation(),
	IsDecl ? diag::note_previous_decl : diag::note_defined_here)
	<< VD;
	return ExprError();
	}
	// OpenMP [2.9.2, Restrictions, C/C++, p.6]
	// A threadprivate directive for static block-scope variables must appear
	// in the scope of the variable and not in a nested scope.
	if (CanonicalVD->isLocalVarDecl() && CurScope &&
	!isDeclInScope(ND, getCurLexicalContext(), CurScope)) {
	Diag(Id.getLoc(), diag::err_omp_var_scope)
	<< getOpenMPDirectiveName(Kind) << VD;
	bool IsDecl =
	VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
	Diag(VD->getLocation(),
	IsDecl ? diag::note_previous_decl : diag::note_defined_here)
	<< VD;
	return ExprError();
	}

	// OpenMP [2.9.2, Restrictions, C/C++, p.2-6]
	// A threadprivate directive must lexically precede all references to any
	// of the variables in its list.
	if (Kind == OMPD_threadprivate && VD->isUsed() &&
	!DSAStack->isThreadPrivate(VD)) {
	Diag(Id.getLoc(), diag::err_omp_var_used)
	<< getOpenMPDirectiveName(Kind) << VD;
	return ExprError();
	}

	QualType ExprType = VD->getType().getNonReferenceType();
	return DeclRefExpr::Create(Context, NestedNameSpecifierLoc(),
	SourceLocation(), VD,
	/RefersToEnclosingVariableOrCapture=/false,
	Id.getLoc(), ExprType, VK_LValue);
	}

	Sema::DeclGroupPtrTy
	Sema::ActOnOpenMPThreadprivateDirective(SourceLocation Loc,
	ArrayRef<Expr *> VarList) {
	if (OMPThreadPrivateDecl *D = CheckOMPThreadPrivateDecl(Loc, VarList)) {
	CurContext->addDecl(D);
	return DeclGroupPtrTy::make(DeclGroupRef(D));
	}
	return nullptr;
	}

	namespace {
	class LocalVarRefChecker final
	: public ConstStmtVisitor<LocalVarRefChecker, bool> {
	Sema &SemaRef;

	public:
	bool VisitDeclRefExpr(const DeclRefExpr *E) {
	if (const auto *VD = dyn_cast<VarDecl>(E->getDecl())) {
	if (VD->hasLocalStorage()) {
	SemaRef.Diag(E->getBeginLoc(),
	diag::err_omp_local_var_in_threadprivate_init)
	<< E->getSourceRange();
	SemaRef.Diag(VD->getLocation(), diag::note_defined_here)
	<< VD << VD->getSourceRange();
	return true;
	}
	}
	return false;
	}
	bool VisitStmt(const Stmt *S) {
	for (const Stmt *Child : S->children()) {
	if (Child && Visit(Child))
	return true;
	}
	return false;
	}
	explicit LocalVarRefChecker(Sema &SemaRef) : SemaRef(SemaRef) {}
	};
	} // namespace

	OMPThreadPrivateDecl *
	Sema::CheckOMPThreadPrivateDecl(SourceLocation Loc, ArrayRef<Expr *> VarList) {
	SmallVector<Expr *, 8> Vars;
	for (Expr *RefExpr : VarList) {
	auto *DE = cast<DeclRefExpr>(RefExpr);
	auto *VD = cast<VarDecl>(DE->getDecl());
	SourceLocation ILoc = DE->getExprLoc();

	// Mark variable as used.
	VD->setReferenced();
	VD->markUsed(Context);

	QualType QType = VD->getType();
	if (QType->isDependentType() \|\| QType->isInstantiationDependentType()) {
	// It will be analyzed later.
	Vars.push_back(DE);
	continue;
	}

	// OpenMP [2.9.2, Restrictions, C/C++, p.10]
	// A threadprivate variable must not have an incomplete type.
	if (RequireCompleteType(ILoc, VD->getType(),
	diag::err_omp_threadprivate_incomplete_type)) {
	continue;
	}

	// OpenMP [2.9.2, Restrictions, C/C++, p.10]
	// A threadprivate variable must not have a reference type.
	if (VD->getType()->isReferenceType()) {
	Diag(ILoc, diag::err_omp_ref_type_arg)
	<< getOpenMPDirectiveName(OMPD_threadprivate) << VD->getType();
	bool IsDecl =
	VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
	Diag(VD->getLocation(),
	IsDecl ? diag::note_previous_decl : diag::note_defined_here)
	<< VD;
	continue;
	}

	// Check if this is a TLS variable. If TLS is not being supported, produce
	// the corresponding diagnostic.
	if ((VD->getTLSKind() != VarDecl::TLS_None &&
	!(VD->hasAttr<OMPThreadPrivateDeclAttr>() &&
	getLangOpts().OpenMPUseTLS &&
	getASTContext().getTargetInfo().isTLSSupported())) \|\|
	(VD->getStorageClass() == SC_Register && VD->hasAttr<AsmLabelAttr>() &&
	!VD->isLocalVarDecl())) {
	Diag(ILoc, diag::err_omp_var_thread_local)
	<< VD << ((VD->getTLSKind() != VarDecl::TLS_None) ? 0 : 1);
	bool IsDecl =
	VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
	Diag(VD->getLocation(),
	IsDecl ? diag::note_previous_decl : diag::note_defined_here)
	<< VD;
	continue;
	}

	// Check if initial value of threadprivate variable reference variable with
	// local storage (it is not supported by runtime).
	if (const Expr *Init = VD->getAnyInitializer()) {
	LocalVarRefChecker Checker(*this);
	if (Checker.Visit(Init))
	continue;
	}

	Vars.push_back(RefExpr);
	DSAStack->addDSA(VD, DE, OMPC_threadprivate);
	VD->addAttr(OMPThreadPrivateDeclAttr::CreateImplicit(
	Context, SourceRange(Loc, Loc)));
	if (ASTMutationListener *ML = Context.getASTMutationListener())
	ML->DeclarationMarkedOpenMPThreadPrivate(VD);
	}
	OMPThreadPrivateDecl *D = nullptr;
	if (!Vars.empty()) {
	D = OMPThreadPrivateDecl::Create(Context, getCurLexicalContext(), Loc,
	Vars);
	D->setAccess(AS_public);
	}
	return D;
	}

	static OMPAllocateDeclAttr::AllocatorTypeTy
	getAllocatorKind(Sema &S, DSAStackTy Stack, Expr Allocator) {
	if (!Allocator)
	return OMPAllocateDeclAttr::OMPNullMemAlloc;
	if (Allocator->isTypeDependent() \|\| Allocator->isValueDependent() \|\|
	Allocator->isInstantiationDependent() \|\|
	Allocator->containsUnexpandedParameterPack())
	return OMPAllocateDeclAttr::OMPUserDefinedMemAlloc;
	auto AllocatorKindRes = OMPAllocateDeclAttr::OMPUserDefinedMemAlloc;
	const Expr *AE = Allocator->IgnoreParenImpCasts();
	for (int I = 0; I < OMPAllocateDeclAttr::OMPUserDefinedMemAlloc; ++I) {
	auto AllocatorKind = static_cast<OMPAllocateDeclAttr::AllocatorTypeTy>(I);
	const Expr *DefAllocator = Stack->getAllocator(AllocatorKind);
	llvm::FoldingSetNodeID AEId, DAEId;
	AE->Profile(AEId, S.getASTContext(), /Canonical=/true);
	DefAllocator->Profile(DAEId, S.getASTContext(), /Canonical=/true);
	if (AEId == DAEId) {
	AllocatorKindRes = AllocatorKind;
	break;
	}
	}
	return AllocatorKindRes;
	}

	static bool checkPreviousOMPAllocateAttribute(
	Sema &S, DSAStackTy Stack, Expr RefExpr, VarDecl *VD,
	OMPAllocateDeclAttr::AllocatorTypeTy AllocatorKind, Expr *Allocator) {
	if (!VD->hasAttr<OMPAllocateDeclAttr>())
	return false;
	const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
	Expr *PrevAllocator = A->getAllocator();
	OMPAllocateDeclAttr::AllocatorTypeTy PrevAllocatorKind =
	getAllocatorKind(S, Stack, PrevAllocator);
	bool AllocatorsMatch = AllocatorKind == PrevAllocatorKind;
	if (AllocatorsMatch &&
	AllocatorKind == OMPAllocateDeclAttr::OMPUserDefinedMemAlloc &&
	Allocator && PrevAllocator) {
	const Expr *AE = Allocator->IgnoreParenImpCasts();
	const Expr *PAE = PrevAllocator->IgnoreParenImpCasts();
	llvm::FoldingSetNodeID AEId, PAEId;
	AE->Profile(AEId, S.Context, /Canonical=/true);
	PAE->Profile(PAEId, S.Context, /Canonical=/true);
	AllocatorsMatch = AEId == PAEId;
	}
	if (!AllocatorsMatch) {
	SmallString<256> AllocatorBuffer;
	llvm::raw_svector_ostream AllocatorStream(AllocatorBuffer);
	if (Allocator)
	Allocator->printPretty(AllocatorStream, nullptr, S.getPrintingPolicy());
	SmallString<256> PrevAllocatorBuffer;
	llvm::raw_svector_ostream PrevAllocatorStream(PrevAllocatorBuffer);
	if (PrevAllocator)
	PrevAllocator->printPretty(PrevAllocatorStream, nullptr,
	S.getPrintingPolicy());

	SourceLocation AllocatorLoc =
	Allocator ? Allocator->getExprLoc() : RefExpr->getExprLoc();
	SourceRange AllocatorRange =
	Allocator ? Allocator->getSourceRange() : RefExpr->getSourceRange();
	SourceLocation PrevAllocatorLoc =
	PrevAllocator ? PrevAllocator->getExprLoc() : A->getLocation();
	SourceRange PrevAllocatorRange =
	PrevAllocator ? PrevAllocator->getSourceRange() : A->getRange();
	S.Diag(AllocatorLoc, diag::warn_omp_used_different_allocator)
	<< (Allocator ? 1 : 0) << AllocatorStream.str()
	<< (PrevAllocator ? 1 : 0) << PrevAllocatorStream.str()
	<< AllocatorRange;
	S.Diag(PrevAllocatorLoc, diag::note_omp_previous_allocator)
	<< PrevAllocatorRange;
	return true;
	}
	return false;
	}

	static void
	applyOMPAllocateAttribute(Sema &S, VarDecl *VD,
	OMPAllocateDeclAttr::AllocatorTypeTy AllocatorKind,
	Expr *Allocator, SourceRange SR) {
	if (VD->hasAttr<OMPAllocateDeclAttr>())
	return;
	if (Allocator &&
	(Allocator->isTypeDependent() \|\| Allocator->isValueDependent() \|\|
	Allocator->isInstantiationDependent() \|\|
	Allocator->containsUnexpandedParameterPack()))
	return;
	auto *A = OMPAllocateDeclAttr::CreateImplicit(S.Context, AllocatorKind,
	Allocator, SR);
	VD->addAttr(A);
	if (ASTMutationListener *ML = S.Context.getASTMutationListener())
	ML->DeclarationMarkedOpenMPAllocate(VD, A);
	}

	Sema::DeclGroupPtrTy Sema::ActOnOpenMPAllocateDirective(
	SourceLocation Loc, ArrayRef<Expr *> VarList,
	ArrayRef<OMPClause > Clauses, DeclContext Owner) {
	assert(Clauses.size() <= 1 && "Expected at most one clause.");
	Expr *Allocator = nullptr;
	if (Clauses.empty()) {
	// OpenMP 5.0, 2.11.3 allocate Directive, Restrictions.
	// allocate directives that appear in a target region must specify an
	// allocator clause unless a requires directive with the dynamic_allocators
	// clause is present in the same compilation unit.
	if (LangOpts.OpenMPIsDevice &&
	!DSAStack->hasRequiresDeclWithClause<OMPDynamicAllocatorsClause>())
	targetDiag(Loc, diag::err_expected_allocator_clause);
	} else {
	Allocator = cast<OMPAllocatorClause>(Clauses.back())->getAllocator();
	}
	OMPAllocateDeclAttr::AllocatorTypeTy AllocatorKind =
	getAllocatorKind(*this, DSAStack, Allocator);
	SmallVector<Expr *, 8> Vars;
	for (Expr *RefExpr : VarList) {
	auto *DE = cast<DeclRefExpr>(RefExpr);
	auto *VD = cast<VarDecl>(DE->getDecl());

	// Check if this is a TLS variable or global register.
	if (VD->getTLSKind() != VarDecl::TLS_None \|\|
	VD->hasAttr<OMPThreadPrivateDeclAttr>() \|\|
	(VD->getStorageClass() == SC_Register && VD->hasAttr<AsmLabelAttr>() &&
	!VD->isLocalVarDecl()))
	continue;

	// If the used several times in the allocate directive, the same allocator
	// must be used.
	if (checkPreviousOMPAllocateAttribute(*this, DSAStack, RefExpr, VD,
	AllocatorKind, Allocator))
	continue;

	// OpenMP, 2.11.3 allocate Directive, Restrictions, C / C++
	// If a list item has a static storage type, the allocator expression in the
	// allocator clause must be a constant expression that evaluates to one of
	// the predefined memory allocator values.
	if (Allocator && VD->hasGlobalStorage()) {
	if (AllocatorKind == OMPAllocateDeclAttr::OMPUserDefinedMemAlloc) {
	Diag(Allocator->getExprLoc(),
	diag::err_omp_expected_predefined_allocator)
	<< Allocator->getSourceRange();
	bool IsDecl = VD->isThisDeclarationADefinition(Context) ==
	VarDecl::DeclarationOnly;
	Diag(VD->getLocation(),
	IsDecl ? diag::note_previous_decl : diag::note_defined_here)
	<< VD;
	continue;
	}
	}

	Vars.push_back(RefExpr);
	applyOMPAllocateAttribute(*this, VD, AllocatorKind, Allocator,
	DE->getSourceRange());
	}
	if (Vars.empty())
	return nullptr;
	if (!Owner)
	Owner = getCurLexicalContext();
	auto *D = OMPAllocateDecl::Create(Context, Owner, Loc, Vars, Clauses);
	D->setAccess(AS_public);
	Owner->addDecl(D);
	return DeclGroupPtrTy::make(DeclGroupRef(D));
	}

	Sema::DeclGroupPtrTy
	Sema::ActOnOpenMPRequiresDirective(SourceLocation Loc,
	ArrayRef<OMPClause *> ClauseList) {
	OMPRequiresDecl *D = nullptr;
	if (!CurContext->isFileContext()) {
	Diag(Loc, diag::err_omp_invalid_scope) << "requires";
	} else {
	D = CheckOMPRequiresDecl(Loc, ClauseList);
	if (D) {
	CurContext->addDecl(D);
	DSAStack->addRequiresDecl(D);
	}
	}
	return DeclGroupPtrTy::make(DeclGroupRef(D));
	}

	OMPRequiresDecl *Sema::CheckOMPRequiresDecl(SourceLocation Loc,
	ArrayRef<OMPClause *> ClauseList) {
	/// For target specific clauses, the requires directive cannot be
	/// specified after the handling of any of the target regions in the
	/// current compilation unit.
	ArrayRef<SourceLocation> TargetLocations =
	DSAStack->getEncounteredTargetLocs();
	SourceLocation AtomicLoc = DSAStack->getAtomicDirectiveLoc();
	if (!TargetLocations.empty() \|\| !AtomicLoc.isInvalid()) {
	for (const OMPClause *CNew : ClauseList) {
	// Check if any of the requires clauses affect target regions.
	if (isa<OMPUnifiedSharedMemoryClause>(CNew) \|\|
	isa<OMPUnifiedAddressClause>(CNew) \|\|
	isa<OMPReverseOffloadClause>(CNew) \|\|
	isa<OMPDynamicAllocatorsClause>(CNew)) {
	Diag(Loc, diag::err_omp_directive_before_requires)
	<< "target" << getOpenMPClauseName(CNew->getClauseKind());
	for (SourceLocation TargetLoc : TargetLocations) {
	Diag(TargetLoc, diag::note_omp_requires_encountered_directive)
	<< "target";
	}
	} else if (!AtomicLoc.isInvalid() &&
	isa<OMPAtomicDefaultMemOrderClause>(CNew)) {
	Diag(Loc, diag::err_omp_directive_before_requires)
	<< "atomic" << getOpenMPClauseName(CNew->getClauseKind());
	Diag(AtomicLoc, diag::note_omp_requires_encountered_directive)
	<< "atomic";
	}
	}
	}

	if (!DSAStack->hasDuplicateRequiresClause(ClauseList))
	return OMPRequiresDecl::Create(Context, getCurLexicalContext(), Loc,
	ClauseList);
	return nullptr;
	}

	static void reportOriginalDsa(Sema &SemaRef, const DSAStackTy *Stack,
	const ValueDecl *D,
	const DSAStackTy::DSAVarData &DVar,
	bool IsLoopIterVar) {
	if (DVar.RefExpr) {
	SemaRef.Diag(DVar.RefExpr->getExprLoc(), diag::note_omp_explicit_dsa)
	<< getOpenMPClauseName(DVar.CKind);
	return;
	}
	enum {
	PDSA_StaticMemberShared,
	PDSA_StaticLocalVarShared,
	PDSA_LoopIterVarPrivate,
	PDSA_LoopIterVarLinear,
	PDSA_LoopIterVarLastprivate,
	PDSA_ConstVarShared,
	PDSA_GlobalVarShared,
	PDSA_TaskVarFirstprivate,
	PDSA_LocalVarPrivate,
	PDSA_Implicit
	} Reason = PDSA_Implicit;
	bool ReportHint = false;
	auto ReportLoc = D->getLocation();
	auto *VD = dyn_cast<VarDecl>(D);
	if (IsLoopIterVar) {
	if (DVar.CKind == OMPC_private)
	Reason = PDSA_LoopIterVarPrivate;
	else if (DVar.CKind == OMPC_lastprivate)
	Reason = PDSA_LoopIterVarLastprivate;
	else
	Reason = PDSA_LoopIterVarLinear;
	} else if (isOpenMPTaskingDirective(DVar.DKind) &&
	DVar.CKind == OMPC_firstprivate) {
	Reason = PDSA_TaskVarFirstprivate;
	ReportLoc = DVar.ImplicitDSALoc;
	} else if (VD && VD->isStaticLocal())
	Reason = PDSA_StaticLocalVarShared;
	else if (VD && VD->isStaticDataMember())
	Reason = PDSA_StaticMemberShared;
	else if (VD && VD->isFileVarDecl())
	Reason = PDSA_GlobalVarShared;
	else if (D->getType().isConstant(SemaRef.getASTContext()))
	Reason = PDSA_ConstVarShared;
	else if (VD && VD->isLocalVarDecl() && DVar.CKind == OMPC_private) {
	ReportHint = true;
	Reason = PDSA_LocalVarPrivate;
	}
	if (Reason != PDSA_Implicit) {
	SemaRef.Diag(ReportLoc, diag::note_omp_predetermined_dsa)
	<< Reason << ReportHint
	<< getOpenMPDirectiveName(Stack->getCurrentDirective());
	} else if (DVar.ImplicitDSALoc.isValid()) {
	SemaRef.Diag(DVar.ImplicitDSALoc, diag::note_omp_implicit_dsa)
	<< getOpenMPClauseName(DVar.CKind);
	}
	}

	static OpenMPMapClauseKind
	getMapClauseKindFromModifier(OpenMPDefaultmapClauseModifier M,
	bool IsAggregateOrDeclareTarget) {
	OpenMPMapClauseKind Kind = OMPC_MAP_unknown;
	switch (M) {
	case OMPC_DEFAULTMAP_MODIFIER_alloc:
	Kind = OMPC_MAP_alloc;
	break;
	case OMPC_DEFAULTMAP_MODIFIER_to:
	Kind = OMPC_MAP_to;
	break;
	case OMPC_DEFAULTMAP_MODIFIER_from:
	Kind = OMPC_MAP_from;
	break;
	case OMPC_DEFAULTMAP_MODIFIER_tofrom:
	Kind = OMPC_MAP_tofrom;
	break;
	case OMPC_DEFAULTMAP_MODIFIER_firstprivate:
	case OMPC_DEFAULTMAP_MODIFIER_last:
	llvm_unreachable("Unexpected defaultmap implicit behavior");
	case OMPC_DEFAULTMAP_MODIFIER_none:
	case OMPC_DEFAULTMAP_MODIFIER_default:
	case OMPC_DEFAULTMAP_MODIFIER_unknown:
	// IsAggregateOrDeclareTarget could be true if:
	// 1. the implicit behavior for aggregate is tofrom
	// 2. it's a declare target link
	if (IsAggregateOrDeclareTarget) {
	Kind = OMPC_MAP_tofrom;
	break;
	}
	llvm_unreachable("Unexpected defaultmap implicit behavior");
	}
	assert(Kind != OMPC_MAP_unknown && "Expect map kind to be known");
	return Kind;
	}

	namespace {
	class DSAAttrChecker final : public StmtVisitor<DSAAttrChecker, void> {
	DSAStackTy *Stack;
	Sema &SemaRef;
	bool ErrorFound = false;
	bool TryCaptureCXXThisMembers = false;
	CapturedStmt *CS = nullptr;
	llvm::SmallVector<Expr *, 4> ImplicitFirstprivate;
	llvm::SmallVector<Expr *, 4> ImplicitMap[OMPC_MAP_delete];
	Sema::VarsWithInheritedDSAType VarsWithInheritedDSA;
	llvm::SmallDenseSet<const ValueDecl *, 4> ImplicitDeclarations;

	void VisitSubCaptures(OMPExecutableDirective *S) {
	// Check implicitly captured variables.
	if (!S->hasAssociatedStmt() \|\| !S->getAssociatedStmt())
	return;
	visitSubCaptures(S->getInnermostCapturedStmt());
	// Try to capture inner this->member references to generate correct mappings
	// and diagnostics.
	if (TryCaptureCXXThisMembers \|\|
	(isOpenMPTargetExecutionDirective(Stack->getCurrentDirective()) &&
	llvm::any_of(S->getInnermostCapturedStmt()->captures(),
	[](const CapturedStmt::Capture &C) {
	return C.capturesThis();
	}))) {
	bool SavedTryCaptureCXXThisMembers = TryCaptureCXXThisMembers;
	TryCaptureCXXThisMembers = true;
	Visit(S->getInnermostCapturedStmt()->getCapturedStmt());
	TryCaptureCXXThisMembers = SavedTryCaptureCXXThisMembers;
	}
	// In tasks firstprivates are not captured anymore, need to analyze them
	// explicitly.
	if (isOpenMPTaskingDirective(S->getDirectiveKind()) &&
	!isOpenMPTaskLoopDirective(S->getDirectiveKind())) {
	for (OMPClause *C : S->clauses())
	if (auto *FC = dyn_cast<OMPFirstprivateClause>(C)) {
	for (Expr *Ref : FC->varlists())
	Visit(Ref);
	}
	}
	}

	public:
	void VisitDeclRefExpr(DeclRefExpr *E) {
	if (TryCaptureCXXThisMembers \|\| E->isTypeDependent() \|\|
	E->isValueDependent() \|\| E->containsUnexpandedParameterPack() \|\|
	E->isInstantiationDependent())
	return;
	if (auto *VD = dyn_cast<VarDecl>(E->getDecl())) {
	// Check the datasharing rules for the expressions in the clauses.
	if (!CS) {
	if (auto *CED = dyn_cast<OMPCapturedExprDecl>(VD))
	if (!CED->hasAttr<OMPCaptureNoInitAttr>()) {
	Visit(CED->getInit());
	return;
	}
	} else if (VD->isImplicit() \|\| isa<OMPCapturedExprDecl>(VD))
	// Do not analyze internal variables and do not enclose them into
	// implicit clauses.
	return;
	VD = VD->getCanonicalDecl();
	// Skip internally declared variables.
	if (VD->hasLocalStorage() && CS && !CS->capturesVariable(VD) &&
	!Stack->isImplicitTaskFirstprivate(VD))
	return;
	// Skip allocators in uses_allocators clauses.
	if (Stack->isUsesAllocatorsDecl(VD).hasValue())
	return;

	DSAStackTy::DSAVarData DVar = Stack->getTopDSA(VD, /FromParent=/false);
	// Check if the variable has explicit DSA set and stop analysis if it so.
	if (DVar.RefExpr \|\| !ImplicitDeclarations.insert(VD).second)
	return;

	// Skip internally declared static variables.
	llvm::Optional<OMPDeclareTargetDeclAttr::MapTypeTy> Res =
	OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD);
	if (VD->hasGlobalStorage() && CS && !CS->capturesVariable(VD) &&
	(Stack->hasRequiresDeclWithClause<OMPUnifiedSharedMemoryClause>() \|\|
	!Res \|\| *Res != OMPDeclareTargetDeclAttr::MT_Link) &&
	!Stack->isImplicitTaskFirstprivate(VD))
	return;

	SourceLocation ELoc = E->getExprLoc();
	OpenMPDirectiveKind DKind = Stack->getCurrentDirective();
	// The default(none) clause requires that each variable that is referenced
	// in the construct, and does not have a predetermined data-sharing
	// attribute, must have its data-sharing attribute explicitly determined
	// by being listed in a data-sharing attribute clause.
	if (DVar.CKind == OMPC_unknown &&
	(Stack->getDefaultDSA() == DSA_none \|\|
	Stack->getDefaultDSA() == DSA_firstprivate) &&
	isImplicitOrExplicitTaskingRegion(DKind) &&
	VarsWithInheritedDSA.count(VD) == 0) {
	bool InheritedDSA = Stack->getDefaultDSA() == DSA_none;
	if (!InheritedDSA && Stack->getDefaultDSA() == DSA_firstprivate) {
	DSAStackTy::DSAVarData DVar =
	Stack->getImplicitDSA(VD, /FromParent=/false);
	InheritedDSA = DVar.CKind == OMPC_unknown;
	}
	if (InheritedDSA)
	VarsWithInheritedDSA[VD] = E;
	return;
	}

	// OpenMP 5.0 [2.19.7.2, defaultmap clause, Description]
	// If implicit-behavior is none, each variable referenced in the
	// construct that does not have a predetermined data-sharing attribute
	// and does not appear in a to or link clause on a declare target
	// directive must be listed in a data-mapping attribute clause, a
	// data-haring attribute clause (including a data-sharing attribute
	// clause on a combined construct where target. is one of the
	// constituent constructs), or an is_device_ptr clause.
	OpenMPDefaultmapClauseKind ClauseKind =
	getVariableCategoryFromDecl(SemaRef.getLangOpts(), VD);
	if (SemaRef.getLangOpts().OpenMP >= 50) {
	bool IsModifierNone = Stack->getDefaultmapModifier(ClauseKind) ==
	OMPC_DEFAULTMAP_MODIFIER_none;
	if (DVar.CKind == OMPC_unknown && IsModifierNone &&
	VarsWithInheritedDSA.count(VD) == 0 && !Res) {
	// Only check for data-mapping attribute and is_device_ptr here
	// since we have already make sure that the declaration does not
	// have a data-sharing attribute above
	if (!Stack->checkMappableExprComponentListsForDecl(
	VD, /CurrentRegionOnly=/true,
	[VD](OMPClauseMappableExprCommon::MappableExprComponentListRef
	MapExprComponents,
	OpenMPClauseKind) {
	auto MI = MapExprComponents.rbegin();
	auto ME = MapExprComponents.rend();
	return MI != ME && MI->getAssociatedDeclaration() == VD;
	})) {
	VarsWithInheritedDSA[VD] = E;
	return;
	}
	}
	}

	if (isOpenMPTargetExecutionDirective(DKind) &&
	!Stack->isLoopControlVariable(VD).first) {
	if (!Stack->checkMappableExprComponentListsForDecl(
	VD, /CurrentRegionOnly=/true,
	[](OMPClauseMappableExprCommon::MappableExprComponentListRef
	StackComponents,
	OpenMPClauseKind) {
	// Variable is used if it has been marked as an array, array
	// section, array shaping or the variable iself.
	return StackComponents.size() == 1 \|\|
	std::all_of(
	std::next(StackComponents.rbegin()),
	StackComponents.rend(),
	[](const OMPClauseMappableExprCommon::
	MappableComponent &MC) {
	return MC.getAssociatedDeclaration() ==
	nullptr &&
	(isa<OMPArraySectionExpr>(
	MC.getAssociatedExpression()) \|\|
	isa<OMPArrayShapingExpr>(
	MC.getAssociatedExpression()) \|\|
	isa<ArraySubscriptExpr>(
	MC.getAssociatedExpression()));
	});
	})) {
	bool IsFirstprivate = false;
	// By default lambdas are captured as firstprivates.
	if (const auto *RD =
	VD->getType().getNonReferenceType()->getAsCXXRecordDecl())
	IsFirstprivate = RD->isLambda();
	IsFirstprivate =
	IsFirstprivate \|\| (Stack->mustBeFirstprivate(ClauseKind) && !Res);
	if (IsFirstprivate) {
	ImplicitFirstprivate.emplace_back(E);
	} else {
	OpenMPDefaultmapClauseModifier M =
	Stack->getDefaultmapModifier(ClauseKind);
	OpenMPMapClauseKind Kind = getMapClauseKindFromModifier(
	M, ClauseKind == OMPC_DEFAULTMAP_aggregate \|\| Res);
	ImplicitMap[Kind].emplace_back(E);
	}
	return;
	}
	}

	// OpenMP [2.9.3.6, Restrictions, p.2]
	// A list item that appears in a reduction clause of the innermost
	// enclosing worksharing or parallel construct may not be accessed in an
	// explicit task.
	DVar = Stack->hasInnermostDSA(
	VD, [](OpenMPClauseKind C) { return C == OMPC_reduction; },
	[](OpenMPDirectiveKind K) {
	return isOpenMPParallelDirective(K) \|\|
	isOpenMPWorksharingDirective(K) \|\| isOpenMPTeamsDirective(K);
	},
	/FromParent=/true);
	if (isOpenMPTaskingDirective(DKind) && DVar.CKind == OMPC_reduction) {
	ErrorFound = true;
	SemaRef.Diag(ELoc, diag::err_omp_reduction_in_task);
	reportOriginalDsa(SemaRef, Stack, VD, DVar);
	return;
	}

	// Define implicit data-sharing attributes for task.
	DVar = Stack->getImplicitDSA(VD, /FromParent=/false);
	if (((isOpenMPTaskingDirective(DKind) && DVar.CKind != OMPC_shared) \|\|
	(Stack->getDefaultDSA() == DSA_firstprivate &&
	DVar.CKind == OMPC_firstprivate && !DVar.RefExpr)) &&
	!Stack->isLoopControlVariable(VD).first) {
	ImplicitFirstprivate.push_back(E);
	return;
	}

	// Store implicitly used globals with declare target link for parent
	// target.
	if (!isOpenMPTargetExecutionDirective(DKind) && Res &&
	*Res == OMPDeclareTargetDeclAttr::MT_Link) {
	Stack->addToParentTargetRegionLinkGlobals(E);
	return;
	}
	}
	}
	void VisitMemberExpr(MemberExpr *E) {
	if (E->isTypeDependent() \|\| E->isValueDependent() \|\|
	E->containsUnexpandedParameterPack() \|\| E->isInstantiationDependent())
	return;
	auto *FD = dyn_cast<FieldDecl>(E->getMemberDecl());
	OpenMPDirectiveKind DKind = Stack->getCurrentDirective();
	if (auto *TE = dyn_cast<CXXThisExpr>(E->getBase()->IgnoreParenCasts())) {
	if (!FD)
	return;
	DSAStackTy::DSAVarData DVar = Stack->getTopDSA(FD, /FromParent=/false);
	// Check if the variable has explicit DSA set and stop analysis if it
	// so.
	if (DVar.RefExpr \|\| !ImplicitDeclarations.insert(FD).second)
	return;

	if (isOpenMPTargetExecutionDirective(DKind) &&
	!Stack->isLoopControlVariable(FD).first &&
	!Stack->checkMappableExprComponentListsForDecl(
	FD, /CurrentRegionOnly=/true,
	[](OMPClauseMappableExprCommon::MappableExprComponentListRef
	StackComponents,
	OpenMPClauseKind) {
	return isa<CXXThisExpr>(
	cast<MemberExpr>(
	StackComponents.back().getAssociatedExpression())
	->getBase()
	->IgnoreParens());
	})) {
	// OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, C/C++, p.3]
	// A bit-field cannot appear in a map clause.
	//
	if (FD->isBitField())
	return;

	// Check to see if the member expression is referencing a class that
	// has already been explicitly mapped
	if (Stack->isClassPreviouslyMapped(TE->getType()))
	return;

	OpenMPDefaultmapClauseModifier Modifier =
	Stack->getDefaultmapModifier(OMPC_DEFAULTMAP_aggregate);
	OpenMPMapClauseKind Kind = getMapClauseKindFromModifier(
	Modifier, /IsAggregateOrDeclareTarget/ true);
	ImplicitMap[Kind].emplace_back(E);
	return;
	}

	SourceLocation ELoc = E->getExprLoc();
	// OpenMP [2.9.3.6, Restrictions, p.2]
	// A list item that appears in a reduction clause of the innermost
	// enclosing worksharing or parallel construct may not be accessed in
	// an explicit task.
	DVar = Stack->hasInnermostDSA(
	FD, [](OpenMPClauseKind C) { return C == OMPC_reduction; },
	[](OpenMPDirectiveKind K) {
	return isOpenMPParallelDirective(K) \|\|
	isOpenMPWorksharingDirective(K) \|\| isOpenMPTeamsDirective(K);
	},
	/FromParent=/true);
	if (isOpenMPTaskingDirective(DKind) && DVar.CKind == OMPC_reduction) {
	ErrorFound = true;
	SemaRef.Diag(ELoc, diag::err_omp_reduction_in_task);
	reportOriginalDsa(SemaRef, Stack, FD, DVar);
	return;
	}

	// Define implicit data-sharing attributes for task.
	DVar = Stack->getImplicitDSA(FD, /FromParent=/false);
	if (isOpenMPTaskingDirective(DKind) && DVar.CKind != OMPC_shared &&
	!Stack->isLoopControlVariable(FD).first) {
	// Check if there is a captured expression for the current field in the
	// region. Do not mark it as firstprivate unless there is no captured
	// expression.
	// TODO: try to make it firstprivate.
	if (DVar.CKind != OMPC_unknown)
	ImplicitFirstprivate.push_back(E);
	}
	return;
	}
	if (isOpenMPTargetExecutionDirective(DKind)) {
	OMPClauseMappableExprCommon::MappableExprComponentList CurComponents;
	if (!checkMapClauseExpressionBase(SemaRef, E, CurComponents, OMPC_map,
	/NoDiagnose=/true))
	return;
	const auto *VD = cast<ValueDecl>(
	CurComponents.back().getAssociatedDeclaration()->getCanonicalDecl());
	if (!Stack->checkMappableExprComponentListsForDecl(
	VD, /CurrentRegionOnly=/true,
	[&CurComponents](
	OMPClauseMappableExprCommon::MappableExprComponentListRef
	StackComponents,
	OpenMPClauseKind) {
	auto CCI = CurComponents.rbegin();
	auto CCE = CurComponents.rend();
	for (const auto &SC : llvm::reverse(StackComponents)) {
	// Do both expressions have the same kind?
	if (CCI->getAssociatedExpression()->getStmtClass() !=
	SC.getAssociatedExpression()->getStmtClass())
	if (!((isa<OMPArraySectionExpr>(
	SC.getAssociatedExpression()) \|\|
	isa<OMPArrayShapingExpr>(
	SC.getAssociatedExpression())) &&
	isa<ArraySubscriptExpr>(
	CCI->getAssociatedExpression())))
	return false;

	const Decl *CCD = CCI->getAssociatedDeclaration();
	const Decl *SCD = SC.getAssociatedDeclaration();
	CCD = CCD ? CCD->getCanonicalDecl() : nullptr;
	SCD = SCD ? SCD->getCanonicalDecl() : nullptr;
	if (SCD != CCD)
	return false;
	std::advance(CCI, 1);
	if (CCI == CCE)
	break;
	}
	return true;
	})) {
	Visit(E->getBase());
	}
	} else if (!TryCaptureCXXThisMembers) {
	Visit(E->getBase());
	}
	}
	void VisitOMPExecutableDirective(OMPExecutableDirective *S) {
	for (OMPClause *C : S->clauses()) {
	// Skip analysis of arguments of implicitly defined firstprivate clause
	// for task\|target directives.
	// Skip analysis of arguments of implicitly defined map clause for target
	// directives.
	if (C && !((isa<OMPFirstprivateClause>(C) \|\| isa<OMPMapClause>(C)) &&
	C->isImplicit())) {
	for (Stmt *CC : C->children()) {
	if (CC)
	Visit(CC);
	}
	}
	}
	// Check implicitly captured variables.
	VisitSubCaptures(S);
	}
	void VisitStmt(Stmt *S) {
	for (Stmt *C : S->children()) {
	if (C) {
	// Check implicitly captured variables in the task-based directives to
	// check if they must be firstprivatized.
	Visit(C);
	}
	}
	}

	void visitSubCaptures(CapturedStmt *S) {
	for (const CapturedStmt::Capture &Cap : S->captures()) {
	if (!Cap.capturesVariable() && !Cap.capturesVariableByCopy())
	continue;
	VarDecl *VD = Cap.getCapturedVar();
	// Do not try to map the variable if it or its sub-component was mapped
	// already.
	if (isOpenMPTargetExecutionDirective(Stack->getCurrentDirective()) &&
	Stack->checkMappableExprComponentListsForDecl(
	VD, /CurrentRegionOnly=/true,
	[](OMPClauseMappableExprCommon::MappableExprComponentListRef,
	OpenMPClauseKind) { return true; }))
	continue;
	DeclRefExpr *DRE = buildDeclRefExpr(
	SemaRef, VD, VD->getType().getNonLValueExprType(SemaRef.Context),
	Cap.getLocation(), /RefersToCapture=/true);
	Visit(DRE);
	}
	}
	bool isErrorFound() const { return ErrorFound; }
	ArrayRef<Expr *> getImplicitFirstprivate() const {
	return ImplicitFirstprivate;
	}
	ArrayRef<Expr *> getImplicitMap(OpenMPDefaultmapClauseKind Kind) const {
	return ImplicitMap[Kind];
	}
	const Sema::VarsWithInheritedDSAType &getVarsWithInheritedDSA() const {
	return VarsWithInheritedDSA;
	}

	DSAAttrChecker(DSAStackTy S, Sema &SemaRef, CapturedStmt CS)
	: Stack(S), SemaRef(SemaRef), ErrorFound(false), CS(CS) {
	// Process declare target link variables for the target directives.
	if (isOpenMPTargetExecutionDirective(S->getCurrentDirective())) {
	for (DeclRefExpr *E : Stack->getLinkGlobals())
	Visit(E);
	}
	}
	};
	} // namespace

	void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
	switch (DKind) {
	case OMPD_parallel:
	case OMPD_parallel_for:
	case OMPD_parallel_for_simd:
	case OMPD_parallel_sections:
	case OMPD_parallel_master:
	case OMPD_teams:
	case OMPD_teams_distribute:
	case OMPD_teams_distribute_simd: {
	QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1).withConst();
	QualType KmpInt32PtrTy =
	Context.getPointerType(KmpInt32Ty).withConst().withRestrict();
	Sema::CapturedParamNameType Params[] = {
	std::make_pair(".global_tid.", KmpInt32PtrTy),
	std::make_pair(".bound_tid.", KmpInt32PtrTy),
	std::make_pair(StringRef(), QualType()) // __context with shared vars
	};
	ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
	Params);
	break;
	}
	case OMPD_target_teams:
	case OMPD_target_parallel:
	case OMPD_target_parallel_for:
	case OMPD_target_parallel_for_simd:
	case OMPD_target_teams_distribute:
	case OMPD_target_teams_distribute_simd: {
	QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1).withConst();
	QualType VoidPtrTy = Context.VoidPtrTy.withConst().withRestrict();
	QualType KmpInt32PtrTy =
	Context.getPointerType(KmpInt32Ty).withConst().withRestrict();
	QualType Args[] = {VoidPtrTy};
	FunctionProtoType::ExtProtoInfo EPI;
	EPI.Variadic = true;
	QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI);
	Sema::CapturedParamNameType Params[] = {
	std::make_pair(".global_tid.", KmpInt32Ty),
	std::make_pair(".part_id.", KmpInt32PtrTy),
	std::make_pair(".privates.", VoidPtrTy),
	std::make_pair(
	".copy_fn.",
	Context.getPointerType(CopyFnType).withConst().withRestrict()),
	std::make_pair(".task_t.", Context.VoidPtrTy.withConst()),
	std::make_pair(StringRef(), QualType()) // __context with shared vars
	};
	ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
	Params, /OpenMPCaptureLevel=/0);
	// Mark this captured region as inlined, because we don't use outlined
	// function directly.
	getCurCapturedRegion()->TheCapturedDecl->addAttr(
	AlwaysInlineAttr::CreateImplicit(
	Context, {}, AttributeCommonInfo::AS_Keyword,
	AlwaysInlineAttr::Keyword_forceinline));
	Sema::CapturedParamNameType ParamsTarget[] = {
	std::make_pair(StringRef(), QualType()) // __context with shared vars
	};
	// Start a captured region for 'target' with no implicit parameters.
	ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
	ParamsTarget, /OpenMPCaptureLevel=/1);
	Sema::CapturedParamNameType ParamsTeamsOrParallel[] = {
	std::make_pair(".global_tid.", KmpInt32PtrTy),
	std::make_pair(".bound_tid.", KmpInt32PtrTy),
	std::make_pair(StringRef(), QualType()) // __context with shared vars
	};
	// Start a captured region for 'teams' or 'parallel'. Both regions have
	// the same implicit parameters.
	ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
	ParamsTeamsOrParallel, /OpenMPCaptureLevel=/2);
	break;
	}
	case OMPD_target:
	case OMPD_target_simd: {
	QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1).withConst();
	QualType VoidPtrTy = Context.VoidPtrTy.withConst().withRestrict();
	QualType KmpInt32PtrTy =
	Context.getPointerType(KmpInt32Ty).withConst().withRestrict();
	QualType Args[] = {VoidPtrTy};
	FunctionProtoType::ExtProtoInfo EPI;
	EPI.Variadic = true;
	QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI);
	Sema::CapturedParamNameType Params[] = {
	std::make_pair(".global_tid.", KmpInt32Ty),
	std::make_pair(".part_id.", KmpInt32PtrTy),
	std::make_pair(".privates.", VoidPtrTy),
	std::make_pair(
	".copy_fn.",
	Context.getPointerType(CopyFnType).withConst().withRestrict()),
	std::make_pair(".task_t.", Context.VoidPtrTy.withConst()),
	std::make_pair(StringRef(), QualType()) // __context with shared vars
	};
	ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
	Params, /OpenMPCaptureLevel=/0);
	// Mark this captured region as inlined, because we don't use outlined
	// function directly.
	getCurCapturedRegion()->TheCapturedDecl->addAttr(
	AlwaysInlineAttr::CreateImplicit(
	Context, {}, AttributeCommonInfo::AS_Keyword,
	AlwaysInlineAttr::Keyword_forceinline));
	ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
	std::make_pair(StringRef(), QualType()),
	/OpenMPCaptureLevel=/1);
	break;
	}
	case OMPD_simd:
	case OMPD_for:
	case OMPD_for_simd:
	case OMPD_sections:
	case OMPD_section:
	case OMPD_single:
	case OMPD_master:
	case OMPD_critical:
	case OMPD_taskgroup:
	case OMPD_distribute:
	case OMPD_distribute_simd:
	case OMPD_ordered:
	case OMPD_atomic:
	case OMPD_target_data: {
	Sema::CapturedParamNameType Params[] = {
	std::make_pair(StringRef(), QualType()) // __context with shared vars
	};
	ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
	Params);
	break;
	}
	case OMPD_task: {
	QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1).withConst();
	QualType VoidPtrTy = Context.VoidPtrTy.withConst().withRestrict();
	QualType KmpInt32PtrTy =
	Context.getPointerType(KmpInt32Ty).withConst().withRestrict();
	QualType Args[] = {VoidPtrTy};
	FunctionProtoType::ExtProtoInfo EPI;
	EPI.Variadic = true;
	QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI);
	Sema::CapturedParamNameType Params[] = {
	std::make_pair(".global_tid.", KmpInt32Ty),
	std::make_pair(".part_id.", KmpInt32PtrTy),
	std::make_pair(".privates.", VoidPtrTy),
	std::make_pair(
	".copy_fn.",
	Context.getPointerType(CopyFnType).withConst().withRestrict()),
	std::make_pair(".task_t.", Context.VoidPtrTy.withConst()),
	std::make_pair(StringRef(), QualType()) // __context with shared vars
	};
	ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
	Params);
	// Mark this captured region as inlined, because we don't use outlined
	// function directly.
	getCurCapturedRegion()->TheCapturedDecl->addAttr(
	AlwaysInlineAttr::CreateImplicit(
	Context, {}, AttributeCommonInfo::AS_Keyword,
	AlwaysInlineAttr::Keyword_forceinline));
	break;
	}
	case OMPD_taskloop:
	case OMPD_taskloop_simd:
	case OMPD_master_taskloop:
	case OMPD_master_taskloop_simd: {
	QualType KmpInt32Ty =
	Context.getIntTypeForBitwidth(/DestWidth=/32, /Signed=/1)
	.withConst();
	QualType KmpUInt64Ty =
	Context.getIntTypeForBitwidth(/DestWidth=/64, /Signed=/0)
	.withConst();
	QualType KmpInt64Ty =
	Context.getIntTypeForBitwidth(/DestWidth=/64, /Signed=/1)
	.withConst();
	QualType VoidPtrTy = Context.VoidPtrTy.withConst().withRestrict();
	QualType KmpInt32PtrTy =
	Context.getPointerType(KmpInt32Ty).withConst().withRestrict();
	QualType Args[] = {VoidPtrTy};
	FunctionProtoType::ExtProtoInfo EPI;
	EPI.Variadic = true;
	QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI);
	Sema::CapturedParamNameType Params[] = {
	std::make_pair(".global_tid.", KmpInt32Ty),
	std::make_pair(".part_id.", KmpInt32PtrTy),
	std::make_pair(".privates.", VoidPtrTy),
	std::make_pair(
	".copy_fn.",
	Context.getPointerType(CopyFnType).withConst().withRestrict()),
	std::make_pair(".task_t.", Context.VoidPtrTy.withConst()),
	std::make_pair(".lb.", KmpUInt64Ty),
	std::make_pair(".ub.", KmpUInt64Ty),
	std::make_pair(".st.", KmpInt64Ty),
	std::make_pair(".liter.", KmpInt32Ty),
	std::make_pair(".reductions.", VoidPtrTy),
	std::make_pair(StringRef(), QualType()) // __context with shared vars
	};
	ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
	Params);
	// Mark this captured region as inlined, because we don't use outlined
	// function directly.
	getCurCapturedRegion()->TheCapturedDecl->addAttr(
	AlwaysInlineAttr::CreateImplicit(
	Context, {}, AttributeCommonInfo::AS_Keyword,
	AlwaysInlineAttr::Keyword_forceinline));
	break;
	}
	case OMPD_parallel_master_taskloop:
	case OMPD_parallel_master_taskloop_simd: {
	QualType KmpInt32Ty =
	Context.getIntTypeForBitwidth(/DestWidth=/32, /Signed=/1)
	.withConst();
	QualType KmpUInt64Ty =
	Context.getIntTypeForBitwidth(/DestWidth=/64, /Signed=/0)
	.withConst();
	QualType KmpInt64Ty =
	Context.getIntTypeForBitwidth(/DestWidth=/64, /Signed=/1)
	.withConst();
	QualType VoidPtrTy = Context.VoidPtrTy.withConst().withRestrict();
	QualType KmpInt32PtrTy =
	Context.getPointerType(KmpInt32Ty).withConst().withRestrict();
	Sema::CapturedParamNameType ParamsParallel[] = {
	std::make_pair(".global_tid.", KmpInt32PtrTy),
	std::make_pair(".bound_tid.", KmpInt32PtrTy),
	std::make_pair(StringRef(), QualType()) // __context with shared vars
	};
	// Start a captured region for 'parallel'.
	ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
	ParamsParallel, /OpenMPCaptureLevel=/0);
	QualType Args[] = {VoidPtrTy};
	FunctionProtoType::ExtProtoInfo EPI;
	EPI.Variadic = true;
	QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI);
	Sema::CapturedParamNameType Params[] = {
	std::make_pair(".global_tid.", KmpInt32Ty),
	std::make_pair(".part_id.", KmpInt32PtrTy),
	std::make_pair(".privates.", VoidPtrTy),
	std::make_pair(
	".copy_fn.",
	Context.getPointerType(CopyFnType).withConst().withRestrict()),
	std::make_pair(".task_t.", Context.VoidPtrTy.withConst()),
	std::make_pair(".lb.", KmpUInt64Ty),
	std::make_pair(".ub.", KmpUInt64Ty),
	std::make_pair(".st.", KmpInt64Ty),
	std::make_pair(".liter.", KmpInt32Ty),
	std::make_pair(".reductions.", VoidPtrTy),
	std::make_pair(StringRef(), QualType()) // __context with shared vars
	};
	ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
	Params, /OpenMPCaptureLevel=/1);
	// Mark this captured region as inlined, because we don't use outlined
	// function directly.
	getCurCapturedRegion()->TheCapturedDecl->addAttr(
	AlwaysInlineAttr::CreateImplicit(
	Context, {}, AttributeCommonInfo::AS_Keyword,
	AlwaysInlineAttr::Keyword_forceinline));
	break;
	}
	case OMPD_distribute_parallel_for_simd:
	case OMPD_distribute_parallel_for: {
	QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1).withConst();
	QualType KmpInt32PtrTy =
	Context.getPointerType(KmpInt32Ty).withConst().withRestrict();
	Sema::CapturedParamNameType Params[] = {
	std::make_pair(".global_tid.", KmpInt32PtrTy),
	std::make_pair(".bound_tid.", KmpInt32PtrTy),
	std::make_pair(".previous.lb.", Context.getSizeType().withConst()),
	std::make_pair(".previous.ub.", Context.getSizeType().withConst()),
	std::make_pair(StringRef(), QualType()) // __context with shared vars
	};
	ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
	Params);
	break;
	}
	case OMPD_target_teams_distribute_parallel_for:
	case OMPD_target_teams_distribute_parallel_for_simd: {
	QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1).withConst();
	QualType KmpInt32PtrTy =
	Context.getPointerType(KmpInt32Ty).withConst().withRestrict();
	QualType VoidPtrTy = Context.VoidPtrTy.withConst().withRestrict();

	QualType Args[] = {VoidPtrTy};
	FunctionProtoType::ExtProtoInfo EPI;
	EPI.Variadic = true;
	QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI);
	Sema::CapturedParamNameType Params[] = {
	std::make_pair(".global_tid.", KmpInt32Ty),
	std::make_pair(".part_id.", KmpInt32PtrTy),
	std::make_pair(".privates.", VoidPtrTy),
	std::make_pair(
	".copy_fn.",
	Context.getPointerType(CopyFnType).withConst().withRestrict()),
	std::make_pair(".task_t.", Context.VoidPtrTy.withConst()),
	std::make_pair(StringRef(), QualType()) // __context with shared vars
	};
	ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
	Params, /OpenMPCaptureLevel=/0);
	// Mark this captured region as inlined, because we don't use outlined
	// function directly.
	getCurCapturedRegion()->TheCapturedDecl->addAttr(
	AlwaysInlineAttr::CreateImplicit(
	Context, {}, AttributeCommonInfo::AS_Keyword,
	AlwaysInlineAttr::Keyword_forceinline));
	Sema::CapturedParamNameType ParamsTarget[] = {
	std::make_pair(StringRef(), QualType()) // __context with shared vars
	};
	// Start a captured region for 'target' with no implicit parameters.
	ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
	ParamsTarget, /OpenMPCaptureLevel=/1);

	Sema::CapturedParamNameType ParamsTeams[] = {
	std::make_pair(".global_tid.", KmpInt32PtrTy),
	std::make_pair(".bound_tid.", KmpInt32PtrTy),
	std::make_pair(StringRef(), QualType()) // __context with shared vars
	};
	// Start a captured region for 'target' with no implicit parameters.
	ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
	ParamsTeams, /OpenMPCaptureLevel=/2);

	Sema::CapturedParamNameType ParamsParallel[] = {
	std::make_pair(".global_tid.", KmpInt32PtrTy),
	std::make_pair(".bound_tid.", KmpInt32PtrTy),
	std::make_pair(".previous.lb.", Context.getSizeType().withConst()),
	std::make_pair(".previous.ub.", Context.getSizeType().withConst()),
	std::make_pair(StringRef(), QualType()) // __context with shared vars
	};
	// Start a captured region for 'teams' or 'parallel'. Both regions have
	// the same implicit parameters.
	ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
	ParamsParallel, /OpenMPCaptureLevel=/3);
	break;
	}

	case OMPD_teams_distribute_parallel_for:
	case OMPD_teams_distribute_parallel_for_simd: {
	QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1).withConst();
	QualType KmpInt32PtrTy =
	Context.getPointerType(KmpInt32Ty).withConst().withRestrict();

	Sema::CapturedParamNameType ParamsTeams[] = {
	std::make_pair(".global_tid.", KmpInt32PtrTy),
	std::make_pair(".bound_tid.", KmpInt32PtrTy),
	std::make_pair(StringRef(), QualType()) // __context with shared vars
	};
	// Start a captured region for 'target' with no implicit parameters.
	ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
	ParamsTeams, /OpenMPCaptureLevel=/0);

	Sema::CapturedParamNameType ParamsParallel[] = {
	std::make_pair(".global_tid.", KmpInt32PtrTy),
	std::make_pair(".bound_tid.", KmpInt32PtrTy),
	std::make_pair(".previous.lb.", Context.getSizeType().withConst()),
	std::make_pair(".previous.ub.", Context.getSizeType().withConst()),
	std::make_pair(StringRef(), QualType()) // __context with shared vars
	};
	// Start a captured region for 'teams' or 'parallel'. Both regions have
	// the same implicit parameters.
	ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
	ParamsParallel, /OpenMPCaptureLevel=/1);
	break;
	}
	case OMPD_target_update:
	case OMPD_target_enter_data:
	case OMPD_target_exit_data: {
	QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1).withConst();
	QualType VoidPtrTy = Context.VoidPtrTy.withConst().withRestrict();
	QualType KmpInt32PtrTy =
	Context.getPointerType(KmpInt32Ty).withConst().withRestrict();
	QualType Args[] = {VoidPtrTy};
	FunctionProtoType::ExtProtoInfo EPI;
	EPI.Variadic = true;
	QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI);
	Sema::CapturedParamNameType Params[] = {
	std::make_pair(".global_tid.", KmpInt32Ty),
	std::make_pair(".part_id.", KmpInt32PtrTy),
	std::make_pair(".privates.", VoidPtrTy),
	std::make_pair(
	".copy_fn.",
	Context.getPointerType(CopyFnType).withConst().withRestrict()),
	std::make_pair(".task_t.", Context.VoidPtrTy.withConst()),
	std::make_pair(StringRef(), QualType()) // __context with shared vars
	};
	ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
	Params);
	// Mark this captured region as inlined, because we don't use outlined
	// function directly.
	getCurCapturedRegion()->TheCapturedDecl->addAttr(
	AlwaysInlineAttr::CreateImplicit(
	Context, {}, AttributeCommonInfo::AS_Keyword,
	AlwaysInlineAttr::Keyword_forceinline));
	break;
	}
	case OMPD_threadprivate:
	case OMPD_allocate:
	case OMPD_taskyield:
	case OMPD_barrier:
	case OMPD_taskwait:
	case OMPD_cancellation_point:
	case OMPD_cancel:
	case OMPD_flush:
	case OMPD_depobj:
	case OMPD_scan:
	case OMPD_declare_reduction:
	case OMPD_declare_mapper:
	case OMPD_declare_simd:
	case OMPD_declare_target:
	case OMPD_end_declare_target:
	case OMPD_requires:
	case OMPD_declare_variant:
	case OMPD_begin_declare_variant:
	case OMPD_end_declare_variant:
	llvm_unreachable("OpenMP Directive is not allowed");
	case OMPD_unknown:
	default:
	llvm_unreachable("Unknown OpenMP directive");
	}
	}

	int Sema::getNumberOfConstructScopes(unsigned Level) const {
	return getOpenMPCaptureLevels(DSAStack->getDirective(Level));
	}

	int Sema::getOpenMPCaptureLevels(OpenMPDirectiveKind DKind) {
	SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
	getOpenMPCaptureRegions(CaptureRegions, DKind);
	return CaptureRegions.size();
	}

	static OMPCapturedExprDecl buildCaptureDecl(Sema &S, IdentifierInfo Id,
	Expr *CaptureExpr, bool WithInit,
	bool AsExpression) {
	assert(CaptureExpr);
	ASTContext &C = S.getASTContext();
	Expr *Init = AsExpression ? CaptureExpr : CaptureExpr->IgnoreImpCasts();
	QualType Ty = Init->getType();
	if (CaptureExpr->getObjectKind() == OK_Ordinary && CaptureExpr->isGLValue()) {
	if (S.getLangOpts().CPlusPlus) {
	Ty = C.getLValueReferenceType(Ty);
	} else {
	Ty = C.getPointerType(Ty);
	ExprResult Res =
	S.CreateBuiltinUnaryOp(CaptureExpr->getExprLoc(), UO_AddrOf, Init);
	if (!Res.isUsable())
	return nullptr;
	Init = Res.get();
	}
	WithInit = true;
	}
	auto *CED = OMPCapturedExprDecl::Create(C, S.CurContext, Id, Ty,
	CaptureExpr->getBeginLoc());
	if (!WithInit)
	CED->addAttr(OMPCaptureNoInitAttr::CreateImplicit(C));
	S.CurContext->addHiddenDecl(CED);
	S.AddInitializerToDecl(CED, Init, /DirectInit=/false);
	return CED;
	}

	static DeclRefExpr buildCapture(Sema &S, ValueDecl D, Expr *CaptureExpr,
	bool WithInit) {
	OMPCapturedExprDecl *CD;
	if (VarDecl *VD = S.isOpenMPCapturedDecl(D))
	CD = cast<OMPCapturedExprDecl>(VD);
	else
	CD = buildCaptureDecl(S, D->getIdentifier(), CaptureExpr, WithInit,
	/AsExpression=/false);
	return buildDeclRefExpr(S, CD, CD->getType().getNonReferenceType(),
	CaptureExpr->getExprLoc());
	}

	static ExprResult buildCapture(Sema &S, Expr CaptureExpr, DeclRefExpr &Ref) {
	CaptureExpr = S.DefaultLvalueConversion(CaptureExpr).get();
	if (!Ref) {
	OMPCapturedExprDecl *CD = buildCaptureDecl(
	S, &S.getASTContext().Idents.get(".capture_expr."), CaptureExpr,
	/WithInit=/true, /AsExpression=/true);
	Ref = buildDeclRefExpr(S, CD, CD->getType().getNonReferenceType(),
	CaptureExpr->getExprLoc());
	}
	ExprResult Res = Ref;
	if (!S.getLangOpts().CPlusPlus &&
	CaptureExpr->getObjectKind() == OK_Ordinary && CaptureExpr->isGLValue() &&
	Ref->getType()->isPointerType()) {
	Res = S.CreateBuiltinUnaryOp(CaptureExpr->getExprLoc(), UO_Deref, Ref);
	if (!Res.isUsable())
	return ExprError();
	}
	return S.DefaultLvalueConversion(Res.get());
	}

	namespace {
	// OpenMP directives parsed in this section are represented as a
	// CapturedStatement with an associated statement. If a syntax error
	// is detected during the parsing of the associated statement, the
	// compiler must abort processing and close the CapturedStatement.
	//
	// Combined directives such as 'target parallel' have more than one
	// nested CapturedStatements. This RAII ensures that we unwind out
	// of all the nested CapturedStatements when an error is found.
	class CaptureRegionUnwinderRAII {
	private:
	Sema &S;
	bool &ErrorFound;
	OpenMPDirectiveKind DKind = OMPD_unknown;

	public:
	CaptureRegionUnwinderRAII(Sema &S, bool &ErrorFound,
	OpenMPDirectiveKind DKind)
	: S(S), ErrorFound(ErrorFound), DKind(DKind) {}
	~CaptureRegionUnwinderRAII() {
	if (ErrorFound) {
	int ThisCaptureLevel = S.getOpenMPCaptureLevels(DKind);
	while (--ThisCaptureLevel >= 0)
	S.ActOnCapturedRegionError();
	}
	}
	};
	} // namespace

	void Sema::tryCaptureOpenMPLambdas(ValueDecl *V) {
	// Capture variables captured by reference in lambdas for target-based
	// directives.
	if (!CurContext->isDependentContext() &&
	(isOpenMPTargetExecutionDirective(DSAStack->getCurrentDirective()) \|\|
	isOpenMPTargetDataManagementDirective(
	DSAStack->getCurrentDirective()))) {
	QualType Type = V->getType();
	if (const auto *RD = Type.getCanonicalType()
	.getNonReferenceType()
	->getAsCXXRecordDecl()) {
	bool SavedForceCaptureByReferenceInTargetExecutable =
	DSAStack->isForceCaptureByReferenceInTargetExecutable();
	DSAStack->setForceCaptureByReferenceInTargetExecutable(
	/V=/true);
	if (RD->isLambda()) {
	llvm::DenseMap<const VarDecl , FieldDecl > Captures;
	FieldDecl *ThisCapture;
	RD->getCaptureFields(Captures, ThisCapture);
	for (const LambdaCapture &LC : RD->captures()) {
	if (LC.getCaptureKind() == LCK_ByRef) {
	VarDecl *VD = LC.getCapturedVar();
	DeclContext *VDC = VD->getDeclContext();
	if (!VDC->Encloses(CurContext))
	continue;
	MarkVariableReferenced(LC.getLocation(), VD);
	} else if (LC.getCaptureKind() == LCK_This) {
	QualType ThisTy = getCurrentThisType();
	if (!ThisTy.isNull() &&
	Context.typesAreCompatible(ThisTy, ThisCapture->getType()))
	CheckCXXThisCapture(LC.getLocation());
	}
	}
	}
	DSAStack->setForceCaptureByReferenceInTargetExecutable(
	SavedForceCaptureByReferenceInTargetExecutable);
	}
	}
	}

	static bool checkOrderedOrderSpecified(Sema &S,
	const ArrayRef<OMPClause *> Clauses) {
	const OMPOrderedClause *Ordered = nullptr;
	const OMPOrderClause *Order = nullptr;

	for (const OMPClause *Clause : Clauses) {
	if (Clause->getClauseKind() == OMPC_ordered)
	Ordered = cast<OMPOrderedClause>(Clause);
	else if (Clause->getClauseKind() == OMPC_order) {
	Order = cast<OMPOrderClause>(Clause);
	if (Order->getKind() != OMPC_ORDER_concurrent)
	Order = nullptr;
	}
	if (Ordered && Order)
	break;
	}

	if (Ordered && Order) {
	S.Diag(Order->getKindKwLoc(),
	diag::err_omp_simple_clause_incompatible_with_ordered)
	<< getOpenMPClauseName(OMPC_order)
	<< getOpenMPSimpleClauseTypeName(OMPC_order, OMPC_ORDER_concurrent)
	<< SourceRange(Order->getBeginLoc(), Order->getEndLoc());
	S.Diag(Ordered->getBeginLoc(), diag::note_omp_ordered_param)
	<< 0 << SourceRange(Ordered->getBeginLoc(), Ordered->getEndLoc());
	return true;
	}
	return false;
	}

	StmtResult Sema::ActOnOpenMPRegionEnd(StmtResult S,
	ArrayRef<OMPClause *> Clauses) {
	bool ErrorFound = false;
	CaptureRegionUnwinderRAII CaptureRegionUnwinder(
	*this, ErrorFound, DSAStack->getCurrentDirective());
	if (!S.isUsable()) {
	ErrorFound = true;
	return StmtError();
	}

	SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
	getOpenMPCaptureRegions(CaptureRegions, DSAStack->getCurrentDirective());
	OMPOrderedClause *OC = nullptr;
	OMPScheduleClause *SC = nullptr;
	SmallVector<const OMPLinearClause *, 4> LCs;
	SmallVector<const OMPClauseWithPreInit *, 4> PICs;
	// This is required for proper codegen.
	for (OMPClause *Clause : Clauses) {
	if (!LangOpts.OpenMPSimd &&
	isOpenMPTaskingDirective(DSAStack->getCurrentDirective()) &&
	Clause->getClauseKind() == OMPC_in_reduction) {
	// Capture taskgroup task_reduction descriptors inside the tasking regions
	// with the corresponding in_reduction items.
	auto *IRC = cast<OMPInReductionClause>(Clause);
	for (Expr *E : IRC->taskgroup_descriptors())
	if (E)
	MarkDeclarationsReferencedInExpr(E);
	}
	if (isOpenMPPrivate(Clause->getClauseKind()) \|\|
	Clause->getClauseKind() == OMPC_copyprivate \|\|
	(getLangOpts().OpenMPUseTLS &&
	getASTContext().getTargetInfo().isTLSSupported() &&
	Clause->getClauseKind() == OMPC_copyin)) {
	DSAStack->setForceVarCapturing(Clause->getClauseKind() == OMPC_copyin);
	// Mark all variables in private list clauses as used in inner region.
	for (Stmt *VarRef : Clause->children()) {
	if (auto *E = cast_or_null<Expr>(VarRef)) {
	MarkDeclarationsReferencedInExpr(E);
	}
	}
	DSAStack->setForceVarCapturing(/V=/false);
	} else if (CaptureRegions.size() > 1 \|\|
	CaptureRegions.back() != OMPD_unknown) {
	if (auto *C = OMPClauseWithPreInit::get(Clause))
	PICs.push_back(C);
	if (auto *C = OMPClauseWithPostUpdate::get(Clause)) {
	if (Expr *E = C->getPostUpdateExpr())
	MarkDeclarationsReferencedInExpr(E);
	}
	}
	if (Clause->getClauseKind() == OMPC_schedule)
	SC = cast<OMPScheduleClause>(Clause);
	else if (Clause->getClauseKind() == OMPC_ordered)
	OC = cast<OMPOrderedClause>(Clause);
	else if (Clause->getClauseKind() == OMPC_linear)
	LCs.push_back(cast<OMPLinearClause>(Clause));
	}
	// Capture allocator expressions if used.
	for (Expr *E : DSAStack->getInnerAllocators())
	MarkDeclarationsReferencedInExpr(E);
	// OpenMP, 2.7.1 Loop Construct, Restrictions
	// The nonmonotonic modifier cannot be specified if an ordered clause is
	// specified.
	if (SC &&
	(SC->getFirstScheduleModifier() == OMPC_SCHEDULE_MODIFIER_nonmonotonic \|\|
	SC->getSecondScheduleModifier() ==
	OMPC_SCHEDULE_MODIFIER_nonmonotonic) &&
	OC) {
	Diag(SC->getFirstScheduleModifier() == OMPC_SCHEDULE_MODIFIER_nonmonotonic
	? SC->getFirstScheduleModifierLoc()
	: SC->getSecondScheduleModifierLoc(),
	diag::err_omp_simple_clause_incompatible_with_ordered)
	<< getOpenMPClauseName(OMPC_schedule)
	<< getOpenMPSimpleClauseTypeName(OMPC_schedule,
	OMPC_SCHEDULE_MODIFIER_nonmonotonic)
	<< SourceRange(OC->getBeginLoc(), OC->getEndLoc());
	ErrorFound = true;
	}
	// OpenMP 5.0, 2.9.2 Worksharing-Loop Construct, Restrictions.
	// If an order(concurrent) clause is present, an ordered clause may not appear
	// on the same directive.
	if (checkOrderedOrderSpecified(*this, Clauses))
	ErrorFound = true;
	if (!LCs.empty() && OC && OC->getNumForLoops()) {
	for (const OMPLinearClause *C : LCs) {
	Diag(C->getBeginLoc(), diag::err_omp_linear_ordered)
	<< SourceRange(OC->getBeginLoc(), OC->getEndLoc());
	}
	ErrorFound = true;
	}
	if (isOpenMPWorksharingDirective(DSAStack->getCurrentDirective()) &&
	isOpenMPSimdDirective(DSAStack->getCurrentDirective()) && OC &&
	OC->getNumForLoops()) {
	Diag(OC->getBeginLoc(), diag::err_omp_ordered_simd)
	<< getOpenMPDirectiveName(DSAStack->getCurrentDirective());
	ErrorFound = true;
	}
	if (ErrorFound) {
	return StmtError();
	}
	StmtResult SR = S;
	unsigned CompletedRegions = 0;
	for (OpenMPDirectiveKind ThisCaptureRegion : llvm::reverse(CaptureRegions)) {
	// Mark all variables in private list clauses as used in inner region.
	// Required for proper codegen of combined directives.
	// TODO: add processing for other clauses.
	if (ThisCaptureRegion != OMPD_unknown) {
	for (const clang::OMPClauseWithPreInit *C : PICs) {
	OpenMPDirectiveKind CaptureRegion = C->getCaptureRegion();
	// Find the particular capture region for the clause if the
	// directive is a combined one with multiple capture regions.
	// If the directive is not a combined one, the capture region
	// associated with the clause is OMPD_unknown and is generated
	// only once.
	if (CaptureRegion == ThisCaptureRegion \|\|
	CaptureRegion == OMPD_unknown) {
	if (auto *DS = cast_or_null<DeclStmt>(C->getPreInitStmt())) {
	for (Decl *D : DS->decls())
	MarkVariableReferenced(D->getLocation(), cast<VarDecl>(D));
	}
	}
	}
	}
	if (ThisCaptureRegion == OMPD_target) {
	// Capture allocator traits in the target region. They are used implicitly
	// and, thus, are not captured by default.
	for (OMPClause *C : Clauses) {
	if (const auto *UAC = dyn_cast<OMPUsesAllocatorsClause>(C)) {
	for (unsigned I = 0, End = UAC->getNumberOfAllocators(); I < End;
	++I) {
	OMPUsesAllocatorsClause::Data D = UAC->getAllocatorData(I);
	if (Expr *E = D.AllocatorTraits)
	MarkDeclarationsReferencedInExpr(E);
	}
	continue;
	}
	}
	}
	if (++CompletedRegions == CaptureRegions.size())
	DSAStack->setBodyComplete();
	SR = ActOnCapturedRegionEnd(SR.get());
	}
	return SR;
	}

	static bool checkCancelRegion(Sema &SemaRef, OpenMPDirectiveKind CurrentRegion,
	OpenMPDirectiveKind CancelRegion,
	SourceLocation StartLoc) {
	// CancelRegion is only needed for cancel and cancellation_point.
	if (CurrentRegion != OMPD_cancel && CurrentRegion != OMPD_cancellation_point)
	return false;

	if (CancelRegion == OMPD_parallel \|\| CancelRegion == OMPD_for \|\|
	CancelRegion == OMPD_sections \|\| CancelRegion == OMPD_taskgroup)
	return false;

	SemaRef.Diag(StartLoc, diag::err_omp_wrong_cancel_region)
	<< getOpenMPDirectiveName(CancelRegion);
	return true;
	}

	static bool checkNestingOfRegions(Sema &SemaRef, const DSAStackTy *Stack,
	OpenMPDirectiveKind CurrentRegion,
	const DeclarationNameInfo &CurrentName,
	OpenMPDirectiveKind CancelRegion,
	SourceLocation StartLoc) {
	if (Stack->getCurScope()) {
	OpenMPDirectiveKind ParentRegion = Stack->getParentDirective();
	OpenMPDirectiveKind OffendingRegion = ParentRegion;
	bool NestingProhibited = false;
	bool CloseNesting = true;
	bool OrphanSeen = false;
	enum {
	NoRecommend,
	ShouldBeInParallelRegion,
	ShouldBeInOrderedRegion,
	ShouldBeInTargetRegion,
	ShouldBeInTeamsRegion,
	ShouldBeInLoopSimdRegion,
	} Recommend = NoRecommend;
	if (isOpenMPSimdDirective(ParentRegion) &&
	((SemaRef.LangOpts.OpenMP <= 45 && CurrentRegion != OMPD_ordered) \|\|
	(SemaRef.LangOpts.OpenMP >= 50 && CurrentRegion != OMPD_ordered &&
	CurrentRegion != OMPD_simd && CurrentRegion != OMPD_atomic &&
	CurrentRegion != OMPD_scan))) {
	// OpenMP [2.16, Nesting of Regions]
	// OpenMP constructs may not be nested inside a simd region.
	// OpenMP [2.8.1,simd Construct, Restrictions]
	// An ordered construct with the simd clause is the only OpenMP
	// construct that can appear in the simd region.
	// Allowing a SIMD construct nested in another SIMD construct is an
	// extension. The OpenMP 4.5 spec does not allow it. Issue a warning
	// message.
	// OpenMP 5.0 [2.9.3.1, simd Construct, Restrictions]
	// The only OpenMP constructs that can be encountered during execution of
	// a simd region are the atomic construct, the loop construct, the simd
	// construct and the ordered construct with the simd clause.
	SemaRef.Diag(StartLoc, (CurrentRegion != OMPD_simd)
	? diag::err_omp_prohibited_region_simd
	: diag::warn_omp_nesting_simd)
	<< (SemaRef.LangOpts.OpenMP >= 50 ? 1 : 0);
	return CurrentRegion != OMPD_simd;
	}
	if (ParentRegion == OMPD_atomic) {
	// OpenMP [2.16, Nesting of Regions]
	// OpenMP constructs may not be nested inside an atomic region.
	SemaRef.Diag(StartLoc, diag::err_omp_prohibited_region_atomic);
	return true;
	}
	if (CurrentRegion == OMPD_section) {
	// OpenMP [2.7.2, sections Construct, Restrictions]
	// Orphaned section directives are prohibited. That is, the section
	// directives must appear within the sections construct and must not be
	// encountered elsewhere in the sections region.
	if (ParentRegion != OMPD_sections &&
	ParentRegion != OMPD_parallel_sections) {
	SemaRef.Diag(StartLoc, diag::err_omp_orphaned_section_directive)
	<< (ParentRegion != OMPD_unknown)
	<< getOpenMPDirectiveName(ParentRegion);
	return true;
	}
	return false;
	}
	// Allow some constructs (except teams and cancellation constructs) to be
	// orphaned (they could be used in functions, called from OpenMP regions
	// with the required preconditions).
	if (ParentRegion == OMPD_unknown &&
	!isOpenMPNestingTeamsDirective(CurrentRegion) &&
	CurrentRegion != OMPD_cancellation_point &&
	CurrentRegion != OMPD_cancel && CurrentRegion != OMPD_scan)
	return false;
	if (CurrentRegion == OMPD_cancellation_point \|\|
	CurrentRegion == OMPD_cancel) {
	// OpenMP [2.16, Nesting of Regions]
	// A cancellation point construct for which construct-type-clause is
	// taskgroup must be nested inside a task construct. A cancellation
	// point construct for which construct-type-clause is not taskgroup must
	// be closely nested inside an OpenMP construct that matches the type
	// specified in construct-type-clause.
	// A cancel construct for which construct-type-clause is taskgroup must be
	// nested inside a task construct. A cancel construct for which
	// construct-type-clause is not taskgroup must be closely nested inside an
	// OpenMP construct that matches the type specified in
	// construct-type-clause.
	NestingProhibited =
	!((CancelRegion == OMPD_parallel &&
	(ParentRegion == OMPD_parallel \|\|
	ParentRegion == OMPD_target_parallel)) \|\|
	(CancelRegion == OMPD_for &&
	(ParentRegion == OMPD_for \|\| ParentRegion == OMPD_parallel_for \|\|
	ParentRegion == OMPD_target_parallel_for \|\|
	ParentRegion == OMPD_distribute_parallel_for \|\|
	ParentRegion == OMPD_teams_distribute_parallel_for \|\|
	ParentRegion == OMPD_target_teams_distribute_parallel_for)) \|\|
	(CancelRegion == OMPD_taskgroup &&
	(ParentRegion == OMPD_task \|\|
	(SemaRef.getLangOpts().OpenMP >= 50 &&
	(ParentRegion == OMPD_taskloop \|\|
	ParentRegion == OMPD_master_taskloop \|\|
	ParentRegion == OMPD_parallel_master_taskloop)))) \|\|
	(CancelRegion == OMPD_sections &&
	(ParentRegion == OMPD_section \|\| ParentRegion == OMPD_sections \|\|
	ParentRegion == OMPD_parallel_sections)));
	OrphanSeen = ParentRegion == OMPD_unknown;
	} else if (CurrentRegion == OMPD_master) {
	// OpenMP [2.16, Nesting of Regions]
	// A master region may not be closely nested inside a worksharing,
	// atomic, or explicit task region.
	NestingProhibited = isOpenMPWorksharingDirective(ParentRegion) \|\|
	isOpenMPTaskingDirective(ParentRegion);
	} else if (CurrentRegion == OMPD_critical && CurrentName.getName()) {
	// OpenMP [2.16, Nesting of Regions]
	// A critical region may not be nested (closely or otherwise) inside a
	// critical region with the same name. Note that this restriction is not
	// sufficient to prevent deadlock.
	SourceLocation PreviousCriticalLoc;
	bool DeadLock = Stack->hasDirective(
	[CurrentName, &PreviousCriticalLoc](OpenMPDirectiveKind K,
	const DeclarationNameInfo &DNI,
	SourceLocation Loc) {
	if (K == OMPD_critical && DNI.getName() == CurrentName.getName()) {
	PreviousCriticalLoc = Loc;
	return true;
	}
	return false;
	},
	false /* skip top directive */);
	if (DeadLock) {
	SemaRef.Diag(StartLoc,
	diag::err_omp_prohibited_region_critical_same_name)
	<< CurrentName.getName();
	if (PreviousCriticalLoc.isValid())
	SemaRef.Diag(PreviousCriticalLoc,
	diag::note_omp_previous_critical_region);
	return true;
	}
	} else if (CurrentRegion == OMPD_barrier) {
	// OpenMP [2.16, Nesting of Regions]
	// A barrier region may not be closely nested inside a worksharing,
	// explicit task, critical, ordered, atomic, or master region.
	NestingProhibited = isOpenMPWorksharingDirective(ParentRegion) \|\|
	isOpenMPTaskingDirective(ParentRegion) \|\|
	ParentRegion == OMPD_master \|\|
	ParentRegion == OMPD_parallel_master \|\|
	ParentRegion == OMPD_critical \|\|
	ParentRegion == OMPD_ordered;
	} else if (isOpenMPWorksharingDirective(CurrentRegion) &&
	!isOpenMPParallelDirective(CurrentRegion) &&
	!isOpenMPTeamsDirective(CurrentRegion)) {
	// OpenMP [2.16, Nesting of Regions]
	// A worksharing region may not be closely nested inside a worksharing,
	// explicit task, critical, ordered, atomic, or master region.
	NestingProhibited = isOpenMPWorksharingDirective(ParentRegion) \|\|
	isOpenMPTaskingDirective(ParentRegion) \|\|
	ParentRegion == OMPD_master \|\|
	ParentRegion == OMPD_parallel_master \|\|
	ParentRegion == OMPD_critical \|\|
	ParentRegion == OMPD_ordered;
	Recommend = ShouldBeInParallelRegion;
	} else if (CurrentRegion == OMPD_ordered) {
	// OpenMP [2.16, Nesting of Regions]
	// An ordered region may not be closely nested inside a critical,
	// atomic, or explicit task region.
	// An ordered region must be closely nested inside a loop region (or
	// parallel loop region) with an ordered clause.
	// OpenMP [2.8.1,simd Construct, Restrictions]
	// An ordered construct with the simd clause is the only OpenMP construct
	// that can appear in the simd region.
	NestingProhibited = ParentRegion == OMPD_critical \|\|
	isOpenMPTaskingDirective(ParentRegion) \|\|
	!(isOpenMPSimdDirective(ParentRegion) \|\|
	Stack->isParentOrderedRegion());
	Recommend = ShouldBeInOrderedRegion;
	} else if (isOpenMPNestingTeamsDirective(CurrentRegion)) {
	// OpenMP [2.16, Nesting of Regions]
	// If specified, a teams construct must be contained within a target
	// construct.
	NestingProhibited =
	(SemaRef.LangOpts.OpenMP <= 45 && ParentRegion != OMPD_target) \|\|
	(SemaRef.LangOpts.OpenMP >= 50 && ParentRegion != OMPD_unknown &&
	ParentRegion != OMPD_target);
	OrphanSeen = ParentRegion == OMPD_unknown;
	Recommend = ShouldBeInTargetRegion;
	} else if (CurrentRegion == OMPD_scan) {
	// OpenMP [2.16, Nesting of Regions]
	// If specified, a teams construct must be contained within a target
	// construct.
	NestingProhibited =
	SemaRef.LangOpts.OpenMP < 50 \|\|
	(ParentRegion != OMPD_simd && ParentRegion != OMPD_for &&
	ParentRegion != OMPD_for_simd && ParentRegion != OMPD_parallel_for &&
	ParentRegion != OMPD_parallel_for_simd);
	OrphanSeen = ParentRegion == OMPD_unknown;
	Recommend = ShouldBeInLoopSimdRegion;
	}
	if (!NestingProhibited &&
	!isOpenMPTargetExecutionDirective(CurrentRegion) &&
	!isOpenMPTargetDataManagementDirective(CurrentRegion) &&
	(ParentRegion == OMPD_teams \|\| ParentRegion == OMPD_target_teams)) {
	// OpenMP [2.16, Nesting of Regions]
	// distribute, parallel, parallel sections, parallel workshare, and the
	// parallel loop and parallel loop SIMD constructs are the only OpenMP
	// constructs that can be closely nested in the teams region.
	NestingProhibited = !isOpenMPParallelDirective(CurrentRegion) &&
	!isOpenMPDistributeDirective(CurrentRegion);
	Recommend = ShouldBeInParallelRegion;
	}
	if (!NestingProhibited &&
	isOpenMPNestingDistributeDirective(CurrentRegion)) {
	// OpenMP 4.5 [2.17 Nesting of Regions]
	// The region associated with the distribute construct must be strictly
	// nested inside a teams region
	NestingProhibited =
	(ParentRegion != OMPD_teams && ParentRegion != OMPD_target_teams);
	Recommend = ShouldBeInTeamsRegion;
	}
	if (!NestingProhibited &&
	(isOpenMPTargetExecutionDirective(CurrentRegion) \|\|
	isOpenMPTargetDataManagementDirective(CurrentRegion))) {
	// OpenMP 4.5 [2.17 Nesting of Regions]
	// If a target, target update, target data, target enter data, or
	// target exit data construct is encountered during execution of a
	// target region, the behavior is unspecified.
	NestingProhibited = Stack->hasDirective(
	[&OffendingRegion](OpenMPDirectiveKind K, const DeclarationNameInfo &,
	SourceLocation) {
	if (isOpenMPTargetExecutionDirective(K)) {
	OffendingRegion = K;
	return true;
	}
	return false;
	},
	false /* don't skip top directive */);
	CloseNesting = false;
	}
	if (NestingProhibited) {
	if (OrphanSeen) {
	SemaRef.Diag(StartLoc, diag::err_omp_orphaned_device_directive)
	<< getOpenMPDirectiveName(CurrentRegion) << Recommend;
	} else {
	SemaRef.Diag(StartLoc, diag::err_omp_prohibited_region)
	<< CloseNesting << getOpenMPDirectiveName(OffendingRegion)
	<< Recommend << getOpenMPDirectiveName(CurrentRegion);
	}
	return true;
	}
	}
	return false;
	}

	struct Kind2Unsigned {
	using argument_type = OpenMPDirectiveKind;
	unsigned operator()(argument_type DK) { return unsigned(DK); }
	};
	static bool checkIfClauses(Sema &S, OpenMPDirectiveKind Kind,
	ArrayRef<OMPClause *> Clauses,
	ArrayRef<OpenMPDirectiveKind> AllowedNameModifiers) {
	bool ErrorFound = false;
	unsigned NamedModifiersNumber = 0;
	llvm::IndexedMap<const OMPIfClause *, Kind2Unsigned> FoundNameModifiers;
	FoundNameModifiers.resize(llvm::omp::Directive_enumSize + 1);
	SmallVector<SourceLocation, 4> NameModifierLoc;
	for (const OMPClause *C : Clauses) {
	if (const auto *IC = dyn_cast_or_null<OMPIfClause>(C)) {
	// At most one if clause without a directive-name-modifier can appear on
	// the directive.
	OpenMPDirectiveKind CurNM = IC->getNameModifier();
	if (FoundNameModifiers[CurNM]) {
	S.Diag(C->getBeginLoc(), diag::err_omp_more_one_clause)
	<< getOpenMPDirectiveName(Kind) << getOpenMPClauseName(OMPC_if)
	<< (CurNM != OMPD_unknown) << getOpenMPDirectiveName(CurNM);
	ErrorFound = true;
	} else if (CurNM != OMPD_unknown) {
	NameModifierLoc.push_back(IC->getNameModifierLoc());
	++NamedModifiersNumber;
	}
	FoundNameModifiers[CurNM] = IC;
	if (CurNM == OMPD_unknown)
	continue;
	// Check if the specified name modifier is allowed for the current
	// directive.
	// At most one if clause with the particular directive-name-modifier can
	// appear on the directive.
	bool MatchFound = false;
	for (auto NM : AllowedNameModifiers) {
	if (CurNM == NM) {
	MatchFound = true;
	break;
	}
	}
	if (!MatchFound) {
	S.Diag(IC->getNameModifierLoc(),
	diag::err_omp_wrong_if_directive_name_modifier)
	<< getOpenMPDirectiveName(CurNM) << getOpenMPDirectiveName(Kind);
	ErrorFound = true;
	}
	}
	}
	// If any if clause on the directive includes a directive-name-modifier then
	// all if clauses on the directive must include a directive-name-modifier.
	if (FoundNameModifiers[OMPD_unknown] && NamedModifiersNumber > 0) {
	if (NamedModifiersNumber == AllowedNameModifiers.size()) {
	S.Diag(FoundNameModifiers[OMPD_unknown]->getBeginLoc(),
	diag::err_omp_no_more_if_clause);
	} else {
	std::string Values;
	std::string Sep(", ");
	unsigned AllowedCnt = 0;
	unsigned TotalAllowedNum =
	AllowedNameModifiers.size() - NamedModifiersNumber;
	for (unsigned Cnt = 0, End = AllowedNameModifiers.size(); Cnt < End;
	++Cnt) {
	OpenMPDirectiveKind NM = AllowedNameModifiers[Cnt];
	if (!FoundNameModifiers[NM]) {
	Values += "'";
	Values += getOpenMPDirectiveName(NM);
	Values += "'";
	if (AllowedCnt + 2 == TotalAllowedNum)
	Values += " or ";
	else if (AllowedCnt + 1 != TotalAllowedNum)
	Values += Sep;
	++AllowedCnt;
	}
	}
	S.Diag(FoundNameModifiers[OMPD_unknown]->getCondition()->getBeginLoc(),
	diag::err_omp_unnamed_if_clause)
	<< (TotalAllowedNum > 1) << Values;
	}
	for (SourceLocation Loc : NameModifierLoc) {
	S.Diag(Loc, diag::note_omp_previous_named_if_clause);
	}
	ErrorFound = true;
	}
	return ErrorFound;
	}

	static std::pair<ValueDecl , bool> getPrivateItem(Sema &S, Expr &RefExpr,
	SourceLocation &ELoc,
	SourceRange &ERange,
	bool AllowArraySection) {
	if (RefExpr->isTypeDependent() \|\| RefExpr->isValueDependent() \|\|
	RefExpr->containsUnexpandedParameterPack())
	return std::make_pair(nullptr, true);

	// OpenMP [3.1, C/C++]
	// A list item is a variable name.
	// OpenMP [2.9.3.3, Restrictions, p.1]
	// A variable that is part of another variable (as an array or
	// structure element) cannot appear in a private clause.
	RefExpr = RefExpr->IgnoreParens();
	enum {
	NoArrayExpr = -1,
	ArraySubscript = 0,
	OMPArraySection = 1
	} IsArrayExpr = NoArrayExpr;
	if (AllowArraySection) {
	if (auto *ASE = dyn_cast_or_null<ArraySubscriptExpr>(RefExpr)) {
	Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
	while (auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
	Base = TempASE->getBase()->IgnoreParenImpCasts();
	RefExpr = Base;
	IsArrayExpr = ArraySubscript;
	} else if (auto *OASE = dyn_cast_or_null<OMPArraySectionExpr>(RefExpr)) {
	Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
	while (auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
	Base = TempOASE->getBase()->IgnoreParenImpCasts();
	while (auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
	Base = TempASE->getBase()->IgnoreParenImpCasts();
	RefExpr = Base;
	IsArrayExpr = OMPArraySection;
	}
	}
	ELoc = RefExpr->getExprLoc();
	ERange = RefExpr->getSourceRange();
	RefExpr = RefExpr->IgnoreParenImpCasts();
	auto *DE = dyn_cast_or_null<DeclRefExpr>(RefExpr);
	auto *ME = dyn_cast_or_null<MemberExpr>(RefExpr);
	if ((!DE \|\| !isa<VarDecl>(DE->getDecl())) &&
	(S.getCurrentThisType().isNull() \|\| !ME \|\|
	!isa<CXXThisExpr>(ME->getBase()->IgnoreParenImpCasts()) \|\|
	!isa<FieldDecl>(ME->getMemberDecl()))) {
	if (IsArrayExpr != NoArrayExpr) {
	S.Diag(ELoc, diag::err_omp_expected_base_var_name) << IsArrayExpr
	<< ERange;
	} else {
	S.Diag(ELoc,
	AllowArraySection
	? diag::err_omp_expected_var_name_member_expr_or_array_item
	: diag::err_omp_expected_var_name_member_expr)
	<< (S.getCurrentThisType().isNull() ? 0 : 1) << ERange;
	}
	return std::make_pair(nullptr, false);
	}
	return std::make_pair(
	getCanonicalDecl(DE ? DE->getDecl() : ME->getMemberDecl()), false);
	}

	namespace {
	/// Checks if the allocator is used in uses_allocators clause to be allowed in
	/// target regions.
	class AllocatorChecker final : public ConstStmtVisitor<AllocatorChecker, bool> {
	DSAStackTy *S = nullptr;

	public:
	bool VisitDeclRefExpr(const DeclRefExpr *E) {
	return S->isUsesAllocatorsDecl(E->getDecl())
	.getValueOr(
	DSAStackTy::UsesAllocatorsDeclKind::AllocatorTrait) ==
	DSAStackTy::UsesAllocatorsDeclKind::AllocatorTrait;
	}
	bool VisitStmt(const Stmt *S) {
	for (const Stmt *Child : S->children()) {
	if (Child && Visit(Child))
	return true;
	}
	return false;
	}
	explicit AllocatorChecker(DSAStackTy *S) : S(S) {}
	};
	} // namespace

	static void checkAllocateClauses(Sema &S, DSAStackTy *Stack,
	ArrayRef<OMPClause *> Clauses) {
	assert(!S.CurContext->isDependentContext() &&
	"Expected non-dependent context.");
	auto AllocateRange =
	llvm::make_filter_range(Clauses, OMPAllocateClause::classof);
	llvm::DenseMap<CanonicalDeclPtr<Decl>, CanonicalDeclPtr<VarDecl>>
	DeclToCopy;
	auto PrivateRange = llvm::make_filter_range(Clauses, [](const OMPClause *C) {
	return isOpenMPPrivate(C->getClauseKind());
	});
	for (OMPClause *Cl : PrivateRange) {
	MutableArrayRef<Expr *>::iterator I, It, Et;
	if (Cl->getClauseKind() == OMPC_private) {
	auto *PC = cast<OMPPrivateClause>(Cl);
	I = PC->private_copies().begin();
	It = PC->varlist_begin();
	Et = PC->varlist_end();
	} else if (Cl->getClauseKind() == OMPC_firstprivate) {
	auto *PC = cast<OMPFirstprivateClause>(Cl);
	I = PC->private_copies().begin();
	It = PC->varlist_begin();
	Et = PC->varlist_end();
	} else if (Cl->getClauseKind() == OMPC_lastprivate) {
	auto *PC = cast<OMPLastprivateClause>(Cl);
	I = PC->private_copies().begin();
	It = PC->varlist_begin();
	Et = PC->varlist_end();
	} else if (Cl->getClauseKind() == OMPC_linear) {
	auto *PC = cast<OMPLinearClause>(Cl);
	I = PC->privates().begin();
	It = PC->varlist_begin();
	Et = PC->varlist_end();
	} else if (Cl->getClauseKind() == OMPC_reduction) {
	auto *PC = cast<OMPReductionClause>(Cl);
	I = PC->privates().begin();
	It = PC->varlist_begin();
	Et = PC->varlist_end();
	} else if (Cl->getClauseKind() == OMPC_task_reduction) {
	auto *PC = cast<OMPTaskReductionClause>(Cl);
	I = PC->privates().begin();
	It = PC->varlist_begin();
	Et = PC->varlist_end();
	} else if (Cl->getClauseKind() == OMPC_in_reduction) {
	auto *PC = cast<OMPInReductionClause>(Cl);
	I = PC->privates().begin();
	It = PC->varlist_begin();
	Et = PC->varlist_end();
	} else {
	llvm_unreachable("Expected private clause.");
	}
	for (Expr *E : llvm::make_range(It, Et)) {
	if (!*I) {
	++I;
	continue;
	}
	SourceLocation ELoc;
	SourceRange ERange;
	Expr *SimpleRefExpr = E;
	auto Res = getPrivateItem(S, SimpleRefExpr, ELoc, ERange,
	/AllowArraySection=/true);
	DeclToCopy.try_emplace(Res.first,
	cast<VarDecl>(cast<DeclRefExpr>(*I)->getDecl()));
	++I;
	}
	}
	for (OMPClause *C : AllocateRange) {
	auto *AC = cast<OMPAllocateClause>(C);
	if (S.getLangOpts().OpenMP >= 50 &&
	!Stack->hasRequiresDeclWithClause<OMPDynamicAllocatorsClause>() &&
	isOpenMPTargetExecutionDirective(Stack->getCurrentDirective()) &&
	AC->getAllocator()) {
	Expr *Allocator = AC->getAllocator();
	// OpenMP, 2.12.5 target Construct
	// Memory allocators that do not appear in a uses_allocators clause cannot
	// appear as an allocator in an allocate clause or be used in the target
	// region unless a requires directive with the dynamic_allocators clause
	// is present in the same compilation unit.
	AllocatorChecker Checker(Stack);
	if (Checker.Visit(Allocator))
	S.Diag(Allocator->getExprLoc(),
	diag::err_omp_allocator_not_in_uses_allocators)
	<< Allocator->getSourceRange();
	}
	OMPAllocateDeclAttr::AllocatorTypeTy AllocatorKind =
	getAllocatorKind(S, Stack, AC->getAllocator());
	// OpenMP, 2.11.4 allocate Clause, Restrictions.
	// For task, taskloop or target directives, allocation requests to memory
	// allocators with the trait access set to thread result in unspecified
	// behavior.
	if (AllocatorKind == OMPAllocateDeclAttr::OMPThreadMemAlloc &&
	(isOpenMPTaskingDirective(Stack->getCurrentDirective()) \|\|
	isOpenMPTargetExecutionDirective(Stack->getCurrentDirective()))) {
	S.Diag(AC->getAllocator()->getExprLoc(),
	diag::warn_omp_allocate_thread_on_task_target_directive)
	<< getOpenMPDirectiveName(Stack->getCurrentDirective());
	}
	for (Expr *E : AC->varlists()) {
	SourceLocation ELoc;
	SourceRange ERange;
	Expr *SimpleRefExpr = E;
	auto Res = getPrivateItem(S, SimpleRefExpr, ELoc, ERange);
	ValueDecl *VD = Res.first;
	DSAStackTy::DSAVarData Data = Stack->getTopDSA(VD, /FromParent=/false);
	if (!isOpenMPPrivate(Data.CKind)) {
	S.Diag(E->getExprLoc(),
	diag::err_omp_expected_private_copy_for_allocate);
	continue;
	}
	VarDecl *PrivateVD = DeclToCopy[VD];
	if (checkPreviousOMPAllocateAttribute(S, Stack, E, PrivateVD,
	AllocatorKind, AC->getAllocator()))
	continue;
	applyOMPAllocateAttribute(S, PrivateVD, AllocatorKind, AC->getAllocator(),
	E->getSourceRange());
	}
	}
	}

	StmtResult Sema::ActOnOpenMPExecutableDirective(
	OpenMPDirectiveKind Kind, const DeclarationNameInfo &DirName,
	OpenMPDirectiveKind CancelRegion, ArrayRef<OMPClause *> Clauses,
	Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc) {
	StmtResult Res = StmtError();
	// First check CancelRegion which is then used in checkNestingOfRegions.
	if (checkCancelRegion(*this, Kind, CancelRegion, StartLoc) \|\|
	checkNestingOfRegions(*this, DSAStack, Kind, DirName, CancelRegion,
	StartLoc))
	return StmtError();

	llvm::SmallVector<OMPClause *, 8> ClausesWithImplicit;
	VarsWithInheritedDSAType VarsWithInheritedDSA;
	bool ErrorFound = false;
	ClausesWithImplicit.append(Clauses.begin(), Clauses.end());
	if (AStmt && !CurContext->isDependentContext()) {
	assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");

	// Check default data sharing attributes for referenced variables.
	DSAAttrChecker DSAChecker(DSAStack, *this, cast<CapturedStmt>(AStmt));
	int ThisCaptureLevel = getOpenMPCaptureLevels(Kind);
	Stmt *S = AStmt;
	while (--ThisCaptureLevel >= 0)
	S = cast<CapturedStmt>(S)->getCapturedStmt();
	DSAChecker.Visit(S);
	if (!isOpenMPTargetDataManagementDirective(Kind) &&
	!isOpenMPTaskingDirective(Kind)) {
	// Visit subcaptures to generate implicit clauses for captured vars.
	auto *CS = cast<CapturedStmt>(AStmt);
	SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
	getOpenMPCaptureRegions(CaptureRegions, Kind);
	// Ignore outer tasking regions for target directives.
	if (CaptureRegions.size() > 1 && CaptureRegions.front() == OMPD_task)
	CS = cast<CapturedStmt>(CS->getCapturedStmt());
	DSAChecker.visitSubCaptures(CS);
	}
	if (DSAChecker.isErrorFound())
	return StmtError();
	// Generate list of implicitly defined firstprivate variables.
	VarsWithInheritedDSA = DSAChecker.getVarsWithInheritedDSA();

	SmallVector<Expr *, 4> ImplicitFirstprivates(
	DSAChecker.getImplicitFirstprivate().begin(),
	DSAChecker.getImplicitFirstprivate().end());
	SmallVector<Expr *, 4> ImplicitMaps[OMPC_MAP_delete];
	for (unsigned I = 0; I < OMPC_MAP_delete; ++I) {
	ArrayRef<Expr *> ImplicitMap =
	DSAChecker.getImplicitMap(static_cast<OpenMPDefaultmapClauseKind>(I));
	ImplicitMaps[I].append(ImplicitMap.begin(), ImplicitMap.end());
	}
	// Mark taskgroup task_reduction descriptors as implicitly firstprivate.
	for (OMPClause *C : Clauses) {
	if (auto *IRC = dyn_cast<OMPInReductionClause>(C)) {
	for (Expr *E : IRC->taskgroup_descriptors())
	if (E)
	ImplicitFirstprivates.emplace_back(E);
	}
	// OpenMP 5.0, 2.10.1 task Construct
	// [detach clause]... The event-handle will be considered as if it was
	// specified on a firstprivate clause.
	if (auto *DC = dyn_cast<OMPDetachClause>(C))
	ImplicitFirstprivates.push_back(DC->getEventHandler());
	}
	if (!ImplicitFirstprivates.empty()) {
	if (OMPClause *Implicit = ActOnOpenMPFirstprivateClause(
	ImplicitFirstprivates, SourceLocation(), SourceLocation(),
	SourceLocation())) {
	ClausesWithImplicit.push_back(Implicit);
	ErrorFound = cast<OMPFirstprivateClause>(Implicit)->varlist_size() !=
	ImplicitFirstprivates.size();
	} else {
	ErrorFound = true;
	}
	}
	int ClauseKindCnt = -1;
	for (ArrayRef<Expr *> ImplicitMap : ImplicitMaps) {
	++ClauseKindCnt;
	if (ImplicitMap.empty())
	continue;
	CXXScopeSpec MapperIdScopeSpec;
	DeclarationNameInfo MapperId;
	auto Kind = static_cast<OpenMPMapClauseKind>(ClauseKindCnt);
	if (OMPClause *Implicit = ActOnOpenMPMapClause(
	llvm::None, llvm::None, MapperIdScopeSpec, MapperId, Kind,
	/IsMapTypeImplicit=/true, SourceLocation(), SourceLocation(),
	ImplicitMap, OMPVarListLocTy())) {
	ClausesWithImplicit.emplace_back(Implicit);
	ErrorFound \|=
	cast<OMPMapClause>(Implicit)->varlist_size() != ImplicitMap.size();
	} else {
	ErrorFound = true;
	}
	}
	}

	llvm::SmallVector<OpenMPDirectiveKind, 4> AllowedNameModifiers;
	switch (Kind) {
	case OMPD_parallel:
	Res = ActOnOpenMPParallelDirective(ClausesWithImplicit, AStmt, StartLoc,
	EndLoc);
	AllowedNameModifiers.push_back(OMPD_parallel);
	break;
	case OMPD_simd:
	Res = ActOnOpenMPSimdDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc,
	VarsWithInheritedDSA);
	if (LangOpts.OpenMP >= 50)
	AllowedNameModifiers.push_back(OMPD_simd);
	break;
	case OMPD_for:
	Res = ActOnOpenMPForDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc,
	VarsWithInheritedDSA);
	break;
	case OMPD_for_simd:
	Res = ActOnOpenMPForSimdDirective(ClausesWithImplicit, AStmt, StartLoc,
	EndLoc, VarsWithInheritedDSA);
	if (LangOpts.OpenMP >= 50)
	AllowedNameModifiers.push_back(OMPD_simd);
	break;
	case OMPD_sections:
	Res = ActOnOpenMPSectionsDirective(ClausesWithImplicit, AStmt, StartLoc,
	EndLoc);
	break;
	case OMPD_section:
	assert(ClausesWithImplicit.empty() &&
	"No clauses are allowed for 'omp section' directive");
	Res = ActOnOpenMPSectionDirective(AStmt, StartLoc, EndLoc);
	break;
	case OMPD_single:
	Res = ActOnOpenMPSingleDirective(ClausesWithImplicit, AStmt, StartLoc,
	EndLoc);
	break;
	case OMPD_master:
	assert(ClausesWithImplicit.empty() &&
	"No clauses are allowed for 'omp master' directive");
	Res = ActOnOpenMPMasterDirective(AStmt, StartLoc, EndLoc);
	break;
	case OMPD_critical:
	Res = ActOnOpenMPCriticalDirective(DirName, ClausesWithImplicit, AStmt,
	StartLoc, EndLoc);
	break;
	case OMPD_parallel_for:
	Res = ActOnOpenMPParallelForDirective(ClausesWithImplicit, AStmt, StartLoc,
	EndLoc, VarsWithInheritedDSA);
	AllowedNameModifiers.push_back(OMPD_parallel);
	break;
	case OMPD_parallel_for_simd:
	Res = ActOnOpenMPParallelForSimdDirective(
	ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
	AllowedNameModifiers.push_back(OMPD_parallel);
	if (LangOpts.OpenMP >= 50)
	AllowedNameModifiers.push_back(OMPD_simd);
	break;
	case OMPD_parallel_master:
	Res = ActOnOpenMPParallelMasterDirective(ClausesWithImplicit, AStmt,
	StartLoc, EndLoc);
	AllowedNameModifiers.push_back(OMPD_parallel);
	break;
	case OMPD_parallel_sections:
	Res = ActOnOpenMPParallelSectionsDirective(ClausesWithImplicit, AStmt,
	StartLoc, EndLoc);
	AllowedNameModifiers.push_back(OMPD_parallel);
	break;
	case OMPD_task:
	Res =
	ActOnOpenMPTaskDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc);
	AllowedNameModifiers.push_back(OMPD_task);
	break;
	case OMPD_taskyield:
	assert(ClausesWithImplicit.empty() &&
	"No clauses are allowed for 'omp taskyield' directive");
	assert(AStmt == nullptr &&
	"No associated statement allowed for 'omp taskyield' directive");
	Res = ActOnOpenMPTaskyieldDirective(StartLoc, EndLoc);
	break;
	case OMPD_barrier:
	assert(ClausesWithImplicit.empty() &&
	"No clauses are allowed for 'omp barrier' directive");
	assert(AStmt == nullptr &&
	"No associated statement allowed for 'omp barrier' directive");
	Res = ActOnOpenMPBarrierDirective(StartLoc, EndLoc);
	break;
	case OMPD_taskwait:
	assert(ClausesWithImplicit.empty() &&
	"No clauses are allowed for 'omp taskwait' directive");
	assert(AStmt == nullptr &&
	"No associated statement allowed for 'omp taskwait' directive");
	Res = ActOnOpenMPTaskwaitDirective(StartLoc, EndLoc);
	break;
	case OMPD_taskgroup:
	Res = ActOnOpenMPTaskgroupDirective(ClausesWithImplicit, AStmt, StartLoc,
	EndLoc);
	break;
	case OMPD_flush:
	assert(AStmt == nullptr &&
	"No associated statement allowed for 'omp flush' directive");
	Res = ActOnOpenMPFlushDirective(ClausesWithImplicit, StartLoc, EndLoc);
	break;
	case OMPD_depobj:
	assert(AStmt == nullptr &&
	"No associated statement allowed for 'omp depobj' directive");
	Res = ActOnOpenMPDepobjDirective(ClausesWithImplicit, StartLoc, EndLoc);
	break;
	case OMPD_scan:
	assert(AStmt == nullptr &&
	"No associated statement allowed for 'omp scan' directive");
	Res = ActOnOpenMPScanDirective(ClausesWithImplicit, StartLoc, EndLoc);
	break;
	case OMPD_ordered:
	Res = ActOnOpenMPOrderedDirective(ClausesWithImplicit, AStmt, StartLoc,
	EndLoc);
	break;
	case OMPD_atomic:
	Res = ActOnOpenMPAtomicDirective(ClausesWithImplicit, AStmt, StartLoc,
	EndLoc);
	break;
	case OMPD_teams:
	Res =
	ActOnOpenMPTeamsDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc);
	break;
	case OMPD_target:
	Res = ActOnOpenMPTargetDirective(ClausesWithImplicit, AStmt, StartLoc,
	EndLoc);
	AllowedNameModifiers.push_back(OMPD_target);
	break;
	case OMPD_target_parallel:
	Res = ActOnOpenMPTargetParallelDirective(ClausesWithImplicit, AStmt,
	StartLoc, EndLoc);
	AllowedNameModifiers.push_back(OMPD_target);
	AllowedNameModifiers.push_back(OMPD_parallel);
	break;
	case OMPD_target_parallel_for:
	Res = ActOnOpenMPTargetParallelForDirective(
	ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
	AllowedNameModifiers.push_back(OMPD_target);
	AllowedNameModifiers.push_back(OMPD_parallel);
	break;
	case OMPD_cancellation_point:
	assert(ClausesWithImplicit.empty() &&
	"No clauses are allowed for 'omp cancellation point' directive");
	assert(AStmt == nullptr && "No associated statement allowed for 'omp "
	"cancellation point' directive");
	Res = ActOnOpenMPCancellationPointDirective(StartLoc, EndLoc, CancelRegion);
	break;
	case OMPD_cancel:
	assert(AStmt == nullptr &&
	"No associated statement allowed for 'omp cancel' directive");
	Res = ActOnOpenMPCancelDirective(ClausesWithImplicit, StartLoc, EndLoc,
	CancelRegion);
	AllowedNameModifiers.push_back(OMPD_cancel);
	break;
	case OMPD_target_data:
	Res = ActOnOpenMPTargetDataDirective(ClausesWithImplicit, AStmt, StartLoc,
	EndLoc);
	AllowedNameModifiers.push_back(OMPD_target_data);
	break;
	case OMPD_target_enter_data:
	Res = ActOnOpenMPTargetEnterDataDirective(ClausesWithImplicit, StartLoc,
	EndLoc, AStmt);
	AllowedNameModifiers.push_back(OMPD_target_enter_data);
	break;
	case OMPD_target_exit_data:
	Res = ActOnOpenMPTargetExitDataDirective(ClausesWithImplicit, StartLoc,
	EndLoc, AStmt);
	AllowedNameModifiers.push_back(OMPD_target_exit_data);
	break;
	case OMPD_taskloop:
	Res = ActOnOpenMPTaskLoopDirective(ClausesWithImplicit, AStmt, StartLoc,
	EndLoc, VarsWithInheritedDSA);
	AllowedNameModifiers.push_back(OMPD_taskloop);
	break;
	case OMPD_taskloop_simd:
	Res = ActOnOpenMPTaskLoopSimdDirective(ClausesWithImplicit, AStmt, StartLoc,
	EndLoc, VarsWithInheritedDSA);
	AllowedNameModifiers.push_back(OMPD_taskloop);
	if (LangOpts.OpenMP >= 50)
	AllowedNameModifiers.push_back(OMPD_simd);
	break;
	case OMPD_master_taskloop:
	Res = ActOnOpenMPMasterTaskLoopDirective(
	ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
	AllowedNameModifiers.push_back(OMPD_taskloop);
	break;
	case OMPD_master_taskloop_simd:
	Res = ActOnOpenMPMasterTaskLoopSimdDirective(
	ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
	AllowedNameModifiers.push_back(OMPD_taskloop);
	if (LangOpts.OpenMP >= 50)
	AllowedNameModifiers.push_back(OMPD_simd);
	break;
	case OMPD_parallel_master_taskloop:
	Res = ActOnOpenMPParallelMasterTaskLoopDirective(
	ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
	AllowedNameModifiers.push_back(OMPD_taskloop);
	AllowedNameModifiers.push_back(OMPD_parallel);
	break;
	case OMPD_parallel_master_taskloop_simd:
	Res = ActOnOpenMPParallelMasterTaskLoopSimdDirective(
	ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
	AllowedNameModifiers.push_back(OMPD_taskloop);
	AllowedNameModifiers.push_back(OMPD_parallel);
	if (LangOpts.OpenMP >= 50)
	AllowedNameModifiers.push_back(OMPD_simd);
	break;
	case OMPD_distribute:
	Res = ActOnOpenMPDistributeDirective(ClausesWithImplicit, AStmt, StartLoc,
	EndLoc, VarsWithInheritedDSA);
	break;
	case OMPD_target_update:
	Res = ActOnOpenMPTargetUpdateDirective(ClausesWithImplicit, StartLoc,
	EndLoc, AStmt);
	AllowedNameModifiers.push_back(OMPD_target_update);
	break;
	case OMPD_distribute_parallel_for:
	Res = ActOnOpenMPDistributeParallelForDirective(
	ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
	AllowedNameModifiers.push_back(OMPD_parallel);
	break;
	case OMPD_distribute_parallel_for_simd:
	Res = ActOnOpenMPDistributeParallelForSimdDirective(
	ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
	AllowedNameModifiers.push_back(OMPD_parallel);
	if (LangOpts.OpenMP >= 50)
	AllowedNameModifiers.push_back(OMPD_simd);
	break;
	case OMPD_distribute_simd:
	Res = ActOnOpenMPDistributeSimdDirective(
	ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
	if (LangOpts.OpenMP >= 50)
	AllowedNameModifiers.push_back(OMPD_simd);
	break;
	case OMPD_target_parallel_for_simd:
	Res = ActOnOpenMPTargetParallelForSimdDirective(
	ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
	AllowedNameModifiers.push_back(OMPD_target);
	AllowedNameModifiers.push_back(OMPD_parallel);
	if (LangOpts.OpenMP >= 50)
	AllowedNameModifiers.push_back(OMPD_simd);
	break;
	case OMPD_target_simd:
	Res = ActOnOpenMPTargetSimdDirective(ClausesWithImplicit, AStmt, StartLoc,
	EndLoc, VarsWithInheritedDSA);
	AllowedNameModifiers.push_back(OMPD_target);
	if (LangOpts.OpenMP >= 50)
	AllowedNameModifiers.push_back(OMPD_simd);
	break;
	case OMPD_teams_distribute:
	Res = ActOnOpenMPTeamsDistributeDirective(
	ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
	break;
	case OMPD_teams_distribute_simd:
	Res = ActOnOpenMPTeamsDistributeSimdDirective(
	ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
	if (LangOpts.OpenMP >= 50)
	AllowedNameModifiers.push_back(OMPD_simd);
	break;
	case OMPD_teams_distribute_parallel_for_simd:
	Res = ActOnOpenMPTeamsDistributeParallelForSimdDirective(
	ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
	AllowedNameModifiers.push_back(OMPD_parallel);
	if (LangOpts.OpenMP >= 50)
	AllowedNameModifiers.push_back(OMPD_simd);
	break;
	case OMPD_teams_distribute_parallel_for:
	Res = ActOnOpenMPTeamsDistributeParallelForDirective(
	ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
	AllowedNameModifiers.push_back(OMPD_parallel);
	break;
	case OMPD_target_teams:
	Res = ActOnOpenMPTargetTeamsDirective(ClausesWithImplicit, AStmt, StartLoc,
	EndLoc);
	AllowedNameModifiers.push_back(OMPD_target);
	break;
	case OMPD_target_teams_distribute:
	Res = ActOnOpenMPTargetTeamsDistributeDirective(
	ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
	AllowedNameModifiers.push_back(OMPD_target);
	break;
	case OMPD_target_teams_distribute_parallel_for:
	Res = ActOnOpenMPTargetTeamsDistributeParallelForDirective(
	ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
	AllowedNameModifiers.push_back(OMPD_target);
	AllowedNameModifiers.push_back(OMPD_parallel);
	break;
	case OMPD_target_teams_distribute_parallel_for_simd:
	Res = ActOnOpenMPTargetTeamsDistributeParallelForSimdDirective(
	ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
	AllowedNameModifiers.push_back(OMPD_target);
	AllowedNameModifiers.push_back(OMPD_parallel);
	if (LangOpts.OpenMP >= 50)
	AllowedNameModifiers.push_back(OMPD_simd);
	break;
	case OMPD_target_teams_distribute_simd:
	Res = ActOnOpenMPTargetTeamsDistributeSimdDirective(
	ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
	AllowedNameModifiers.push_back(OMPD_target);
	if (LangOpts.OpenMP >= 50)
	AllowedNameModifiers.push_back(OMPD_simd);
	break;
	case OMPD_declare_target:
	case OMPD_end_declare_target:
	case OMPD_threadprivate:
	case OMPD_allocate:
	case OMPD_declare_reduction:
	case OMPD_declare_mapper:
	case OMPD_declare_simd:
	case OMPD_requires:
	case OMPD_declare_variant:
	case OMPD_begin_declare_variant:
	case OMPD_end_declare_variant:
	llvm_unreachable("OpenMP Directive is not allowed");
	case OMPD_unknown:
	default:
	llvm_unreachable("Unknown OpenMP directive");
	}

	ErrorFound = Res.isInvalid() \|\| ErrorFound;

	// Check variables in the clauses if default(none) or
	// default(firstprivate) was specified.
	if (DSAStack->getDefaultDSA() == DSA_none \|\|
	DSAStack->getDefaultDSA() == DSA_firstprivate) {
	DSAAttrChecker DSAChecker(DSAStack, *this, nullptr);
	for (OMPClause *C : Clauses) {
	switch (C->getClauseKind()) {
	case OMPC_num_threads:
	case OMPC_dist_schedule:
	// Do not analyse if no parent teams directive.
	if (isOpenMPTeamsDirective(Kind))
	break;
	continue;
	case OMPC_if:
	if (isOpenMPTeamsDirective(Kind) &&
	cast<OMPIfClause>(C)->getNameModifier() != OMPD_target)
	break;
	if (isOpenMPParallelDirective(Kind) &&
	isOpenMPTaskLoopDirective(Kind) &&
	cast<OMPIfClause>(C)->getNameModifier() != OMPD_parallel)
	break;
	continue;
	case OMPC_schedule:
	case OMPC_detach:
	break;
	case OMPC_grainsize:
	case OMPC_num_tasks:
	case OMPC_final:
	case OMPC_priority:
	// Do not analyze if no parent parallel directive.
	if (isOpenMPParallelDirective(Kind))
	break;
	continue;
	case OMPC_ordered:
	case OMPC_device:
	case OMPC_num_teams:
	case OMPC_thread_limit:
	case OMPC_hint:
	case OMPC_collapse:
	case OMPC_safelen:
	case OMPC_simdlen:
	case OMPC_default:
	case OMPC_proc_bind:
	case OMPC_private:
	case OMPC_firstprivate:
	case OMPC_lastprivate:
	case OMPC_shared:
	case OMPC_reduction:
	case OMPC_task_reduction:
	case OMPC_in_reduction:
	case OMPC_linear:
	case OMPC_aligned:
	case OMPC_copyin:
	case OMPC_copyprivate:
	case OMPC_nowait:
	case OMPC_untied:
	case OMPC_mergeable:
	case OMPC_allocate:
	case OMPC_read:
	case OMPC_write:
	case OMPC_update:
	case OMPC_capture:
	case OMPC_seq_cst:
	case OMPC_acq_rel:
	case OMPC_acquire:
	case OMPC_release:
	case OMPC_relaxed:
	case OMPC_depend:
	case OMPC_threads:
	case OMPC_simd:
	case OMPC_map:
	case OMPC_nogroup:
	case OMPC_defaultmap:
	case OMPC_to:
	case OMPC_from:
	case OMPC_use_device_ptr:
	case OMPC_use_device_addr:
	case OMPC_is_device_ptr:
	case OMPC_nontemporal:
	case OMPC_order:
	case OMPC_destroy:
	case OMPC_inclusive:
	case OMPC_exclusive:
	case OMPC_uses_allocators:
	case OMPC_affinity:
	continue;
	case OMPC_allocator:
	case OMPC_flush:
	case OMPC_depobj:
	case OMPC_threadprivate:
	case OMPC_uniform:
	case OMPC_unknown:
	case OMPC_unified_address:
	case OMPC_unified_shared_memory:
	case OMPC_reverse_offload:
	case OMPC_dynamic_allocators:
	case OMPC_atomic_default_mem_order:
	case OMPC_device_type:
	case OMPC_match:
	default:
	llvm_unreachable("Unexpected clause");
	}
	for (Stmt *CC : C->children()) {
	if (CC)
	DSAChecker.Visit(CC);
	}
	}
	for (const auto &P : DSAChecker.getVarsWithInheritedDSA())
	VarsWithInheritedDSA[P.getFirst()] = P.getSecond();
	}
	for (const auto &P : VarsWithInheritedDSA) {
	if (P.getFirst()->isImplicit() \|\| isa<OMPCapturedExprDecl>(P.getFirst()))
	continue;
	ErrorFound = true;
	if (DSAStack->getDefaultDSA() == DSA_none \|\|
	DSAStack->getDefaultDSA() == DSA_firstprivate) {
	Diag(P.second->getExprLoc(), diag::err_omp_no_dsa_for_variable)
	<< P.first << P.second->getSourceRange();
	Diag(DSAStack->getDefaultDSALocation(), diag::note_omp_default_dsa_none);
	} else if (getLangOpts().OpenMP >= 50) {
	Diag(P.second->getExprLoc(),
	diag::err_omp_defaultmap_no_attr_for_variable)
	<< P.first << P.second->getSourceRange();
	Diag(DSAStack->getDefaultDSALocation(),
	diag::note_omp_defaultmap_attr_none);
	}
	}

	if (!AllowedNameModifiers.empty())
	ErrorFound = checkIfClauses(*this, Kind, Clauses, AllowedNameModifiers) \|\|
	ErrorFound;

	if (ErrorFound)
	return StmtError();

	if (!CurContext->isDependentContext() &&
	isOpenMPTargetExecutionDirective(Kind) &&
	!(DSAStack->hasRequiresDeclWithClause<OMPUnifiedSharedMemoryClause>() \|\|
	DSAStack->hasRequiresDeclWithClause<OMPUnifiedAddressClause>() \|\|
	DSAStack->hasRequiresDeclWithClause<OMPReverseOffloadClause>() \|\|
	DSAStack->hasRequiresDeclWithClause<OMPDynamicAllocatorsClause>())) {
	// Register target to DSA Stack.
	DSAStack->addTargetDirLocation(StartLoc);
	}

	return Res;
	}

	Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareSimdDirective(
	DeclGroupPtrTy DG, OMPDeclareSimdDeclAttr::BranchStateTy BS, Expr *Simdlen,
	ArrayRef<Expr > Uniforms, ArrayRef<Expr > Aligneds,
	ArrayRef<Expr > Alignments, ArrayRef<Expr > Linears,
	ArrayRef<unsigned> LinModifiers, ArrayRef<Expr *> Steps, SourceRange SR) {
	assert(Aligneds.size() == Alignments.size());
	assert(Linears.size() == LinModifiers.size());
	assert(Linears.size() == Steps.size());
	if (!DG \|\| DG.get().isNull())
	return DeclGroupPtrTy();

	const int SimdId = 0;
	if (!DG.get().isSingleDecl()) {
	Diag(SR.getBegin(), diag::err_omp_single_decl_in_declare_simd_variant)
	<< SimdId;
	return DG;
	}
	Decl *ADecl = DG.get().getSingleDecl();
	if (auto *FTD = dyn_cast<FunctionTemplateDecl>(ADecl))
	ADecl = FTD->getTemplatedDecl();

	auto *FD = dyn_cast<FunctionDecl>(ADecl);
	if (!FD) {
	Diag(ADecl->getLocation(), diag::err_omp_function_expected) << SimdId;
	return DeclGroupPtrTy();
	}

	// OpenMP [2.8.2, declare simd construct, Description]
	// The parameter of the simdlen clause must be a constant positive integer
	// expression.
	ExprResult SL;
	if (Simdlen)
	SL = VerifyPositiveIntegerConstantInClause(Simdlen, OMPC_simdlen);
	// OpenMP [2.8.2, declare simd construct, Description]
	// The special this pointer can be used as if was one of the arguments to the
	// function in any of the linear, aligned, or uniform clauses.
	// The uniform clause declares one or more arguments to have an invariant
	// value for all concurrent invocations of the function in the execution of a
	// single SIMD loop.
	llvm::DenseMap<const Decl , const Expr > UniformedArgs;
	const Expr *UniformedLinearThis = nullptr;
	for (const Expr *E : Uniforms) {
	E = E->IgnoreParenImpCasts();
	if (const auto *DRE = dyn_cast<DeclRefExpr>(E))
	if (const auto *PVD = dyn_cast<ParmVarDecl>(DRE->getDecl()))
	if (FD->getNumParams() > PVD->getFunctionScopeIndex() &&
	FD->getParamDecl(PVD->getFunctionScopeIndex())
	->getCanonicalDecl() == PVD->getCanonicalDecl()) {
	UniformedArgs.try_emplace(PVD->getCanonicalDecl(), E);
	continue;
	}
	if (isa<CXXThisExpr>(E)) {
	UniformedLinearThis = E;
	continue;
	}
	Diag(E->getExprLoc(), diag::err_omp_param_or_this_in_clause)
	<< FD->getDeclName() << (isa<CXXMethodDecl>(ADecl) ? 1 : 0);
	}
	// OpenMP [2.8.2, declare simd construct, Description]
	// The aligned clause declares that the object to which each list item points
	// is aligned to the number of bytes expressed in the optional parameter of
	// the aligned clause.
	// The special this pointer can be used as if was one of the arguments to the
	// function in any of the linear, aligned, or uniform clauses.
	// The type of list items appearing in the aligned clause must be array,
	// pointer, reference to array, or reference to pointer.
	llvm::DenseMap<const Decl , const Expr > AlignedArgs;
	const Expr *AlignedThis = nullptr;
	for (const Expr *E : Aligneds) {
	E = E->IgnoreParenImpCasts();
	if (const auto *DRE = dyn_cast<DeclRefExpr>(E))
	if (const auto *PVD = dyn_cast<ParmVarDecl>(DRE->getDecl())) {
	const VarDecl *CanonPVD = PVD->getCanonicalDecl();
	if (FD->getNumParams() > PVD->getFunctionScopeIndex() &&
	FD->getParamDecl(PVD->getFunctionScopeIndex())
	->getCanonicalDecl() == CanonPVD) {
	// OpenMP [2.8.1, simd construct, Restrictions]
	// A list-item cannot appear in more than one aligned clause.
	if (AlignedArgs.count(CanonPVD) > 0) {
	Diag(E->getExprLoc(), diag::err_omp_used_in_clause_twice)
	<< 1 << getOpenMPClauseName(OMPC_aligned)
	<< E->getSourceRange();
	Diag(AlignedArgs[CanonPVD]->getExprLoc(),
	diag::note_omp_explicit_dsa)
	<< getOpenMPClauseName(OMPC_aligned);
	continue;
	}
	AlignedArgs[CanonPVD] = E;
	QualType QTy = PVD->getType()
	.getNonReferenceType()
	.getUnqualifiedType()
	.getCanonicalType();
	const Type *Ty = QTy.getTypePtrOrNull();
	if (!Ty \|\| (!Ty->isArrayType() && !Ty->isPointerType())) {
	Diag(E->getExprLoc(), diag::err_omp_aligned_expected_array_or_ptr)
	<< QTy << getLangOpts().CPlusPlus << E->getSourceRange();
	Diag(PVD->getLocation(), diag::note_previous_decl) << PVD;
	}
	continue;
	}
	}
	if (isa<CXXThisExpr>(E)) {
	if (AlignedThis) {
	Diag(E->getExprLoc(), diag::err_omp_used_in_clause_twice)
	<< 2 << getOpenMPClauseName(OMPC_aligned) << E->getSourceRange();
	Diag(AlignedThis->getExprLoc(), diag::note_omp_explicit_dsa)
	<< getOpenMPClauseName(OMPC_aligned);
	}
	AlignedThis = E;
	continue;
	}
	Diag(E->getExprLoc(), diag::err_omp_param_or_this_in_clause)
	<< FD->getDeclName() << (isa<CXXMethodDecl>(ADecl) ? 1 : 0);
	}
	// The optional parameter of the aligned clause, alignment, must be a constant
	// positive integer expression. If no optional parameter is specified,
	// implementation-defined default alignments for SIMD instructions on the
	// target platforms are assumed.
	SmallVector<const Expr *, 4> NewAligns;
	for (Expr *E : Alignments) {
	ExprResult Align;
	if (E)
	Align = VerifyPositiveIntegerConstantInClause(E, OMPC_aligned);
	NewAligns.push_back(Align.get());
	}
	// OpenMP [2.8.2, declare simd construct, Description]
	// The linear clause declares one or more list items to be private to a SIMD
	// lane and to have a linear relationship with respect to the iteration space
	// of a loop.
	// The special this pointer can be used as if was one of the arguments to the
	// function in any of the linear, aligned, or uniform clauses.
	// When a linear-step expression is specified in a linear clause it must be
	// either a constant integer expression or an integer-typed parameter that is
	// specified in a uniform clause on the directive.
	llvm::DenseMap<const Decl , const Expr > LinearArgs;
	const bool IsUniformedThis = UniformedLinearThis != nullptr;
	auto MI = LinModifiers.begin();
	for (const Expr *E : Linears) {
	auto LinKind = static_cast<OpenMPLinearClauseKind>(*MI);
	++MI;
	E = E->IgnoreParenImpCasts();
	if (const auto *DRE = dyn_cast<DeclRefExpr>(E))
	if (const auto *PVD = dyn_cast<ParmVarDecl>(DRE->getDecl())) {
	const VarDecl *CanonPVD = PVD->getCanonicalDecl();
	if (FD->getNumParams() > PVD->getFunctionScopeIndex() &&
	FD->getParamDecl(PVD->getFunctionScopeIndex())
	->getCanonicalDecl() == CanonPVD) {
	// OpenMP [2.15.3.7, linear Clause, Restrictions]
	// A list-item cannot appear in more than one linear clause.
	if (LinearArgs.count(CanonPVD) > 0) {
	Diag(E->getExprLoc(), diag::err_omp_wrong_dsa)
	<< getOpenMPClauseName(OMPC_linear)
	<< getOpenMPClauseName(OMPC_linear) << E->getSourceRange();
	Diag(LinearArgs[CanonPVD]->getExprLoc(),
	diag::note_omp_explicit_dsa)
	<< getOpenMPClauseName(OMPC_linear);
	continue;
	}
	// Each argument can appear in at most one uniform or linear clause.
	if (UniformedArgs.count(CanonPVD) > 0) {
	Diag(E->getExprLoc(), diag::err_omp_wrong_dsa)
	<< getOpenMPClauseName(OMPC_linear)
	<< getOpenMPClauseName(OMPC_uniform) << E->getSourceRange();
	Diag(UniformedArgs[CanonPVD]->getExprLoc(),
	diag::note_omp_explicit_dsa)
	<< getOpenMPClauseName(OMPC_uniform);
	continue;
	}
	LinearArgs[CanonPVD] = E;
	if (E->isValueDependent() \|\| E->isTypeDependent() \|\|
	E->isInstantiationDependent() \|\|
	E->containsUnexpandedParameterPack())
	continue;
	(void)CheckOpenMPLinearDecl(CanonPVD, E->getExprLoc(), LinKind,
	PVD->getOriginalType(),
	/IsDeclareSimd=/true);
	continue;
	}
	}
	if (isa<CXXThisExpr>(E)) {
	if (UniformedLinearThis) {
	Diag(E->getExprLoc(), diag::err_omp_wrong_dsa)
	<< getOpenMPClauseName(OMPC_linear)
	<< getOpenMPClauseName(IsUniformedThis ? OMPC_uniform : OMPC_linear)
	<< E->getSourceRange();
	Diag(UniformedLinearThis->getExprLoc(), diag::note_omp_explicit_dsa)
	<< getOpenMPClauseName(IsUniformedThis ? OMPC_uniform
	: OMPC_linear);
	continue;
	}
	UniformedLinearThis = E;
	if (E->isValueDependent() \|\| E->isTypeDependent() \|\|
	E->isInstantiationDependent() \|\| E->containsUnexpandedParameterPack())
	continue;
	(void)CheckOpenMPLinearDecl(/D=/nullptr, E->getExprLoc(), LinKind,
	E->getType(), /IsDeclareSimd=/true);
	continue;
	}
	Diag(E->getExprLoc(), diag::err_omp_param_or_this_in_clause)
	<< FD->getDeclName() << (isa<CXXMethodDecl>(ADecl) ? 1 : 0);
	}
	Expr *Step = nullptr;
	Expr *NewStep = nullptr;
	SmallVector<Expr *, 4> NewSteps;
	for (Expr *E : Steps) {
	// Skip the same step expression, it was checked already.
	if (Step == E \|\| !E) {
	NewSteps.push_back(E ? NewStep : nullptr);
	continue;
	}
	Step = E;
	if (const auto *DRE = dyn_cast<DeclRefExpr>(Step))
	if (const auto *PVD = dyn_cast<ParmVarDecl>(DRE->getDecl())) {
	const VarDecl *CanonPVD = PVD->getCanonicalDecl();
	if (UniformedArgs.count(CanonPVD) == 0) {
	Diag(Step->getExprLoc(), diag::err_omp_expected_uniform_param)
	<< Step->getSourceRange();
	} else if (E->isValueDependent() \|\| E->isTypeDependent() \|\|
	E->isInstantiationDependent() \|\|
	E->containsUnexpandedParameterPack() \|\|
	CanonPVD->getType()->hasIntegerRepresentation()) {
	NewSteps.push_back(Step);
	} else {
	Diag(Step->getExprLoc(), diag::err_omp_expected_int_param)
	<< Step->getSourceRange();
	}
	continue;
	}
	NewStep = Step;
	if (Step && !Step->isValueDependent() && !Step->isTypeDependent() &&
	!Step->isInstantiationDependent() &&
	!Step->containsUnexpandedParameterPack()) {
	NewStep = PerformOpenMPImplicitIntegerConversion(Step->getExprLoc(), Step)
	.get();
	if (NewStep)
	NewStep = VerifyIntegerConstantExpression(NewStep).get();
	}
	NewSteps.push_back(NewStep);
	}
	auto *NewAttr = OMPDeclareSimdDeclAttr::CreateImplicit(
	Context, BS, SL.get(), const_cast<Expr **>(Uniforms.data()),
	Uniforms.size(), const_cast<Expr **>(Aligneds.data()), Aligneds.size(),
	const_cast<Expr **>(NewAligns.data()), NewAligns.size(),
	const_cast<Expr **>(Linears.data()), Linears.size(),
	const_cast<unsigned *>(LinModifiers.data()), LinModifiers.size(),
	NewSteps.data(), NewSteps.size(), SR);
	ADecl->addAttr(NewAttr);
	return DG;
	}

	static void setPrototype(Sema &S, FunctionDecl FD, FunctionDecl FDWithProto,
	QualType NewType) {
	assert(NewType->isFunctionProtoType() &&
	"Expected function type with prototype.");
	assert(FD->getType()->isFunctionNoProtoType() &&
	"Expected function with type with no prototype.");
	assert(FDWithProto->getType()->isFunctionProtoType() &&
	"Expected function with prototype.");
	// Synthesize parameters with the same types.
	FD->setType(NewType);
	SmallVector<ParmVarDecl *, 16> Params;
	for (const ParmVarDecl *P : FDWithProto->parameters()) {
	auto *Param = ParmVarDecl::Create(S.getASTContext(), FD, SourceLocation(),
	SourceLocation(), nullptr, P->getType(),
	/TInfo=/nullptr, SC_None, nullptr);
	Param->setScopeInfo(0, Params.size());
	Param->setImplicit();
	Params.push_back(Param);
	}

	FD->setParams(Params);
	}

	Sema::OMPDeclareVariantScope::OMPDeclareVariantScope(OMPTraitInfo &TI)
	: TI(&TI), NameSuffix(TI.getMangledName()) {}

	FunctionDecl *
	Sema::ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope(Scope *S,
	Declarator &D) {
	IdentifierInfo *BaseII = D.getIdentifier();
	LookupResult Lookup(*this, DeclarationName(BaseII), D.getIdentifierLoc(),
	LookupOrdinaryName);
	LookupParsedName(Lookup, S, &D.getCXXScopeSpec());

	TypeSourceInfo *TInfo = GetTypeForDeclarator(D, S);
	QualType FType = TInfo->getType();

	bool IsConstexpr = D.getDeclSpec().getConstexprSpecifier() == CSK_constexpr;
	bool IsConsteval = D.getDeclSpec().getConstexprSpecifier() == CSK_consteval;

	FunctionDecl *BaseFD = nullptr;
	for (auto *Candidate : Lookup) {
	auto *UDecl = dyn_cast<FunctionDecl>(Candidate->getUnderlyingDecl());
	if (!UDecl)
	continue;

	// Don't specialize constexpr/consteval functions with
	// non-constexpr/consteval functions.
	if (UDecl->isConstexpr() && !IsConstexpr)
	continue;
	if (UDecl->isConsteval() && !IsConsteval)
	continue;

	QualType NewType = Context.mergeFunctionTypes(
	FType, UDecl->getType(), /* OfBlockPointer */ false,
	/* Unqualified / false, / AllowCXX */ true);
	if (NewType.isNull())
	continue;

	// Found a base!
	BaseFD = UDecl;
	break;
	}
	if (!BaseFD) {
	BaseFD = cast<FunctionDecl>(ActOnDeclarator(S, D));
	BaseFD->setImplicit(true);
	}

	OMPDeclareVariantScope &DVScope = OMPDeclareVariantScopes.back();
	std::string MangledName;
	MangledName += D.getIdentifier()->getName();
	MangledName += getOpenMPVariantManglingSeparatorStr();
	MangledName += DVScope.NameSuffix;
	IdentifierInfo &VariantII = Context.Idents.get(MangledName);

	VariantII.setMangledOpenMPVariantName(true);
	D.SetIdentifier(&VariantII, D.getBeginLoc());
	return BaseFD;
	}

	void Sema::ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope(
	FunctionDecl FD, FunctionDecl BaseFD) {
	// Do not mark function as is used to prevent its emission if this is the
	// only place where it is used.
	EnterExpressionEvaluationContext Unevaluated(
	*this, Sema::ExpressionEvaluationContext::Unevaluated);

	Expr *VariantFuncRef = DeclRefExpr::Create(
	Context, NestedNameSpecifierLoc(), SourceLocation(), FD,
	/* RefersToEnclosingVariableOrCapture */ false,
	/* NameLoc */ FD->getLocation(), FD->getType(), ExprValueKind::VK_RValue);

	OMPDeclareVariantScope &DVScope = OMPDeclareVariantScopes.back();
	auto *OMPDeclareVariantA = OMPDeclareVariantAttr::CreateImplicit(
	Context, VariantFuncRef, DVScope.TI);
	BaseFD->addAttr(OMPDeclareVariantA);
	}

	ExprResult Sema::ActOnOpenMPCall(ExprResult Call, Scope *Scope,
	SourceLocation LParenLoc,
	MultiExprArg ArgExprs,
	SourceLocation RParenLoc, Expr *ExecConfig) {
	// The common case is a regular call we do not want to specialize at all. Try
	// to make that case fast by bailing early.
	CallExpr *CE = dyn_cast<CallExpr>(Call.get());
	if (!CE)
	return Call;

	FunctionDecl *CalleeFnDecl = CE->getDirectCallee();
	if (!CalleeFnDecl)
	return Call;

	if (!CalleeFnDecl->hasAttr<OMPDeclareVariantAttr>())
	return Call;

	ASTContext &Context = getASTContext();
	OMPContext OMPCtx(getLangOpts().OpenMPIsDevice,
	Context.getTargetInfo().getTriple());

	SmallVector<Expr *, 4> Exprs;
	SmallVector<VariantMatchInfo, 4> VMIs;
	while (CalleeFnDecl) {
	for (OMPDeclareVariantAttr *A :
	CalleeFnDecl->specific_attrs<OMPDeclareVariantAttr>()) {
	Expr *VariantRef = A->getVariantFuncRef();

	VariantMatchInfo VMI;
	OMPTraitInfo &TI = A->getTraitInfo();
	TI.getAsVariantMatchInfo(Context, VMI);
	if (!isVariantApplicableInContext(VMI, OMPCtx, /* DeviceSetOnly */ false))
	continue;

	VMIs.push_back(VMI);
	Exprs.push_back(VariantRef);
	}

	CalleeFnDecl = CalleeFnDecl->getPreviousDecl();
	}

	ExprResult NewCall;
	do {
	int BestIdx = getBestVariantMatchForContext(VMIs, OMPCtx);
	if (BestIdx < 0)
	return Call;
	Expr *BestExpr = cast<DeclRefExpr>(Exprs[BestIdx]);
	Decl *BestDecl = cast<DeclRefExpr>(BestExpr)->getDecl();

	{
	// Try to build a (member) call expression for the current best applicable
	// variant expression. We allow this to fail in which case we continue
	// with the next best variant expression. The fail case is part of the
	// implementation defined behavior in the OpenMP standard when it talks
	// about what differences in the function prototypes: "Any differences
	// that the specific OpenMP context requires in the prototype of the
	// variant from the base function prototype are implementation defined."
	// This wording is there to allow the specialized variant to have a
	// different type than the base function. This is intended and OK but if
	// we cannot create a call the difference is not in the "implementation
	// defined range" we allow.
	Sema::TentativeAnalysisScope Trap(*this);

	if (auto *SpecializedMethod = dyn_cast<CXXMethodDecl>(BestDecl)) {
	auto *MemberCall = dyn_cast<CXXMemberCallExpr>(CE);
	BestExpr = MemberExpr::CreateImplicit(
	Context, MemberCall->getImplicitObjectArgument(),
	/* IsArrow */ false, SpecializedMethod, Context.BoundMemberTy,
	MemberCall->getValueKind(), MemberCall->getObjectKind());
	}
	NewCall = BuildCallExpr(Scope, BestExpr, LParenLoc, ArgExprs, RParenLoc,
	ExecConfig);
	if (NewCall.isUsable())
	break;
	}

	VMIs.erase(VMIs.begin() + BestIdx);
	Exprs.erase(Exprs.begin() + BestIdx);
	} while (!VMIs.empty());

	if (!NewCall.isUsable())
	return Call;
	return PseudoObjectExpr::Create(Context, CE, {NewCall.get()}, 0);
	}

	Optional<std::pair<FunctionDecl , Expr >>
	Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG,
	Expr *VariantRef, OMPTraitInfo &TI,
	SourceRange SR) {
	if (!DG \|\| DG.get().isNull())
	return None;

	const int VariantId = 1;
	// Must be applied only to single decl.
	if (!DG.get().isSingleDecl()) {
	Diag(SR.getBegin(), diag::err_omp_single_decl_in_declare_simd_variant)
	<< VariantId << SR;
	return None;
	}
	Decl *ADecl = DG.get().getSingleDecl();
	if (auto *FTD = dyn_cast<FunctionTemplateDecl>(ADecl))
	ADecl = FTD->getTemplatedDecl();

	// Decl must be a function.
	auto *FD = dyn_cast<FunctionDecl>(ADecl);
	if (!FD) {
	Diag(ADecl->getLocation(), diag::err_omp_function_expected)
	<< VariantId << SR;
	return None;
	}

	auto &&HasMultiVersionAttributes = [](const FunctionDecl *FD) {
	return FD->hasAttrs() &&
	(FD->hasAttr<CPUDispatchAttr>() \|\| FD->hasAttr<CPUSpecificAttr>() \|\|
	FD->hasAttr<TargetAttr>());
	};
	// OpenMP is not compatible with CPU-specific attributes.
	if (HasMultiVersionAttributes(FD)) {
	Diag(FD->getLocation(), diag::err_omp_declare_variant_incompat_attributes)
	<< SR;
	return None;
	}

	// Allow #pragma omp declare variant only if the function is not used.
	if (FD->isUsed(false))
	Diag(SR.getBegin(), diag::warn_omp_declare_variant_after_used)
	<< FD->getLocation();

	// Check if the function was emitted already.
	const FunctionDecl *Definition;
	if (!FD->isThisDeclarationADefinition() && FD->isDefined(Definition) &&
	(LangOpts.EmitAllDecls \|\| Context.DeclMustBeEmitted(Definition)))
	Diag(SR.getBegin(), diag::warn_omp_declare_variant_after_emitted)
	<< FD->getLocation();

	// The VariantRef must point to function.
	if (!VariantRef) {
	Diag(SR.getBegin(), diag::err_omp_function_expected) << VariantId;
	return None;
	}

	auto ShouldDelayChecks = [](Expr *&E, bool) {
	return E && (E->isTypeDependent() \|\| E->isValueDependent() \|\|
	E->containsUnexpandedParameterPack() \|\|
	E->isInstantiationDependent());
	};
	// Do not check templates, wait until instantiation.
	if (FD->isDependentContext() \|\| ShouldDelayChecks(VariantRef, false) \|\|
	TI.anyScoreOrCondition(ShouldDelayChecks))
	return std::make_pair(FD, VariantRef);

	// Deal with non-constant score and user condition expressions.
	auto HandleNonConstantScoresAndConditions = [this](Expr *&E,
	bool IsScore) -> bool {
	llvm::APSInt Result;
	if (!E \|\| E->isIntegerConstantExpr(Result, Context))
	return false;

	if (IsScore) {
	// We warn on non-constant scores and pretend they were not present.
	Diag(E->getExprLoc(), diag::warn_omp_declare_variant_score_not_constant)
	<< E;
	E = nullptr;
	} else {
	// We could replace a non-constant user condition with "false" but we
	// will soon need to handle these anyway for the dynamic version of
	// OpenMP context selectors.
	Diag(E->getExprLoc(),
	diag::err_omp_declare_variant_user_condition_not_constant)
	<< E;
	}
	return true;
	};
	if (TI.anyScoreOrCondition(HandleNonConstantScoresAndConditions))
	return None;

	// Convert VariantRef expression to the type of the original function to
	// resolve possible conflicts.
	ExprResult VariantRefCast;
	if (LangOpts.CPlusPlus) {
	QualType FnPtrType;
	auto *Method = dyn_cast<CXXMethodDecl>(FD);
	if (Method && !Method->isStatic()) {
	const Type *ClassType =
	Context.getTypeDeclType(Method->getParent()).getTypePtr();
	FnPtrType = Context.getMemberPointerType(FD->getType(), ClassType);
	ExprResult ER;
	{
	// Build adrr_of unary op to correctly handle type checks for member
	// functions.
	Sema::TentativeAnalysisScope Trap(*this);
	ER = CreateBuiltinUnaryOp(VariantRef->getBeginLoc(), UO_AddrOf,
	VariantRef);
	}
	if (!ER.isUsable()) {
	Diag(VariantRef->getExprLoc(), diag::err_omp_function_expected)
	<< VariantId << VariantRef->getSourceRange();
	return None;
	}
	VariantRef = ER.get();
	} else {
	FnPtrType = Context.getPointerType(FD->getType());
	}
	ImplicitConversionSequence ICS =
	TryImplicitConversion(VariantRef, FnPtrType.getUnqualifiedType(),
	/SuppressUserConversions=/false,
	AllowedExplicit::None,
	/InOverloadResolution=/false,
	/CStyle=/false,
	/AllowObjCWritebackConversion=/false);
	if (ICS.isFailure()) {
	Diag(VariantRef->getExprLoc(),
	diag::err_omp_declare_variant_incompat_types)
	<< VariantRef->getType()
	<< ((Method && !Method->isStatic()) ? FnPtrType : FD->getType())
	<< VariantRef->getSourceRange();
	return None;
	}
	VariantRefCast = PerformImplicitConversion(
	VariantRef, FnPtrType.getUnqualifiedType(), AA_Converting);
	if (!VariantRefCast.isUsable())
	return None;
	// Drop previously built artificial addr_of unary op for member functions.
	if (Method && !Method->isStatic()) {
	Expr *PossibleAddrOfVariantRef = VariantRefCast.get();
	if (auto *UO = dyn_cast<UnaryOperator>(
	PossibleAddrOfVariantRef->IgnoreImplicit()))
	VariantRefCast = UO->getSubExpr();
	}
	} else {
	VariantRefCast = VariantRef;
	}

	ExprResult ER = CheckPlaceholderExpr(VariantRefCast.get());
	if (!ER.isUsable() \|\|
	!ER.get()->IgnoreParenImpCasts()->getType()->isFunctionType()) {
	Diag(VariantRef->getExprLoc(), diag::err_omp_function_expected)
	<< VariantId << VariantRef->getSourceRange();
	return None;
	}

	// The VariantRef must point to function.
	auto *DRE = dyn_cast<DeclRefExpr>(ER.get()->IgnoreParenImpCasts());
	if (!DRE) {
	Diag(VariantRef->getExprLoc(), diag::err_omp_function_expected)
	<< VariantId << VariantRef->getSourceRange();
	return None;
	}
	auto *NewFD = dyn_cast_or_null<FunctionDecl>(DRE->getDecl());
	if (!NewFD) {
	Diag(VariantRef->getExprLoc(), diag::err_omp_function_expected)
	<< VariantId << VariantRef->getSourceRange();
	return None;
	}

	// Check if function types are compatible in C.
	if (!LangOpts.CPlusPlus) {
	QualType NewType =
	Context.mergeFunctionTypes(FD->getType(), NewFD->getType());
	if (NewType.isNull()) {
	Diag(VariantRef->getExprLoc(),
	diag::err_omp_declare_variant_incompat_types)
	<< NewFD->getType() << FD->getType() << VariantRef->getSourceRange();
	return None;
	}
	if (NewType->isFunctionProtoType()) {
	if (FD->getType()->isFunctionNoProtoType())
	setPrototype(*this, FD, NewFD, NewType);
	else if (NewFD->getType()->isFunctionNoProtoType())
	setPrototype(*this, NewFD, FD, NewType);
	}
	}

	// Check if variant function is not marked with declare variant directive.
	if (NewFD->hasAttrs() && NewFD->hasAttr<OMPDeclareVariantAttr>()) {
	Diag(VariantRef->getExprLoc(),
	diag::warn_omp_declare_variant_marked_as_declare_variant)
	<< VariantRef->getSourceRange();
	SourceRange SR =
	NewFD->specific_attr_begin<OMPDeclareVariantAttr>()->getRange();
	Diag(SR.getBegin(), diag::note_omp_marked_declare_variant_here) << SR;
	return None;
	}

	enum DoesntSupport {
	VirtFuncs = 1,
	Constructors = 3,
	Destructors = 4,
	DeletedFuncs = 5,
	DefaultedFuncs = 6,
	ConstexprFuncs = 7,
	ConstevalFuncs = 8,
	};
	if (const auto *CXXFD = dyn_cast<CXXMethodDecl>(FD)) {
	if (CXXFD->isVirtual()) {
	Diag(FD->getLocation(), diag::err_omp_declare_variant_doesnt_support)
	<< VirtFuncs;
	return None;
	}

	if (isa<CXXConstructorDecl>(FD)) {
	Diag(FD->getLocation(), diag::err_omp_declare_variant_doesnt_support)
	<< Constructors;
	return None;
	}

	if (isa<CXXDestructorDecl>(FD)) {
	Diag(FD->getLocation(), diag::err_omp_declare_variant_doesnt_support)
	<< Destructors;
	return None;
	}
	}

	if (FD->isDeleted()) {
	Diag(FD->getLocation(), diag::err_omp_declare_variant_doesnt_support)
	<< DeletedFuncs;
	return None;
	}

	if (FD->isDefaulted()) {
	Diag(FD->getLocation(), diag::err_omp_declare_variant_doesnt_support)
	<< DefaultedFuncs;
	return None;
	}

	if (FD->isConstexpr()) {
	Diag(FD->getLocation(), diag::err_omp_declare_variant_doesnt_support)
	<< (NewFD->isConsteval() ? ConstevalFuncs : ConstexprFuncs);
	return None;
	}

	// Check general compatibility.
	if (areMultiversionVariantFunctionsCompatible(
	FD, NewFD, PartialDiagnostic::NullDiagnostic(),
	PartialDiagnosticAt(SourceLocation(),
	PartialDiagnostic::NullDiagnostic()),
	PartialDiagnosticAt(
	VariantRef->getExprLoc(),
	PDiag(diag::err_omp_declare_variant_doesnt_support)),
	PartialDiagnosticAt(VariantRef->getExprLoc(),
	PDiag(diag::err_omp_declare_variant_diff)
	<< FD->getLocation()),
	/TemplatesSupported=/true, /ConstexprSupported=/false,
	/CLinkageMayDiffer=/true))
	return None;
	return std::make_pair(FD, cast<Expr>(DRE));
	}

	void Sema::ActOnOpenMPDeclareVariantDirective(FunctionDecl *FD,
	Expr *VariantRef,
	OMPTraitInfo &TI,
	SourceRange SR) {
	auto *NewAttr =
	OMPDeclareVariantAttr::CreateImplicit(Context, VariantRef, &TI, SR);
	FD->addAttr(NewAttr);
	}

	StmtResult Sema::ActOnOpenMPParallelDirective(ArrayRef<OMPClause *> Clauses,
	Stmt *AStmt,
	SourceLocation StartLoc,
	SourceLocation EndLoc) {
	if (!AStmt)
	return StmtError();

	auto *CS = cast<CapturedStmt>(AStmt);
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();

	setFunctionHasBranchProtectedScope();

	return OMPParallelDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt,
	DSAStack->getTaskgroupReductionRef(),
	DSAStack->isCancelRegion());
	}

	namespace {
	/// Iteration space of a single for loop.
	struct LoopIterationSpace final {
	/// True if the condition operator is the strict compare operator (<, > or
	/// !=).
	bool IsStrictCompare = false;
	/// Condition of the loop.
	Expr *PreCond = nullptr;
	/// This expression calculates the number of iterations in the loop.
	/// It is always possible to calculate it before starting the loop.
	Expr *NumIterations = nullptr;
	/// The loop counter variable.
	Expr *CounterVar = nullptr;
	/// Private loop counter variable.
	Expr *PrivateCounterVar = nullptr;
	/// This is initializer for the initial value of #CounterVar.
	Expr *CounterInit = nullptr;
	/// This is step for the #CounterVar used to generate its update:
	/// #CounterVar = #CounterInit + #CounterStep * CurrentIteration.
	Expr *CounterStep = nullptr;
	/// Should step be subtracted?
	bool Subtract = false;
	/// Source range of the loop init.
	SourceRange InitSrcRange;
	/// Source range of the loop condition.
	SourceRange CondSrcRange;
	/// Source range of the loop increment.
	SourceRange IncSrcRange;
	/// Minimum value that can have the loop control variable. Used to support
	/// non-rectangular loops. Applied only for LCV with the non-iterator types,
	/// since only such variables can be used in non-loop invariant expressions.
	Expr *MinValue = nullptr;
	/// Maximum value that can have the loop control variable. Used to support
	/// non-rectangular loops. Applied only for LCV with the non-iterator type,
	/// since only such variables can be used in non-loop invariant expressions.
	Expr *MaxValue = nullptr;
	/// true, if the lower bound depends on the outer loop control var.
	bool IsNonRectangularLB = false;
	/// true, if the upper bound depends on the outer loop control var.
	bool IsNonRectangularUB = false;
	/// Index of the loop this loop depends on and forms non-rectangular loop
	/// nest.
	unsigned LoopDependentIdx = 0;
	/// Final condition for the non-rectangular loop nest support. It is used to
	/// check that the number of iterations for this particular counter must be
	/// finished.
	Expr *FinalCondition = nullptr;
	};

	/// Helper class for checking canonical form of the OpenMP loops and
	/// extracting iteration space of each loop in the loop nest, that will be used
	/// for IR generation.
	class OpenMPIterationSpaceChecker {
	/// Reference to Sema.
	Sema &SemaRef;
	/// Data-sharing stack.
	DSAStackTy &Stack;
	/// A location for diagnostics (when there is no some better location).
	SourceLocation DefaultLoc;
	/// A location for diagnostics (when increment is not compatible).
	SourceLocation ConditionLoc;
	/// A source location for referring to loop init later.
	SourceRange InitSrcRange;
	/// A source location for referring to condition later.
	SourceRange ConditionSrcRange;
	/// A source location for referring to increment later.
	SourceRange IncrementSrcRange;
	/// Loop variable.
	ValueDecl *LCDecl = nullptr;
	/// Reference to loop variable.
	Expr *LCRef = nullptr;
	/// Lower bound (initializer for the var).
	Expr *LB = nullptr;
	/// Upper bound.
	Expr *UB = nullptr;
	/// Loop step (increment).
	Expr *Step = nullptr;
	/// This flag is true when condition is one of:
	/// Var < UB
	/// Var <= UB
	/// UB > Var
	/// UB >= Var
	/// This will have no value when the condition is !=
	llvm::Optional<bool> TestIsLessOp;
	/// This flag is true when condition is strict ( < or > ).
	bool TestIsStrictOp = false;
	/// This flag is true when step is subtracted on each iteration.
	bool SubtractStep = false;
	/// The outer loop counter this loop depends on (if any).
	const ValueDecl *DepDecl = nullptr;
	/// Contains number of loop (starts from 1) on which loop counter init
	/// expression of this loop depends on.
	Optional<unsigned> InitDependOnLC;
	/// Contains number of loop (starts from 1) on which loop counter condition
	/// expression of this loop depends on.
	Optional<unsigned> CondDependOnLC;
	/// Checks if the provide statement depends on the loop counter.
	Optional<unsigned> doesDependOnLoopCounter(const Stmt *S, bool IsInitializer);
	/// Original condition required for checking of the exit condition for
	/// non-rectangular loop.
	Expr *Condition = nullptr;

	public:
	OpenMPIterationSpaceChecker(Sema &SemaRef, DSAStackTy &Stack,
	SourceLocation DefaultLoc)
	: SemaRef(SemaRef), Stack(Stack), DefaultLoc(DefaultLoc),
	ConditionLoc(DefaultLoc) {}
	/// Check init-expr for canonical loop form and save loop counter
	/// variable - #Var and its initialization value - #LB.
	bool checkAndSetInit(Stmt *S, bool EmitDiags = true);
	/// Check test-expr for canonical form, save upper-bound (#UB), flags
	/// for less/greater and for strict/non-strict comparison.
	bool checkAndSetCond(Expr *S);
	/// Check incr-expr for canonical loop form and return true if it
	/// does not conform, otherwise save loop step (#Step).
	bool checkAndSetInc(Expr *S);
	/// Return the loop counter variable.
	ValueDecl *getLoopDecl() const { return LCDecl; }
	/// Return the reference expression to loop counter variable.
	Expr *getLoopDeclRefExpr() const { return LCRef; }
	/// Source range of the loop init.
	SourceRange getInitSrcRange() const { return InitSrcRange; }
	/// Source range of the loop condition.
	SourceRange getConditionSrcRange() const { return ConditionSrcRange; }
	/// Source range of the loop increment.
	SourceRange getIncrementSrcRange() const { return IncrementSrcRange; }
	/// True if the step should be subtracted.
	bool shouldSubtractStep() const { return SubtractStep; }
	/// True, if the compare operator is strict (<, > or !=).
	bool isStrictTestOp() const { return TestIsStrictOp; }
	/// Build the expression to calculate the number of iterations.
	Expr *buildNumIterations(
	Scope *S, ArrayRef<LoopIterationSpace> ResultIterSpaces, bool LimitedType,
	llvm::MapVector<const Expr , DeclRefExpr > &Captures) const;
	/// Build the precondition expression for the loops.
	Expr *
	buildPreCond(Scope S, Expr Cond,
	llvm::MapVector<const Expr , DeclRefExpr > &Captures) const;
	/// Build reference expression to the counter be used for codegen.
	DeclRefExpr *
	buildCounterVar(llvm::MapVector<const Expr , DeclRefExpr > &Captures,
	DSAStackTy &DSA) const;
	/// Build reference expression to the private counter be used for
	/// codegen.
	Expr *buildPrivateCounterVar() const;
	/// Build initialization of the counter be used for codegen.
	Expr *buildCounterInit() const;
	/// Build step of the counter be used for codegen.
	Expr *buildCounterStep() const;
	/// Build loop data with counter value for depend clauses in ordered
	/// directives.
	Expr *
	buildOrderedLoopData(Scope S, Expr Counter,
	llvm::MapVector<const Expr , DeclRefExpr > &Captures,
	SourceLocation Loc, Expr *Inc = nullptr,
	OverloadedOperatorKind OOK = OO_Amp);
	/// Builds the minimum value for the loop counter.
	std::pair<Expr , Expr > buildMinMaxValues(
	Scope S, llvm::MapVector<const Expr , DeclRefExpr *> &Captures) const;
	/// Builds final condition for the non-rectangular loops.
	Expr buildFinalCondition(Scope S) const;
	/// Return true if any expression is dependent.
	bool dependent() const;
	/// Returns true if the initializer forms non-rectangular loop.
	bool doesInitDependOnLC() const { return InitDependOnLC.hasValue(); }
	/// Returns true if the condition forms non-rectangular loop.
	bool doesCondDependOnLC() const { return CondDependOnLC.hasValue(); }
	/// Returns index of the loop we depend on (starting from 1), or 0 otherwise.
	unsigned getLoopDependentIdx() const {
	return InitDependOnLC.getValueOr(CondDependOnLC.getValueOr(0));
	}

	private:
	/// Check the right-hand side of an assignment in the increment
	/// expression.
	bool checkAndSetIncRHS(Expr *RHS);
	/// Helper to set loop counter variable and its initializer.
	bool setLCDeclAndLB(ValueDecl NewLCDecl, Expr NewDeclRefExpr, Expr *NewLB,
	bool EmitDiags);
	/// Helper to set upper bound.
	bool setUB(Expr *NewUB, llvm::Optional<bool> LessOp, bool StrictOp,
	SourceRange SR, SourceLocation SL);
	/// Helper to set loop increment.
	bool setStep(Expr *NewStep, bool Subtract);
	};

	bool OpenMPIterationSpaceChecker::dependent() const {
	if (!LCDecl) {
	assert(!LB && !UB && !Step);
	return false;
	}
	return LCDecl->getType()->isDependentType() \|\|
	(LB && LB->isValueDependent()) \|\| (UB && UB->isValueDependent()) \|\|
	(Step && Step->isValueDependent());
	}

	bool OpenMPIterationSpaceChecker::setLCDeclAndLB(ValueDecl *NewLCDecl,
	Expr *NewLCRefExpr,
	Expr *NewLB, bool EmitDiags) {
	// State consistency checking to ensure correct usage.
	assert(LCDecl == nullptr && LB == nullptr && LCRef == nullptr &&
	UB == nullptr && Step == nullptr && !TestIsLessOp && !TestIsStrictOp);
	if (!NewLCDecl \|\| !NewLB)
	return true;
	LCDecl = getCanonicalDecl(NewLCDecl);
	LCRef = NewLCRefExpr;
	if (auto *CE = dyn_cast_or_null<CXXConstructExpr>(NewLB))
	if (const CXXConstructorDecl *Ctor = CE->getConstructor())
	if ((Ctor->isCopyOrMoveConstructor() \|\|
	Ctor->isConvertingConstructor(/AllowExplicit=/false)) &&
	CE->getNumArgs() > 0 && CE->getArg(0) != nullptr)
	NewLB = CE->getArg(0)->IgnoreParenImpCasts();
	LB = NewLB;
	if (EmitDiags)
	InitDependOnLC = doesDependOnLoopCounter(LB, /IsInitializer=/true);
	return false;
	}

	bool OpenMPIterationSpaceChecker::setUB(Expr *NewUB,
	llvm::Optional<bool> LessOp,
	bool StrictOp, SourceRange SR,
	SourceLocation SL) {
	// State consistency checking to ensure correct usage.
	assert(LCDecl != nullptr && LB != nullptr && UB == nullptr &&
	Step == nullptr && !TestIsLessOp && !TestIsStrictOp);
	if (!NewUB)
	return true;
	UB = NewUB;
	if (LessOp)
	TestIsLessOp = LessOp;
	TestIsStrictOp = StrictOp;
	ConditionSrcRange = SR;
	ConditionLoc = SL;
	CondDependOnLC = doesDependOnLoopCounter(UB, /IsInitializer=/false);
	return false;
	}

	bool OpenMPIterationSpaceChecker::setStep(Expr *NewStep, bool Subtract) {
	// State consistency checking to ensure correct usage.
	assert(LCDecl != nullptr && LB != nullptr && Step == nullptr);
	if (!NewStep)
	return true;
	if (!NewStep->isValueDependent()) {
	// Check that the step is integer expression.
	SourceLocation StepLoc = NewStep->getBeginLoc();
	ExprResult Val = SemaRef.PerformOpenMPImplicitIntegerConversion(
	StepLoc, getExprAsWritten(NewStep));
	if (Val.isInvalid())
	return true;
	NewStep = Val.get();

	// OpenMP [2.6, Canonical Loop Form, Restrictions]
	// If test-expr is of form var relational-op b and relational-op is < or
	// <= then incr-expr must cause var to increase on each iteration of the
	// loop. If test-expr is of form var relational-op b and relational-op is
	// > or >= then incr-expr must cause var to decrease on each iteration of
	// the loop.
	// If test-expr is of form b relational-op var and relational-op is < or
	// <= then incr-expr must cause var to decrease on each iteration of the
	// loop. If test-expr is of form b relational-op var and relational-op is
	// > or >= then incr-expr must cause var to increase on each iteration of
	// the loop.
	llvm::APSInt Result;
	bool IsConstant = NewStep->isIntegerConstantExpr(Result, SemaRef.Context);
	bool IsUnsigned = !NewStep->getType()->hasSignedIntegerRepresentation();
	bool IsConstNeg =
	IsConstant && Result.isSigned() && (Subtract != Result.isNegative());
	bool IsConstPos =
	IsConstant && Result.isSigned() && (Subtract == Result.isNegative());
	bool IsConstZero = IsConstant && !Result.getBoolValue();

	// != with increment is treated as <; != with decrement is treated as >
	if (!TestIsLessOp.hasValue())
	TestIsLessOp = IsConstPos \|\| (IsUnsigned && !Subtract);
	if (UB && (IsConstZero \|\|
	(TestIsLessOp.getValue() ?
	(IsConstNeg \|\| (IsUnsigned && Subtract)) :
	(IsConstPos \|\| (IsUnsigned && !Subtract))))) {
	SemaRef.Diag(NewStep->getExprLoc(),
	diag::err_omp_loop_incr_not_compatible)
	<< LCDecl << TestIsLessOp.getValue() << NewStep->getSourceRange();
	SemaRef.Diag(ConditionLoc,
	diag::note_omp_loop_cond_requres_compatible_incr)
	<< TestIsLessOp.getValue() << ConditionSrcRange;
	return true;
	}
	if (TestIsLessOp.getValue() == Subtract) {
	NewStep =
	SemaRef.CreateBuiltinUnaryOp(NewStep->getExprLoc(), UO_Minus, NewStep)
	.get();
	Subtract = !Subtract;
	}
	}

	Step = NewStep;
	SubtractStep = Subtract;
	return false;
	}

	namespace {
	/// Checker for the non-rectangular loops. Checks if the initializer or
	/// condition expression references loop counter variable.
	class LoopCounterRefChecker final
	: public ConstStmtVisitor<LoopCounterRefChecker, bool> {
	Sema &SemaRef;
	DSAStackTy &Stack;
	const ValueDecl *CurLCDecl = nullptr;
	const ValueDecl *DepDecl = nullptr;
	const ValueDecl *PrevDepDecl = nullptr;
	bool IsInitializer = true;
	unsigned BaseLoopId = 0;
	bool checkDecl(const Expr E, const ValueDecl VD) {
	if (getCanonicalDecl(VD) == getCanonicalDecl(CurLCDecl)) {
	SemaRef.Diag(E->getExprLoc(), diag::err_omp_stmt_depends_on_loop_counter)
	<< (IsInitializer ? 0 : 1);
	return false;
	}
	const auto &&Data = Stack.isLoopControlVariable(VD);
	// OpenMP, 2.9.1 Canonical Loop Form, Restrictions.
	// The type of the loop iterator on which we depend may not have a random
	// access iterator type.
	if (Data.first && VD->getType()->isRecordType()) {
	SmallString<128> Name;
	llvm::raw_svector_ostream OS(Name);
	VD->getNameForDiagnostic(OS, SemaRef.getPrintingPolicy(),
	/Qualified=/true);
	SemaRef.Diag(E->getExprLoc(),
	diag::err_omp_wrong_dependency_iterator_type)
	<< OS.str();
	SemaRef.Diag(VD->getLocation(), diag::note_previous_decl) << VD;
	return false;
	}
	if (Data.first &&
	(DepDecl \|\| (PrevDepDecl &&
	getCanonicalDecl(VD) != getCanonicalDecl(PrevDepDecl)))) {
	if (!DepDecl && PrevDepDecl)
	DepDecl = PrevDepDecl;
	SmallString<128> Name;
	llvm::raw_svector_ostream OS(Name);
	DepDecl->getNameForDiagnostic(OS, SemaRef.getPrintingPolicy(),
	/Qualified=/true);
	SemaRef.Diag(E->getExprLoc(),
	diag::err_omp_invariant_or_linear_dependency)
	<< OS.str();
	return false;
	}
	if (Data.first) {
	DepDecl = VD;
	BaseLoopId = Data.first;
	}
	return Data.first;
	}

	public:
	bool VisitDeclRefExpr(const DeclRefExpr *E) {
	const ValueDecl *VD = E->getDecl();
	if (isa<VarDecl>(VD))
	return checkDecl(E, VD);
	return false;
	}
	bool VisitMemberExpr(const MemberExpr *E) {
	if (isa<CXXThisExpr>(E->getBase()->IgnoreParens())) {
	const ValueDecl *VD = E->getMemberDecl();
	if (isa<VarDecl>(VD) \|\| isa<FieldDecl>(VD))
	return checkDecl(E, VD);
	}
	return false;
	}
	bool VisitStmt(const Stmt *S) {
	bool Res = false;
	for (const Stmt *Child : S->children())
	Res = (Child && Visit(Child)) \|\| Res;
	return Res;
	}
	explicit LoopCounterRefChecker(Sema &SemaRef, DSAStackTy &Stack,
	const ValueDecl *CurLCDecl, bool IsInitializer,
	const ValueDecl *PrevDepDecl = nullptr)
	: SemaRef(SemaRef), Stack(Stack), CurLCDecl(CurLCDecl),
	PrevDepDecl(PrevDepDecl), IsInitializer(IsInitializer) {}
	unsigned getBaseLoopId() const {
	assert(CurLCDecl && "Expected loop dependency.");
	return BaseLoopId;
	}
	const ValueDecl *getDepDecl() const {
	assert(CurLCDecl && "Expected loop dependency.");
	return DepDecl;
	}
	};
	} // namespace

	Optional<unsigned>
	OpenMPIterationSpaceChecker::doesDependOnLoopCounter(const Stmt *S,
	bool IsInitializer) {
	// Check for the non-rectangular loops.
	LoopCounterRefChecker LoopStmtChecker(SemaRef, Stack, LCDecl, IsInitializer,
	DepDecl);
	if (LoopStmtChecker.Visit(S)) {
	DepDecl = LoopStmtChecker.getDepDecl();
	return LoopStmtChecker.getBaseLoopId();
	}
	return llvm::None;
	}

	bool OpenMPIterationSpaceChecker::checkAndSetInit(Stmt *S, bool EmitDiags) {
	// Check init-expr for canonical loop form and save loop counter
	// variable - #Var and its initialization value - #LB.
	// OpenMP [2.6] Canonical loop form. init-expr may be one of the following:
	// var = lb
	// integer-type var = lb
	// random-access-iterator-type var = lb
	// pointer-type var = lb
	//
	if (!S) {
	if (EmitDiags) {
	SemaRef.Diag(DefaultLoc, diag::err_omp_loop_not_canonical_init);
	}
	return true;
	}
	if (auto *ExprTemp = dyn_cast<ExprWithCleanups>(S))
	if (!ExprTemp->cleanupsHaveSideEffects())
	S = ExprTemp->getSubExpr();

	InitSrcRange = S->getSourceRange();
	if (Expr *E = dyn_cast<Expr>(S))
	S = E->IgnoreParens();
	if (auto *BO = dyn_cast<BinaryOperator>(S)) {
	if (BO->getOpcode() == BO_Assign) {
	Expr *LHS = BO->getLHS()->IgnoreParens();
	if (auto *DRE = dyn_cast<DeclRefExpr>(LHS)) {
	if (auto *CED = dyn_cast<OMPCapturedExprDecl>(DRE->getDecl()))
	if (auto *ME = dyn_cast<MemberExpr>(getExprAsWritten(CED->getInit())))
	return setLCDeclAndLB(ME->getMemberDecl(), ME, BO->getRHS(),
	EmitDiags);
	return setLCDeclAndLB(DRE->getDecl(), DRE, BO->getRHS(), EmitDiags);
	}
	if (auto *ME = dyn_cast<MemberExpr>(LHS)) {
	if (ME->isArrow() &&
	isa<CXXThisExpr>(ME->getBase()->IgnoreParenImpCasts()))
	return setLCDeclAndLB(ME->getMemberDecl(), ME, BO->getRHS(),
	EmitDiags);
	}
	}
	} else if (auto *DS = dyn_cast<DeclStmt>(S)) {
	if (DS->isSingleDecl()) {
	if (auto *Var = dyn_cast_or_null<VarDecl>(DS->getSingleDecl())) {
	if (Var->hasInit() && !Var->getType()->isReferenceType()) {
	// Accept non-canonical init form here but emit ext. warning.
	if (Var->getInitStyle() != VarDecl::CInit && EmitDiags)
	SemaRef.Diag(S->getBeginLoc(),
	diag::ext_omp_loop_not_canonical_init)
	<< S->getSourceRange();
	return setLCDeclAndLB(
	Var,
	buildDeclRefExpr(SemaRef, Var,
	Var->getType().getNonReferenceType(),
	DS->getBeginLoc()),
	Var->getInit(), EmitDiags);
	}
	}
	}
	} else if (auto *CE = dyn_cast<CXXOperatorCallExpr>(S)) {
	if (CE->getOperator() == OO_Equal) {
	Expr *LHS = CE->getArg(0);
	if (auto *DRE = dyn_cast<DeclRefExpr>(LHS)) {
	if (auto *CED = dyn_cast<OMPCapturedExprDecl>(DRE->getDecl()))
	if (auto *ME = dyn_cast<MemberExpr>(getExprAsWritten(CED->getInit())))
	return setLCDeclAndLB(ME->getMemberDecl(), ME, BO->getRHS(),
	EmitDiags);
	return setLCDeclAndLB(DRE->getDecl(), DRE, CE->getArg(1), EmitDiags);
	}
	if (auto *ME = dyn_cast<MemberExpr>(LHS)) {
	if (ME->isArrow() &&
	isa<CXXThisExpr>(ME->getBase()->IgnoreParenImpCasts()))
	return setLCDeclAndLB(ME->getMemberDecl(), ME, BO->getRHS(),
	EmitDiags);
	}
	}
	}

	if (dependent() \|\| SemaRef.CurContext->isDependentContext())
	return false;
	if (EmitDiags) {
	SemaRef.Diag(S->getBeginLoc(), diag::err_omp_loop_not_canonical_init)
	<< S->getSourceRange();
	}
	return true;
	}

	/// Ignore parenthesizes, implicit casts, copy constructor and return the
	/// variable (which may be the loop variable) if possible.
	static const ValueDecl getInitLCDecl(const Expr E) {
	if (!E)
	return nullptr;
	E = getExprAsWritten(E);
	if (const auto *CE = dyn_cast_or_null<CXXConstructExpr>(E))
	if (const CXXConstructorDecl *Ctor = CE->getConstructor())
	if ((Ctor->isCopyOrMoveConstructor() \|\|
	Ctor->isConvertingConstructor(/AllowExplicit=/false)) &&
	CE->getNumArgs() > 0 && CE->getArg(0) != nullptr)
	E = CE->getArg(0)->IgnoreParenImpCasts();
	if (const auto *DRE = dyn_cast_or_null<DeclRefExpr>(E)) {
	if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl()))
	return getCanonicalDecl(VD);
	}
	if (const auto *ME = dyn_cast_or_null<MemberExpr>(E))
	if (ME->isArrow() && isa<CXXThisExpr>(ME->getBase()->IgnoreParenImpCasts()))
	return getCanonicalDecl(ME->getMemberDecl());
	return nullptr;
	}

	bool OpenMPIterationSpaceChecker::checkAndSetCond(Expr *S) {
	// Check test-expr for canonical form, save upper-bound UB, flags for
	// less/greater and for strict/non-strict comparison.
	// OpenMP [2.9] Canonical loop form. Test-expr may be one of the following:
	// var relational-op b
	// b relational-op var
	//
	bool IneqCondIsCanonical = SemaRef.getLangOpts().OpenMP >= 50;
	if (!S) {
	SemaRef.Diag(DefaultLoc, diag::err_omp_loop_not_canonical_cond)
	<< (IneqCondIsCanonical ? 1 : 0) << LCDecl;
	return true;
	}
	Condition = S;
	S = getExprAsWritten(S);
	SourceLocation CondLoc = S->getBeginLoc();
	if (auto *BO = dyn_cast<BinaryOperator>(S)) {
	if (BO->isRelationalOp()) {
	if (getInitLCDecl(BO->getLHS()) == LCDecl)
	return setUB(BO->getRHS(),
	(BO->getOpcode() == BO_LT \|\| BO->getOpcode() == BO_LE),
	(BO->getOpcode() == BO_LT \|\| BO->getOpcode() == BO_GT),
	BO->getSourceRange(), BO->getOperatorLoc());
	if (getInitLCDecl(BO->getRHS()) == LCDecl)
	return setUB(BO->getLHS(),
	(BO->getOpcode() == BO_GT \|\| BO->getOpcode() == BO_GE),
	(BO->getOpcode() == BO_LT \|\| BO->getOpcode() == BO_GT),
	BO->getSourceRange(), BO->getOperatorLoc());
	} else if (IneqCondIsCanonical && BO->getOpcode() == BO_NE)
	return setUB(
	getInitLCDecl(BO->getLHS()) == LCDecl ? BO->getRHS() : BO->getLHS(),
	/LessOp=/llvm::None,
	/StrictOp=/true, BO->getSourceRange(), BO->getOperatorLoc());
	} else if (auto *CE = dyn_cast<CXXOperatorCallExpr>(S)) {
	if (CE->getNumArgs() == 2) {
	auto Op = CE->getOperator();
	switch (Op) {
	case OO_Greater:
	case OO_GreaterEqual:
	case OO_Less:
	case OO_LessEqual:
	if (getInitLCDecl(CE->getArg(0)) == LCDecl)
	return setUB(CE->getArg(1), Op == OO_Less \|\| Op == OO_LessEqual,
	Op == OO_Less \|\| Op == OO_Greater, CE->getSourceRange(),
	CE->getOperatorLoc());
	if (getInitLCDecl(CE->getArg(1)) == LCDecl)
	return setUB(CE->getArg(0), Op == OO_Greater \|\| Op == OO_GreaterEqual,
	Op == OO_Less \|\| Op == OO_Greater, CE->getSourceRange(),
	CE->getOperatorLoc());
	break;
	case OO_ExclaimEqual:
	if (IneqCondIsCanonical)
	return setUB(getInitLCDecl(CE->getArg(0)) == LCDecl ? CE->getArg(1)
	: CE->getArg(0),
	/LessOp=/llvm::None,
	/StrictOp=/true, CE->getSourceRange(),
	CE->getOperatorLoc());
	break;
	default:
	break;
	}
	}
	}
	if (dependent() \|\| SemaRef.CurContext->isDependentContext())
	return false;
	SemaRef.Diag(CondLoc, diag::err_omp_loop_not_canonical_cond)
	<< (IneqCondIsCanonical ? 1 : 0) << S->getSourceRange() << LCDecl;
	return true;
	}

	bool OpenMPIterationSpaceChecker::checkAndSetIncRHS(Expr *RHS) {
	// RHS of canonical loop form increment can be:
	// var + incr
	// incr + var
	// var - incr
	//
	RHS = RHS->IgnoreParenImpCasts();
	if (auto *BO = dyn_cast<BinaryOperator>(RHS)) {
	if (BO->isAdditiveOp()) {
	bool IsAdd = BO->getOpcode() == BO_Add;
	if (getInitLCDecl(BO->getLHS()) == LCDecl)
	return setStep(BO->getRHS(), !IsAdd);
	if (IsAdd && getInitLCDecl(BO->getRHS()) == LCDecl)
	return setStep(BO->getLHS(), /Subtract=/false);
	}
	} else if (auto *CE = dyn_cast<CXXOperatorCallExpr>(RHS)) {
	bool IsAdd = CE->getOperator() == OO_Plus;
	if ((IsAdd \|\| CE->getOperator() == OO_Minus) && CE->getNumArgs() == 2) {
	if (getInitLCDecl(CE->getArg(0)) == LCDecl)
	return setStep(CE->getArg(1), !IsAdd);
	if (IsAdd && getInitLCDecl(CE->getArg(1)) == LCDecl)
	return setStep(CE->getArg(0), /Subtract=/false);
	}
	}
	if (dependent() \|\| SemaRef.CurContext->isDependentContext())
	return false;
	SemaRef.Diag(RHS->getBeginLoc(), diag::err_omp_loop_not_canonical_incr)
	<< RHS->getSourceRange() << LCDecl;
	return true;
	}

	bool OpenMPIterationSpaceChecker::checkAndSetInc(Expr *S) {
	// Check incr-expr for canonical loop form and return true if it
	// does not conform.
	// OpenMP [2.6] Canonical loop form. Test-expr may be one of the following:
	// ++var
	// var++
	// --var
	// var--
	// var += incr
	// var -= incr
	// var = var + incr
	// var = incr + var
	// var = var - incr
	//
	if (!S) {
	SemaRef.Diag(DefaultLoc, diag::err_omp_loop_not_canonical_incr) << LCDecl;
	return true;
	}
	if (auto *ExprTemp = dyn_cast<ExprWithCleanups>(S))
	if (!ExprTemp->cleanupsHaveSideEffects())
	S = ExprTemp->getSubExpr();

	IncrementSrcRange = S->getSourceRange();
	S = S->IgnoreParens();
	if (auto *UO = dyn_cast<UnaryOperator>(S)) {
	if (UO->isIncrementDecrementOp() &&
	getInitLCDecl(UO->getSubExpr()) == LCDecl)
	return setStep(SemaRef
	.ActOnIntegerConstant(UO->getBeginLoc(),
	(UO->isDecrementOp() ? -1 : 1))
	.get(),
	/Subtract=/false);
	} else if (auto *BO = dyn_cast<BinaryOperator>(S)) {
	switch (BO->getOpcode()) {
	case BO_AddAssign:
	case BO_SubAssign:
	if (getInitLCDecl(BO->getLHS()) == LCDecl)
	return setStep(BO->getRHS(), BO->getOpcode() == BO_SubAssign);
	break;
	case BO_Assign:
	if (getInitLCDecl(BO->getLHS()) == LCDecl)
	return checkAndSetIncRHS(BO->getRHS());
	break;
	default:
	break;
	}
	} else if (auto *CE = dyn_cast<CXXOperatorCallExpr>(S)) {
	switch (CE->getOperator()) {
	case OO_PlusPlus:
	case OO_MinusMinus:
	if (getInitLCDecl(CE->getArg(0)) == LCDecl)
	return setStep(SemaRef
	.ActOnIntegerConstant(
	CE->getBeginLoc(),
	((CE->getOperator() == OO_MinusMinus) ? -1 : 1))
	.get(),
	/Subtract=/false);
	break;
	case OO_PlusEqual:
	case OO_MinusEqual:
	if (getInitLCDecl(CE->getArg(0)) == LCDecl)
	return setStep(CE->getArg(1), CE->getOperator() == OO_MinusEqual);
	break;
	case OO_Equal:
	if (getInitLCDecl(CE->getArg(0)) == LCDecl)
	return checkAndSetIncRHS(CE->getArg(1));
	break;
	default:
	break;
	}
	}
	if (dependent() \|\| SemaRef.CurContext->isDependentContext())
	return false;
	SemaRef.Diag(S->getBeginLoc(), diag::err_omp_loop_not_canonical_incr)
	<< S->getSourceRange() << LCDecl;
	return true;
	}

	static ExprResult
	tryBuildCapture(Sema &SemaRef, Expr *Capture,
	llvm::MapVector<const Expr , DeclRefExpr > &Captures) {
	if (SemaRef.CurContext->isDependentContext() \|\| Capture->containsErrors())
	return Capture;
	if (Capture->isEvaluatable(SemaRef.Context, Expr::SE_AllowSideEffects))
	return SemaRef.PerformImplicitConversion(
	Capture->IgnoreImpCasts(), Capture->getType(), Sema::AA_Converting,
	/AllowExplicit=/true);
	auto I = Captures.find(Capture);
	if (I != Captures.end())
	return buildCapture(SemaRef, Capture, I->second);
	DeclRefExpr *Ref = nullptr;
	ExprResult Res = buildCapture(SemaRef, Capture, Ref);
	Captures[Capture] = Ref;
	return Res;
	}

	/// Calculate number of iterations, transforming to unsigned, if number of
	/// iterations may be larger than the original type.
	static Expr *
	calculateNumIters(Sema &SemaRef, Scope *S, SourceLocation DefaultLoc,
	Expr Lower, Expr Upper, Expr *Step, QualType LCTy,
	bool TestIsStrictOp, bool RoundToStep,
	llvm::MapVector<const Expr , DeclRefExpr > &Captures) {
	ExprResult NewStep = tryBuildCapture(SemaRef, Step, Captures);
	if (!NewStep.isUsable())
	return nullptr;
	llvm::APSInt LRes, URes, SRes;
	bool IsLowerConst = Lower->isIntegerConstantExpr(LRes, SemaRef.Context);
	bool IsStepConst = Step->isIntegerConstantExpr(SRes, SemaRef.Context);
	bool NoNeedToConvert = IsLowerConst && !RoundToStep &&
	((!TestIsStrictOp && LRes.isNonNegative()) \|\|
	(TestIsStrictOp && LRes.isStrictlyPositive()));
	bool NeedToReorganize = false;
	// Check if any subexpressions in Lower -Step [+ 1] lead to overflow.
	if (!NoNeedToConvert && IsLowerConst &&
	(TestIsStrictOp \|\| (RoundToStep && IsStepConst))) {
	NoNeedToConvert = true;
	if (RoundToStep) {
	unsigned BW = LRes.getBitWidth() > SRes.getBitWidth()
	? LRes.getBitWidth()
	: SRes.getBitWidth();
	LRes = LRes.extend(BW + 1);
	LRes.setIsSigned(true);
	SRes = SRes.extend(BW + 1);
	SRes.setIsSigned(true);
	LRes -= SRes;
	NoNeedToConvert = LRes.trunc(BW).extend(BW + 1) == LRes;
	LRes = LRes.trunc(BW);
	}
	if (TestIsStrictOp) {
	unsigned BW = LRes.getBitWidth();
	LRes = LRes.extend(BW + 1);
	LRes.setIsSigned(true);
	++LRes;
	NoNeedToConvert =
	NoNeedToConvert && LRes.trunc(BW).extend(BW + 1) == LRes;
	// truncate to the original bitwidth.
	LRes = LRes.trunc(BW);
	}
	NeedToReorganize = NoNeedToConvert;
	}
	bool IsUpperConst = Upper->isIntegerConstantExpr(URes, SemaRef.Context);
	if (NoNeedToConvert && IsLowerConst && IsUpperConst &&
	(!RoundToStep \|\| IsStepConst)) {
	unsigned BW = LRes.getBitWidth() > URes.getBitWidth() ? LRes.getBitWidth()
	: URes.getBitWidth();
	LRes = LRes.extend(BW + 1);
	LRes.setIsSigned(true);
	URes = URes.extend(BW + 1);
	URes.setIsSigned(true);
	URes -= LRes;
	NoNeedToConvert = URes.trunc(BW).extend(BW + 1) == URes;
	NeedToReorganize = NoNeedToConvert;
	}
	// If the boundaries are not constant or (Lower - Step [+ 1]) is not constant
	// or less than zero (Upper - (Lower - Step [+ 1]) may overflow) - promote to
	// unsigned.
	if ((!NoNeedToConvert \|\| (LRes.isNegative() && !IsUpperConst)) &&
	!LCTy->isDependentType() && LCTy->isIntegerType()) {
	QualType LowerTy = Lower->getType();
	QualType UpperTy = Upper->getType();
	uint64_t LowerSize = SemaRef.Context.getTypeSize(LowerTy);
	uint64_t UpperSize = SemaRef.Context.getTypeSize(UpperTy);
	if ((LowerSize <= UpperSize && UpperTy->hasSignedIntegerRepresentation()) \|\|
	(LowerSize > UpperSize && LowerTy->hasSignedIntegerRepresentation())) {
	QualType CastType = SemaRef.Context.getIntTypeForBitwidth(
	LowerSize > UpperSize ? LowerSize : UpperSize, /Signed=/0);
	Upper =
	SemaRef
	.PerformImplicitConversion(
	SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, Upper).get(),
	CastType, Sema::AA_Converting)
	.get();
	Lower = SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, Lower).get();
	NewStep = SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, NewStep.get());
	}
	}
	if (!Lower \|\| !Upper \|\| NewStep.isInvalid())
	return nullptr;

	ExprResult Diff;
	// If need to reorganize, then calculate the form as Upper - (Lower - Step [+
	// 1]).
	if (NeedToReorganize) {
	Diff = Lower;

	if (RoundToStep) {
	// Lower - Step
	Diff =
	SemaRef.BuildBinOp(S, DefaultLoc, BO_Sub, Diff.get(), NewStep.get());
	if (!Diff.isUsable())
	return nullptr;
	}

	// Lower - Step [+ 1]
	if (TestIsStrictOp)
	Diff = SemaRef.BuildBinOp(
	S, DefaultLoc, BO_Add, Diff.get(),
	SemaRef.ActOnIntegerConstant(SourceLocation(), 1).get());
	if (!Diff.isUsable())
	return nullptr;

	Diff = SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, Diff.get());
	if (!Diff.isUsable())
	return nullptr;

	// Upper - (Lower - Step [+ 1]).
	Diff = SemaRef.BuildBinOp(S, DefaultLoc, BO_Sub, Upper, Diff.get());
	if (!Diff.isUsable())
	return nullptr;
	} else {
	Diff = SemaRef.BuildBinOp(S, DefaultLoc, BO_Sub, Upper, Lower);

	if (!Diff.isUsable() && LCTy->getAsCXXRecordDecl()) {
	// BuildBinOp already emitted error, this one is to point user to upper
	// and lower bound, and to tell what is passed to 'operator-'.
	SemaRef.Diag(Upper->getBeginLoc(), diag::err_omp_loop_diff_cxx)
	<< Upper->getSourceRange() << Lower->getSourceRange();
	return nullptr;
	}

	if (!Diff.isUsable())
	return nullptr;

	// Upper - Lower [- 1]
	if (TestIsStrictOp)
	Diff = SemaRef.BuildBinOp(
	S, DefaultLoc, BO_Sub, Diff.get(),
	SemaRef.ActOnIntegerConstant(SourceLocation(), 1).get());
	if (!Diff.isUsable())
	return nullptr;

	if (RoundToStep) {
	// Upper - Lower [- 1] + Step
	Diff =
	SemaRef.BuildBinOp(S, DefaultLoc, BO_Add, Diff.get(), NewStep.get());
	if (!Diff.isUsable())
	return nullptr;
	}
	}

	// Parentheses (for dumping/debugging purposes only).
	Diff = SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, Diff.get());
	if (!Diff.isUsable())
	return nullptr;

	// (Upper - Lower [- 1] + Step) / Step or (Upper - Lower) / Step
	Diff = SemaRef.BuildBinOp(S, DefaultLoc, BO_Div, Diff.get(), NewStep.get());
	if (!Diff.isUsable())
	return nullptr;

	return Diff.get();
	}

	/// Build the expression to calculate the number of iterations.
	Expr *OpenMPIterationSpaceChecker::buildNumIterations(
	Scope *S, ArrayRef<LoopIterationSpace> ResultIterSpaces, bool LimitedType,
	llvm::MapVector<const Expr , DeclRefExpr > &Captures) const {
	QualType VarType = LCDecl->getType().getNonReferenceType();
	if (!VarType->isIntegerType() && !VarType->isPointerType() &&
	!SemaRef.getLangOpts().CPlusPlus)
	return nullptr;
	Expr *LBVal = LB;
	Expr *UBVal = UB;
	// LB = TestIsLessOp.getValue() ? min(LB(MinVal), LB(MaxVal)) :
	// max(LB(MinVal), LB(MaxVal))
	if (InitDependOnLC) {
	const LoopIterationSpace &IS =
	ResultIterSpaces[ResultIterSpaces.size() - 1 -
	InitDependOnLC.getValueOr(
	CondDependOnLC.getValueOr(0))];
	if (!IS.MinValue \|\| !IS.MaxValue)
	return nullptr;
	// OuterVar = Min
	ExprResult MinValue =
	SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, IS.MinValue);
	if (!MinValue.isUsable())
	return nullptr;

	ExprResult LBMinVal = SemaRef.BuildBinOp(S, DefaultLoc, BO_Assign,
	IS.CounterVar, MinValue.get());
	if (!LBMinVal.isUsable())
	return nullptr;
	// OuterVar = Min, LBVal
	LBMinVal =
	SemaRef.BuildBinOp(S, DefaultLoc, BO_Comma, LBMinVal.get(), LBVal);
	if (!LBMinVal.isUsable())
	return nullptr;
	// (OuterVar = Min, LBVal)
	LBMinVal = SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, LBMinVal.get());
	if (!LBMinVal.isUsable())
	return nullptr;

	// OuterVar = Max
	ExprResult MaxValue =
	SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, IS.MaxValue);
	if (!MaxValue.isUsable())
	return nullptr;

	ExprResult LBMaxVal = SemaRef.BuildBinOp(S, DefaultLoc, BO_Assign,
	IS.CounterVar, MaxValue.get());
	if (!LBMaxVal.isUsable())
	return nullptr;
	// OuterVar = Max, LBVal
	LBMaxVal =
	SemaRef.BuildBinOp(S, DefaultLoc, BO_Comma, LBMaxVal.get(), LBVal);
	if (!LBMaxVal.isUsable())
	return nullptr;
	// (OuterVar = Max, LBVal)
	LBMaxVal = SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, LBMaxVal.get());
	if (!LBMaxVal.isUsable())
	return nullptr;

	Expr *LBMin = tryBuildCapture(SemaRef, LBMinVal.get(), Captures).get();
	Expr *LBMax = tryBuildCapture(SemaRef, LBMaxVal.get(), Captures).get();
	if (!LBMin \|\| !LBMax)
	return nullptr;
	// LB(MinVal) < LB(MaxVal)
	ExprResult MinLessMaxRes =
	SemaRef.BuildBinOp(S, DefaultLoc, BO_LT, LBMin, LBMax);
	if (!MinLessMaxRes.isUsable())
	return nullptr;
	Expr *MinLessMax =
	tryBuildCapture(SemaRef, MinLessMaxRes.get(), Captures).get();
	if (!MinLessMax)
	return nullptr;
	if (TestIsLessOp.getValue()) {
	// LB(MinVal) < LB(MaxVal) ? LB(MinVal) : LB(MaxVal) - min(LB(MinVal),
	// LB(MaxVal))
	ExprResult MinLB = SemaRef.ActOnConditionalOp(DefaultLoc, DefaultLoc,
	MinLessMax, LBMin, LBMax);
	if (!MinLB.isUsable())
	return nullptr;
	LBVal = MinLB.get();
	} else {
	// LB(MinVal) < LB(MaxVal) ? LB(MaxVal) : LB(MinVal) - max(LB(MinVal),
	// LB(MaxVal))
	ExprResult MaxLB = SemaRef.ActOnConditionalOp(DefaultLoc, DefaultLoc,
	MinLessMax, LBMax, LBMin);
	if (!MaxLB.isUsable())
	return nullptr;
	LBVal = MaxLB.get();
	}
	}
	// UB = TestIsLessOp.getValue() ? max(UB(MinVal), UB(MaxVal)) :
	// min(UB(MinVal), UB(MaxVal))
	if (CondDependOnLC) {
	const LoopIterationSpace &IS =
	ResultIterSpaces[ResultIterSpaces.size() - 1 -
	InitDependOnLC.getValueOr(
	CondDependOnLC.getValueOr(0))];
	if (!IS.MinValue \|\| !IS.MaxValue)
	return nullptr;
	// OuterVar = Min
	ExprResult MinValue =
	SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, IS.MinValue);
	if (!MinValue.isUsable())
	return nullptr;

	ExprResult UBMinVal = SemaRef.BuildBinOp(S, DefaultLoc, BO_Assign,
	IS.CounterVar, MinValue.get());
	if (!UBMinVal.isUsable())
	return nullptr;
	// OuterVar = Min, UBVal
	UBMinVal =
	SemaRef.BuildBinOp(S, DefaultLoc, BO_Comma, UBMinVal.get(), UBVal);
	if (!UBMinVal.isUsable())
	return nullptr;
	// (OuterVar = Min, UBVal)
	UBMinVal = SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, UBMinVal.get());
	if (!UBMinVal.isUsable())
	return nullptr;

	// OuterVar = Max
	ExprResult MaxValue =
	SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, IS.MaxValue);
	if (!MaxValue.isUsable())
	return nullptr;

	ExprResult UBMaxVal = SemaRef.BuildBinOp(S, DefaultLoc, BO_Assign,
	IS.CounterVar, MaxValue.get());
	if (!UBMaxVal.isUsable())
	return nullptr;
	// OuterVar = Max, UBVal
	UBMaxVal =
	SemaRef.BuildBinOp(S, DefaultLoc, BO_Comma, UBMaxVal.get(), UBVal);
	if (!UBMaxVal.isUsable())
	return nullptr;
	// (OuterVar = Max, UBVal)
	UBMaxVal = SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, UBMaxVal.get());
	if (!UBMaxVal.isUsable())
	return nullptr;

	Expr *UBMin = tryBuildCapture(SemaRef, UBMinVal.get(), Captures).get();
	Expr *UBMax = tryBuildCapture(SemaRef, UBMaxVal.get(), Captures).get();
	if (!UBMin \|\| !UBMax)
	return nullptr;
	// UB(MinVal) > UB(MaxVal)
	ExprResult MinGreaterMaxRes =
	SemaRef.BuildBinOp(S, DefaultLoc, BO_GT, UBMin, UBMax);
	if (!MinGreaterMaxRes.isUsable())
	return nullptr;
	Expr *MinGreaterMax =
	tryBuildCapture(SemaRef, MinGreaterMaxRes.get(), Captures).get();
	if (!MinGreaterMax)
	return nullptr;
	if (TestIsLessOp.getValue()) {
	// UB(MinVal) > UB(MaxVal) ? UB(MinVal) : UB(MaxVal) - max(UB(MinVal),
	// UB(MaxVal))
	ExprResult MaxUB = SemaRef.ActOnConditionalOp(
	DefaultLoc, DefaultLoc, MinGreaterMax, UBMin, UBMax);
	if (!MaxUB.isUsable())
	return nullptr;
	UBVal = MaxUB.get();
	} else {
	// UB(MinVal) > UB(MaxVal) ? UB(MaxVal) : UB(MinVal) - min(UB(MinVal),
	// UB(MaxVal))
	ExprResult MinUB = SemaRef.ActOnConditionalOp(
	DefaultLoc, DefaultLoc, MinGreaterMax, UBMax, UBMin);
	if (!MinUB.isUsable())
	return nullptr;
	UBVal = MinUB.get();
	}
	}
	Expr *UBExpr = TestIsLessOp.getValue() ? UBVal : LBVal;
	Expr *LBExpr = TestIsLessOp.getValue() ? LBVal : UBVal;
	Expr *Upper = tryBuildCapture(SemaRef, UBExpr, Captures).get();
	Expr *Lower = tryBuildCapture(SemaRef, LBExpr, Captures).get();
	if (!Upper \|\| !Lower)
	return nullptr;

	ExprResult Diff =
	calculateNumIters(SemaRef, S, DefaultLoc, Lower, Upper, Step, VarType,
	TestIsStrictOp, /RoundToStep=/true, Captures);
	if (!Diff.isUsable())
	return nullptr;

	// OpenMP runtime requires 32-bit or 64-bit loop variables.
	QualType Type = Diff.get()->getType();
	ASTContext &C = SemaRef.Context;
	bool UseVarType = VarType->hasIntegerRepresentation() &&
	C.getTypeSize(Type) > C.getTypeSize(VarType);
	if (!Type->isIntegerType() \|\| UseVarType) {
	unsigned NewSize =
	UseVarType ? C.getTypeSize(VarType) : C.getTypeSize(Type);
	bool IsSigned = UseVarType ? VarType->hasSignedIntegerRepresentation()
	: Type->hasSignedIntegerRepresentation();
	Type = C.getIntTypeForBitwidth(NewSize, IsSigned);
	if (!SemaRef.Context.hasSameType(Diff.get()->getType(), Type)) {
	Diff = SemaRef.PerformImplicitConversion(
	Diff.get(), Type, Sema::AA_Converting, /AllowExplicit=/true);
	if (!Diff.isUsable())
	return nullptr;
	}
	}
	if (LimitedType) {
	unsigned NewSize = (C.getTypeSize(Type) > 32) ? 64 : 32;
	if (NewSize != C.getTypeSize(Type)) {
	if (NewSize < C.getTypeSize(Type)) {
	assert(NewSize == 64 && "incorrect loop var size");
	SemaRef.Diag(DefaultLoc, diag::warn_omp_loop_64_bit_var)
	<< InitSrcRange << ConditionSrcRange;
	}
	QualType NewType = C.getIntTypeForBitwidth(
	NewSize, Type->hasSignedIntegerRepresentation() \|\|
	C.getTypeSize(Type) < NewSize);
	if (!SemaRef.Context.hasSameType(Diff.get()->getType(), NewType)) {
	Diff = SemaRef.PerformImplicitConversion(Diff.get(), NewType,
	Sema::AA_Converting, true);
	if (!Diff.isUsable())
	return nullptr;
	}
	}
	}

	return Diff.get();
	}

	std::pair<Expr , Expr > OpenMPIterationSpaceChecker::buildMinMaxValues(
	Scope S, llvm::MapVector<const Expr , DeclRefExpr *> &Captures) const {
	// Do not build for iterators, they cannot be used in non-rectangular loop
	// nests.
	if (LCDecl->getType()->isRecordType())
	return std::make_pair(nullptr, nullptr);
	// If we subtract, the min is in the condition, otherwise the min is in the
	// init value.
	Expr *MinExpr = nullptr;
	Expr *MaxExpr = nullptr;
	Expr *LBExpr = TestIsLessOp.getValue() ? LB : UB;
	Expr *UBExpr = TestIsLessOp.getValue() ? UB : LB;
	bool LBNonRect = TestIsLessOp.getValue() ? InitDependOnLC.hasValue()
	: CondDependOnLC.hasValue();
	bool UBNonRect = TestIsLessOp.getValue() ? CondDependOnLC.hasValue()
	: InitDependOnLC.hasValue();
	Expr *Lower =
	LBNonRect ? LBExpr : tryBuildCapture(SemaRef, LBExpr, Captures).get();
	Expr *Upper =
	UBNonRect ? UBExpr : tryBuildCapture(SemaRef, UBExpr, Captures).get();
	if (!Upper \|\| !Lower)
	return std::make_pair(nullptr, nullptr);

	if (TestIsLessOp.getValue())
	MinExpr = Lower;
	else
	MaxExpr = Upper;

	// Build minimum/maximum value based on number of iterations.
	QualType VarType = LCDecl->getType().getNonReferenceType();

	ExprResult Diff =
	calculateNumIters(SemaRef, S, DefaultLoc, Lower, Upper, Step, VarType,
	TestIsStrictOp, /RoundToStep=/false, Captures);
	if (!Diff.isUsable())
	return std::make_pair(nullptr, nullptr);

	// ((Upper - Lower [- 1]) / Step) * Step
	// Parentheses (for dumping/debugging purposes only).
	Diff = SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, Diff.get());
	if (!Diff.isUsable())
	return std::make_pair(nullptr, nullptr);

	ExprResult NewStep = tryBuildCapture(SemaRef, Step, Captures);
	if (!NewStep.isUsable())
	return std::make_pair(nullptr, nullptr);
	Diff = SemaRef.BuildBinOp(S, DefaultLoc, BO_Mul, Diff.get(), NewStep.get());
	if (!Diff.isUsable())
	return std::make_pair(nullptr, nullptr);

	// Parentheses (for dumping/debugging purposes only).
	Diff = SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, Diff.get());
	if (!Diff.isUsable())
	return std::make_pair(nullptr, nullptr);

	// Convert to the ptrdiff_t, if original type is pointer.
	if (VarType->isAnyPointerType() &&
	!SemaRef.Context.hasSameType(
	Diff.get()->getType(),
	SemaRef.Context.getUnsignedPointerDiffType())) {
	Diff = SemaRef.PerformImplicitConversion(
	Diff.get(), SemaRef.Context.getUnsignedPointerDiffType(),
	Sema::AA_Converting, /AllowExplicit=/true);
	}
	if (!Diff.isUsable())
	return std::make_pair(nullptr, nullptr);

	if (TestIsLessOp.getValue()) {
	// MinExpr = Lower;
	// MaxExpr = Lower + (((Upper - Lower [- 1]) / Step) * Step)
	Diff = SemaRef.BuildBinOp(
	S, DefaultLoc, BO_Add,
	SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, Lower).get(),
	Diff.get());
	if (!Diff.isUsable())
	return std::make_pair(nullptr, nullptr);
	} else {
	// MaxExpr = Upper;
	// MinExpr = Upper - (((Upper - Lower [- 1]) / Step) * Step)
	Diff = SemaRef.BuildBinOp(
	S, DefaultLoc, BO_Sub,
	SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, Upper).get(),
	Diff.get());
	if (!Diff.isUsable())
	return std::make_pair(nullptr, nullptr);
	}

	// Convert to the original type.
	if (SemaRef.Context.hasSameType(Diff.get()->getType(), VarType))
	Diff = SemaRef.PerformImplicitConversion(Diff.get(), VarType,
	Sema::AA_Converting,
	/AllowExplicit=/true);
	if (!Diff.isUsable())
	return std::make_pair(nullptr, nullptr);

	Diff = SemaRef.ActOnFinishFullExpr(Diff.get(), /DiscardedValue=/false);
	if (!Diff.isUsable())
	return std::make_pair(nullptr, nullptr);

	if (TestIsLessOp.getValue())
	MaxExpr = Diff.get();
	else
	MinExpr = Diff.get();

	return std::make_pair(MinExpr, MaxExpr);
	}

	Expr OpenMPIterationSpaceChecker::buildFinalCondition(Scope S) const {
	if (InitDependOnLC \|\| CondDependOnLC)
	return Condition;
	return nullptr;
	}

	Expr *OpenMPIterationSpaceChecker::buildPreCond(
	Scope S, Expr Cond,
	llvm::MapVector<const Expr , DeclRefExpr > &Captures) const {
	// Do not build a precondition when the condition/initialization is dependent
	// to prevent pessimistic early loop exit.
	// TODO: this can be improved by calculating min/max values but not sure that
	// it will be very effective.
	if (CondDependOnLC \|\| InitDependOnLC)
	return SemaRef.PerformImplicitConversion(
	SemaRef.ActOnIntegerConstant(SourceLocation(), 1).get(),
	SemaRef.Context.BoolTy, /Action=/Sema::AA_Casting,
	/AllowExplicit=/true).get();

	// Try to build LB <op> UB, where <op> is <, >, <=, or >=.
	Sema::TentativeAnalysisScope Trap(SemaRef);

	ExprResult NewLB = tryBuildCapture(SemaRef, LB, Captures);
	ExprResult NewUB = tryBuildCapture(SemaRef, UB, Captures);
	if (!NewLB.isUsable() \|\| !NewUB.isUsable())
	return nullptr;

	ExprResult CondExpr =
	SemaRef.BuildBinOp(S, DefaultLoc,
	TestIsLessOp.getValue() ?
	(TestIsStrictOp ? BO_LT : BO_LE) :
	(TestIsStrictOp ? BO_GT : BO_GE),
	NewLB.get(), NewUB.get());
	if (CondExpr.isUsable()) {
	if (!SemaRef.Context.hasSameUnqualifiedType(CondExpr.get()->getType(),
	SemaRef.Context.BoolTy))
	CondExpr = SemaRef.PerformImplicitConversion(
	CondExpr.get(), SemaRef.Context.BoolTy, /Action=/Sema::AA_Casting,
	/AllowExplicit=/true);
	}

	// Otherwise use original loop condition and evaluate it in runtime.
	return CondExpr.isUsable() ? CondExpr.get() : Cond;
	}

	/// Build reference expression to the counter be used for codegen.
	DeclRefExpr *OpenMPIterationSpaceChecker::buildCounterVar(
	llvm::MapVector<const Expr , DeclRefExpr > &Captures,
	DSAStackTy &DSA) const {
	auto *VD = dyn_cast<VarDecl>(LCDecl);
	if (!VD) {
	VD = SemaRef.isOpenMPCapturedDecl(LCDecl);
	DeclRefExpr *Ref = buildDeclRefExpr(
	SemaRef, VD, VD->getType().getNonReferenceType(), DefaultLoc);
	const DSAStackTy::DSAVarData Data =
	DSA.getTopDSA(LCDecl, /FromParent=/false);
	// If the loop control decl is explicitly marked as private, do not mark it
	// as captured again.
	if (!isOpenMPPrivate(Data.CKind) \|\| !Data.RefExpr)
	Captures.insert(std::make_pair(LCRef, Ref));
	return Ref;
	}
	return cast<DeclRefExpr>(LCRef);
	}

	Expr *OpenMPIterationSpaceChecker::buildPrivateCounterVar() const {
	if (LCDecl && !LCDecl->isInvalidDecl()) {
	QualType Type = LCDecl->getType().getNonReferenceType();
	VarDecl *PrivateVar = buildVarDecl(
	SemaRef, DefaultLoc, Type, LCDecl->getName(),
	LCDecl->hasAttrs() ? &LCDecl->getAttrs() : nullptr,
	isa<VarDecl>(LCDecl)
	? buildDeclRefExpr(SemaRef, cast<VarDecl>(LCDecl), Type, DefaultLoc)
	: nullptr);
	if (PrivateVar->isInvalidDecl())
	return nullptr;
	return buildDeclRefExpr(SemaRef, PrivateVar, Type, DefaultLoc);
	}
	return nullptr;
	}

	/// Build initialization of the counter to be used for codegen.
	Expr *OpenMPIterationSpaceChecker::buildCounterInit() const { return LB; }

	/// Build step of the counter be used for codegen.
	Expr *OpenMPIterationSpaceChecker::buildCounterStep() const { return Step; }

	Expr *OpenMPIterationSpaceChecker::buildOrderedLoopData(
	Scope S, Expr Counter,
	llvm::MapVector<const Expr , DeclRefExpr > &Captures, SourceLocation Loc,
	Expr *Inc, OverloadedOperatorKind OOK) {
	Expr *Cnt = SemaRef.DefaultLvalueConversion(Counter).get();
	if (!Cnt)
	return nullptr;
	if (Inc) {
	assert((OOK == OO_Plus \|\| OOK == OO_Minus) &&
	"Expected only + or - operations for depend clauses.");
	BinaryOperatorKind BOK = (OOK == OO_Plus) ? BO_Add : BO_Sub;
	Cnt = SemaRef.BuildBinOp(S, Loc, BOK, Cnt, Inc).get();
	if (!Cnt)
	return nullptr;
	}
	QualType VarType = LCDecl->getType().getNonReferenceType();
	if (!VarType->isIntegerType() && !VarType->isPointerType() &&
	!SemaRef.getLangOpts().CPlusPlus)
	return nullptr;
	// Upper - Lower
	Expr *Upper = TestIsLessOp.getValue()
	? Cnt
	: tryBuildCapture(SemaRef, LB, Captures).get();
	Expr *Lower = TestIsLessOp.getValue()
	? tryBuildCapture(SemaRef, LB, Captures).get()
	: Cnt;
	if (!Upper \|\| !Lower)
	return nullptr;

	ExprResult Diff = calculateNumIters(SemaRef, S, DefaultLoc, Lower, Upper,
	Step, VarType, /TestIsStrictOp=/false,
	/RoundToStep=/false, Captures);
	if (!Diff.isUsable())
	return nullptr;

	return Diff.get();
	}
	} // namespace

	void Sema::ActOnOpenMPLoopInitialization(SourceLocation ForLoc, Stmt *Init) {
	assert(getLangOpts().OpenMP && "OpenMP is not active.");
	assert(Init && "Expected loop in canonical form.");
	unsigned AssociatedLoops = DSAStack->getAssociatedLoops();
	if (AssociatedLoops > 0 &&
	isOpenMPLoopDirective(DSAStack->getCurrentDirective())) {
	DSAStack->loopStart();
	OpenMPIterationSpaceChecker ISC(this, DSAStack, ForLoc);
	if (!ISC.checkAndSetInit(Init, /EmitDiags=/false)) {
	if (ValueDecl *D = ISC.getLoopDecl()) {
	auto *VD = dyn_cast<VarDecl>(D);
	DeclRefExpr *PrivateRef = nullptr;
	if (!VD) {
	if (VarDecl *Private = isOpenMPCapturedDecl(D)) {
	VD = Private;
	} else {
	PrivateRef = buildCapture(*this, D, ISC.getLoopDeclRefExpr(),
	/WithInit=/false);
	VD = cast<VarDecl>(PrivateRef->getDecl());
	}
	}
	DSAStack->addLoopControlVariable(D, VD);
	const Decl *LD = DSAStack->getPossiblyLoopCunter();
	if (LD != D->getCanonicalDecl()) {
	DSAStack->resetPossibleLoopCounter();
	if (auto *Var = dyn_cast_or_null<VarDecl>(LD))
	MarkDeclarationsReferencedInExpr(
	buildDeclRefExpr(this, const_cast<VarDecl >(Var),
	Var->getType().getNonLValueExprType(Context),
	ForLoc, /RefersToCapture=/true));
	}
	OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
	// OpenMP [2.14.1.1, Data-sharing Attribute Rules for Variables
	// Referenced in a Construct, C/C++]. The loop iteration variable in the
	// associated for-loop of a simd construct with just one associated
	// for-loop may be listed in a linear clause with a constant-linear-step
	// that is the increment of the associated for-loop. The loop iteration
	// variable(s) in the associated for-loop(s) of a for or parallel for
	// construct may be listed in a private or lastprivate clause.
	DSAStackTy::DSAVarData DVar =
	DSAStack->getTopDSA(D, /FromParent=/false);
	// If LoopVarRefExpr is nullptr it means the corresponding loop variable
	// is declared in the loop and it is predetermined as a private.
	Expr *LoopDeclRefExpr = ISC.getLoopDeclRefExpr();
	OpenMPClauseKind PredeterminedCKind =
	isOpenMPSimdDirective(DKind)
	? (DSAStack->hasMutipleLoops() ? OMPC_lastprivate : OMPC_linear)
	: OMPC_private;
	if (((isOpenMPSimdDirective(DKind) && DVar.CKind != OMPC_unknown &&
	DVar.CKind != PredeterminedCKind && DVar.RefExpr &&
	(LangOpts.OpenMP <= 45 \|\| (DVar.CKind != OMPC_lastprivate &&
	DVar.CKind != OMPC_private))) \|\|
	((isOpenMPWorksharingDirective(DKind) \|\| DKind == OMPD_taskloop \|\|
	DKind == OMPD_master_taskloop \|\|
	DKind == OMPD_parallel_master_taskloop \|\|
	isOpenMPDistributeDirective(DKind)) &&
	!isOpenMPSimdDirective(DKind) && DVar.CKind != OMPC_unknown &&
	DVar.CKind != OMPC_private && DVar.CKind != OMPC_lastprivate)) &&
	(DVar.CKind != OMPC_private \|\| DVar.RefExpr)) {
	Diag(Init->getBeginLoc(), diag::err_omp_loop_var_dsa)
	<< getOpenMPClauseName(DVar.CKind)
	<< getOpenMPDirectiveName(DKind)
	<< getOpenMPClauseName(PredeterminedCKind);
	if (DVar.RefExpr == nullptr)
	DVar.CKind = PredeterminedCKind;
	reportOriginalDsa(*this, DSAStack, D, DVar,
	/IsLoopIterVar=/true);
	} else if (LoopDeclRefExpr) {
	// Make the loop iteration variable private (for worksharing
	// constructs), linear (for simd directives with the only one
	// associated loop) or lastprivate (for simd directives with several
	// collapsed or ordered loops).
	if (DVar.CKind == OMPC_unknown)
	DSAStack->addDSA(D, LoopDeclRefExpr, PredeterminedCKind,
	PrivateRef);
	}
	}
	}
	DSAStack->setAssociatedLoops(AssociatedLoops - 1);
	}
	}

	/// Called on a for stmt to check and extract its iteration space
	/// for further processing (such as collapsing).
	static bool checkOpenMPIterationSpace(
	OpenMPDirectiveKind DKind, Stmt *S, Sema &SemaRef, DSAStackTy &DSA,
	unsigned CurrentNestedLoopCount, unsigned NestedLoopCount,
	unsigned TotalNestedLoopCount, Expr *CollapseLoopCountExpr,
	Expr *OrderedLoopCountExpr,
	Sema::VarsWithInheritedDSAType &VarsWithImplicitDSA,
	llvm::MutableArrayRef<LoopIterationSpace> ResultIterSpaces,
	llvm::MapVector<const Expr , DeclRefExpr > &Captures) {
	// OpenMP [2.9.1, Canonical Loop Form]
	// for (init-expr; test-expr; incr-expr) structured-block
	// for (range-decl: range-expr) structured-block
	auto *For = dyn_cast_or_null<ForStmt>(S);
	auto *CXXFor = dyn_cast_or_null<CXXForRangeStmt>(S);
	// Ranged for is supported only in OpenMP 5.0.
	if (!For && (SemaRef.LangOpts.OpenMP <= 45 \|\| !CXXFor)) {
	SemaRef.Diag(S->getBeginLoc(), diag::err_omp_not_for)
	<< (CollapseLoopCountExpr != nullptr \|\| OrderedLoopCountExpr != nullptr)
	<< getOpenMPDirectiveName(DKind) << TotalNestedLoopCount
	<< (CurrentNestedLoopCount > 0) << CurrentNestedLoopCount;
	if (TotalNestedLoopCount > 1) {
	if (CollapseLoopCountExpr && OrderedLoopCountExpr)
	SemaRef.Diag(DSA.getConstructLoc(),
	diag::note_omp_collapse_ordered_expr)
	<< 2 << CollapseLoopCountExpr->getSourceRange()
	<< OrderedLoopCountExpr->getSourceRange();
	else if (CollapseLoopCountExpr)
	SemaRef.Diag(CollapseLoopCountExpr->getExprLoc(),
	diag::note_omp_collapse_ordered_expr)
	<< 0 << CollapseLoopCountExpr->getSourceRange();
	else
	SemaRef.Diag(OrderedLoopCountExpr->getExprLoc(),
	diag::note_omp_collapse_ordered_expr)
	<< 1 << OrderedLoopCountExpr->getSourceRange();
	}
	return true;
	}
	assert(((For && For->getBody()) \|\| (CXXFor && CXXFor->getBody())) &&
	"No loop body.");

	OpenMPIterationSpaceChecker ISC(SemaRef, DSA,
	For ? For->getForLoc() : CXXFor->getForLoc());

	// Check init.
	Stmt *Init = For ? For->getInit() : CXXFor->getBeginStmt();
	if (ISC.checkAndSetInit(Init))
	return true;

	bool HasErrors = false;

	// Check loop variable's type.
	if (ValueDecl *LCDecl = ISC.getLoopDecl()) {
	// OpenMP [2.6, Canonical Loop Form]
	// Var is one of the following:
	// A variable of signed or unsigned integer type.
	// For C++, a variable of a random access iterator type.
	// For C, a variable of a pointer type.
	QualType VarType = LCDecl->getType().getNonReferenceType();
	if (!VarType->isDependentType() && !VarType->isIntegerType() &&
	!VarType->isPointerType() &&
	!(SemaRef.getLangOpts().CPlusPlus && VarType->isOverloadableType())) {
	SemaRef.Diag(Init->getBeginLoc(), diag::err_omp_loop_variable_type)
	<< SemaRef.getLangOpts().CPlusPlus;
	HasErrors = true;
	}

	// OpenMP, 2.14.1.1 Data-sharing Attribute Rules for Variables Referenced in
	// a Construct
	// The loop iteration variable(s) in the associated for-loop(s) of a for or
	// parallel for construct is (are) private.
	// The loop iteration variable in the associated for-loop of a simd
	// construct with just one associated for-loop is linear with a
	// constant-linear-step that is the increment of the associated for-loop.
	// Exclude loop var from the list of variables with implicitly defined data
	// sharing attributes.
	VarsWithImplicitDSA.erase(LCDecl);

	assert(isOpenMPLoopDirective(DKind) && "DSA for non-loop vars");

	// Check test-expr.
	HasErrors \|= ISC.checkAndSetCond(For ? For->getCond() : CXXFor->getCond());

	// Check incr-expr.
	HasErrors \|= ISC.checkAndSetInc(For ? For->getInc() : CXXFor->getInc());
	}

	if (ISC.dependent() \|\| SemaRef.CurContext->isDependentContext() \|\| HasErrors)
	return HasErrors;

	// Build the loop's iteration space representation.
	ResultIterSpaces[CurrentNestedLoopCount].PreCond = ISC.buildPreCond(
	DSA.getCurScope(), For ? For->getCond() : CXXFor->getCond(), Captures);
	ResultIterSpaces[CurrentNestedLoopCount].NumIterations =
	ISC.buildNumIterations(DSA.getCurScope(), ResultIterSpaces,
	(isOpenMPWorksharingDirective(DKind) \|\|
	isOpenMPTaskLoopDirective(DKind) \|\|
	isOpenMPDistributeDirective(DKind)),
	Captures);
	ResultIterSpaces[CurrentNestedLoopCount].CounterVar =
	ISC.buildCounterVar(Captures, DSA);
	ResultIterSpaces[CurrentNestedLoopCount].PrivateCounterVar =
	ISC.buildPrivateCounterVar();
	ResultIterSpaces[CurrentNestedLoopCount].CounterInit = ISC.buildCounterInit();
	ResultIterSpaces[CurrentNestedLoopCount].CounterStep = ISC.buildCounterStep();
	ResultIterSpaces[CurrentNestedLoopCount].InitSrcRange = ISC.getInitSrcRange();
	ResultIterSpaces[CurrentNestedLoopCount].CondSrcRange =
	ISC.getConditionSrcRange();
	ResultIterSpaces[CurrentNestedLoopCount].IncSrcRange =
	ISC.getIncrementSrcRange();
	ResultIterSpaces[CurrentNestedLoopCount].Subtract = ISC.shouldSubtractStep();
	ResultIterSpaces[CurrentNestedLoopCount].IsStrictCompare =
	ISC.isStrictTestOp();
	std::tie(ResultIterSpaces[CurrentNestedLoopCount].MinValue,
	ResultIterSpaces[CurrentNestedLoopCount].MaxValue) =
	ISC.buildMinMaxValues(DSA.getCurScope(), Captures);
	ResultIterSpaces[CurrentNestedLoopCount].FinalCondition =
	ISC.buildFinalCondition(DSA.getCurScope());
	ResultIterSpaces[CurrentNestedLoopCount].IsNonRectangularLB =
	ISC.doesInitDependOnLC();
	ResultIterSpaces[CurrentNestedLoopCount].IsNonRectangularUB =
	ISC.doesCondDependOnLC();
	ResultIterSpaces[CurrentNestedLoopCount].LoopDependentIdx =
	ISC.getLoopDependentIdx();

	HasErrors \|=
	(ResultIterSpaces[CurrentNestedLoopCount].PreCond == nullptr \|\|
	ResultIterSpaces[CurrentNestedLoopCount].NumIterations == nullptr \|\|
	ResultIterSpaces[CurrentNestedLoopCount].CounterVar == nullptr \|\|
	ResultIterSpaces[CurrentNestedLoopCount].PrivateCounterVar == nullptr \|\|
	ResultIterSpaces[CurrentNestedLoopCount].CounterInit == nullptr \|\|
	ResultIterSpaces[CurrentNestedLoopCount].CounterStep == nullptr);
	if (!HasErrors && DSA.isOrderedRegion()) {
	if (DSA.getOrderedRegionParam().second->getNumForLoops()) {
	if (CurrentNestedLoopCount <
	DSA.getOrderedRegionParam().second->getLoopNumIterations().size()) {
	DSA.getOrderedRegionParam().second->setLoopNumIterations(
	CurrentNestedLoopCount,
	ResultIterSpaces[CurrentNestedLoopCount].NumIterations);
	DSA.getOrderedRegionParam().second->setLoopCounter(
	CurrentNestedLoopCount,
	ResultIterSpaces[CurrentNestedLoopCount].CounterVar);
	}
	}
	for (auto &Pair : DSA.getDoacrossDependClauses()) {
	if (CurrentNestedLoopCount >= Pair.first->getNumLoops()) {
	// Erroneous case - clause has some problems.
	continue;
	}
	if (Pair.first->getDependencyKind() == OMPC_DEPEND_sink &&
	Pair.second.size() <= CurrentNestedLoopCount) {
	// Erroneous case - clause has some problems.
	Pair.first->setLoopData(CurrentNestedLoopCount, nullptr);
	continue;
	}
	Expr *CntValue;
	if (Pair.first->getDependencyKind() == OMPC_DEPEND_source)
	CntValue = ISC.buildOrderedLoopData(
	DSA.getCurScope(),
	ResultIterSpaces[CurrentNestedLoopCount].CounterVar, Captures,
	Pair.first->getDependencyLoc());
	else
	CntValue = ISC.buildOrderedLoopData(
	DSA.getCurScope(),
	ResultIterSpaces[CurrentNestedLoopCount].CounterVar, Captures,
	Pair.first->getDependencyLoc(),
	Pair.second[CurrentNestedLoopCount].first,
	Pair.second[CurrentNestedLoopCount].second);
	Pair.first->setLoopData(CurrentNestedLoopCount, CntValue);
	}
	}

	return HasErrors;
	}

	/// Build 'VarRef = Start.
	static ExprResult
	buildCounterInit(Sema &SemaRef, Scope *S, SourceLocation Loc, ExprResult VarRef,
	ExprResult Start, bool IsNonRectangularLB,
	llvm::MapVector<const Expr , DeclRefExpr > &Captures) {
	// Build 'VarRef = Start.
	ExprResult NewStart = IsNonRectangularLB
	? Start.get()
	: tryBuildCapture(SemaRef, Start.get(), Captures);
	if (!NewStart.isUsable())
	return ExprError();
	if (!SemaRef.Context.hasSameType(NewStart.get()->getType(),
	VarRef.get()->getType())) {
	NewStart = SemaRef.PerformImplicitConversion(
	NewStart.get(), VarRef.get()->getType(), Sema::AA_Converting,
	/AllowExplicit=/true);
	if (!NewStart.isUsable())
	return ExprError();
	}

	ExprResult Init =
	SemaRef.BuildBinOp(S, Loc, BO_Assign, VarRef.get(), NewStart.get());
	return Init;
	}

	/// Build 'VarRef = Start + Iter * Step'.
	static ExprResult buildCounterUpdate(
	Sema &SemaRef, Scope *S, SourceLocation Loc, ExprResult VarRef,
	ExprResult Start, ExprResult Iter, ExprResult Step, bool Subtract,
	bool IsNonRectangularLB,
	llvm::MapVector<const Expr , DeclRefExpr > *Captures = nullptr) {
	// Add parentheses (for debugging purposes only).
	Iter = SemaRef.ActOnParenExpr(Loc, Loc, Iter.get());
	if (!VarRef.isUsable() \|\| !Start.isUsable() \|\| !Iter.isUsable() \|\|
	!Step.isUsable())
	return ExprError();

	ExprResult NewStep = Step;
	if (Captures)
	NewStep = tryBuildCapture(SemaRef, Step.get(), *Captures);
	if (NewStep.isInvalid())
	return ExprError();
	ExprResult Update =
	SemaRef.BuildBinOp(S, Loc, BO_Mul, Iter.get(), NewStep.get());
	if (!Update.isUsable())
	return ExprError();

	// Try to build 'VarRef = Start, VarRef (+\|-)= Iter * Step' or
	// 'VarRef = Start (+\|-) Iter * Step'.
	if (!Start.isUsable())
	return ExprError();
	ExprResult NewStart = SemaRef.ActOnParenExpr(Loc, Loc, Start.get());
	if (!NewStart.isUsable())
	return ExprError();
	if (Captures && !IsNonRectangularLB)
	NewStart = tryBuildCapture(SemaRef, Start.get(), *Captures);
	if (NewStart.isInvalid())
	return ExprError();

	// First attempt: try to build 'VarRef = Start, VarRef += Iter * Step'.
	ExprResult SavedUpdate = Update;
	ExprResult UpdateVal;
	if (VarRef.get()->getType()->isOverloadableType() \|\|
	NewStart.get()->getType()->isOverloadableType() \|\|
	Update.get()->getType()->isOverloadableType()) {
	Sema::TentativeAnalysisScope Trap(SemaRef);

	Update =
	SemaRef.BuildBinOp(S, Loc, BO_Assign, VarRef.get(), NewStart.get());
	if (Update.isUsable()) {
	UpdateVal =
	SemaRef.BuildBinOp(S, Loc, Subtract ? BO_SubAssign : BO_AddAssign,
	VarRef.get(), SavedUpdate.get());
	if (UpdateVal.isUsable()) {
	Update = SemaRef.CreateBuiltinBinOp(Loc, BO_Comma, Update.get(),
	UpdateVal.get());
	}
	}
	}

	// Second attempt: try to build 'VarRef = Start (+\|-) Iter * Step'.
	if (!Update.isUsable() \|\| !UpdateVal.isUsable()) {
	Update = SemaRef.BuildBinOp(S, Loc, Subtract ? BO_Sub : BO_Add,
	NewStart.get(), SavedUpdate.get());
	if (!Update.isUsable())
	return ExprError();

	if (!SemaRef.Context.hasSameType(Update.get()->getType(),
	VarRef.get()->getType())) {
	Update = SemaRef.PerformImplicitConversion(
	Update.get(), VarRef.get()->getType(), Sema::AA_Converting, true);
	if (!Update.isUsable())
	return ExprError();
	}

	Update = SemaRef.BuildBinOp(S, Loc, BO_Assign, VarRef.get(), Update.get());
	}
	return Update;
	}

	/// Convert integer expression \a E to make it have at least \a Bits
	/// bits.
	static ExprResult widenIterationCount(unsigned Bits, Expr *E, Sema &SemaRef) {
	if (E == nullptr)
	return ExprError();
	ASTContext &C = SemaRef.Context;
	QualType OldType = E->getType();
	unsigned HasBits = C.getTypeSize(OldType);
	if (HasBits >= Bits)
	return ExprResult(E);
	// OK to convert to signed, because new type has more bits than old.
	QualType NewType = C.getIntTypeForBitwidth(Bits, /* Signed */ true);
	return SemaRef.PerformImplicitConversion(E, NewType, Sema::AA_Converting,
	true);
	}

	/// Check if the given expression \a E is a constant integer that fits
	/// into \a Bits bits.
	static bool fitsInto(unsigned Bits, bool Signed, const Expr *E, Sema &SemaRef) {
	if (E == nullptr)
	return false;
	llvm::APSInt Result;
	if (E->isIntegerConstantExpr(Result, SemaRef.Context))
	return Signed ? Result.isSignedIntN(Bits) : Result.isIntN(Bits);
	return false;
	}

	/// Build preinits statement for the given declarations.
	static Stmt *buildPreInits(ASTContext &Context,
	MutableArrayRef<Decl *> PreInits) {
	if (!PreInits.empty()) {
	return new (Context) DeclStmt(
	DeclGroupRef::Create(Context, PreInits.begin(), PreInits.size()),
	SourceLocation(), SourceLocation());
	}
	return nullptr;
	}

	/// Build preinits statement for the given declarations.
	static Stmt *
	buildPreInits(ASTContext &Context,
	const llvm::MapVector<const Expr , DeclRefExpr > &Captures) {
	if (!Captures.empty()) {
	SmallVector<Decl *, 16> PreInits;
	for (const auto &Pair : Captures)
	PreInits.push_back(Pair.second->getDecl());
	return buildPreInits(Context, PreInits);
	}
	return nullptr;
	}

	/// Build postupdate expression for the given list of postupdates expressions.
	static Expr buildPostUpdate(Sema &S, ArrayRef<Expr > PostUpdates) {
	Expr *PostUpdate = nullptr;
	if (!PostUpdates.empty()) {
	for (Expr *E : PostUpdates) {
	Expr *ConvE = S.BuildCStyleCastExpr(
	E->getExprLoc(),
	S.Context.getTrivialTypeSourceInfo(S.Context.VoidTy),
	E->getExprLoc(), E)
	.get();
	PostUpdate = PostUpdate
	? S.CreateBuiltinBinOp(ConvE->getExprLoc(), BO_Comma,
	PostUpdate, ConvE)
	.get()
	: ConvE;
	}
	}
	return PostUpdate;
	}

	/// Called on a for stmt to check itself and nested loops (if any).
	/// \return Returns 0 if one of the collapsed stmts is not canonical for loop,
	/// number of collapsed loops otherwise.
	static unsigned
	checkOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr,
	Expr OrderedLoopCountExpr, Stmt AStmt, Sema &SemaRef,
	DSAStackTy &DSA,
	Sema::VarsWithInheritedDSAType &VarsWithImplicitDSA,
	OMPLoopDirective::HelperExprs &Built) {
	unsigned NestedLoopCount = 1;
	if (CollapseLoopCountExpr) {
	// Found 'collapse' clause - calculate collapse number.
	Expr::EvalResult Result;
	if (!CollapseLoopCountExpr->isValueDependent() &&
	CollapseLoopCountExpr->EvaluateAsInt(Result, SemaRef.getASTContext())) {
	NestedLoopCount = Result.Val.getInt().getLimitedValue();
	} else {
	Built.clear(/Size=/1);
	return 1;
	}
	}
	unsigned OrderedLoopCount = 1;
	if (OrderedLoopCountExpr) {
	// Found 'ordered' clause - calculate collapse number.
	Expr::EvalResult EVResult;
	if (!OrderedLoopCountExpr->isValueDependent() &&
	OrderedLoopCountExpr->EvaluateAsInt(EVResult,
	SemaRef.getASTContext())) {
	llvm::APSInt Result = EVResult.Val.getInt();
	if (Result.getLimitedValue() < NestedLoopCount) {
	SemaRef.Diag(OrderedLoopCountExpr->getExprLoc(),
	diag::err_omp_wrong_ordered_loop_count)
	<< OrderedLoopCountExpr->getSourceRange();
	SemaRef.Diag(CollapseLoopCountExpr->getExprLoc(),
	diag::note_collapse_loop_count)
	<< CollapseLoopCountExpr->getSourceRange();
	}
	OrderedLoopCount = Result.getLimitedValue();
	} else {
	Built.clear(/Size=/1);
	return 1;
	}
	}
	// This is helper routine for loop directives (e.g., 'for', 'simd',
	// 'for simd', etc.).
	llvm::MapVector<const Expr , DeclRefExpr > Captures;
	SmallVector<LoopIterationSpace, 4> IterSpaces(
	std::max(OrderedLoopCount, NestedLoopCount));
	Stmt CurStmt = AStmt->IgnoreContainers(/ IgnoreCaptured */ true);
	for (unsigned Cnt = 0; Cnt < NestedLoopCount; ++Cnt) {
	if (checkOpenMPIterationSpace(
	DKind, CurStmt, SemaRef, DSA, Cnt, NestedLoopCount,
	std::max(OrderedLoopCount, NestedLoopCount), CollapseLoopCountExpr,
	OrderedLoopCountExpr, VarsWithImplicitDSA, IterSpaces, Captures))
	return 0;
	// Move on to the next nested for loop, or to the loop body.
	// OpenMP [2.8.1, simd construct, Restrictions]
	// All loops associated with the construct must be perfectly nested; that
	// is, there must be no intervening code nor any OpenMP directive between
	// any two loops.
	if (auto *For = dyn_cast<ForStmt>(CurStmt)) {
	CurStmt = For->getBody();
	} else {
	assert(isa<CXXForRangeStmt>(CurStmt) &&
	"Expected canonical for or range-based for loops.");
	CurStmt = cast<CXXForRangeStmt>(CurStmt)->getBody();
	}
	CurStmt = OMPLoopDirective::tryToFindNextInnerLoop(
	CurStmt, SemaRef.LangOpts.OpenMP >= 50);
	}
	for (unsigned Cnt = NestedLoopCount; Cnt < OrderedLoopCount; ++Cnt) {
	if (checkOpenMPIterationSpace(
	DKind, CurStmt, SemaRef, DSA, Cnt, NestedLoopCount,
	std::max(OrderedLoopCount, NestedLoopCount), CollapseLoopCountExpr,
	OrderedLoopCountExpr, VarsWithImplicitDSA, IterSpaces, Captures))
	return 0;
	if (Cnt > 0 && IterSpaces[Cnt].CounterVar) {
	// Handle initialization of captured loop iterator variables.
	auto *DRE = cast<DeclRefExpr>(IterSpaces[Cnt].CounterVar);
	if (isa<OMPCapturedExprDecl>(DRE->getDecl())) {
	Captures[DRE] = DRE;
	}
	}
	// Move on to the next nested for loop, or to the loop body.
	// OpenMP [2.8.1, simd construct, Restrictions]
	// All loops associated with the construct must be perfectly nested; that
	// is, there must be no intervening code nor any OpenMP directive between
	// any two loops.
	if (auto *For = dyn_cast<ForStmt>(CurStmt)) {
	CurStmt = For->getBody();
	} else {
	assert(isa<CXXForRangeStmt>(CurStmt) &&
	"Expected canonical for or range-based for loops.");
	CurStmt = cast<CXXForRangeStmt>(CurStmt)->getBody();
	}
	CurStmt = OMPLoopDirective::tryToFindNextInnerLoop(
	CurStmt, SemaRef.LangOpts.OpenMP >= 50);
	}

	Built.clear(/* size */ NestedLoopCount);

	if (SemaRef.CurContext->isDependentContext())
	return NestedLoopCount;

	// An example of what is generated for the following code:
	//
	// #pragma omp simd collapse(2) ordered(2)
	// for (i = 0; i < NI; ++i)
	// for (k = 0; k < NK; ++k)
	// for (j = J0; j < NJ; j+=2) {
	// <loop body>
	// }
	//
	// We generate the code below.
	// Note: the loop body may be outlined in CodeGen.
	// Note: some counters may be C++ classes, operator- is used to find number of
	// iterations and operator+= to calculate counter value.
	// Note: decltype(NumIterations) must be integer type (in 'omp for', only i32
	// or i64 is currently supported).
	//
	// #define NumIterations (NI * ((NJ - J0 - 1 + 2) / 2))
	// for (int[32\|64]_t IV = 0; IV < NumIterations; ++IV ) {
	// .local.i = IV / ((NJ - J0 - 1 + 2) / 2);
	// .local.j = J0 + (IV % ((NJ - J0 - 1 + 2) / 2)) * 2;
	// // similar updates for vars in clauses (e.g. 'linear')
	// <loop body (using local i and j)>
	// }
	// i = NI; // assign final values of counters
	// j = NJ;
	//

	// Last iteration number is (I1 * I2 * ... In) - 1, where I1, I2 ... In are
	// the iteration counts of the collapsed for loops.
	// Precondition tests if there is at least one iteration (all conditions are
	// true).
	auto PreCond = ExprResult(IterSpaces[0].PreCond);
	Expr *N0 = IterSpaces[0].NumIterations;
	ExprResult LastIteration32 =
	widenIterationCount(/Bits=/32,
	SemaRef
	.PerformImplicitConversion(
	N0->IgnoreImpCasts(), N0->getType(),
	Sema::AA_Converting, /AllowExplicit=/true)
	.get(),
	SemaRef);
	ExprResult LastIteration64 = widenIterationCount(
	/Bits=/64,
	SemaRef
	.PerformImplicitConversion(N0->IgnoreImpCasts(), N0->getType(),
	Sema::AA_Converting,
	/AllowExplicit=/true)
	.get(),
	SemaRef);

	if (!LastIteration32.isUsable() \|\| !LastIteration64.isUsable())
	return NestedLoopCount;

	ASTContext &C = SemaRef.Context;
	bool AllCountsNeedLessThan32Bits = C.getTypeSize(N0->getType()) < 32;

	Scope *CurScope = DSA.getCurScope();
	for (unsigned Cnt = 1; Cnt < NestedLoopCount; ++Cnt) {
	if (PreCond.isUsable()) {
	PreCond =
	SemaRef.BuildBinOp(CurScope, PreCond.get()->getExprLoc(), BO_LAnd,
	PreCond.get(), IterSpaces[Cnt].PreCond);
	}
	Expr *N = IterSpaces[Cnt].NumIterations;
	SourceLocation Loc = N->getExprLoc();
	AllCountsNeedLessThan32Bits &= C.getTypeSize(N->getType()) < 32;
	if (LastIteration32.isUsable())
	LastIteration32 = SemaRef.BuildBinOp(
	CurScope, Loc, BO_Mul, LastIteration32.get(),
	SemaRef
	.PerformImplicitConversion(N->IgnoreImpCasts(), N->getType(),
	Sema::AA_Converting,
	/AllowExplicit=/true)
	.get());
	if (LastIteration64.isUsable())
	LastIteration64 = SemaRef.BuildBinOp(
	CurScope, Loc, BO_Mul, LastIteration64.get(),
	SemaRef
	.PerformImplicitConversion(N->IgnoreImpCasts(), N->getType(),
	Sema::AA_Converting,
	/AllowExplicit=/true)
	.get());
	}

	// Choose either the 32-bit or 64-bit version.
	ExprResult LastIteration = LastIteration64;
	if (SemaRef.getLangOpts().OpenMPOptimisticCollapse \|\|
	(LastIteration32.isUsable() &&
	C.getTypeSize(LastIteration32.get()->getType()) == 32 &&
	(AllCountsNeedLessThan32Bits \|\| NestedLoopCount == 1 \|\|
	fitsInto(
	/Bits=/32,
	LastIteration32.get()->getType()->hasSignedIntegerRepresentation(),
	LastIteration64.get(), SemaRef))))
	LastIteration = LastIteration32;
	QualType VType = LastIteration.get()->getType();
	QualType RealVType = VType;
	QualType StrideVType = VType;
	if (isOpenMPTaskLoopDirective(DKind)) {
	VType =
	SemaRef.Context.getIntTypeForBitwidth(/DestWidth=/64, /Signed=/0);
	StrideVType =
	SemaRef.Context.getIntTypeForBitwidth(/DestWidth=/64, /Signed=/1);
	}

	if (!LastIteration.isUsable())
	return 0;

	// Save the number of iterations.
	ExprResult NumIterations = LastIteration;
	{
	LastIteration = SemaRef.BuildBinOp(
	CurScope, LastIteration.get()->getExprLoc(), BO_Sub,
	LastIteration.get(),
	SemaRef.ActOnIntegerConstant(SourceLocation(), 1).get());
	if (!LastIteration.isUsable())
	return 0;
	}

	// Calculate the last iteration number beforehand instead of doing this on
	// each iteration. Do not do this if the number of iterations may be kfold-ed.
	llvm::APSInt Result;
	bool IsConstant =
	LastIteration.get()->isIntegerConstantExpr(Result, SemaRef.Context);
	ExprResult CalcLastIteration;
	if (!IsConstant) {
	ExprResult SaveRef =
	tryBuildCapture(SemaRef, LastIteration.get(), Captures);
	LastIteration = SaveRef;

	// Prepare SaveRef + 1.
	NumIterations = SemaRef.BuildBinOp(
	CurScope, SaveRef.get()->getExprLoc(), BO_Add, SaveRef.get(),
	SemaRef.ActOnIntegerConstant(SourceLocation(), 1).get());
	if (!NumIterations.isUsable())
	return 0;
	}

	SourceLocation InitLoc = IterSpaces[0].InitSrcRange.getBegin();

	// Build variables passed into runtime, necessary for worksharing directives.
	ExprResult LB, UB, IL, ST, EUB, CombLB, CombUB, PrevLB, PrevUB, CombEUB;
	if (isOpenMPWorksharingDirective(DKind) \|\| isOpenMPTaskLoopDirective(DKind) \|\|
	isOpenMPDistributeDirective(DKind)) {
	// Lower bound variable, initialized with zero.
	VarDecl *LBDecl = buildVarDecl(SemaRef, InitLoc, VType, ".omp.lb");
	LB = buildDeclRefExpr(SemaRef, LBDecl, VType, InitLoc);
	SemaRef.AddInitializerToDecl(LBDecl,
	SemaRef.ActOnIntegerConstant(InitLoc, 0).get(),
	/DirectInit/ false);

	// Upper bound variable, initialized with last iteration number.
	VarDecl *UBDecl = buildVarDecl(SemaRef, InitLoc, VType, ".omp.ub");
	UB = buildDeclRefExpr(SemaRef, UBDecl, VType, InitLoc);
	SemaRef.AddInitializerToDecl(UBDecl, LastIteration.get(),
	/DirectInit/ false);

	// A 32-bit variable-flag where runtime returns 1 for the last iteration.
	// This will be used to implement clause 'lastprivate'.
	QualType Int32Ty = SemaRef.Context.getIntTypeForBitwidth(32, true);
	VarDecl *ILDecl = buildVarDecl(SemaRef, InitLoc, Int32Ty, ".omp.is_last");
	IL = buildDeclRefExpr(SemaRef, ILDecl, Int32Ty, InitLoc);
	SemaRef.AddInitializerToDecl(ILDecl,
	SemaRef.ActOnIntegerConstant(InitLoc, 0).get(),
	/DirectInit/ false);

	// Stride variable returned by runtime (we initialize it to 1 by default).
	VarDecl *STDecl =
	buildVarDecl(SemaRef, InitLoc, StrideVType, ".omp.stride");
	ST = buildDeclRefExpr(SemaRef, STDecl, StrideVType, InitLoc);
	SemaRef.AddInitializerToDecl(STDecl,
	SemaRef.ActOnIntegerConstant(InitLoc, 1).get(),
	/DirectInit/ false);

	// Build expression: UB = min(UB, LastIteration)
	// It is necessary for CodeGen of directives with static scheduling.
	ExprResult IsUBGreater = SemaRef.BuildBinOp(CurScope, InitLoc, BO_GT,
	UB.get(), LastIteration.get());
	ExprResult CondOp = SemaRef.ActOnConditionalOp(
	LastIteration.get()->getExprLoc(), InitLoc, IsUBGreater.get(),
	LastIteration.get(), UB.get());
	EUB = SemaRef.BuildBinOp(CurScope, InitLoc, BO_Assign, UB.get(),
	CondOp.get());
	EUB = SemaRef.ActOnFinishFullExpr(EUB.get(), /DiscardedValue/ false);

	// If we have a combined directive that combines 'distribute', 'for' or
	// 'simd' we need to be able to access the bounds of the schedule of the
	// enclosing region. E.g. in 'distribute parallel for' the bounds obtained
	// by scheduling 'distribute' have to be passed to the schedule of 'for'.
	if (isOpenMPLoopBoundSharingDirective(DKind)) {
	// Lower bound variable, initialized with zero.
	VarDecl *CombLBDecl =
	buildVarDecl(SemaRef, InitLoc, VType, ".omp.comb.lb");
	CombLB = buildDeclRefExpr(SemaRef, CombLBDecl, VType, InitLoc);
	SemaRef.AddInitializerToDecl(
	CombLBDecl, SemaRef.ActOnIntegerConstant(InitLoc, 0).get(),
	/DirectInit/ false);

	// Upper bound variable, initialized with last iteration number.
	VarDecl *CombUBDecl =
	buildVarDecl(SemaRef, InitLoc, VType, ".omp.comb.ub");
	CombUB = buildDeclRefExpr(SemaRef, CombUBDecl, VType, InitLoc);
	SemaRef.AddInitializerToDecl(CombUBDecl, LastIteration.get(),
	/DirectInit/ false);

	ExprResult CombIsUBGreater = SemaRef.BuildBinOp(
	CurScope, InitLoc, BO_GT, CombUB.get(), LastIteration.get());
	ExprResult CombCondOp =
	SemaRef.ActOnConditionalOp(InitLoc, InitLoc, CombIsUBGreater.get(),
	LastIteration.get(), CombUB.get());
	CombEUB = SemaRef.BuildBinOp(CurScope, InitLoc, BO_Assign, CombUB.get(),
	CombCondOp.get());
	CombEUB =
	SemaRef.ActOnFinishFullExpr(CombEUB.get(), /DiscardedValue/ false);

	const CapturedDecl *CD = cast<CapturedStmt>(AStmt)->getCapturedDecl();
	// We expect to have at least 2 more parameters than the 'parallel'
	// directive does - the lower and upper bounds of the previous schedule.
	assert(CD->getNumParams() >= 4 &&
	"Unexpected number of parameters in loop combined directive");

	// Set the proper type for the bounds given what we learned from the
	// enclosed loops.
	ImplicitParamDecl PrevLBDecl = CD->getParam(/PrevLB=*/2);
	ImplicitParamDecl PrevUBDecl = CD->getParam(/PrevUB=*/3);

	// Previous lower and upper bounds are obtained from the region
	// parameters.
	PrevLB =
	buildDeclRefExpr(SemaRef, PrevLBDecl, PrevLBDecl->getType(), InitLoc);
	PrevUB =
	buildDeclRefExpr(SemaRef, PrevUBDecl, PrevUBDecl->getType(), InitLoc);
	}
	}

	// Build the iteration variable and its initialization before loop.
	ExprResult IV;
	ExprResult Init, CombInit;
	{
	VarDecl *IVDecl = buildVarDecl(SemaRef, InitLoc, RealVType, ".omp.iv");
	IV = buildDeclRefExpr(SemaRef, IVDecl, RealVType, InitLoc);
	Expr *RHS =
	(isOpenMPWorksharingDirective(DKind) \|\|
	isOpenMPTaskLoopDirective(DKind) \|\| isOpenMPDistributeDirective(DKind))
	? LB.get()
	: SemaRef.ActOnIntegerConstant(SourceLocation(), 0).get();
	Init = SemaRef.BuildBinOp(CurScope, InitLoc, BO_Assign, IV.get(), RHS);
	Init = SemaRef.ActOnFinishFullExpr(Init.get(), /DiscardedValue/ false);

	if (isOpenMPLoopBoundSharingDirective(DKind)) {
	Expr *CombRHS =
	(isOpenMPWorksharingDirective(DKind) \|\|
	isOpenMPTaskLoopDirective(DKind) \|\|
	isOpenMPDistributeDirective(DKind))
	? CombLB.get()
	: SemaRef.ActOnIntegerConstant(SourceLocation(), 0).get();
	CombInit =
	SemaRef.BuildBinOp(CurScope, InitLoc, BO_Assign, IV.get(), CombRHS);
	CombInit =
	SemaRef.ActOnFinishFullExpr(CombInit.get(), /DiscardedValue/ false);
	}
	}

	bool UseStrictCompare =
	RealVType->hasUnsignedIntegerRepresentation() &&
	llvm::all_of(IterSpaces, [](const LoopIterationSpace &LIS) {
	return LIS.IsStrictCompare;
	});
	// Loop condition (IV < NumIterations) or (IV <= UB or IV < UB + 1 (for
	// unsigned IV)) for worksharing loops.
	SourceLocation CondLoc = AStmt->getBeginLoc();
	Expr *BoundUB = UB.get();
	if (UseStrictCompare) {
	BoundUB =
	SemaRef
	.BuildBinOp(CurScope, CondLoc, BO_Add, BoundUB,
	SemaRef.ActOnIntegerConstant(SourceLocation(), 1).get())
	.get();
	BoundUB =
	SemaRef.ActOnFinishFullExpr(BoundUB, /DiscardedValue/ false).get();
	}
	ExprResult Cond =
	(isOpenMPWorksharingDirective(DKind) \|\|
	isOpenMPTaskLoopDirective(DKind) \|\| isOpenMPDistributeDirective(DKind))
	? SemaRef.BuildBinOp(CurScope, CondLoc,
	UseStrictCompare ? BO_LT : BO_LE, IV.get(),
	BoundUB)
	: SemaRef.BuildBinOp(CurScope, CondLoc, BO_LT, IV.get(),
	NumIterations.get());
	ExprResult CombDistCond;
	if (isOpenMPLoopBoundSharingDirective(DKind)) {
	CombDistCond = SemaRef.BuildBinOp(CurScope, CondLoc, BO_LT, IV.get(),
	NumIterations.get());
	}

	ExprResult CombCond;
	if (isOpenMPLoopBoundSharingDirective(DKind)) {
	Expr *BoundCombUB = CombUB.get();
	if (UseStrictCompare) {
	BoundCombUB =
	SemaRef
	.BuildBinOp(
	CurScope, CondLoc, BO_Add, BoundCombUB,
	SemaRef.ActOnIntegerConstant(SourceLocation(), 1).get())
	.get();
	BoundCombUB =
	SemaRef.ActOnFinishFullExpr(BoundCombUB, /DiscardedValue/ false)
	.get();
	}
	CombCond =
	SemaRef.BuildBinOp(CurScope, CondLoc, UseStrictCompare ? BO_LT : BO_LE,
	IV.get(), BoundCombUB);
	}
	// Loop increment (IV = IV + 1)
	SourceLocation IncLoc = AStmt->getBeginLoc();
	ExprResult Inc =
	SemaRef.BuildBinOp(CurScope, IncLoc, BO_Add, IV.get(),
	SemaRef.ActOnIntegerConstant(IncLoc, 1).get());
	if (!Inc.isUsable())
	return 0;
	Inc = SemaRef.BuildBinOp(CurScope, IncLoc, BO_Assign, IV.get(), Inc.get());
	Inc = SemaRef.ActOnFinishFullExpr(Inc.get(), /DiscardedValue/ false);
	if (!Inc.isUsable())
	return 0;

	// Increments for worksharing loops (LB = LB + ST; UB = UB + ST).
	// Used for directives with static scheduling.
	// In combined construct, add combined version that use CombLB and CombUB
	// base variables for the update
	ExprResult NextLB, NextUB, CombNextLB, CombNextUB;
	if (isOpenMPWorksharingDirective(DKind) \|\| isOpenMPTaskLoopDirective(DKind) \|\|
	isOpenMPDistributeDirective(DKind)) {
	// LB + ST
	NextLB = SemaRef.BuildBinOp(CurScope, IncLoc, BO_Add, LB.get(), ST.get());
	if (!NextLB.isUsable())
	return 0;
	// LB = LB + ST
	NextLB =
	SemaRef.BuildBinOp(CurScope, IncLoc, BO_Assign, LB.get(), NextLB.get());
	NextLB =
	SemaRef.ActOnFinishFullExpr(NextLB.get(), /DiscardedValue/ false);
	if (!NextLB.isUsable())
	return 0;
	// UB + ST
	NextUB = SemaRef.BuildBinOp(CurScope, IncLoc, BO_Add, UB.get(), ST.get());
	if (!NextUB.isUsable())
	return 0;
	// UB = UB + ST
	NextUB =
	SemaRef.BuildBinOp(CurScope, IncLoc, BO_Assign, UB.get(), NextUB.get());
	NextUB =
	SemaRef.ActOnFinishFullExpr(NextUB.get(), /DiscardedValue/ false);
	if (!NextUB.isUsable())
	return 0;
	if (isOpenMPLoopBoundSharingDirective(DKind)) {
	CombNextLB =
	SemaRef.BuildBinOp(CurScope, IncLoc, BO_Add, CombLB.get(), ST.get());
	if (!NextLB.isUsable())
	return 0;
	// LB = LB + ST
	CombNextLB = SemaRef.BuildBinOp(CurScope, IncLoc, BO_Assign, CombLB.get(),
	CombNextLB.get());
	CombNextLB = SemaRef.ActOnFinishFullExpr(CombNextLB.get(),
	/DiscardedValue/ false);
	if (!CombNextLB.isUsable())
	return 0;
	// UB + ST
	CombNextUB =
	SemaRef.BuildBinOp(CurScope, IncLoc, BO_Add, CombUB.get(), ST.get());
	if (!CombNextUB.isUsable())
	return 0;
	// UB = UB + ST
	CombNextUB = SemaRef.BuildBinOp(CurScope, IncLoc, BO_Assign, CombUB.get(),
	CombNextUB.get());
	CombNextUB = SemaRef.ActOnFinishFullExpr(CombNextUB.get(),
	/DiscardedValue/ false);
	if (!CombNextUB.isUsable())
	return 0;
	}
	}

	// Create increment expression for distribute loop when combined in a same
	// directive with for as IV = IV + ST; ensure upper bound expression based
	// on PrevUB instead of NumIterations - used to implement 'for' when found
	// in combination with 'distribute', like in 'distribute parallel for'
	SourceLocation DistIncLoc = AStmt->getBeginLoc();
	ExprResult DistCond, DistInc, PrevEUB, ParForInDistCond;
	if (isOpenMPLoopBoundSharingDirective(DKind)) {
	DistCond = SemaRef.BuildBinOp(
	CurScope, CondLoc, UseStrictCompare ? BO_LT : BO_LE, IV.get(), BoundUB);
	assert(DistCond.isUsable() && "distribute cond expr was not built");

	DistInc =
	SemaRef.BuildBinOp(CurScope, DistIncLoc, BO_Add, IV.get(), ST.get());
	assert(DistInc.isUsable() && "distribute inc expr was not built");
	DistInc = SemaRef.BuildBinOp(CurScope, DistIncLoc, BO_Assign, IV.get(),
	DistInc.get());
	DistInc =
	SemaRef.ActOnFinishFullExpr(DistInc.get(), /DiscardedValue/ false);
	assert(DistInc.isUsable() && "distribute inc expr was not built");

	// Build expression: UB = min(UB, prevUB) for #for in composite or combined
	// construct
	SourceLocation DistEUBLoc = AStmt->getBeginLoc();
	ExprResult IsUBGreater =
	SemaRef.BuildBinOp(CurScope, DistEUBLoc, BO_GT, UB.get(), PrevUB.get());
	ExprResult CondOp = SemaRef.ActOnConditionalOp(
	DistEUBLoc, DistEUBLoc, IsUBGreater.get(), PrevUB.get(), UB.get());
	PrevEUB = SemaRef.BuildBinOp(CurScope, DistIncLoc, BO_Assign, UB.get(),
	CondOp.get());
	PrevEUB =
	SemaRef.ActOnFinishFullExpr(PrevEUB.get(), /DiscardedValue/ false);

	// Build IV <= PrevUB or IV < PrevUB + 1 for unsigned IV to be used in
	// parallel for is in combination with a distribute directive with
	// schedule(static, 1)
	Expr *BoundPrevUB = PrevUB.get();
	if (UseStrictCompare) {
	BoundPrevUB =
	SemaRef
	.BuildBinOp(
	CurScope, CondLoc, BO_Add, BoundPrevUB,
	SemaRef.ActOnIntegerConstant(SourceLocation(), 1).get())
	.get();
	BoundPrevUB =
	SemaRef.ActOnFinishFullExpr(BoundPrevUB, /DiscardedValue/ false)
	.get();
	}
	ParForInDistCond =
	SemaRef.BuildBinOp(CurScope, CondLoc, UseStrictCompare ? BO_LT : BO_LE,
	IV.get(), BoundPrevUB);
	}

	// Build updates and final values of the loop counters.
	bool HasErrors = false;
	Built.Counters.resize(NestedLoopCount);
	Built.Inits.resize(NestedLoopCount);
	Built.Updates.resize(NestedLoopCount);
	Built.Finals.resize(NestedLoopCount);
	Built.DependentCounters.resize(NestedLoopCount);
	Built.DependentInits.resize(NestedLoopCount);
	Built.FinalsConditions.resize(NestedLoopCount);
	{
	// We implement the following algorithm for obtaining the
	// original loop iteration variable values based on the
	// value of the collapsed loop iteration variable IV.
	//
	// Let n+1 be the number of collapsed loops in the nest.
	// Iteration variables (I0, I1, .... In)
	// Iteration counts (N0, N1, ... Nn)
	//
	// Acc = IV;
	//
	// To compute Ik for loop k, 0 <= k <= n, generate:
	// Prod = N(k+1) * N(k+2) * ... * Nn;
	// Ik = Acc / Prod;
	// Acc -= Ik * Prod;
	//
	ExprResult Acc = IV;
	for (unsigned int Cnt = 0; Cnt < NestedLoopCount; ++Cnt) {
	LoopIterationSpace &IS = IterSpaces[Cnt];
	SourceLocation UpdLoc = IS.IncSrcRange.getBegin();
	ExprResult Iter;

	// Compute prod
	ExprResult Prod =
	SemaRef.ActOnIntegerConstant(SourceLocation(), 1).get();
	for (unsigned int K = Cnt+1; K < NestedLoopCount; ++K)
	Prod = SemaRef.BuildBinOp(CurScope, UpdLoc, BO_Mul, Prod.get(),
	IterSpaces[K].NumIterations);

	// Iter = Acc / Prod
	// If there is at least one more inner loop to avoid
	// multiplication by 1.
	if (Cnt + 1 < NestedLoopCount)
	Iter = SemaRef.BuildBinOp(CurScope, UpdLoc, BO_Div,
	Acc.get(), Prod.get());
	else
	Iter = Acc;
	if (!Iter.isUsable()) {
	HasErrors = true;
	break;
	}

	// Update Acc:
	// Acc -= Iter * Prod
	// Check if there is at least one more inner loop to avoid
	// multiplication by 1.
	if (Cnt + 1 < NestedLoopCount)
	Prod = SemaRef.BuildBinOp(CurScope, UpdLoc, BO_Mul,
	Iter.get(), Prod.get());
	else
	Prod = Iter;
	Acc = SemaRef.BuildBinOp(CurScope, UpdLoc, BO_Sub,
	Acc.get(), Prod.get());

	// Build update: IS.CounterVar(Private) = IS.Start + Iter * IS.Step
	auto *VD = cast<VarDecl>(cast<DeclRefExpr>(IS.CounterVar)->getDecl());
	DeclRefExpr *CounterVar = buildDeclRefExpr(
	SemaRef, VD, IS.CounterVar->getType(), IS.CounterVar->getExprLoc(),
	/RefersToCapture=/true);
	ExprResult Init =
	buildCounterInit(SemaRef, CurScope, UpdLoc, CounterVar,
	IS.CounterInit, IS.IsNonRectangularLB, Captures);
	if (!Init.isUsable()) {
	HasErrors = true;
	break;
	}
	ExprResult Update = buildCounterUpdate(
	SemaRef, CurScope, UpdLoc, CounterVar, IS.CounterInit, Iter,
	IS.CounterStep, IS.Subtract, IS.IsNonRectangularLB, &Captures);
	if (!Update.isUsable()) {
	HasErrors = true;
	break;
	}

	// Build final: IS.CounterVar = IS.Start + IS.NumIters * IS.Step
	ExprResult Final =
	buildCounterUpdate(SemaRef, CurScope, UpdLoc, CounterVar,
	IS.CounterInit, IS.NumIterations, IS.CounterStep,
	IS.Subtract, IS.IsNonRectangularLB, &Captures);
	if (!Final.isUsable()) {
	HasErrors = true;
	break;
	}

	if (!Update.isUsable() \|\| !Final.isUsable()) {
	HasErrors = true;
	break;
	}
	// Save results
	Built.Counters[Cnt] = IS.CounterVar;
	Built.PrivateCounters[Cnt] = IS.PrivateCounterVar;
	Built.Inits[Cnt] = Init.get();
	Built.Updates[Cnt] = Update.get();
	Built.Finals[Cnt] = Final.get();
	Built.DependentCounters[Cnt] = nullptr;
	Built.DependentInits[Cnt] = nullptr;
	Built.FinalsConditions[Cnt] = nullptr;
	if (IS.IsNonRectangularLB \|\| IS.IsNonRectangularUB) {
	Built.DependentCounters[Cnt] =
	Built.Counters[NestedLoopCount - 1 - IS.LoopDependentIdx];
	Built.DependentInits[Cnt] =
	Built.Inits[NestedLoopCount - 1 - IS.LoopDependentIdx];
	Built.FinalsConditions[Cnt] = IS.FinalCondition;
	}
	}
	}

	if (HasErrors)
	return 0;

	// Save results
	Built.IterationVarRef = IV.get();
	Built.LastIteration = LastIteration.get();
	Built.NumIterations = NumIterations.get();
	Built.CalcLastIteration = SemaRef
	.ActOnFinishFullExpr(CalcLastIteration.get(),
	/DiscardedValue=/false)
	.get();
	Built.PreCond = PreCond.get();
	Built.PreInits = buildPreInits(C, Captures);
	Built.Cond = Cond.get();
	Built.Init = Init.get();
	Built.Inc = Inc.get();
	Built.LB = LB.get();
	Built.UB = UB.get();
	Built.IL = IL.get();
	Built.ST = ST.get();
	Built.EUB = EUB.get();
	Built.NLB = NextLB.get();
	Built.NUB = NextUB.get();
	Built.PrevLB = PrevLB.get();
	Built.PrevUB = PrevUB.get();
	Built.DistInc = DistInc.get();
	Built.PrevEUB = PrevEUB.get();
	Built.DistCombinedFields.LB = CombLB.get();
	Built.DistCombinedFields.UB = CombUB.get();
	Built.DistCombinedFields.EUB = CombEUB.get();
	Built.DistCombinedFields.Init = CombInit.get();
	Built.DistCombinedFields.Cond = CombCond.get();
	Built.DistCombinedFields.NLB = CombNextLB.get();
	Built.DistCombinedFields.NUB = CombNextUB.get();
	Built.DistCombinedFields.DistCond = CombDistCond.get();
	Built.DistCombinedFields.ParForInDistCond = ParForInDistCond.get();

	return NestedLoopCount;
	}

	static Expr getCollapseNumberExpr(ArrayRef<OMPClause > Clauses) {
	auto CollapseClauses =
	OMPExecutableDirective::getClausesOfKind<OMPCollapseClause>(Clauses);
	if (CollapseClauses.begin() != CollapseClauses.end())
	return (*CollapseClauses.begin())->getNumForLoops();
	return nullptr;
	}

	static Expr getOrderedNumberExpr(ArrayRef<OMPClause > Clauses) {
	auto OrderedClauses =
	OMPExecutableDirective::getClausesOfKind<OMPOrderedClause>(Clauses);
	if (OrderedClauses.begin() != OrderedClauses.end())
	return (*OrderedClauses.begin())->getNumForLoops();
	return nullptr;
	}

	static bool checkSimdlenSafelenSpecified(Sema &S,
	const ArrayRef<OMPClause *> Clauses) {
	const OMPSafelenClause *Safelen = nullptr;
	const OMPSimdlenClause *Simdlen = nullptr;

	for (const OMPClause *Clause : Clauses) {
	if (Clause->getClauseKind() == OMPC_safelen)
	Safelen = cast<OMPSafelenClause>(Clause);
	else if (Clause->getClauseKind() == OMPC_simdlen)
	Simdlen = cast<OMPSimdlenClause>(Clause);
	if (Safelen && Simdlen)
	break;
	}

	if (Simdlen && Safelen) {
	const Expr *SimdlenLength = Simdlen->getSimdlen();
	const Expr *SafelenLength = Safelen->getSafelen();
	if (SimdlenLength->isValueDependent() \|\| SimdlenLength->isTypeDependent() \|\|
	SimdlenLength->isInstantiationDependent() \|\|
	SimdlenLength->containsUnexpandedParameterPack())
	return false;
	if (SafelenLength->isValueDependent() \|\| SafelenLength->isTypeDependent() \|\|
	SafelenLength->isInstantiationDependent() \|\|
	SafelenLength->containsUnexpandedParameterPack())
	return false;
	Expr::EvalResult SimdlenResult, SafelenResult;
	SimdlenLength->EvaluateAsInt(SimdlenResult, S.Context);
	SafelenLength->EvaluateAsInt(SafelenResult, S.Context);
	llvm::APSInt SimdlenRes = SimdlenResult.Val.getInt();
	llvm::APSInt SafelenRes = SafelenResult.Val.getInt();
	// OpenMP 4.5 [2.8.1, simd Construct, Restrictions]
	// If both simdlen and safelen clauses are specified, the value of the
	// simdlen parameter must be less than or equal to the value of the safelen
	// parameter.
	if (SimdlenRes > SafelenRes) {
	S.Diag(SimdlenLength->getExprLoc(),
	diag::err_omp_wrong_simdlen_safelen_values)
	<< SimdlenLength->getSourceRange() << SafelenLength->getSourceRange();
	return true;
	}
	}
	return false;
	}

	StmtResult
	Sema::ActOnOpenMPSimdDirective(ArrayRef<OMPClause > Clauses, Stmt AStmt,
	SourceLocation StartLoc, SourceLocation EndLoc,
	VarsWithInheritedDSAType &VarsWithImplicitDSA) {
	if (!AStmt)
	return StmtError();

	assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
	OMPLoopDirective::HelperExprs B;
	// In presence of clause 'collapse' or 'ordered' with number of loops, it will
	// define the nested loops number.
	unsigned NestedLoopCount = checkOpenMPLoop(
	OMPD_simd, getCollapseNumberExpr(Clauses), getOrderedNumberExpr(Clauses),
	AStmt, this, DSAStack, VarsWithImplicitDSA, B);
	if (NestedLoopCount == 0)
	return StmtError();

	assert((CurContext->isDependentContext() \|\| B.builtAll()) &&
	"omp simd loop exprs were not built");

	if (!CurContext->isDependentContext()) {
	// Finalize the clauses that need pre-built expressions for CodeGen.
	for (OMPClause *C : Clauses) {
	if (auto *LC = dyn_cast<OMPLinearClause>(C))
	if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
	B.NumIterations, *this, CurScope,
	DSAStack))
	return StmtError();
	}
	}

	if (checkSimdlenSafelenSpecified(*this, Clauses))
	return StmtError();

	setFunctionHasBranchProtectedScope();
	return OMPSimdDirective::Create(Context, StartLoc, EndLoc, NestedLoopCount,
	Clauses, AStmt, B);
	}

	StmtResult
	Sema::ActOnOpenMPForDirective(ArrayRef<OMPClause > Clauses, Stmt AStmt,
	SourceLocation StartLoc, SourceLocation EndLoc,
	VarsWithInheritedDSAType &VarsWithImplicitDSA) {
	if (!AStmt)
	return StmtError();

	assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
	OMPLoopDirective::HelperExprs B;
	// In presence of clause 'collapse' or 'ordered' with number of loops, it will
	// define the nested loops number.
	unsigned NestedLoopCount = checkOpenMPLoop(
	OMPD_for, getCollapseNumberExpr(Clauses), getOrderedNumberExpr(Clauses),
	AStmt, this, DSAStack, VarsWithImplicitDSA, B);
	if (NestedLoopCount == 0)
	return StmtError();

	assert((CurContext->isDependentContext() \|\| B.builtAll()) &&
	"omp for loop exprs were not built");

	if (!CurContext->isDependentContext()) {
	// Finalize the clauses that need pre-built expressions for CodeGen.
	for (OMPClause *C : Clauses) {
	if (auto *LC = dyn_cast<OMPLinearClause>(C))
	if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
	B.NumIterations, *this, CurScope,
	DSAStack))
	return StmtError();
	}
	}

	setFunctionHasBranchProtectedScope();
	return OMPForDirective::Create(
	Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
	DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
	}

	StmtResult Sema::ActOnOpenMPForSimdDirective(
	ArrayRef<OMPClause > Clauses, Stmt AStmt, SourceLocation StartLoc,
	SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
	if (!AStmt)
	return StmtError();

	assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
	OMPLoopDirective::HelperExprs B;
	// In presence of clause 'collapse' or 'ordered' with number of loops, it will
	// define the nested loops number.
	unsigned NestedLoopCount =
	checkOpenMPLoop(OMPD_for_simd, getCollapseNumberExpr(Clauses),
	getOrderedNumberExpr(Clauses), AStmt, this, DSAStack,
	VarsWithImplicitDSA, B);
	if (NestedLoopCount == 0)
	return StmtError();

	assert((CurContext->isDependentContext() \|\| B.builtAll()) &&
	"omp for simd loop exprs were not built");

	if (!CurContext->isDependentContext()) {
	// Finalize the clauses that need pre-built expressions for CodeGen.
	for (OMPClause *C : Clauses) {
	if (auto *LC = dyn_cast<OMPLinearClause>(C))
	if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
	B.NumIterations, *this, CurScope,
	DSAStack))
	return StmtError();
	}
	}

	if (checkSimdlenSafelenSpecified(*this, Clauses))
	return StmtError();

	setFunctionHasBranchProtectedScope();
	return OMPForSimdDirective::Create(Context, StartLoc, EndLoc, NestedLoopCount,
	Clauses, AStmt, B);
	}

	StmtResult Sema::ActOnOpenMPSectionsDirective(ArrayRef<OMPClause *> Clauses,
	Stmt *AStmt,
	SourceLocation StartLoc,
	SourceLocation EndLoc) {
	if (!AStmt)
	return StmtError();

	assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
	auto BaseStmt = AStmt;
	while (auto *CS = dyn_cast_or_null<CapturedStmt>(BaseStmt))
	BaseStmt = CS->getCapturedStmt();
	if (auto *C = dyn_cast_or_null<CompoundStmt>(BaseStmt)) {
	auto S = C->children();
	if (S.begin() == S.end())
	return StmtError();
	// All associated statements must be '#pragma omp section' except for
	// the first one.
	for (Stmt *SectionStmt : llvm::make_range(std::next(S.begin()), S.end())) {
	if (!SectionStmt \|\| !isa<OMPSectionDirective>(SectionStmt)) {
	if (SectionStmt)
	Diag(SectionStmt->getBeginLoc(),
	diag::err_omp_sections_substmt_not_section);
	return StmtError();
	}
	cast<OMPSectionDirective>(SectionStmt)
	->setHasCancel(DSAStack->isCancelRegion());
	}
	} else {
	Diag(AStmt->getBeginLoc(), diag::err_omp_sections_not_compound_stmt);
	return StmtError();
	}

	setFunctionHasBranchProtectedScope();

	return OMPSectionsDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt,
	DSAStack->getTaskgroupReductionRef(),
	DSAStack->isCancelRegion());
	}

	StmtResult Sema::ActOnOpenMPSectionDirective(Stmt *AStmt,
	SourceLocation StartLoc,
	SourceLocation EndLoc) {
	if (!AStmt)
	return StmtError();

	assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");

	setFunctionHasBranchProtectedScope();
	DSAStack->setParentCancelRegion(DSAStack->isCancelRegion());

	return OMPSectionDirective::Create(Context, StartLoc, EndLoc, AStmt,
	DSAStack->isCancelRegion());
	}

	StmtResult Sema::ActOnOpenMPSingleDirective(ArrayRef<OMPClause *> Clauses,
	Stmt *AStmt,
	SourceLocation StartLoc,
	SourceLocation EndLoc) {
	if (!AStmt)
	return StmtError();

	assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");

	setFunctionHasBranchProtectedScope();

	// OpenMP [2.7.3, single Construct, Restrictions]
	// The copyprivate clause must not be used with the nowait clause.
	const OMPClause *Nowait = nullptr;
	const OMPClause *Copyprivate = nullptr;
	for (const OMPClause *Clause : Clauses) {
	if (Clause->getClauseKind() == OMPC_nowait)
	Nowait = Clause;
	else if (Clause->getClauseKind() == OMPC_copyprivate)
	Copyprivate = Clause;
	if (Copyprivate && Nowait) {
	Diag(Copyprivate->getBeginLoc(),
	diag::err_omp_single_copyprivate_with_nowait);
	Diag(Nowait->getBeginLoc(), diag::note_omp_nowait_clause_here);
	return StmtError();
	}
	}

	return OMPSingleDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt);
	}

	StmtResult Sema::ActOnOpenMPMasterDirective(Stmt *AStmt,
	SourceLocation StartLoc,
	SourceLocation EndLoc) {
	if (!AStmt)
	return StmtError();

	assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");

	setFunctionHasBranchProtectedScope();

	return OMPMasterDirective::Create(Context, StartLoc, EndLoc, AStmt);
	}

	StmtResult Sema::ActOnOpenMPCriticalDirective(
	const DeclarationNameInfo &DirName, ArrayRef<OMPClause *> Clauses,
	Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc) {
	if (!AStmt)
	return StmtError();

	assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");

	bool ErrorFound = false;
	llvm::APSInt Hint;
	SourceLocation HintLoc;
	bool DependentHint = false;
	for (const OMPClause *C : Clauses) {
	if (C->getClauseKind() == OMPC_hint) {
	if (!DirName.getName()) {
	Diag(C->getBeginLoc(), diag::err_omp_hint_clause_no_name);
	ErrorFound = true;
	}
	Expr *E = cast<OMPHintClause>(C)->getHint();
	if (E->isTypeDependent() \|\| E->isValueDependent() \|\|
	E->isInstantiationDependent()) {
	DependentHint = true;
	} else {
	Hint = E->EvaluateKnownConstInt(Context);
	HintLoc = C->getBeginLoc();
	}
	}
	}
	if (ErrorFound)
	return StmtError();
	const auto Pair = DSAStack->getCriticalWithHint(DirName);
	if (Pair.first && DirName.getName() && !DependentHint) {
	if (llvm::APSInt::compareValues(Hint, Pair.second) != 0) {
	Diag(StartLoc, diag::err_omp_critical_with_hint);
	if (HintLoc.isValid())
	Diag(HintLoc, diag::note_omp_critical_hint_here)
	<< 0 << Hint.toString(/Radix=/10, /Signed=/false);
	else
	Diag(StartLoc, diag::note_omp_critical_no_hint) << 0;
	if (const auto *C = Pair.first->getSingleClause<OMPHintClause>()) {
	Diag(C->getBeginLoc(), diag::note_omp_critical_hint_here)
	<< 1
	<< C->getHint()->EvaluateKnownConstInt(Context).toString(
	/Radix=/10, /Signed=/false);
	} else {
	Diag(Pair.first->getBeginLoc(), diag::note_omp_critical_no_hint) << 1;
	}
	}
	}

	setFunctionHasBranchProtectedScope();

	auto *Dir = OMPCriticalDirective::Create(Context, DirName, StartLoc, EndLoc,
	Clauses, AStmt);
	if (!Pair.first && DirName.getName() && !DependentHint)
	DSAStack->addCriticalWithHint(Dir, Hint);
	return Dir;
	}

	StmtResult Sema::ActOnOpenMPParallelForDirective(
	ArrayRef<OMPClause > Clauses, Stmt AStmt, SourceLocation StartLoc,
	SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
	if (!AStmt)
	return StmtError();

	auto *CS = cast<CapturedStmt>(AStmt);
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();

	OMPLoopDirective::HelperExprs B;
	// In presence of clause 'collapse' or 'ordered' with number of loops, it will
	// define the nested loops number.
	unsigned NestedLoopCount =
	checkOpenMPLoop(OMPD_parallel_for, getCollapseNumberExpr(Clauses),
	getOrderedNumberExpr(Clauses), AStmt, this, DSAStack,
	VarsWithImplicitDSA, B);
	if (NestedLoopCount == 0)
	return StmtError();

	assert((CurContext->isDependentContext() \|\| B.builtAll()) &&
	"omp parallel for loop exprs were not built");

	if (!CurContext->isDependentContext()) {
	// Finalize the clauses that need pre-built expressions for CodeGen.
	for (OMPClause *C : Clauses) {
	if (auto *LC = dyn_cast<OMPLinearClause>(C))
	if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
	B.NumIterations, *this, CurScope,
	DSAStack))
	return StmtError();
	}
	}

	setFunctionHasBranchProtectedScope();
	return OMPParallelForDirective::Create(
	Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
	DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
	}

	StmtResult Sema::ActOnOpenMPParallelForSimdDirective(
	ArrayRef<OMPClause > Clauses, Stmt AStmt, SourceLocation StartLoc,
	SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
	if (!AStmt)
	return StmtError();

	auto *CS = cast<CapturedStmt>(AStmt);
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();

	OMPLoopDirective::HelperExprs B;
	// In presence of clause 'collapse' or 'ordered' with number of loops, it will
	// define the nested loops number.
	unsigned NestedLoopCount =
	checkOpenMPLoop(OMPD_parallel_for_simd, getCollapseNumberExpr(Clauses),
	getOrderedNumberExpr(Clauses), AStmt, this, DSAStack,
	VarsWithImplicitDSA, B);
	if (NestedLoopCount == 0)
	return StmtError();

	if (!CurContext->isDependentContext()) {
	// Finalize the clauses that need pre-built expressions for CodeGen.
	for (OMPClause *C : Clauses) {
	if (auto *LC = dyn_cast<OMPLinearClause>(C))
	if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
	B.NumIterations, *this, CurScope,
	DSAStack))
	return StmtError();
	}
	}

	if (checkSimdlenSafelenSpecified(*this, Clauses))
	return StmtError();

	setFunctionHasBranchProtectedScope();
	return OMPParallelForSimdDirective::Create(
	Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
	}

	StmtResult
	Sema::ActOnOpenMPParallelMasterDirective(ArrayRef<OMPClause *> Clauses,
	Stmt *AStmt, SourceLocation StartLoc,
	SourceLocation EndLoc) {
	if (!AStmt)
	return StmtError();

	assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
	auto *CS = cast<CapturedStmt>(AStmt);
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();

	setFunctionHasBranchProtectedScope();

	return OMPParallelMasterDirective::Create(
	Context, StartLoc, EndLoc, Clauses, AStmt,
	DSAStack->getTaskgroupReductionRef());
	}

	StmtResult
	Sema::ActOnOpenMPParallelSectionsDirective(ArrayRef<OMPClause *> Clauses,
	Stmt *AStmt, SourceLocation StartLoc,
	SourceLocation EndLoc) {
	if (!AStmt)
	return StmtError();

	assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
	auto BaseStmt = AStmt;
	while (auto *CS = dyn_cast_or_null<CapturedStmt>(BaseStmt))
	BaseStmt = CS->getCapturedStmt();
	if (auto *C = dyn_cast_or_null<CompoundStmt>(BaseStmt)) {
	auto S = C->children();
	if (S.begin() == S.end())
	return StmtError();
	// All associated statements must be '#pragma omp section' except for
	// the first one.
	for (Stmt *SectionStmt : llvm::make_range(std::next(S.begin()), S.end())) {
	if (!SectionStmt \|\| !isa<OMPSectionDirective>(SectionStmt)) {
	if (SectionStmt)
	Diag(SectionStmt->getBeginLoc(),
	diag::err_omp_parallel_sections_substmt_not_section);
	return StmtError();
	}
	cast<OMPSectionDirective>(SectionStmt)
	->setHasCancel(DSAStack->isCancelRegion());
	}
	} else {
	Diag(AStmt->getBeginLoc(),
	diag::err_omp_parallel_sections_not_compound_stmt);
	return StmtError();
	}

	setFunctionHasBranchProtectedScope();

	return OMPParallelSectionsDirective::Create(
	Context, StartLoc, EndLoc, Clauses, AStmt,
	DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
	}

	/// detach and mergeable clauses are mutially exclusive, check for it.
	static bool checkDetachMergeableClauses(Sema &S,
	ArrayRef<OMPClause *> Clauses) {
	const OMPClause *PrevClause = nullptr;
	bool ErrorFound = false;
	for (const OMPClause *C : Clauses) {
	if (C->getClauseKind() == OMPC_detach \|\|
	C->getClauseKind() == OMPC_mergeable) {
	if (!PrevClause) {
	PrevClause = C;
	} else if (PrevClause->getClauseKind() != C->getClauseKind()) {
	S.Diag(C->getBeginLoc(), diag::err_omp_clauses_mutually_exclusive)
	<< getOpenMPClauseName(C->getClauseKind())
	<< getOpenMPClauseName(PrevClause->getClauseKind());
	S.Diag(PrevClause->getBeginLoc(), diag::note_omp_previous_clause)
	<< getOpenMPClauseName(PrevClause->getClauseKind());
	ErrorFound = true;
	}
	}
	}
	return ErrorFound;
	}

	StmtResult Sema::ActOnOpenMPTaskDirective(ArrayRef<OMPClause *> Clauses,
	Stmt *AStmt, SourceLocation StartLoc,
	SourceLocation EndLoc) {
	if (!AStmt)
	return StmtError();

	// OpenMP 5.0, 2.10.1 task Construct
	// If a detach clause appears on the directive, then a mergeable clause cannot
	// appear on the same directive.
	if (checkDetachMergeableClauses(*this, Clauses))
	return StmtError();

	auto *CS = cast<CapturedStmt>(AStmt);
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();

	setFunctionHasBranchProtectedScope();

	return OMPTaskDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt,
	DSAStack->isCancelRegion());
	}

	StmtResult Sema::ActOnOpenMPTaskyieldDirective(SourceLocation StartLoc,
	SourceLocation EndLoc) {
	return OMPTaskyieldDirective::Create(Context, StartLoc, EndLoc);
	}

	StmtResult Sema::ActOnOpenMPBarrierDirective(SourceLocation StartLoc,
	SourceLocation EndLoc) {
	return OMPBarrierDirective::Create(Context, StartLoc, EndLoc);
	}

	StmtResult Sema::ActOnOpenMPTaskwaitDirective(SourceLocation StartLoc,
	SourceLocation EndLoc) {
	return OMPTaskwaitDirective::Create(Context, StartLoc, EndLoc);
	}

	StmtResult Sema::ActOnOpenMPTaskgroupDirective(ArrayRef<OMPClause *> Clauses,
	Stmt *AStmt,
	SourceLocation StartLoc,
	SourceLocation EndLoc) {
	if (!AStmt)
	return StmtError();

	assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");

	setFunctionHasBranchProtectedScope();

	return OMPTaskgroupDirective::Create(Context, StartLoc, EndLoc, Clauses,
	AStmt,
	DSAStack->getTaskgroupReductionRef());
	}

	StmtResult Sema::ActOnOpenMPFlushDirective(ArrayRef<OMPClause *> Clauses,
	SourceLocation StartLoc,
	SourceLocation EndLoc) {
	OMPFlushClause *FC = nullptr;
	OMPClause *OrderClause = nullptr;
	for (OMPClause *C : Clauses) {
	if (C->getClauseKind() == OMPC_flush)
	FC = cast<OMPFlushClause>(C);
	else
	OrderClause = C;
	}
	OpenMPClauseKind MemOrderKind = OMPC_unknown;
	SourceLocation MemOrderLoc;
	for (const OMPClause *C : Clauses) {
	if (C->getClauseKind() == OMPC_acq_rel \|\|
	C->getClauseKind() == OMPC_acquire \|\|
	C->getClauseKind() == OMPC_release) {
	if (MemOrderKind != OMPC_unknown) {
	Diag(C->getBeginLoc(), diag::err_omp_several_mem_order_clauses)
	<< getOpenMPDirectiveName(OMPD_flush) << 1
	<< SourceRange(C->getBeginLoc(), C->getEndLoc());
	Diag(MemOrderLoc, diag::note_omp_previous_mem_order_clause)
	<< getOpenMPClauseName(MemOrderKind);
	} else {
	MemOrderKind = C->getClauseKind();
	MemOrderLoc = C->getBeginLoc();
	}
	}
	}
	if (FC && OrderClause) {
	Diag(FC->getLParenLoc(), diag::err_omp_flush_order_clause_and_list)
	<< getOpenMPClauseName(OrderClause->getClauseKind());
	Diag(OrderClause->getBeginLoc(), diag::note_omp_flush_order_clause_here)
	<< getOpenMPClauseName(OrderClause->getClauseKind());
	return StmtError();
	}
	return OMPFlushDirective::Create(Context, StartLoc, EndLoc, Clauses);
	}

	StmtResult Sema::ActOnOpenMPDepobjDirective(ArrayRef<OMPClause *> Clauses,
	SourceLocation StartLoc,
	SourceLocation EndLoc) {
	if (Clauses.empty()) {
	Diag(StartLoc, diag::err_omp_depobj_expected);
	return StmtError();
	} else if (Clauses[0]->getClauseKind() != OMPC_depobj) {
	Diag(Clauses[0]->getBeginLoc(), diag::err_omp_depobj_expected);
	return StmtError();
	}
	// Only depobj expression and another single clause is allowed.
	if (Clauses.size() > 2) {
	Diag(Clauses[2]->getBeginLoc(),
	diag::err_omp_depobj_single_clause_expected);
	return StmtError();
	} else if (Clauses.size() < 1) {
	Diag(Clauses[0]->getEndLoc(), diag::err_omp_depobj_single_clause_expected);
	return StmtError();
	}
	return OMPDepobjDirective::Create(Context, StartLoc, EndLoc, Clauses);
	}

	StmtResult Sema::ActOnOpenMPScanDirective(ArrayRef<OMPClause *> Clauses,
	SourceLocation StartLoc,
	SourceLocation EndLoc) {
	// Check that exactly one clause is specified.
	if (Clauses.size() != 1) {
	Diag(Clauses.empty() ? EndLoc : Clauses[1]->getBeginLoc(),
	diag::err_omp_scan_single_clause_expected);
	return StmtError();
	}
	// Check that scan directive is used in the scopeof the OpenMP loop body.
	if (Scope *S = DSAStack->getCurScope()) {
	Scope *ParentS = S->getParent();
	if (!ParentS \|\| ParentS->getParent() != ParentS->getBreakParent() \|\|
	!ParentS->getBreakParent()->isOpenMPLoopScope())
	return StmtError(Diag(StartLoc, diag::err_omp_orphaned_device_directive)
	<< getOpenMPDirectiveName(OMPD_scan) << 5);
	}
	// Check that only one instance of scan directives is used in the same outer
	// region.
	if (DSAStack->doesParentHasScanDirective()) {
	Diag(StartLoc, diag::err_omp_several_directives_in_region) << "scan";
	Diag(DSAStack->getParentScanDirectiveLoc(),
	diag::note_omp_previous_directive)
	<< "scan";
	return StmtError();
	}
	DSAStack->setParentHasScanDirective(StartLoc);
	return OMPScanDirective::Create(Context, StartLoc, EndLoc, Clauses);
	}

	StmtResult Sema::ActOnOpenMPOrderedDirective(ArrayRef<OMPClause *> Clauses,
	Stmt *AStmt,
	SourceLocation StartLoc,
	SourceLocation EndLoc) {
	const OMPClause *DependFound = nullptr;
	const OMPClause *DependSourceClause = nullptr;
	const OMPClause *DependSinkClause = nullptr;
	bool ErrorFound = false;
	const OMPThreadsClause *TC = nullptr;
	const OMPSIMDClause *SC = nullptr;
	for (const OMPClause *C : Clauses) {
	if (auto *DC = dyn_cast<OMPDependClause>(C)) {
	DependFound = C;
	if (DC->getDependencyKind() == OMPC_DEPEND_source) {
	if (DependSourceClause) {
	Diag(C->getBeginLoc(), diag::err_omp_more_one_clause)
	<< getOpenMPDirectiveName(OMPD_ordered)
	<< getOpenMPClauseName(OMPC_depend) << 2;
	ErrorFound = true;
	} else {
	DependSourceClause = C;
	}
	if (DependSinkClause) {
	Diag(C->getBeginLoc(), diag::err_omp_depend_sink_source_not_allowed)
	<< 0;
	ErrorFound = true;
	}
	} else if (DC->getDependencyKind() == OMPC_DEPEND_sink) {
	if (DependSourceClause) {
	Diag(C->getBeginLoc(), diag::err_omp_depend_sink_source_not_allowed)
	<< 1;
	ErrorFound = true;
	}
	DependSinkClause = C;
	}
	} else if (C->getClauseKind() == OMPC_threads) {
	TC = cast<OMPThreadsClause>(C);
	} else if (C->getClauseKind() == OMPC_simd) {
	SC = cast<OMPSIMDClause>(C);
	}
	}
	if (!ErrorFound && !SC &&
	isOpenMPSimdDirective(DSAStack->getParentDirective())) {
	// OpenMP [2.8.1,simd Construct, Restrictions]
	// An ordered construct with the simd clause is the only OpenMP construct
	// that can appear in the simd region.
	Diag(StartLoc, diag::err_omp_prohibited_region_simd)
	<< (LangOpts.OpenMP >= 50 ? 1 : 0);
	ErrorFound = true;
	} else if (DependFound && (TC \|\| SC)) {
	Diag(DependFound->getBeginLoc(), diag::err_omp_depend_clause_thread_simd)
	<< getOpenMPClauseName(TC ? TC->getClauseKind() : SC->getClauseKind());
	ErrorFound = true;
	} else if (DependFound && !DSAStack->getParentOrderedRegionParam().first) {
	Diag(DependFound->getBeginLoc(),
	diag::err_omp_ordered_directive_without_param);
	ErrorFound = true;
	} else if (TC \|\| Clauses.empty()) {
	if (const Expr *Param = DSAStack->getParentOrderedRegionParam().first) {
	SourceLocation ErrLoc = TC ? TC->getBeginLoc() : StartLoc;
	Diag(ErrLoc, diag::err_omp_ordered_directive_with_param)
	<< (TC != nullptr);
	Diag(Param->getBeginLoc(), diag::note_omp_ordered_param) << 1;
	ErrorFound = true;
	}
	}
	if ((!AStmt && !DependFound) \|\| ErrorFound)
	return StmtError();

	// OpenMP 5.0, 2.17.9, ordered Construct, Restrictions.
	// During execution of an iteration of a worksharing-loop or a loop nest
	// within a worksharing-loop, simd, or worksharing-loop SIMD region, a thread
	// must not execute more than one ordered region corresponding to an ordered
	// construct without a depend clause.
	if (!DependFound) {
	if (DSAStack->doesParentHasOrderedDirective()) {
	Diag(StartLoc, diag::err_omp_several_directives_in_region) << "ordered";
	Diag(DSAStack->getParentOrderedDirectiveLoc(),
	diag::note_omp_previous_directive)
	<< "ordered";
	return StmtError();
	}
	DSAStack->setParentHasOrderedDirective(StartLoc);
	}

	if (AStmt) {
	assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");

	setFunctionHasBranchProtectedScope();
	}

	return OMPOrderedDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt);
	}

	namespace {
	/// Helper class for checking expression in 'omp atomic [update]'
	/// construct.
	class OpenMPAtomicUpdateChecker {
	/// Error results for atomic update expressions.
	enum ExprAnalysisErrorCode {
	/// A statement is not an expression statement.
	NotAnExpression,
	/// Expression is not builtin binary or unary operation.
	NotABinaryOrUnaryExpression,
	/// Unary operation is not post-/pre- increment/decrement operation.
	NotAnUnaryIncDecExpression,
	/// An expression is not of scalar type.
	NotAScalarType,
	/// A binary operation is not an assignment operation.
	NotAnAssignmentOp,
	/// RHS part of the binary operation is not a binary expression.
	NotABinaryExpression,
	/// RHS part is not additive/multiplicative/shift/biwise binary
	/// expression.
	NotABinaryOperator,
	/// RHS binary operation does not have reference to the updated LHS
	/// part.
	NotAnUpdateExpression,
	/// No errors is found.
	NoError
	};
	/// Reference to Sema.
	Sema &SemaRef;
	/// A location for note diagnostics (when error is found).
	SourceLocation NoteLoc;
	/// 'x' lvalue part of the source atomic expression.
	Expr *X;
	/// 'expr' rvalue part of the source atomic expression.
	Expr *E;
	/// Helper expression of the form
	/// 'OpaqueValueExpr(x) binop OpaqueValueExpr(expr)' or
	/// 'OpaqueValueExpr(expr) binop OpaqueValueExpr(x)'.
	Expr *UpdateExpr;
	/// Is 'x' a LHS in a RHS part of full update expression. It is
	/// important for non-associative operations.
	bool IsXLHSInRHSPart;
	BinaryOperatorKind Op;
	SourceLocation OpLoc;
	/// true if the source expression is a postfix unary operation, false
	/// if it is a prefix unary operation.
	bool IsPostfixUpdate;

	public:
	OpenMPAtomicUpdateChecker(Sema &SemaRef)
	: SemaRef(SemaRef), X(nullptr), E(nullptr), UpdateExpr(nullptr),
	IsXLHSInRHSPart(false), Op(BO_PtrMemD), IsPostfixUpdate(false) {}
	/// Check specified statement that it is suitable for 'atomic update'
	/// constructs and extract 'x', 'expr' and Operation from the original
	/// expression. If DiagId and NoteId == 0, then only check is performed
	/// without error notification.
	/// \param DiagId Diagnostic which should be emitted if error is found.
	/// \param NoteId Diagnostic note for the main error message.
	/// \return true if statement is not an update expression, false otherwise.
	bool checkStatement(Stmt *S, unsigned DiagId = 0, unsigned NoteId = 0);
	/// Return the 'x' lvalue part of the source atomic expression.
	Expr *getX() const { return X; }
	/// Return the 'expr' rvalue part of the source atomic expression.
	Expr *getExpr() const { return E; }
	/// Return the update expression used in calculation of the updated
	/// value. Always has form 'OpaqueValueExpr(x) binop OpaqueValueExpr(expr)' or
	/// 'OpaqueValueExpr(expr) binop OpaqueValueExpr(x)'.
	Expr *getUpdateExpr() const { return UpdateExpr; }
	/// Return true if 'x' is LHS in RHS part of full update expression,
	/// false otherwise.
	bool isXLHSInRHSPart() const { return IsXLHSInRHSPart; }

	/// true if the source expression is a postfix unary operation, false
	/// if it is a prefix unary operation.
	bool isPostfixUpdate() const { return IsPostfixUpdate; }

	private:
	bool checkBinaryOperation(BinaryOperator *AtomicBinOp, unsigned DiagId = 0,
	unsigned NoteId = 0);
	};
	} // namespace

	bool OpenMPAtomicUpdateChecker::checkBinaryOperation(
	BinaryOperator *AtomicBinOp, unsigned DiagId, unsigned NoteId) {
	ExprAnalysisErrorCode ErrorFound = NoError;
	SourceLocation ErrorLoc, NoteLoc;
	SourceRange ErrorRange, NoteRange;
	// Allowed constructs are:
	// x = x binop expr;
	// x = expr binop x;
	if (AtomicBinOp->getOpcode() == BO_Assign) {
	X = AtomicBinOp->getLHS();
	if (const auto *AtomicInnerBinOp = dyn_cast<BinaryOperator>(
	AtomicBinOp->getRHS()->IgnoreParenImpCasts())) {
	if (AtomicInnerBinOp->isMultiplicativeOp() \|\|
	AtomicInnerBinOp->isAdditiveOp() \|\| AtomicInnerBinOp->isShiftOp() \|\|
	AtomicInnerBinOp->isBitwiseOp()) {
	Op = AtomicInnerBinOp->getOpcode();
	OpLoc = AtomicInnerBinOp->getOperatorLoc();
	Expr *LHS = AtomicInnerBinOp->getLHS();
	Expr *RHS = AtomicInnerBinOp->getRHS();
	llvm::FoldingSetNodeID XId, LHSId, RHSId;
	X->IgnoreParenImpCasts()->Profile(XId, SemaRef.getASTContext(),
	/Canonical=/true);
	LHS->IgnoreParenImpCasts()->Profile(LHSId, SemaRef.getASTContext(),
	/Canonical=/true);
	RHS->IgnoreParenImpCasts()->Profile(RHSId, SemaRef.getASTContext(),
	/Canonical=/true);
	if (XId == LHSId) {
	E = RHS;
	IsXLHSInRHSPart = true;
	} else if (XId == RHSId) {
	E = LHS;
	IsXLHSInRHSPart = false;
	} else {
	ErrorLoc = AtomicInnerBinOp->getExprLoc();
	ErrorRange = AtomicInnerBinOp->getSourceRange();
	NoteLoc = X->getExprLoc();
	NoteRange = X->getSourceRange();
	ErrorFound = NotAnUpdateExpression;
	}
	} else {
	ErrorLoc = AtomicInnerBinOp->getExprLoc();
	ErrorRange = AtomicInnerBinOp->getSourceRange();
	NoteLoc = AtomicInnerBinOp->getOperatorLoc();
	NoteRange = SourceRange(NoteLoc, NoteLoc);
	ErrorFound = NotABinaryOperator;
	}
	} else {
	NoteLoc = ErrorLoc = AtomicBinOp->getRHS()->getExprLoc();
	NoteRange = ErrorRange = AtomicBinOp->getRHS()->getSourceRange();
	ErrorFound = NotABinaryExpression;
	}
	} else {
	ErrorLoc = AtomicBinOp->getExprLoc();
	ErrorRange = AtomicBinOp->getSourceRange();
	NoteLoc = AtomicBinOp->getOperatorLoc();
	NoteRange = SourceRange(NoteLoc, NoteLoc);
	ErrorFound = NotAnAssignmentOp;
	}
	if (ErrorFound != NoError && DiagId != 0 && NoteId != 0) {
	SemaRef.Diag(ErrorLoc, DiagId) << ErrorRange;
	SemaRef.Diag(NoteLoc, NoteId) << ErrorFound << NoteRange;
	return true;
	}
	if (SemaRef.CurContext->isDependentContext())
	E = X = UpdateExpr = nullptr;
	return ErrorFound != NoError;
	}

	bool OpenMPAtomicUpdateChecker::checkStatement(Stmt *S, unsigned DiagId,
	unsigned NoteId) {
	ExprAnalysisErrorCode ErrorFound = NoError;
	SourceLocation ErrorLoc, NoteLoc;
	SourceRange ErrorRange, NoteRange;
	// Allowed constructs are:
	// x++;
	// x--;
	// ++x;
	// --x;
	// x binop= expr;
	// x = x binop expr;
	// x = expr binop x;
	if (auto *AtomicBody = dyn_cast<Expr>(S)) {
	AtomicBody = AtomicBody->IgnoreParenImpCasts();
	if (AtomicBody->getType()->isScalarType() \|\|
	AtomicBody->isInstantiationDependent()) {
	if (const auto *AtomicCompAssignOp = dyn_cast<CompoundAssignOperator>(
	AtomicBody->IgnoreParenImpCasts())) {
	// Check for Compound Assignment Operation
	Op = BinaryOperator::getOpForCompoundAssignment(
	AtomicCompAssignOp->getOpcode());
	OpLoc = AtomicCompAssignOp->getOperatorLoc();
	E = AtomicCompAssignOp->getRHS();
	X = AtomicCompAssignOp->getLHS()->IgnoreParens();
	IsXLHSInRHSPart = true;
	} else if (auto *AtomicBinOp = dyn_cast<BinaryOperator>(
	AtomicBody->IgnoreParenImpCasts())) {
	// Check for Binary Operation
	if (checkBinaryOperation(AtomicBinOp, DiagId, NoteId))
	return true;
	} else if (const auto *AtomicUnaryOp = dyn_cast<UnaryOperator>(
	AtomicBody->IgnoreParenImpCasts())) {
	// Check for Unary Operation
	if (AtomicUnaryOp->isIncrementDecrementOp()) {
	IsPostfixUpdate = AtomicUnaryOp->isPostfix();
	Op = AtomicUnaryOp->isIncrementOp() ? BO_Add : BO_Sub;
	OpLoc = AtomicUnaryOp->getOperatorLoc();
	X = AtomicUnaryOp->getSubExpr()->IgnoreParens();
	E = SemaRef.ActOnIntegerConstant(OpLoc, /uint64_t Val=/1).get();
	IsXLHSInRHSPart = true;
	} else {
	ErrorFound = NotAnUnaryIncDecExpression;
	ErrorLoc = AtomicUnaryOp->getExprLoc();
	ErrorRange = AtomicUnaryOp->getSourceRange();
	NoteLoc = AtomicUnaryOp->getOperatorLoc();
	NoteRange = SourceRange(NoteLoc, NoteLoc);
	}
	} else if (!AtomicBody->isInstantiationDependent()) {
	ErrorFound = NotABinaryOrUnaryExpression;
	NoteLoc = ErrorLoc = AtomicBody->getExprLoc();
	NoteRange = ErrorRange = AtomicBody->getSourceRange();
	}
	} else {
	ErrorFound = NotAScalarType;
	NoteLoc = ErrorLoc = AtomicBody->getBeginLoc();
	NoteRange = ErrorRange = SourceRange(NoteLoc, NoteLoc);
	}
	} else {
	ErrorFound = NotAnExpression;
	NoteLoc = ErrorLoc = S->getBeginLoc();
	NoteRange = ErrorRange = SourceRange(NoteLoc, NoteLoc);
	}
	if (ErrorFound != NoError && DiagId != 0 && NoteId != 0) {
	SemaRef.Diag(ErrorLoc, DiagId) << ErrorRange;
	SemaRef.Diag(NoteLoc, NoteId) << ErrorFound << NoteRange;
	return true;
	}
	if (SemaRef.CurContext->isDependentContext())
	E = X = UpdateExpr = nullptr;
	if (ErrorFound == NoError && E && X) {
	// Build an update expression of form 'OpaqueValueExpr(x) binop
	// OpaqueValueExpr(expr)' or 'OpaqueValueExpr(expr) binop
	// OpaqueValueExpr(x)' and then cast it to the type of the 'x' expression.
	auto *OVEX = new (SemaRef.getASTContext())
	OpaqueValueExpr(X->getExprLoc(), X->getType(), VK_RValue);
	auto *OVEExpr = new (SemaRef.getASTContext())
	OpaqueValueExpr(E->getExprLoc(), E->getType(), VK_RValue);
	ExprResult Update =
	SemaRef.CreateBuiltinBinOp(OpLoc, Op, IsXLHSInRHSPart ? OVEX : OVEExpr,
	IsXLHSInRHSPart ? OVEExpr : OVEX);
	if (Update.isInvalid())
	return true;
	Update = SemaRef.PerformImplicitConversion(Update.get(), X->getType(),
	Sema::AA_Casting);
	if (Update.isInvalid())
	return true;
	UpdateExpr = Update.get();
	}
	return ErrorFound != NoError;
	}

	StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef<OMPClause *> Clauses,
	Stmt *AStmt,
	SourceLocation StartLoc,
	SourceLocation EndLoc) {
	// Register location of the first atomic directive.
	DSAStack->addAtomicDirectiveLoc(StartLoc);
	if (!AStmt)
	return StmtError();

	auto *CS = cast<CapturedStmt>(AStmt);
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	OpenMPClauseKind AtomicKind = OMPC_unknown;
	SourceLocation AtomicKindLoc;
	OpenMPClauseKind MemOrderKind = OMPC_unknown;
	SourceLocation MemOrderLoc;
	for (const OMPClause *C : Clauses) {
	if (C->getClauseKind() == OMPC_read \|\| C->getClauseKind() == OMPC_write \|\|
	C->getClauseKind() == OMPC_update \|\|
	C->getClauseKind() == OMPC_capture) {
	if (AtomicKind != OMPC_unknown) {
	Diag(C->getBeginLoc(), diag::err_omp_atomic_several_clauses)
	<< SourceRange(C->getBeginLoc(), C->getEndLoc());
	Diag(AtomicKindLoc, diag::note_omp_previous_mem_order_clause)
	<< getOpenMPClauseName(AtomicKind);
	} else {
	AtomicKind = C->getClauseKind();
	AtomicKindLoc = C->getBeginLoc();
	}
	}
	if (C->getClauseKind() == OMPC_seq_cst \|\|
	C->getClauseKind() == OMPC_acq_rel \|\|
	C->getClauseKind() == OMPC_acquire \|\|
	C->getClauseKind() == OMPC_release \|\|
	C->getClauseKind() == OMPC_relaxed) {
	if (MemOrderKind != OMPC_unknown) {
	Diag(C->getBeginLoc(), diag::err_omp_several_mem_order_clauses)
	<< getOpenMPDirectiveName(OMPD_atomic) << 0
	<< SourceRange(C->getBeginLoc(), C->getEndLoc());
	Diag(MemOrderLoc, diag::note_omp_previous_mem_order_clause)
	<< getOpenMPClauseName(MemOrderKind);
	} else {
	MemOrderKind = C->getClauseKind();
	MemOrderLoc = C->getBeginLoc();
	}
	}
	}
	// OpenMP 5.0, 2.17.7 atomic Construct, Restrictions
	// If atomic-clause is read then memory-order-clause must not be acq_rel or
	// release.
	// If atomic-clause is write then memory-order-clause must not be acq_rel or
	// acquire.
	// If atomic-clause is update or not present then memory-order-clause must not
	// be acq_rel or acquire.
	if ((AtomicKind == OMPC_read &&
	(MemOrderKind == OMPC_acq_rel \|\| MemOrderKind == OMPC_release)) \|\|
	((AtomicKind == OMPC_write \|\| AtomicKind == OMPC_update \|\|
	AtomicKind == OMPC_unknown) &&
	(MemOrderKind == OMPC_acq_rel \|\| MemOrderKind == OMPC_acquire))) {
	SourceLocation Loc = AtomicKindLoc;
	if (AtomicKind == OMPC_unknown)
	Loc = StartLoc;
	Diag(Loc, diag::err_omp_atomic_incompatible_mem_order_clause)
	<< getOpenMPClauseName(AtomicKind)
	<< (AtomicKind == OMPC_unknown ? 1 : 0)
	<< getOpenMPClauseName(MemOrderKind);
	Diag(MemOrderLoc, diag::note_omp_previous_mem_order_clause)
	<< getOpenMPClauseName(MemOrderKind);
	}

	Stmt *Body = CS->getCapturedStmt();
	if (auto *EWC = dyn_cast<ExprWithCleanups>(Body))
	Body = EWC->getSubExpr();

	Expr *X = nullptr;
	Expr *V = nullptr;
	Expr *E = nullptr;
	Expr *UE = nullptr;
	bool IsXLHSInRHSPart = false;
	bool IsPostfixUpdate = false;
	// OpenMP [2.12.6, atomic Construct]
	// In the next expressions:
	// * x and v (as applicable) are both l-value expressions with scalar type.
	// * During the execution of an atomic region, multiple syntactic
	// occurrences of x must designate the same storage location.
	// * Neither of v and expr (as applicable) may access the storage location
	// designated by x.
	// * Neither of x and expr (as applicable) may access the storage location
	// designated by v.
	// * expr is an expression with scalar type.
	// * binop is one of +, *, -, /, &, ^, \|, <<, or >>.
	// * binop, binop=, ++, and -- are not overloaded operators.
	// * The expression x binop expr must be numerically equivalent to x binop
	// (expr). This requirement is satisfied if the operators in expr have
	// precedence greater than binop, or by using parentheses around expr or
	// subexpressions of expr.
	// * The expression expr binop x must be numerically equivalent to (expr)
	// binop x. This requirement is satisfied if the operators in expr have
	// precedence equal to or greater than binop, or by using parentheses around
	// expr or subexpressions of expr.
	// * For forms that allow multiple occurrences of x, the number of times
	// that x is evaluated is unspecified.
	if (AtomicKind == OMPC_read) {
	enum {
	NotAnExpression,
	NotAnAssignmentOp,
	NotAScalarType,
	NotAnLValue,
	NoError
	} ErrorFound = NoError;
	SourceLocation ErrorLoc, NoteLoc;
	SourceRange ErrorRange, NoteRange;
	// If clause is read:
	// v = x;
	if (const auto *AtomicBody = dyn_cast<Expr>(Body)) {
	const auto *AtomicBinOp =
	dyn_cast<BinaryOperator>(AtomicBody->IgnoreParenImpCasts());
	if (AtomicBinOp && AtomicBinOp->getOpcode() == BO_Assign) {
	X = AtomicBinOp->getRHS()->IgnoreParenImpCasts();
	V = AtomicBinOp->getLHS()->IgnoreParenImpCasts();
	if ((X->isInstantiationDependent() \|\| X->getType()->isScalarType()) &&
	(V->isInstantiationDependent() \|\| V->getType()->isScalarType())) {
	if (!X->isLValue() \|\| !V->isLValue()) {
	const Expr *NotLValueExpr = X->isLValue() ? V : X;
	ErrorFound = NotAnLValue;
	ErrorLoc = AtomicBinOp->getExprLoc();
	ErrorRange = AtomicBinOp->getSourceRange();
	NoteLoc = NotLValueExpr->getExprLoc();
	NoteRange = NotLValueExpr->getSourceRange();
	}
	} else if (!X->isInstantiationDependent() \|\|
	!V->isInstantiationDependent()) {
	const Expr *NotScalarExpr =
	(X->isInstantiationDependent() \|\| X->getType()->isScalarType())
	? V
	: X;
	ErrorFound = NotAScalarType;
	ErrorLoc = AtomicBinOp->getExprLoc();
	ErrorRange = AtomicBinOp->getSourceRange();
	NoteLoc = NotScalarExpr->getExprLoc();
	NoteRange = NotScalarExpr->getSourceRange();
	}
	} else if (!AtomicBody->isInstantiationDependent()) {
	ErrorFound = NotAnAssignmentOp;
	ErrorLoc = AtomicBody->getExprLoc();
	ErrorRange = AtomicBody->getSourceRange();
	NoteLoc = AtomicBinOp ? AtomicBinOp->getOperatorLoc()
	: AtomicBody->getExprLoc();
	NoteRange = AtomicBinOp ? AtomicBinOp->getSourceRange()
	: AtomicBody->getSourceRange();
	}
	} else {
	ErrorFound = NotAnExpression;
	NoteLoc = ErrorLoc = Body->getBeginLoc();
	NoteRange = ErrorRange = SourceRange(NoteLoc, NoteLoc);
	}
	if (ErrorFound != NoError) {
	Diag(ErrorLoc, diag::err_omp_atomic_read_not_expression_statement)
	<< ErrorRange;
	Diag(NoteLoc, diag::note_omp_atomic_read_write) << ErrorFound
	<< NoteRange;
	return StmtError();
	}
	if (CurContext->isDependentContext())
	V = X = nullptr;
	} else if (AtomicKind == OMPC_write) {
	enum {
	NotAnExpression,
	NotAnAssignmentOp,
	NotAScalarType,
	NotAnLValue,
	NoError
	} ErrorFound = NoError;
	SourceLocation ErrorLoc, NoteLoc;
	SourceRange ErrorRange, NoteRange;
	// If clause is write:
	// x = expr;
	if (const auto *AtomicBody = dyn_cast<Expr>(Body)) {
	const auto *AtomicBinOp =
	dyn_cast<BinaryOperator>(AtomicBody->IgnoreParenImpCasts());
	if (AtomicBinOp && AtomicBinOp->getOpcode() == BO_Assign) {
	X = AtomicBinOp->getLHS();
	E = AtomicBinOp->getRHS();
	if ((X->isInstantiationDependent() \|\| X->getType()->isScalarType()) &&
	(E->isInstantiationDependent() \|\| E->getType()->isScalarType())) {
	if (!X->isLValue()) {
	ErrorFound = NotAnLValue;
	ErrorLoc = AtomicBinOp->getExprLoc();
	ErrorRange = AtomicBinOp->getSourceRange();
	NoteLoc = X->getExprLoc();
	NoteRange = X->getSourceRange();
	}
	} else if (!X->isInstantiationDependent() \|\|
	!E->isInstantiationDependent()) {
	const Expr *NotScalarExpr =
	(X->isInstantiationDependent() \|\| X->getType()->isScalarType())
	? E
	: X;
	ErrorFound = NotAScalarType;
	ErrorLoc = AtomicBinOp->getExprLoc();
	ErrorRange = AtomicBinOp->getSourceRange();
	NoteLoc = NotScalarExpr->getExprLoc();
	NoteRange = NotScalarExpr->getSourceRange();
	}
	} else if (!AtomicBody->isInstantiationDependent()) {
	ErrorFound = NotAnAssignmentOp;
	ErrorLoc = AtomicBody->getExprLoc();
	ErrorRange = AtomicBody->getSourceRange();
	NoteLoc = AtomicBinOp ? AtomicBinOp->getOperatorLoc()
	: AtomicBody->getExprLoc();
	NoteRange = AtomicBinOp ? AtomicBinOp->getSourceRange()
	: AtomicBody->getSourceRange();
	}
	} else {
	ErrorFound = NotAnExpression;
	NoteLoc = ErrorLoc = Body->getBeginLoc();
	NoteRange = ErrorRange = SourceRange(NoteLoc, NoteLoc);
	}
	if (ErrorFound != NoError) {
	Diag(ErrorLoc, diag::err_omp_atomic_write_not_expression_statement)
	<< ErrorRange;
	Diag(NoteLoc, diag::note_omp_atomic_read_write) << ErrorFound
	<< NoteRange;
	return StmtError();
	}
	if (CurContext->isDependentContext())
	E = X = nullptr;
	} else if (AtomicKind == OMPC_update \|\| AtomicKind == OMPC_unknown) {
	// If clause is update:
	// x++;
	// x--;
	// ++x;
	// --x;
	// x binop= expr;
	// x = x binop expr;
	// x = expr binop x;
	OpenMPAtomicUpdateChecker Checker(*this);
	if (Checker.checkStatement(
	Body, (AtomicKind == OMPC_update)
	? diag::err_omp_atomic_update_not_expression_statement
	: diag::err_omp_atomic_not_expression_statement,
	diag::note_omp_atomic_update))
	return StmtError();
	if (!CurContext->isDependentContext()) {
	E = Checker.getExpr();
	X = Checker.getX();
	UE = Checker.getUpdateExpr();
	IsXLHSInRHSPart = Checker.isXLHSInRHSPart();
	}
	} else if (AtomicKind == OMPC_capture) {
	enum {
	NotAnAssignmentOp,
	NotACompoundStatement,
	NotTwoSubstatements,
	NotASpecificExpression,
	NoError
	} ErrorFound = NoError;
	SourceLocation ErrorLoc, NoteLoc;
	SourceRange ErrorRange, NoteRange;
	if (const auto *AtomicBody = dyn_cast<Expr>(Body)) {
	// If clause is a capture:
	// v = x++;
	// v = x--;
	// v = ++x;
	// v = --x;
	// v = x binop= expr;
	// v = x = x binop expr;
	// v = x = expr binop x;
	const auto *AtomicBinOp =
	dyn_cast<BinaryOperator>(AtomicBody->IgnoreParenImpCasts());
	if (AtomicBinOp && AtomicBinOp->getOpcode() == BO_Assign) {
	V = AtomicBinOp->getLHS();
	Body = AtomicBinOp->getRHS()->IgnoreParenImpCasts();
	OpenMPAtomicUpdateChecker Checker(*this);
	if (Checker.checkStatement(
	Body, diag::err_omp_atomic_capture_not_expression_statement,
	diag::note_omp_atomic_update))
	return StmtError();
	E = Checker.getExpr();
	X = Checker.getX();
	UE = Checker.getUpdateExpr();
	IsXLHSInRHSPart = Checker.isXLHSInRHSPart();
	IsPostfixUpdate = Checker.isPostfixUpdate();
	} else if (!AtomicBody->isInstantiationDependent()) {
	ErrorLoc = AtomicBody->getExprLoc();
	ErrorRange = AtomicBody->getSourceRange();
	NoteLoc = AtomicBinOp ? AtomicBinOp->getOperatorLoc()
	: AtomicBody->getExprLoc();
	NoteRange = AtomicBinOp ? AtomicBinOp->getSourceRange()
	: AtomicBody->getSourceRange();
	ErrorFound = NotAnAssignmentOp;
	}
	if (ErrorFound != NoError) {
	Diag(ErrorLoc, diag::err_omp_atomic_capture_not_expression_statement)
	<< ErrorRange;
	Diag(NoteLoc, diag::note_omp_atomic_capture) << ErrorFound << NoteRange;
	return StmtError();
	}
	if (CurContext->isDependentContext())
	UE = V = E = X = nullptr;
	} else {
	// If clause is a capture:
	// { v = x; x = expr; }
	// { v = x; x++; }
	// { v = x; x--; }
	// { v = x; ++x; }
	// { v = x; --x; }
	// { v = x; x binop= expr; }
	// { v = x; x = x binop expr; }
	// { v = x; x = expr binop x; }
	// { x++; v = x; }
	// { x--; v = x; }
	// { ++x; v = x; }
	// { --x; v = x; }
	// { x binop= expr; v = x; }
	// { x = x binop expr; v = x; }
	// { x = expr binop x; v = x; }
	if (auto *CS = dyn_cast<CompoundStmt>(Body)) {
	// Check that this is { expr1; expr2; }
	if (CS->size() == 2) {
	Stmt *First = CS->body_front();
	Stmt *Second = CS->body_back();
	if (auto *EWC = dyn_cast<ExprWithCleanups>(First))
	First = EWC->getSubExpr()->IgnoreParenImpCasts();
	if (auto *EWC = dyn_cast<ExprWithCleanups>(Second))
	Second = EWC->getSubExpr()->IgnoreParenImpCasts();
	// Need to find what subexpression is 'v' and what is 'x'.
	OpenMPAtomicUpdateChecker Checker(*this);
	bool IsUpdateExprFound = !Checker.checkStatement(Second);
	BinaryOperator *BinOp = nullptr;
	if (IsUpdateExprFound) {
	BinOp = dyn_cast<BinaryOperator>(First);
	IsUpdateExprFound = BinOp && BinOp->getOpcode() == BO_Assign;
	}
	if (IsUpdateExprFound && !CurContext->isDependentContext()) {
	// { v = x; x++; }
	// { v = x; x--; }
	// { v = x; ++x; }
	// { v = x; --x; }
	// { v = x; x binop= expr; }
	// { v = x; x = x binop expr; }
	// { v = x; x = expr binop x; }
	// Check that the first expression has form v = x.
	Expr *PossibleX = BinOp->getRHS()->IgnoreParenImpCasts();
	llvm::FoldingSetNodeID XId, PossibleXId;
	Checker.getX()->Profile(XId, Context, /Canonical=/true);
	PossibleX->Profile(PossibleXId, Context, /Canonical=/true);
	IsUpdateExprFound = XId == PossibleXId;
	if (IsUpdateExprFound) {
	V = BinOp->getLHS();
	X = Checker.getX();
	E = Checker.getExpr();
	UE = Checker.getUpdateExpr();
	IsXLHSInRHSPart = Checker.isXLHSInRHSPart();
	IsPostfixUpdate = true;
	}
	}
	if (!IsUpdateExprFound) {
	IsUpdateExprFound = !Checker.checkStatement(First);
	BinOp = nullptr;
	if (IsUpdateExprFound) {
	BinOp = dyn_cast<BinaryOperator>(Second);
	IsUpdateExprFound = BinOp && BinOp->getOpcode() == BO_Assign;
	}
	if (IsUpdateExprFound && !CurContext->isDependentContext()) {
	// { x++; v = x; }
	// { x--; v = x; }
	// { ++x; v = x; }
	// { --x; v = x; }
	// { x binop= expr; v = x; }
	// { x = x binop expr; v = x; }
	// { x = expr binop x; v = x; }
	// Check that the second expression has form v = x.
	Expr *PossibleX = BinOp->getRHS()->IgnoreParenImpCasts();
	llvm::FoldingSetNodeID XId, PossibleXId;
	Checker.getX()->Profile(XId, Context, /Canonical=/true);
	PossibleX->Profile(PossibleXId, Context, /Canonical=/true);
	IsUpdateExprFound = XId == PossibleXId;
	if (IsUpdateExprFound) {
	V = BinOp->getLHS();
	X = Checker.getX();
	E = Checker.getExpr();
	UE = Checker.getUpdateExpr();
	IsXLHSInRHSPart = Checker.isXLHSInRHSPart();
	IsPostfixUpdate = false;
	}
	}
	}
	if (!IsUpdateExprFound) {
	// { v = x; x = expr; }
	auto *FirstExpr = dyn_cast<Expr>(First);
	auto *SecondExpr = dyn_cast<Expr>(Second);
	if (!FirstExpr \|\| !SecondExpr \|\|
	!(FirstExpr->isInstantiationDependent() \|\|
	SecondExpr->isInstantiationDependent())) {
	auto *FirstBinOp = dyn_cast<BinaryOperator>(First);
	if (!FirstBinOp \|\| FirstBinOp->getOpcode() != BO_Assign) {
	ErrorFound = NotAnAssignmentOp;
	NoteLoc = ErrorLoc = FirstBinOp ? FirstBinOp->getOperatorLoc()
	: First->getBeginLoc();
	NoteRange = ErrorRange = FirstBinOp
	? FirstBinOp->getSourceRange()
	: SourceRange(ErrorLoc, ErrorLoc);
	} else {
	auto *SecondBinOp = dyn_cast<BinaryOperator>(Second);
	if (!SecondBinOp \|\| SecondBinOp->getOpcode() != BO_Assign) {
	ErrorFound = NotAnAssignmentOp;
	NoteLoc = ErrorLoc = SecondBinOp
	? SecondBinOp->getOperatorLoc()
	: Second->getBeginLoc();
	NoteRange = ErrorRange =
	SecondBinOp ? SecondBinOp->getSourceRange()
	: SourceRange(ErrorLoc, ErrorLoc);
	} else {
	Expr *PossibleXRHSInFirst =
	FirstBinOp->getRHS()->IgnoreParenImpCasts();
	Expr *PossibleXLHSInSecond =
	SecondBinOp->getLHS()->IgnoreParenImpCasts();
	llvm::FoldingSetNodeID X1Id, X2Id;
	PossibleXRHSInFirst->Profile(X1Id, Context,
	/Canonical=/true);
	PossibleXLHSInSecond->Profile(X2Id, Context,
	/Canonical=/true);
	IsUpdateExprFound = X1Id == X2Id;
	if (IsUpdateExprFound) {
	V = FirstBinOp->getLHS();
	X = SecondBinOp->getLHS();
	E = SecondBinOp->getRHS();
	UE = nullptr;
	IsXLHSInRHSPart = false;
	IsPostfixUpdate = true;
	} else {
	ErrorFound = NotASpecificExpression;
	ErrorLoc = FirstBinOp->getExprLoc();
	ErrorRange = FirstBinOp->getSourceRange();
	NoteLoc = SecondBinOp->getLHS()->getExprLoc();
	NoteRange = SecondBinOp->getRHS()->getSourceRange();
	}
	}
	}
	}
	}
	} else {
	NoteLoc = ErrorLoc = Body->getBeginLoc();
	NoteRange = ErrorRange =
	SourceRange(Body->getBeginLoc(), Body->getBeginLoc());
	ErrorFound = NotTwoSubstatements;
	}
	} else {
	NoteLoc = ErrorLoc = Body->getBeginLoc();
	NoteRange = ErrorRange =
	SourceRange(Body->getBeginLoc(), Body->getBeginLoc());
	ErrorFound = NotACompoundStatement;
	}
	if (ErrorFound != NoError) {
	Diag(ErrorLoc, diag::err_omp_atomic_capture_not_compound_statement)
	<< ErrorRange;
	Diag(NoteLoc, diag::note_omp_atomic_capture) << ErrorFound << NoteRange;
	return StmtError();
	}
	if (CurContext->isDependentContext())
	UE = V = E = X = nullptr;
	}
	}

	setFunctionHasBranchProtectedScope();

	return OMPAtomicDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt,
	X, V, E, UE, IsXLHSInRHSPart,
	IsPostfixUpdate);
	}

	StmtResult Sema::ActOnOpenMPTargetDirective(ArrayRef<OMPClause *> Clauses,
	Stmt *AStmt,
	SourceLocation StartLoc,
	SourceLocation EndLoc) {
	if (!AStmt)
	return StmtError();

	auto *CS = cast<CapturedStmt>(AStmt);
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target);
	ThisCaptureLevel > 1; --ThisCaptureLevel) {
	CS = cast<CapturedStmt>(CS->getCapturedStmt());
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	}

	// OpenMP [2.16, Nesting of Regions]
	// If specified, a teams construct must be contained within a target
	// construct. That target construct must contain no statements or directives
	// outside of the teams construct.
	if (DSAStack->hasInnerTeamsRegion()) {
	const Stmt S = CS->IgnoreContainers(/IgnoreCaptured=*/true);
	bool OMPTeamsFound = true;
	if (const auto *CS = dyn_cast<CompoundStmt>(S)) {
	auto I = CS->body_begin();
	while (I != CS->body_end()) {
	const auto OED = dyn_cast<OMPExecutableDirective>(I);
	if (!OED \|\| !isOpenMPTeamsDirective(OED->getDirectiveKind()) \|\|
	OMPTeamsFound) {

	OMPTeamsFound = false;
	break;
	}
	++I;
	}
	assert(I != CS->body_end() && "Not found statement");
	S = *I;
	} else {
	const auto *OED = dyn_cast<OMPExecutableDirective>(S);
	OMPTeamsFound = OED && isOpenMPTeamsDirective(OED->getDirectiveKind());
	}
	if (!OMPTeamsFound) {
	Diag(StartLoc, diag::err_omp_target_contains_not_only_teams);
	Diag(DSAStack->getInnerTeamsRegionLoc(),
	diag::note_omp_nested_teams_construct_here);
	Diag(S->getBeginLoc(), diag::note_omp_nested_statement_here)
	<< isa<OMPExecutableDirective>(S);
	return StmtError();
	}
	}

	setFunctionHasBranchProtectedScope();

	return OMPTargetDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt);
	}

	StmtResult
	Sema::ActOnOpenMPTargetParallelDirective(ArrayRef<OMPClause *> Clauses,
	Stmt *AStmt, SourceLocation StartLoc,
	SourceLocation EndLoc) {
	if (!AStmt)
	return StmtError();

	auto *CS = cast<CapturedStmt>(AStmt);
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target_parallel);
	ThisCaptureLevel > 1; --ThisCaptureLevel) {
	CS = cast<CapturedStmt>(CS->getCapturedStmt());
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	}

	setFunctionHasBranchProtectedScope();

	return OMPTargetParallelDirective::Create(
	Context, StartLoc, EndLoc, Clauses, AStmt,
	DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
	}

	StmtResult Sema::ActOnOpenMPTargetParallelForDirective(
	ArrayRef<OMPClause > Clauses, Stmt AStmt, SourceLocation StartLoc,
	SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
	if (!AStmt)
	return StmtError();

	auto *CS = cast<CapturedStmt>(AStmt);
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target_parallel_for);
	ThisCaptureLevel > 1; --ThisCaptureLevel) {
	CS = cast<CapturedStmt>(CS->getCapturedStmt());
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	}

	OMPLoopDirective::HelperExprs B;
	// In presence of clause 'collapse' or 'ordered' with number of loops, it will
	// define the nested loops number.
	unsigned NestedLoopCount =
	checkOpenMPLoop(OMPD_target_parallel_for, getCollapseNumberExpr(Clauses),
	getOrderedNumberExpr(Clauses), CS, this, DSAStack,
	VarsWithImplicitDSA, B);
	if (NestedLoopCount == 0)
	return StmtError();

	assert((CurContext->isDependentContext() \|\| B.builtAll()) &&
	"omp target parallel for loop exprs were not built");

	if (!CurContext->isDependentContext()) {
	// Finalize the clauses that need pre-built expressions for CodeGen.
	for (OMPClause *C : Clauses) {
	if (auto *LC = dyn_cast<OMPLinearClause>(C))
	if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
	B.NumIterations, *this, CurScope,
	DSAStack))
	return StmtError();
	}
	}

	setFunctionHasBranchProtectedScope();
	return OMPTargetParallelForDirective::Create(
	Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
	DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
	}

	/// Check for existence of a map clause in the list of clauses.
	static bool hasClauses(ArrayRef<OMPClause *> Clauses,
	const OpenMPClauseKind K) {
	return llvm::any_of(
	Clauses, [K](const OMPClause *C) { return C->getClauseKind() == K; });
	}

	template <typename... Params>
	static bool hasClauses(ArrayRef<OMPClause *> Clauses, const OpenMPClauseKind K,
	const Params... ClauseTypes) {
	return hasClauses(Clauses, K) \|\| hasClauses(Clauses, ClauseTypes...);
	}

	StmtResult Sema::ActOnOpenMPTargetDataDirective(ArrayRef<OMPClause *> Clauses,
	Stmt *AStmt,
	SourceLocation StartLoc,
	SourceLocation EndLoc) {
	if (!AStmt)
	return StmtError();

	assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");

	// OpenMP [2.12.2, target data Construct, Restrictions]
	// At least one map, use_device_addr or use_device_ptr clause must appear on
	// the directive.
	if (!hasClauses(Clauses, OMPC_map, OMPC_use_device_ptr) &&
	(LangOpts.OpenMP < 50 \|\| !hasClauses(Clauses, OMPC_use_device_addr))) {
	StringRef Expected;
	if (LangOpts.OpenMP < 50)
	Expected = "'map' or 'use_device_ptr'";
	else
	Expected = "'map', 'use_device_ptr', or 'use_device_addr'";
	Diag(StartLoc, diag::err_omp_no_clause_for_directive)
	<< Expected << getOpenMPDirectiveName(OMPD_target_data);
	return StmtError();
	}

	setFunctionHasBranchProtectedScope();

	return OMPTargetDataDirective::Create(Context, StartLoc, EndLoc, Clauses,
	AStmt);
	}

	StmtResult
	Sema::ActOnOpenMPTargetEnterDataDirective(ArrayRef<OMPClause *> Clauses,
	SourceLocation StartLoc,
	SourceLocation EndLoc, Stmt *AStmt) {
	if (!AStmt)
	return StmtError();

	auto *CS = cast<CapturedStmt>(AStmt);
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target_enter_data);
	ThisCaptureLevel > 1; --ThisCaptureLevel) {
	CS = cast<CapturedStmt>(CS->getCapturedStmt());
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	}

	// OpenMP [2.10.2, Restrictions, p. 99]
	// At least one map clause must appear on the directive.
	if (!hasClauses(Clauses, OMPC_map)) {
	Diag(StartLoc, diag::err_omp_no_clause_for_directive)
	<< "'map'" << getOpenMPDirectiveName(OMPD_target_enter_data);
	return StmtError();
	}

	return OMPTargetEnterDataDirective::Create(Context, StartLoc, EndLoc, Clauses,
	AStmt);
	}

	StmtResult
	Sema::ActOnOpenMPTargetExitDataDirective(ArrayRef<OMPClause *> Clauses,
	SourceLocation StartLoc,
	SourceLocation EndLoc, Stmt *AStmt) {
	if (!AStmt)
	return StmtError();

	auto *CS = cast<CapturedStmt>(AStmt);
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target_exit_data);
	ThisCaptureLevel > 1; --ThisCaptureLevel) {
	CS = cast<CapturedStmt>(CS->getCapturedStmt());
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	}

	// OpenMP [2.10.3, Restrictions, p. 102]
	// At least one map clause must appear on the directive.
	if (!hasClauses(Clauses, OMPC_map)) {
	Diag(StartLoc, diag::err_omp_no_clause_for_directive)
	<< "'map'" << getOpenMPDirectiveName(OMPD_target_exit_data);
	return StmtError();
	}

	return OMPTargetExitDataDirective::Create(Context, StartLoc, EndLoc, Clauses,
	AStmt);
	}

	StmtResult Sema::ActOnOpenMPTargetUpdateDirective(ArrayRef<OMPClause *> Clauses,
	SourceLocation StartLoc,
	SourceLocation EndLoc,
	Stmt *AStmt) {
	if (!AStmt)
	return StmtError();

	auto *CS = cast<CapturedStmt>(AStmt);
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target_update);
	ThisCaptureLevel > 1; --ThisCaptureLevel) {
	CS = cast<CapturedStmt>(CS->getCapturedStmt());
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	}

	if (!hasClauses(Clauses, OMPC_to, OMPC_from)) {
	Diag(StartLoc, diag::err_omp_at_least_one_motion_clause_required);
	return StmtError();
	}
	return OMPTargetUpdateDirective::Create(Context, StartLoc, EndLoc, Clauses,
	AStmt);
	}

	StmtResult Sema::ActOnOpenMPTeamsDirective(ArrayRef<OMPClause *> Clauses,
	Stmt *AStmt, SourceLocation StartLoc,
	SourceLocation EndLoc) {
	if (!AStmt)
	return StmtError();

	auto *CS = cast<CapturedStmt>(AStmt);
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();

	setFunctionHasBranchProtectedScope();

	DSAStack->setParentTeamsRegionLoc(StartLoc);

	return OMPTeamsDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt);
	}

	StmtResult
	Sema::ActOnOpenMPCancellationPointDirective(SourceLocation StartLoc,
	SourceLocation EndLoc,
	OpenMPDirectiveKind CancelRegion) {
	if (DSAStack->isParentNowaitRegion()) {
	Diag(StartLoc, diag::err_omp_parent_cancel_region_nowait) << 0;
	return StmtError();
	}
	if (DSAStack->isParentOrderedRegion()) {
	Diag(StartLoc, diag::err_omp_parent_cancel_region_ordered) << 0;
	return StmtError();
	}
	return OMPCancellationPointDirective::Create(Context, StartLoc, EndLoc,
	CancelRegion);
	}

	StmtResult Sema::ActOnOpenMPCancelDirective(ArrayRef<OMPClause *> Clauses,
	SourceLocation StartLoc,
	SourceLocation EndLoc,
	OpenMPDirectiveKind CancelRegion) {
	if (DSAStack->isParentNowaitRegion()) {
	Diag(StartLoc, diag::err_omp_parent_cancel_region_nowait) << 1;
	return StmtError();
	}
	if (DSAStack->isParentOrderedRegion()) {
	Diag(StartLoc, diag::err_omp_parent_cancel_region_ordered) << 1;
	return StmtError();
	}
	DSAStack->setParentCancelRegion(/Cancel=/true);
	return OMPCancelDirective::Create(Context, StartLoc, EndLoc, Clauses,
	CancelRegion);
	}

	static bool checkGrainsizeNumTasksClauses(Sema &S,
	ArrayRef<OMPClause *> Clauses) {
	const OMPClause *PrevClause = nullptr;
	bool ErrorFound = false;
	for (const OMPClause *C : Clauses) {
	if (C->getClauseKind() == OMPC_grainsize \|\|
	C->getClauseKind() == OMPC_num_tasks) {
	if (!PrevClause)
	PrevClause = C;
	else if (PrevClause->getClauseKind() != C->getClauseKind()) {
	S.Diag(C->getBeginLoc(), diag::err_omp_clauses_mutually_exclusive)
	<< getOpenMPClauseName(C->getClauseKind())
	<< getOpenMPClauseName(PrevClause->getClauseKind());
	S.Diag(PrevClause->getBeginLoc(), diag::note_omp_previous_clause)
	<< getOpenMPClauseName(PrevClause->getClauseKind());
	ErrorFound = true;
	}
	}
	}
	return ErrorFound;
	}

	static bool checkReductionClauseWithNogroup(Sema &S,
	ArrayRef<OMPClause *> Clauses) {
	const OMPClause *ReductionClause = nullptr;
	const OMPClause *NogroupClause = nullptr;
	for (const OMPClause *C : Clauses) {
	if (C->getClauseKind() == OMPC_reduction) {
	ReductionClause = C;
	if (NogroupClause)
	break;
	continue;
	}
	if (C->getClauseKind() == OMPC_nogroup) {
	NogroupClause = C;
	if (ReductionClause)
	break;
	continue;
	}
	}
	if (ReductionClause && NogroupClause) {
	S.Diag(ReductionClause->getBeginLoc(), diag::err_omp_reduction_with_nogroup)
	<< SourceRange(NogroupClause->getBeginLoc(),
	NogroupClause->getEndLoc());
	return true;
	}
	return false;
	}

	StmtResult Sema::ActOnOpenMPTaskLoopDirective(
	ArrayRef<OMPClause > Clauses, Stmt AStmt, SourceLocation StartLoc,
	SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
	if (!AStmt)
	return StmtError();

	assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
	OMPLoopDirective::HelperExprs B;
	// In presence of clause 'collapse' or 'ordered' with number of loops, it will
	// define the nested loops number.
	unsigned NestedLoopCount =
	checkOpenMPLoop(OMPD_taskloop, getCollapseNumberExpr(Clauses),
	/OrderedLoopCountExpr=/nullptr, AStmt, this, DSAStack,
	VarsWithImplicitDSA, B);
	if (NestedLoopCount == 0)
	return StmtError();

	assert((CurContext->isDependentContext() \|\| B.builtAll()) &&
	"omp for loop exprs were not built");

	// OpenMP, [2.9.2 taskloop Construct, Restrictions]
	// The grainsize clause and num_tasks clause are mutually exclusive and may
	// not appear on the same taskloop directive.
	if (checkGrainsizeNumTasksClauses(*this, Clauses))
	return StmtError();
	// OpenMP, [2.9.2 taskloop Construct, Restrictions]
	// If a reduction clause is present on the taskloop directive, the nogroup
	// clause must not be specified.
	if (checkReductionClauseWithNogroup(*this, Clauses))
	return StmtError();

	setFunctionHasBranchProtectedScope();
	return OMPTaskLoopDirective::Create(Context, StartLoc, EndLoc,
	NestedLoopCount, Clauses, AStmt, B,
	DSAStack->isCancelRegion());
	}

	StmtResult Sema::ActOnOpenMPTaskLoopSimdDirective(
	ArrayRef<OMPClause > Clauses, Stmt AStmt, SourceLocation StartLoc,
	SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
	if (!AStmt)
	return StmtError();

	assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
	OMPLoopDirective::HelperExprs B;
	// In presence of clause 'collapse' or 'ordered' with number of loops, it will
	// define the nested loops number.
	unsigned NestedLoopCount =
	checkOpenMPLoop(OMPD_taskloop_simd, getCollapseNumberExpr(Clauses),
	/OrderedLoopCountExpr=/nullptr, AStmt, this, DSAStack,
	VarsWithImplicitDSA, B);
	if (NestedLoopCount == 0)
	return StmtError();

	assert((CurContext->isDependentContext() \|\| B.builtAll()) &&
	"omp for loop exprs were not built");

	if (!CurContext->isDependentContext()) {
	// Finalize the clauses that need pre-built expressions for CodeGen.
	for (OMPClause *C : Clauses) {
	if (auto *LC = dyn_cast<OMPLinearClause>(C))
	if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
	B.NumIterations, *this, CurScope,
	DSAStack))
	return StmtError();
	}
	}

	// OpenMP, [2.9.2 taskloop Construct, Restrictions]
	// The grainsize clause and num_tasks clause are mutually exclusive and may
	// not appear on the same taskloop directive.
	if (checkGrainsizeNumTasksClauses(*this, Clauses))
	return StmtError();
	// OpenMP, [2.9.2 taskloop Construct, Restrictions]
	// If a reduction clause is present on the taskloop directive, the nogroup
	// clause must not be specified.
	if (checkReductionClauseWithNogroup(*this, Clauses))
	return StmtError();
	if (checkSimdlenSafelenSpecified(*this, Clauses))
	return StmtError();

	setFunctionHasBranchProtectedScope();
	return OMPTaskLoopSimdDirective::Create(Context, StartLoc, EndLoc,
	NestedLoopCount, Clauses, AStmt, B);
	}

	StmtResult Sema::ActOnOpenMPMasterTaskLoopDirective(
	ArrayRef<OMPClause > Clauses, Stmt AStmt, SourceLocation StartLoc,
	SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
	if (!AStmt)
	return StmtError();

	assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
	OMPLoopDirective::HelperExprs B;
	// In presence of clause 'collapse' or 'ordered' with number of loops, it will
	// define the nested loops number.
	unsigned NestedLoopCount =
	checkOpenMPLoop(OMPD_master_taskloop, getCollapseNumberExpr(Clauses),
	/OrderedLoopCountExpr=/nullptr, AStmt, this, DSAStack,
	VarsWithImplicitDSA, B);
	if (NestedLoopCount == 0)
	return StmtError();

	assert((CurContext->isDependentContext() \|\| B.builtAll()) &&
	"omp for loop exprs were not built");

	// OpenMP, [2.9.2 taskloop Construct, Restrictions]
	// The grainsize clause and num_tasks clause are mutually exclusive and may
	// not appear on the same taskloop directive.
	if (checkGrainsizeNumTasksClauses(*this, Clauses))
	return StmtError();
	// OpenMP, [2.9.2 taskloop Construct, Restrictions]
	// If a reduction clause is present on the taskloop directive, the nogroup
	// clause must not be specified.
	if (checkReductionClauseWithNogroup(*this, Clauses))
	return StmtError();

	setFunctionHasBranchProtectedScope();
	return OMPMasterTaskLoopDirective::Create(Context, StartLoc, EndLoc,
	NestedLoopCount, Clauses, AStmt, B,
	DSAStack->isCancelRegion());
	}

	StmtResult Sema::ActOnOpenMPMasterTaskLoopSimdDirective(
	ArrayRef<OMPClause > Clauses, Stmt AStmt, SourceLocation StartLoc,
	SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
	if (!AStmt)
	return StmtError();

	assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
	OMPLoopDirective::HelperExprs B;
	// In presence of clause 'collapse' or 'ordered' with number of loops, it will
	// define the nested loops number.
	unsigned NestedLoopCount =
	checkOpenMPLoop(OMPD_master_taskloop_simd, getCollapseNumberExpr(Clauses),
	/OrderedLoopCountExpr=/nullptr, AStmt, this, DSAStack,
	VarsWithImplicitDSA, B);
	if (NestedLoopCount == 0)
	return StmtError();

	assert((CurContext->isDependentContext() \|\| B.builtAll()) &&
	"omp for loop exprs were not built");

	if (!CurContext->isDependentContext()) {
	// Finalize the clauses that need pre-built expressions for CodeGen.
	for (OMPClause *C : Clauses) {
	if (auto *LC = dyn_cast<OMPLinearClause>(C))
	if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
	B.NumIterations, *this, CurScope,
	DSAStack))
	return StmtError();
	}
	}

	// OpenMP, [2.9.2 taskloop Construct, Restrictions]
	// The grainsize clause and num_tasks clause are mutually exclusive and may
	// not appear on the same taskloop directive.
	if (checkGrainsizeNumTasksClauses(*this, Clauses))
	return StmtError();
	// OpenMP, [2.9.2 taskloop Construct, Restrictions]
	// If a reduction clause is present on the taskloop directive, the nogroup
	// clause must not be specified.
	if (checkReductionClauseWithNogroup(*this, Clauses))
	return StmtError();
	if (checkSimdlenSafelenSpecified(*this, Clauses))
	return StmtError();

	setFunctionHasBranchProtectedScope();
	return OMPMasterTaskLoopSimdDirective::Create(
	Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
	}

	StmtResult Sema::ActOnOpenMPParallelMasterTaskLoopDirective(
	ArrayRef<OMPClause > Clauses, Stmt AStmt, SourceLocation StartLoc,
	SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
	if (!AStmt)
	return StmtError();

	assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
	auto *CS = cast<CapturedStmt>(AStmt);
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	for (int ThisCaptureLevel =
	getOpenMPCaptureLevels(OMPD_parallel_master_taskloop);
	ThisCaptureLevel > 1; --ThisCaptureLevel) {
	CS = cast<CapturedStmt>(CS->getCapturedStmt());
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	}

	OMPLoopDirective::HelperExprs B;
	// In presence of clause 'collapse' or 'ordered' with number of loops, it will
	// define the nested loops number.
	unsigned NestedLoopCount = checkOpenMPLoop(
	OMPD_parallel_master_taskloop, getCollapseNumberExpr(Clauses),
	/OrderedLoopCountExpr=/nullptr, CS, this, DSAStack,
	VarsWithImplicitDSA, B);
	if (NestedLoopCount == 0)
	return StmtError();

	assert((CurContext->isDependentContext() \|\| B.builtAll()) &&
	"omp for loop exprs were not built");

	// OpenMP, [2.9.2 taskloop Construct, Restrictions]
	// The grainsize clause and num_tasks clause are mutually exclusive and may
	// not appear on the same taskloop directive.
	if (checkGrainsizeNumTasksClauses(*this, Clauses))
	return StmtError();
	// OpenMP, [2.9.2 taskloop Construct, Restrictions]
	// If a reduction clause is present on the taskloop directive, the nogroup
	// clause must not be specified.
	if (checkReductionClauseWithNogroup(*this, Clauses))
	return StmtError();

	setFunctionHasBranchProtectedScope();
	return OMPParallelMasterTaskLoopDirective::Create(
	Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
	DSAStack->isCancelRegion());
	}

	StmtResult Sema::ActOnOpenMPParallelMasterTaskLoopSimdDirective(
	ArrayRef<OMPClause > Clauses, Stmt AStmt, SourceLocation StartLoc,
	SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
	if (!AStmt)
	return StmtError();

	assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
	auto *CS = cast<CapturedStmt>(AStmt);
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	for (int ThisCaptureLevel =
	getOpenMPCaptureLevels(OMPD_parallel_master_taskloop_simd);
	ThisCaptureLevel > 1; --ThisCaptureLevel) {
	CS = cast<CapturedStmt>(CS->getCapturedStmt());
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	}

	OMPLoopDirective::HelperExprs B;
	// In presence of clause 'collapse' or 'ordered' with number of loops, it will
	// define the nested loops number.
	unsigned NestedLoopCount = checkOpenMPLoop(
	OMPD_parallel_master_taskloop_simd, getCollapseNumberExpr(Clauses),
	/OrderedLoopCountExpr=/nullptr, CS, this, DSAStack,
	VarsWithImplicitDSA, B);
	if (NestedLoopCount == 0)
	return StmtError();

	assert((CurContext->isDependentContext() \|\| B.builtAll()) &&
	"omp for loop exprs were not built");

	if (!CurContext->isDependentContext()) {
	// Finalize the clauses that need pre-built expressions for CodeGen.
	for (OMPClause *C : Clauses) {
	if (auto *LC = dyn_cast<OMPLinearClause>(C))
	if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
	B.NumIterations, *this, CurScope,
	DSAStack))
	return StmtError();
	}
	}

	// OpenMP, [2.9.2 taskloop Construct, Restrictions]
	// The grainsize clause and num_tasks clause are mutually exclusive and may
	// not appear on the same taskloop directive.
	if (checkGrainsizeNumTasksClauses(*this, Clauses))
	return StmtError();
	// OpenMP, [2.9.2 taskloop Construct, Restrictions]
	// If a reduction clause is present on the taskloop directive, the nogroup
	// clause must not be specified.
	if (checkReductionClauseWithNogroup(*this, Clauses))
	return StmtError();
	if (checkSimdlenSafelenSpecified(*this, Clauses))
	return StmtError();

	setFunctionHasBranchProtectedScope();
	return OMPParallelMasterTaskLoopSimdDirective::Create(
	Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
	}

	StmtResult Sema::ActOnOpenMPDistributeDirective(
	ArrayRef<OMPClause > Clauses, Stmt AStmt, SourceLocation StartLoc,
	SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
	if (!AStmt)
	return StmtError();

	assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
	OMPLoopDirective::HelperExprs B;
	// In presence of clause 'collapse' with number of loops, it will
	// define the nested loops number.
	unsigned NestedLoopCount =
	checkOpenMPLoop(OMPD_distribute, getCollapseNumberExpr(Clauses),
	nullptr /ordered not a clause on distribute/, AStmt,
	this, DSAStack, VarsWithImplicitDSA, B);
	if (NestedLoopCount == 0)
	return StmtError();

	assert((CurContext->isDependentContext() \|\| B.builtAll()) &&
	"omp for loop exprs were not built");

	setFunctionHasBranchProtectedScope();
	return OMPDistributeDirective::Create(Context, StartLoc, EndLoc,
	NestedLoopCount, Clauses, AStmt, B);
	}

	StmtResult Sema::ActOnOpenMPDistributeParallelForDirective(
	ArrayRef<OMPClause > Clauses, Stmt AStmt, SourceLocation StartLoc,
	SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
	if (!AStmt)
	return StmtError();

	auto *CS = cast<CapturedStmt>(AStmt);
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	for (int ThisCaptureLevel =
	getOpenMPCaptureLevels(OMPD_distribute_parallel_for);
	ThisCaptureLevel > 1; --ThisCaptureLevel) {
	CS = cast<CapturedStmt>(CS->getCapturedStmt());
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	}

	OMPLoopDirective::HelperExprs B;
	// In presence of clause 'collapse' with number of loops, it will
	// define the nested loops number.
	unsigned NestedLoopCount = checkOpenMPLoop(
	OMPD_distribute_parallel_for, getCollapseNumberExpr(Clauses),
	nullptr /ordered not a clause on distribute/, CS, this, DSAStack,
	VarsWithImplicitDSA, B);
	if (NestedLoopCount == 0)
	return StmtError();

	assert((CurContext->isDependentContext() \|\| B.builtAll()) &&
	"omp for loop exprs were not built");

	setFunctionHasBranchProtectedScope();
	return OMPDistributeParallelForDirective::Create(
	Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
	DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
	}

	StmtResult Sema::ActOnOpenMPDistributeParallelForSimdDirective(
	ArrayRef<OMPClause > Clauses, Stmt AStmt, SourceLocation StartLoc,
	SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
	if (!AStmt)
	return StmtError();

	auto *CS = cast<CapturedStmt>(AStmt);
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	for (int ThisCaptureLevel =
	getOpenMPCaptureLevels(OMPD_distribute_parallel_for_simd);
	ThisCaptureLevel > 1; --ThisCaptureLevel) {
	CS = cast<CapturedStmt>(CS->getCapturedStmt());
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	}

	OMPLoopDirective::HelperExprs B;
	// In presence of clause 'collapse' with number of loops, it will
	// define the nested loops number.
	unsigned NestedLoopCount = checkOpenMPLoop(
	OMPD_distribute_parallel_for_simd, getCollapseNumberExpr(Clauses),
	nullptr /ordered not a clause on distribute/, CS, this, DSAStack,
	VarsWithImplicitDSA, B);
	if (NestedLoopCount == 0)
	return StmtError();

	assert((CurContext->isDependentContext() \|\| B.builtAll()) &&
	"omp for loop exprs were not built");

	if (!CurContext->isDependentContext()) {
	// Finalize the clauses that need pre-built expressions for CodeGen.
	for (OMPClause *C : Clauses) {
	if (auto *LC = dyn_cast<OMPLinearClause>(C))
	if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
	B.NumIterations, *this, CurScope,
	DSAStack))
	return StmtError();
	}
	}

	if (checkSimdlenSafelenSpecified(*this, Clauses))
	return StmtError();

	setFunctionHasBranchProtectedScope();
	return OMPDistributeParallelForSimdDirective::Create(
	Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
	}

	StmtResult Sema::ActOnOpenMPDistributeSimdDirective(
	ArrayRef<OMPClause > Clauses, Stmt AStmt, SourceLocation StartLoc,
	SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
	if (!AStmt)
	return StmtError();

	auto *CS = cast<CapturedStmt>(AStmt);
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_distribute_simd);
	ThisCaptureLevel > 1; --ThisCaptureLevel) {
	CS = cast<CapturedStmt>(CS->getCapturedStmt());
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	}

	OMPLoopDirective::HelperExprs B;
	// In presence of clause 'collapse' with number of loops, it will
	// define the nested loops number.
	unsigned NestedLoopCount =
	checkOpenMPLoop(OMPD_distribute_simd, getCollapseNumberExpr(Clauses),
	nullptr /ordered not a clause on distribute/, CS, *this,
	*DSAStack, VarsWithImplicitDSA, B);
	if (NestedLoopCount == 0)
	return StmtError();

	assert((CurContext->isDependentContext() \|\| B.builtAll()) &&
	"omp for loop exprs were not built");

	if (!CurContext->isDependentContext()) {
	// Finalize the clauses that need pre-built expressions for CodeGen.
	for (OMPClause *C : Clauses) {
	if (auto *LC = dyn_cast<OMPLinearClause>(C))
	if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
	B.NumIterations, *this, CurScope,
	DSAStack))
	return StmtError();
	}
	}

	if (checkSimdlenSafelenSpecified(*this, Clauses))
	return StmtError();

	setFunctionHasBranchProtectedScope();
	return OMPDistributeSimdDirective::Create(Context, StartLoc, EndLoc,
	NestedLoopCount, Clauses, AStmt, B);
	}

	StmtResult Sema::ActOnOpenMPTargetParallelForSimdDirective(
	ArrayRef<OMPClause > Clauses, Stmt AStmt, SourceLocation StartLoc,
	SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
	if (!AStmt)
	return StmtError();

	auto *CS = cast<CapturedStmt>(AStmt);
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target_parallel_for);
	ThisCaptureLevel > 1; --ThisCaptureLevel) {
	CS = cast<CapturedStmt>(CS->getCapturedStmt());
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	}

	OMPLoopDirective::HelperExprs B;
	// In presence of clause 'collapse' or 'ordered' with number of loops, it will
	// define the nested loops number.
	unsigned NestedLoopCount = checkOpenMPLoop(
	OMPD_target_parallel_for_simd, getCollapseNumberExpr(Clauses),
	getOrderedNumberExpr(Clauses), CS, this, DSAStack,
	VarsWithImplicitDSA, B);
	if (NestedLoopCount == 0)
	return StmtError();

	assert((CurContext->isDependentContext() \|\| B.builtAll()) &&
	"omp target parallel for simd loop exprs were not built");

	if (!CurContext->isDependentContext()) {
	// Finalize the clauses that need pre-built expressions for CodeGen.
	for (OMPClause *C : Clauses) {
	if (auto *LC = dyn_cast<OMPLinearClause>(C))
	if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
	B.NumIterations, *this, CurScope,
	DSAStack))
	return StmtError();
	}
	}
	if (checkSimdlenSafelenSpecified(*this, Clauses))
	return StmtError();

	setFunctionHasBranchProtectedScope();
	return OMPTargetParallelForSimdDirective::Create(
	Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
	}

	StmtResult Sema::ActOnOpenMPTargetSimdDirective(
	ArrayRef<OMPClause > Clauses, Stmt AStmt, SourceLocation StartLoc,
	SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
	if (!AStmt)
	return StmtError();

	auto *CS = cast<CapturedStmt>(AStmt);
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target_simd);
	ThisCaptureLevel > 1; --ThisCaptureLevel) {
	CS = cast<CapturedStmt>(CS->getCapturedStmt());
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	}

	OMPLoopDirective::HelperExprs B;
	// In presence of clause 'collapse' with number of loops, it will define the
	// nested loops number.
	unsigned NestedLoopCount =
	checkOpenMPLoop(OMPD_target_simd, getCollapseNumberExpr(Clauses),
	getOrderedNumberExpr(Clauses), CS, this, DSAStack,
	VarsWithImplicitDSA, B);
	if (NestedLoopCount == 0)
	return StmtError();

	assert((CurContext->isDependentContext() \|\| B.builtAll()) &&
	"omp target simd loop exprs were not built");

	if (!CurContext->isDependentContext()) {
	// Finalize the clauses that need pre-built expressions for CodeGen.
	for (OMPClause *C : Clauses) {
	if (auto *LC = dyn_cast<OMPLinearClause>(C))
	if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
	B.NumIterations, *this, CurScope,
	DSAStack))
	return StmtError();
	}
	}

	if (checkSimdlenSafelenSpecified(*this, Clauses))
	return StmtError();

	setFunctionHasBranchProtectedScope();
	return OMPTargetSimdDirective::Create(Context, StartLoc, EndLoc,
	NestedLoopCount, Clauses, AStmt, B);
	}

	StmtResult Sema::ActOnOpenMPTeamsDistributeDirective(
	ArrayRef<OMPClause > Clauses, Stmt AStmt, SourceLocation StartLoc,
	SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
	if (!AStmt)
	return StmtError();

	auto *CS = cast<CapturedStmt>(AStmt);
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_teams_distribute);
	ThisCaptureLevel > 1; --ThisCaptureLevel) {
	CS = cast<CapturedStmt>(CS->getCapturedStmt());
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	}

	OMPLoopDirective::HelperExprs B;
	// In presence of clause 'collapse' with number of loops, it will
	// define the nested loops number.
	unsigned NestedLoopCount =
	checkOpenMPLoop(OMPD_teams_distribute, getCollapseNumberExpr(Clauses),
	nullptr /ordered not a clause on distribute/, CS, *this,
	*DSAStack, VarsWithImplicitDSA, B);
	if (NestedLoopCount == 0)
	return StmtError();

	assert((CurContext->isDependentContext() \|\| B.builtAll()) &&
	"omp teams distribute loop exprs were not built");

	setFunctionHasBranchProtectedScope();

	DSAStack->setParentTeamsRegionLoc(StartLoc);

	return OMPTeamsDistributeDirective::Create(
	Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
	}

	StmtResult Sema::ActOnOpenMPTeamsDistributeSimdDirective(
	ArrayRef<OMPClause > Clauses, Stmt AStmt, SourceLocation StartLoc,
	SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
	if (!AStmt)
	return StmtError();

	auto *CS = cast<CapturedStmt>(AStmt);
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	for (int ThisCaptureLevel =
	getOpenMPCaptureLevels(OMPD_teams_distribute_simd);
	ThisCaptureLevel > 1; --ThisCaptureLevel) {
	CS = cast<CapturedStmt>(CS->getCapturedStmt());
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	}

	OMPLoopDirective::HelperExprs B;
	// In presence of clause 'collapse' with number of loops, it will
	// define the nested loops number.
	unsigned NestedLoopCount = checkOpenMPLoop(
	OMPD_teams_distribute_simd, getCollapseNumberExpr(Clauses),
	nullptr /ordered not a clause on distribute/, CS, this, DSAStack,
	VarsWithImplicitDSA, B);

	if (NestedLoopCount == 0)
	return StmtError();

	assert((CurContext->isDependentContext() \|\| B.builtAll()) &&
	"omp teams distribute simd loop exprs were not built");

	if (!CurContext->isDependentContext()) {
	// Finalize the clauses that need pre-built expressions for CodeGen.
	for (OMPClause *C : Clauses) {
	if (auto *LC = dyn_cast<OMPLinearClause>(C))
	if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
	B.NumIterations, *this, CurScope,
	DSAStack))
	return StmtError();
	}
	}

	if (checkSimdlenSafelenSpecified(*this, Clauses))
	return StmtError();

	setFunctionHasBranchProtectedScope();

	DSAStack->setParentTeamsRegionLoc(StartLoc);

	return OMPTeamsDistributeSimdDirective::Create(
	Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
	}

	StmtResult Sema::ActOnOpenMPTeamsDistributeParallelForSimdDirective(
	ArrayRef<OMPClause > Clauses, Stmt AStmt, SourceLocation StartLoc,
	SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
	if (!AStmt)
	return StmtError();

	auto *CS = cast<CapturedStmt>(AStmt);
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();

	for (int ThisCaptureLevel =
	getOpenMPCaptureLevels(OMPD_teams_distribute_parallel_for_simd);
	ThisCaptureLevel > 1; --ThisCaptureLevel) {
	CS = cast<CapturedStmt>(CS->getCapturedStmt());
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	}

	OMPLoopDirective::HelperExprs B;
	// In presence of clause 'collapse' with number of loops, it will
	// define the nested loops number.
	unsigned NestedLoopCount = checkOpenMPLoop(
	OMPD_teams_distribute_parallel_for_simd, getCollapseNumberExpr(Clauses),
	nullptr /ordered not a clause on distribute/, CS, this, DSAStack,
	VarsWithImplicitDSA, B);

	if (NestedLoopCount == 0)
	return StmtError();

	assert((CurContext->isDependentContext() \|\| B.builtAll()) &&
	"omp for loop exprs were not built");

	if (!CurContext->isDependentContext()) {
	// Finalize the clauses that need pre-built expressions for CodeGen.
	for (OMPClause *C : Clauses) {
	if (auto *LC = dyn_cast<OMPLinearClause>(C))
	if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
	B.NumIterations, *this, CurScope,
	DSAStack))
	return StmtError();
	}
	}

	if (checkSimdlenSafelenSpecified(*this, Clauses))
	return StmtError();

	setFunctionHasBranchProtectedScope();

	DSAStack->setParentTeamsRegionLoc(StartLoc);

	return OMPTeamsDistributeParallelForSimdDirective::Create(
	Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
	}

	StmtResult Sema::ActOnOpenMPTeamsDistributeParallelForDirective(
	ArrayRef<OMPClause > Clauses, Stmt AStmt, SourceLocation StartLoc,
	SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
	if (!AStmt)
	return StmtError();

	auto *CS = cast<CapturedStmt>(AStmt);
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();

	for (int ThisCaptureLevel =
	getOpenMPCaptureLevels(OMPD_teams_distribute_parallel_for);
	ThisCaptureLevel > 1; --ThisCaptureLevel) {
	CS = cast<CapturedStmt>(CS->getCapturedStmt());
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	}

	OMPLoopDirective::HelperExprs B;
	// In presence of clause 'collapse' with number of loops, it will
	// define the nested loops number.
	unsigned NestedLoopCount = checkOpenMPLoop(
	OMPD_teams_distribute_parallel_for, getCollapseNumberExpr(Clauses),
	nullptr /ordered not a clause on distribute/, CS, this, DSAStack,
	VarsWithImplicitDSA, B);

	if (NestedLoopCount == 0)
	return StmtError();

	assert((CurContext->isDependentContext() \|\| B.builtAll()) &&
	"omp for loop exprs were not built");

	setFunctionHasBranchProtectedScope();

	DSAStack->setParentTeamsRegionLoc(StartLoc);

	return OMPTeamsDistributeParallelForDirective::Create(
	Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
	DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
	}

	StmtResult Sema::ActOnOpenMPTargetTeamsDirective(ArrayRef<OMPClause *> Clauses,
	Stmt *AStmt,
	SourceLocation StartLoc,
	SourceLocation EndLoc) {
	if (!AStmt)
	return StmtError();

	auto *CS = cast<CapturedStmt>(AStmt);
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();

	for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target_teams);
	ThisCaptureLevel > 1; --ThisCaptureLevel) {
	CS = cast<CapturedStmt>(CS->getCapturedStmt());
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	}
	setFunctionHasBranchProtectedScope();

	return OMPTargetTeamsDirective::Create(Context, StartLoc, EndLoc, Clauses,
	AStmt);
	}

	StmtResult Sema::ActOnOpenMPTargetTeamsDistributeDirective(
	ArrayRef<OMPClause > Clauses, Stmt AStmt, SourceLocation StartLoc,
	SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
	if (!AStmt)
	return StmtError();

	auto *CS = cast<CapturedStmt>(AStmt);
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	for (int ThisCaptureLevel =
	getOpenMPCaptureLevels(OMPD_target_teams_distribute);
	ThisCaptureLevel > 1; --ThisCaptureLevel) {
	CS = cast<CapturedStmt>(CS->getCapturedStmt());
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	}

	OMPLoopDirective::HelperExprs B;
	// In presence of clause 'collapse' with number of loops, it will
	// define the nested loops number.
	unsigned NestedLoopCount = checkOpenMPLoop(
	OMPD_target_teams_distribute, getCollapseNumberExpr(Clauses),
	nullptr /ordered not a clause on distribute/, CS, this, DSAStack,
	VarsWithImplicitDSA, B);
	if (NestedLoopCount == 0)
	return StmtError();

	assert((CurContext->isDependentContext() \|\| B.builtAll()) &&
	"omp target teams distribute loop exprs were not built");

	setFunctionHasBranchProtectedScope();
	return OMPTargetTeamsDistributeDirective::Create(
	Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
	}

	StmtResult Sema::ActOnOpenMPTargetTeamsDistributeParallelForDirective(
	ArrayRef<OMPClause > Clauses, Stmt AStmt, SourceLocation StartLoc,
	SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
	if (!AStmt)
	return StmtError();

	auto *CS = cast<CapturedStmt>(AStmt);
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	for (int ThisCaptureLevel =
	getOpenMPCaptureLevels(OMPD_target_teams_distribute_parallel_for);
	ThisCaptureLevel > 1; --ThisCaptureLevel) {
	CS = cast<CapturedStmt>(CS->getCapturedStmt());
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	}

	OMPLoopDirective::HelperExprs B;
	// In presence of clause 'collapse' with number of loops, it will
	// define the nested loops number.
	unsigned NestedLoopCount = checkOpenMPLoop(
	OMPD_target_teams_distribute_parallel_for, getCollapseNumberExpr(Clauses),
	nullptr /ordered not a clause on distribute/, CS, this, DSAStack,
	VarsWithImplicitDSA, B);
	if (NestedLoopCount == 0)
	return StmtError();

	assert((CurContext->isDependentContext() \|\| B.builtAll()) &&
	"omp target teams distribute parallel for loop exprs were not built");

	if (!CurContext->isDependentContext()) {
	// Finalize the clauses that need pre-built expressions for CodeGen.
	for (OMPClause *C : Clauses) {
	if (auto *LC = dyn_cast<OMPLinearClause>(C))
	if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
	B.NumIterations, *this, CurScope,
	DSAStack))
	return StmtError();
	}
	}

	setFunctionHasBranchProtectedScope();
	return OMPTargetTeamsDistributeParallelForDirective::Create(
	Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
	DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
	}

	StmtResult Sema::ActOnOpenMPTargetTeamsDistributeParallelForSimdDirective(
	ArrayRef<OMPClause > Clauses, Stmt AStmt, SourceLocation StartLoc,
	SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
	if (!AStmt)
	return StmtError();

	auto *CS = cast<CapturedStmt>(AStmt);
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	for (int ThisCaptureLevel = getOpenMPCaptureLevels(
	OMPD_target_teams_distribute_parallel_for_simd);
	ThisCaptureLevel > 1; --ThisCaptureLevel) {
	CS = cast<CapturedStmt>(CS->getCapturedStmt());
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	}

	OMPLoopDirective::HelperExprs B;
	// In presence of clause 'collapse' with number of loops, it will
	// define the nested loops number.
	unsigned NestedLoopCount =
	checkOpenMPLoop(OMPD_target_teams_distribute_parallel_for_simd,
	getCollapseNumberExpr(Clauses),
	nullptr /ordered not a clause on distribute/, CS, *this,
	*DSAStack, VarsWithImplicitDSA, B);
	if (NestedLoopCount == 0)
	return StmtError();

	assert((CurContext->isDependentContext() \|\| B.builtAll()) &&
	"omp target teams distribute parallel for simd loop exprs were not "
	"built");

	if (!CurContext->isDependentContext()) {
	// Finalize the clauses that need pre-built expressions for CodeGen.
	for (OMPClause *C : Clauses) {
	if (auto *LC = dyn_cast<OMPLinearClause>(C))
	if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
	B.NumIterations, *this, CurScope,
	DSAStack))
	return StmtError();
	}
	}

	if (checkSimdlenSafelenSpecified(*this, Clauses))
	return StmtError();

	setFunctionHasBranchProtectedScope();
	return OMPTargetTeamsDistributeParallelForSimdDirective::Create(
	Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
	}

	StmtResult Sema::ActOnOpenMPTargetTeamsDistributeSimdDirective(
	ArrayRef<OMPClause > Clauses, Stmt AStmt, SourceLocation StartLoc,
	SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
	if (!AStmt)
	return StmtError();

	auto *CS = cast<CapturedStmt>(AStmt);
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	for (int ThisCaptureLevel =
	getOpenMPCaptureLevels(OMPD_target_teams_distribute_simd);
	ThisCaptureLevel > 1; --ThisCaptureLevel) {
	CS = cast<CapturedStmt>(CS->getCapturedStmt());
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CS->getCapturedDecl()->setNothrow();
	}

	OMPLoopDirective::HelperExprs B;
	// In presence of clause 'collapse' with number of loops, it will
	// define the nested loops number.
	unsigned NestedLoopCount = checkOpenMPLoop(
	OMPD_target_teams_distribute_simd, getCollapseNumberExpr(Clauses),
	nullptr /ordered not a clause on distribute/, CS, this, DSAStack,
	VarsWithImplicitDSA, B);
	if (NestedLoopCount == 0)
	return StmtError();

	assert((CurContext->isDependentContext() \|\| B.builtAll()) &&
	"omp target teams distribute simd loop exprs were not built");

	if (!CurContext->isDependentContext()) {
	// Finalize the clauses that need pre-built expressions for CodeGen.
	for (OMPClause *C : Clauses) {
	if (auto *LC = dyn_cast<OMPLinearClause>(C))
	if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
	B.NumIterations, *this, CurScope,
	DSAStack))
	return StmtError();
	}
	}

	if (checkSimdlenSafelenSpecified(*this, Clauses))
	return StmtError();

	setFunctionHasBranchProtectedScope();
	return OMPTargetTeamsDistributeSimdDirective::Create(
	Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
	}

	OMPClause Sema::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind, Expr Expr,
	SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation EndLoc) {
	OMPClause *Res = nullptr;
	switch (Kind) {
	case OMPC_final:
	Res = ActOnOpenMPFinalClause(Expr, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_num_threads:
	Res = ActOnOpenMPNumThreadsClause(Expr, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_safelen:
	Res = ActOnOpenMPSafelenClause(Expr, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_simdlen:
	Res = ActOnOpenMPSimdlenClause(Expr, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_allocator:
	Res = ActOnOpenMPAllocatorClause(Expr, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_collapse:
	Res = ActOnOpenMPCollapseClause(Expr, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_ordered:
	Res = ActOnOpenMPOrderedClause(StartLoc, EndLoc, LParenLoc, Expr);
	break;
	case OMPC_num_teams:
	Res = ActOnOpenMPNumTeamsClause(Expr, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_thread_limit:
	Res = ActOnOpenMPThreadLimitClause(Expr, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_priority:
	Res = ActOnOpenMPPriorityClause(Expr, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_grainsize:
	Res = ActOnOpenMPGrainsizeClause(Expr, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_num_tasks:
	Res = ActOnOpenMPNumTasksClause(Expr, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_hint:
	Res = ActOnOpenMPHintClause(Expr, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_depobj:
	Res = ActOnOpenMPDepobjClause(Expr, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_detach:
	Res = ActOnOpenMPDetachClause(Expr, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_device:
	case OMPC_if:
	case OMPC_default:
	case OMPC_proc_bind:
	case OMPC_schedule:
	case OMPC_private:
	case OMPC_firstprivate:
	case OMPC_lastprivate:
	case OMPC_shared:
	case OMPC_reduction:
	case OMPC_task_reduction:
	case OMPC_in_reduction:
	case OMPC_linear:
	case OMPC_aligned:
	case OMPC_copyin:
	case OMPC_copyprivate:
	case OMPC_nowait:
	case OMPC_untied:
	case OMPC_mergeable:
	case OMPC_threadprivate:
	case OMPC_allocate:
	case OMPC_flush:
	case OMPC_read:
	case OMPC_write:
	case OMPC_update:
	case OMPC_capture:
	case OMPC_seq_cst:
	case OMPC_acq_rel:
	case OMPC_acquire:
	case OMPC_release:
	case OMPC_relaxed:
	case OMPC_depend:
	case OMPC_threads:
	case OMPC_simd:
	case OMPC_map:
	case OMPC_nogroup:
	case OMPC_dist_schedule:
	case OMPC_defaultmap:
	case OMPC_unknown:
	case OMPC_uniform:
	case OMPC_to:
	case OMPC_from:
	case OMPC_use_device_ptr:
	case OMPC_use_device_addr:
	case OMPC_is_device_ptr:
	case OMPC_unified_address:
	case OMPC_unified_shared_memory:
	case OMPC_reverse_offload:
	case OMPC_dynamic_allocators:
	case OMPC_atomic_default_mem_order:
	case OMPC_device_type:
	case OMPC_match:
	case OMPC_nontemporal:
	case OMPC_order:
	case OMPC_destroy:
	case OMPC_inclusive:
	case OMPC_exclusive:
	case OMPC_uses_allocators:
	case OMPC_affinity:
	default:
	llvm_unreachable("Clause is not allowed.");
	}
	return Res;
	}

	// An OpenMP directive such as 'target parallel' has two captured regions:
	// for the 'target' and 'parallel' respectively. This function returns
	// the region in which to capture expressions associated with a clause.
	// A return value of OMPD_unknown signifies that the expression should not
	// be captured.
	static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
	OpenMPDirectiveKind DKind, OpenMPClauseKind CKind, unsigned OpenMPVersion,
	OpenMPDirectiveKind NameModifier = OMPD_unknown) {
	OpenMPDirectiveKind CaptureRegion = OMPD_unknown;
	switch (CKind) {
	case OMPC_if:
	switch (DKind) {
	case OMPD_target_parallel_for_simd:
	if (OpenMPVersion >= 50 &&
	(NameModifier == OMPD_unknown \|\| NameModifier == OMPD_simd)) {
	CaptureRegion = OMPD_parallel;
	break;
	}
	LLVM_FALLTHROUGH;
	case OMPD_target_parallel:
	case OMPD_target_parallel_for:
	// If this clause applies to the nested 'parallel' region, capture within
	// the 'target' region, otherwise do not capture.
	if (NameModifier == OMPD_unknown \|\| NameModifier == OMPD_parallel)
	CaptureRegion = OMPD_target;
	break;
	case OMPD_target_teams_distribute_parallel_for_simd:
	if (OpenMPVersion >= 50 &&
	(NameModifier == OMPD_unknown \|\| NameModifier == OMPD_simd)) {
	CaptureRegion = OMPD_parallel;
	break;
	}
	LLVM_FALLTHROUGH;
	case OMPD_target_teams_distribute_parallel_for:
	// If this clause applies to the nested 'parallel' region, capture within
	// the 'teams' region, otherwise do not capture.
	if (NameModifier == OMPD_unknown \|\| NameModifier == OMPD_parallel)
	CaptureRegion = OMPD_teams;
	break;
	case OMPD_teams_distribute_parallel_for_simd:
	if (OpenMPVersion >= 50 &&
	(NameModifier == OMPD_unknown \|\| NameModifier == OMPD_simd)) {
	CaptureRegion = OMPD_parallel;
	break;
	}
	LLVM_FALLTHROUGH;
	case OMPD_teams_distribute_parallel_for:
	CaptureRegion = OMPD_teams;
	break;
	case OMPD_target_update:
	case OMPD_target_enter_data:
	case OMPD_target_exit_data:
	CaptureRegion = OMPD_task;
	break;
	case OMPD_parallel_master_taskloop:
	if (NameModifier == OMPD_unknown \|\| NameModifier == OMPD_taskloop)
	CaptureRegion = OMPD_parallel;
	break;
	case OMPD_parallel_master_taskloop_simd:
	if ((OpenMPVersion <= 45 && NameModifier == OMPD_unknown) \|\|
	NameModifier == OMPD_taskloop) {
	CaptureRegion = OMPD_parallel;
	break;
	}
	if (OpenMPVersion <= 45)
	break;
	if (NameModifier == OMPD_unknown \|\| NameModifier == OMPD_simd)
	CaptureRegion = OMPD_taskloop;
	break;
	case OMPD_parallel_for_simd:
	if (OpenMPVersion <= 45)
	break;
	if (NameModifier == OMPD_unknown \|\| NameModifier == OMPD_simd)
	CaptureRegion = OMPD_parallel;
	break;
	case OMPD_taskloop_simd:
	case OMPD_master_taskloop_simd:
	if (OpenMPVersion <= 45)
	break;
	if (NameModifier == OMPD_unknown \|\| NameModifier == OMPD_simd)
	CaptureRegion = OMPD_taskloop;
	break;
	case OMPD_distribute_parallel_for_simd:
	if (OpenMPVersion <= 45)
	break;
	if (NameModifier == OMPD_unknown \|\| NameModifier == OMPD_simd)
	CaptureRegion = OMPD_parallel;
	break;
	case OMPD_target_simd:
	if (OpenMPVersion >= 50 &&
	(NameModifier == OMPD_unknown \|\| NameModifier == OMPD_simd))
	CaptureRegion = OMPD_target;
	break;
	case OMPD_teams_distribute_simd:
	case OMPD_target_teams_distribute_simd:
	if (OpenMPVersion >= 50 &&
	(NameModifier == OMPD_unknown \|\| NameModifier == OMPD_simd))
	CaptureRegion = OMPD_teams;
	break;
	case OMPD_cancel:
	case OMPD_parallel:
	case OMPD_parallel_master:
	case OMPD_parallel_sections:
	case OMPD_parallel_for:
	case OMPD_target:
	case OMPD_target_teams:
	case OMPD_target_teams_distribute:
	case OMPD_distribute_parallel_for:
	case OMPD_task:
	case OMPD_taskloop:
	case OMPD_master_taskloop:
	case OMPD_target_data:
	case OMPD_simd:
	case OMPD_for_simd:
	case OMPD_distribute_simd:
	// Do not capture if-clause expressions.
	break;
	case OMPD_threadprivate:
	case OMPD_allocate:
	case OMPD_taskyield:
	case OMPD_barrier:
	case OMPD_taskwait:
	case OMPD_cancellation_point:
	case OMPD_flush:
	case OMPD_depobj:
	case OMPD_scan:
	case OMPD_declare_reduction:
	case OMPD_declare_mapper:
	case OMPD_declare_simd:
	case OMPD_declare_variant:
	case OMPD_begin_declare_variant:
	case OMPD_end_declare_variant:
	case OMPD_declare_target:
	case OMPD_end_declare_target:
	case OMPD_teams:
	case OMPD_for:
	case OMPD_sections:
	case OMPD_section:
	case OMPD_single:
	case OMPD_master:
	case OMPD_critical:
	case OMPD_taskgroup:
	case OMPD_distribute:
	case OMPD_ordered:
	case OMPD_atomic:
	case OMPD_teams_distribute:
	case OMPD_requires:
	llvm_unreachable("Unexpected OpenMP directive with if-clause");
	case OMPD_unknown:
	default:
	llvm_unreachable("Unknown OpenMP directive");
	}
	break;
	case OMPC_num_threads:
	switch (DKind) {
	case OMPD_target_parallel:
	case OMPD_target_parallel_for:
	case OMPD_target_parallel_for_simd:
	CaptureRegion = OMPD_target;
	break;
	case OMPD_teams_distribute_parallel_for:
	case OMPD_teams_distribute_parallel_for_simd:
	case OMPD_target_teams_distribute_parallel_for:
	case OMPD_target_teams_distribute_parallel_for_simd:
	CaptureRegion = OMPD_teams;
	break;
	case OMPD_parallel:
	case OMPD_parallel_master:
	case OMPD_parallel_sections:
	case OMPD_parallel_for:
	case OMPD_parallel_for_simd:
	case OMPD_distribute_parallel_for:
	case OMPD_distribute_parallel_for_simd:
	case OMPD_parallel_master_taskloop:
	case OMPD_parallel_master_taskloop_simd:
	// Do not capture num_threads-clause expressions.
	break;
	case OMPD_target_data:
	case OMPD_target_enter_data:
	case OMPD_target_exit_data:
	case OMPD_target_update:
	case OMPD_target:
	case OMPD_target_simd:
	case OMPD_target_teams:
	case OMPD_target_teams_distribute:
	case OMPD_target_teams_distribute_simd:
	case OMPD_cancel:
	case OMPD_task:
	case OMPD_taskloop:
	case OMPD_taskloop_simd:
	case OMPD_master_taskloop:
	case OMPD_master_taskloop_simd:
	case OMPD_threadprivate:
	case OMPD_allocate:
	case OMPD_taskyield:
	case OMPD_barrier:
	case OMPD_taskwait:
	case OMPD_cancellation_point:
	case OMPD_flush:
	case OMPD_depobj:
	case OMPD_scan:
	case OMPD_declare_reduction:
	case OMPD_declare_mapper:
	case OMPD_declare_simd:
	case OMPD_declare_variant:
	case OMPD_begin_declare_variant:
	case OMPD_end_declare_variant:
	case OMPD_declare_target:
	case OMPD_end_declare_target:
	case OMPD_teams:
	case OMPD_simd:
	case OMPD_for:
	case OMPD_for_simd:
	case OMPD_sections:
	case OMPD_section:
	case OMPD_single:
	case OMPD_master:
	case OMPD_critical:
	case OMPD_taskgroup:
	case OMPD_distribute:
	case OMPD_ordered:
	case OMPD_atomic:
	case OMPD_distribute_simd:
	case OMPD_teams_distribute:
	case OMPD_teams_distribute_simd:
	case OMPD_requires:
	llvm_unreachable("Unexpected OpenMP directive with num_threads-clause");
	case OMPD_unknown:
	default:
	llvm_unreachable("Unknown OpenMP directive");
	}
	break;
	case OMPC_num_teams:
	switch (DKind) {
	case OMPD_target_teams:
	case OMPD_target_teams_distribute:
	case OMPD_target_teams_distribute_simd:
	case OMPD_target_teams_distribute_parallel_for:
	case OMPD_target_teams_distribute_parallel_for_simd:
	CaptureRegion = OMPD_target;
	break;
	case OMPD_teams_distribute_parallel_for:
	case OMPD_teams_distribute_parallel_for_simd:
	case OMPD_teams:
	case OMPD_teams_distribute:
	case OMPD_teams_distribute_simd:
	// Do not capture num_teams-clause expressions.
	break;
	case OMPD_distribute_parallel_for:
	case OMPD_distribute_parallel_for_simd:
	case OMPD_task:
	case OMPD_taskloop:
	case OMPD_taskloop_simd:
	case OMPD_master_taskloop:
	case OMPD_master_taskloop_simd:
	case OMPD_parallel_master_taskloop:
	case OMPD_parallel_master_taskloop_simd:
	case OMPD_target_data:
	case OMPD_target_enter_data:
	case OMPD_target_exit_data:
	case OMPD_target_update:
	case OMPD_cancel:
	case OMPD_parallel:
	case OMPD_parallel_master:
	case OMPD_parallel_sections:
	case OMPD_parallel_for:
	case OMPD_parallel_for_simd:
	case OMPD_target:
	case OMPD_target_simd:
	case OMPD_target_parallel:
	case OMPD_target_parallel_for:
	case OMPD_target_parallel_for_simd:
	case OMPD_threadprivate:
	case OMPD_allocate:
	case OMPD_taskyield:
	case OMPD_barrier:
	case OMPD_taskwait:
	case OMPD_cancellation_point:
	case OMPD_flush:
	case OMPD_depobj:
	case OMPD_scan:
	case OMPD_declare_reduction:
	case OMPD_declare_mapper:
	case OMPD_declare_simd:
	case OMPD_declare_variant:
	case OMPD_begin_declare_variant:
	case OMPD_end_declare_variant:
	case OMPD_declare_target:
	case OMPD_end_declare_target:
	case OMPD_simd:
	case OMPD_for:
	case OMPD_for_simd:
	case OMPD_sections:
	case OMPD_section:
	case OMPD_single:
	case OMPD_master:
	case OMPD_critical:
	case OMPD_taskgroup:
	case OMPD_distribute:
	case OMPD_ordered:
	case OMPD_atomic:
	case OMPD_distribute_simd:
	case OMPD_requires:
	llvm_unreachable("Unexpected OpenMP directive with num_teams-clause");
	case OMPD_unknown:
	default:
	llvm_unreachable("Unknown OpenMP directive");
	}
	break;
	case OMPC_thread_limit:
	switch (DKind) {
	case OMPD_target_teams:
	case OMPD_target_teams_distribute:
	case OMPD_target_teams_distribute_simd:
	case OMPD_target_teams_distribute_parallel_for:
	case OMPD_target_teams_distribute_parallel_for_simd:
	CaptureRegion = OMPD_target;
	break;
	case OMPD_teams_distribute_parallel_for:
	case OMPD_teams_distribute_parallel_for_simd:
	case OMPD_teams:
	case OMPD_teams_distribute:
	case OMPD_teams_distribute_simd:
	// Do not capture thread_limit-clause expressions.
	break;
	case OMPD_distribute_parallel_for:
	case OMPD_distribute_parallel_for_simd:
	case OMPD_task:
	case OMPD_taskloop:
	case OMPD_taskloop_simd:
	case OMPD_master_taskloop:
	case OMPD_master_taskloop_simd:
	case OMPD_parallel_master_taskloop:
	case OMPD_parallel_master_taskloop_simd:
	case OMPD_target_data:
	case OMPD_target_enter_data:
	case OMPD_target_exit_data:
	case OMPD_target_update:
	case OMPD_cancel:
	case OMPD_parallel:
	case OMPD_parallel_master:
	case OMPD_parallel_sections:
	case OMPD_parallel_for:
	case OMPD_parallel_for_simd:
	case OMPD_target:
	case OMPD_target_simd:
	case OMPD_target_parallel:
	case OMPD_target_parallel_for:
	case OMPD_target_parallel_for_simd:
	case OMPD_threadprivate:
	case OMPD_allocate:
	case OMPD_taskyield:
	case OMPD_barrier:
	case OMPD_taskwait:
	case OMPD_cancellation_point:
	case OMPD_flush:
	case OMPD_depobj:
	case OMPD_scan:
	case OMPD_declare_reduction:
	case OMPD_declare_mapper:
	case OMPD_declare_simd:
	case OMPD_declare_variant:
	case OMPD_begin_declare_variant:
	case OMPD_end_declare_variant:
	case OMPD_declare_target:
	case OMPD_end_declare_target:
	case OMPD_simd:
	case OMPD_for:
	case OMPD_for_simd:
	case OMPD_sections:
	case OMPD_section:
	case OMPD_single:
	case OMPD_master:
	case OMPD_critical:
	case OMPD_taskgroup:
	case OMPD_distribute:
	case OMPD_ordered:
	case OMPD_atomic:
	case OMPD_distribute_simd:
	case OMPD_requires:
	llvm_unreachable("Unexpected OpenMP directive with thread_limit-clause");
	case OMPD_unknown:
	default:
	llvm_unreachable("Unknown OpenMP directive");
	}
	break;
	case OMPC_schedule:
	switch (DKind) {
	case OMPD_parallel_for:
	case OMPD_parallel_for_simd:
	case OMPD_distribute_parallel_for:
	case OMPD_distribute_parallel_for_simd:
	case OMPD_teams_distribute_parallel_for:
	case OMPD_teams_distribute_parallel_for_simd:
	case OMPD_target_parallel_for:
	case OMPD_target_parallel_for_simd:
	case OMPD_target_teams_distribute_parallel_for:
	case OMPD_target_teams_distribute_parallel_for_simd:
	CaptureRegion = OMPD_parallel;
	break;
	case OMPD_for:
	case OMPD_for_simd:
	// Do not capture schedule-clause expressions.
	break;
	case OMPD_task:
	case OMPD_taskloop:
	case OMPD_taskloop_simd:
	case OMPD_master_taskloop:
	case OMPD_master_taskloop_simd:
	case OMPD_parallel_master_taskloop:
	case OMPD_parallel_master_taskloop_simd:
	case OMPD_target_data:
	case OMPD_target_enter_data:
	case OMPD_target_exit_data:
	case OMPD_target_update:
	case OMPD_teams:
	case OMPD_teams_distribute:
	case OMPD_teams_distribute_simd:
	case OMPD_target_teams_distribute:
	case OMPD_target_teams_distribute_simd:
	case OMPD_target:
	case OMPD_target_simd:
	case OMPD_target_parallel:
	case OMPD_cancel:
	case OMPD_parallel:
	case OMPD_parallel_master:
	case OMPD_parallel_sections:
	case OMPD_threadprivate:
	case OMPD_allocate:
	case OMPD_taskyield:
	case OMPD_barrier:
	case OMPD_taskwait:
	case OMPD_cancellation_point:
	case OMPD_flush:
	case OMPD_depobj:
	case OMPD_scan:
	case OMPD_declare_reduction:
	case OMPD_declare_mapper:
	case OMPD_declare_simd:
	case OMPD_declare_variant:
	case OMPD_begin_declare_variant:
	case OMPD_end_declare_variant:
	case OMPD_declare_target:
	case OMPD_end_declare_target:
	case OMPD_simd:
	case OMPD_sections:
	case OMPD_section:
	case OMPD_single:
	case OMPD_master:
	case OMPD_critical:
	case OMPD_taskgroup:
	case OMPD_distribute:
	case OMPD_ordered:
	case OMPD_atomic:
	case OMPD_distribute_simd:
	case OMPD_target_teams:
	case OMPD_requires:
	llvm_unreachable("Unexpected OpenMP directive with schedule clause");
	case OMPD_unknown:
	default:
	llvm_unreachable("Unknown OpenMP directive");
	}
	break;
	case OMPC_dist_schedule:
	switch (DKind) {
	case OMPD_teams_distribute_parallel_for:
	case OMPD_teams_distribute_parallel_for_simd:
	case OMPD_teams_distribute:
	case OMPD_teams_distribute_simd:
	case OMPD_target_teams_distribute_parallel_for:
	case OMPD_target_teams_distribute_parallel_for_simd:
	case OMPD_target_teams_distribute:
	case OMPD_target_teams_distribute_simd:
	CaptureRegion = OMPD_teams;
	break;
	case OMPD_distribute_parallel_for:
	case OMPD_distribute_parallel_for_simd:
	case OMPD_distribute:
	case OMPD_distribute_simd:
	// Do not capture thread_limit-clause expressions.
	break;
	case OMPD_parallel_for:
	case OMPD_parallel_for_simd:
	case OMPD_target_parallel_for_simd:
	case OMPD_target_parallel_for:
	case OMPD_task:
	case OMPD_taskloop:
	case OMPD_taskloop_simd:
	case OMPD_master_taskloop:
	case OMPD_master_taskloop_simd:
	case OMPD_parallel_master_taskloop:
	case OMPD_parallel_master_taskloop_simd:
	case OMPD_target_data:
	case OMPD_target_enter_data:
	case OMPD_target_exit_data:
	case OMPD_target_update:
	case OMPD_teams:
	case OMPD_target:
	case OMPD_target_simd:
	case OMPD_target_parallel:
	case OMPD_cancel:
	case OMPD_parallel:
	case OMPD_parallel_master:
	case OMPD_parallel_sections:
	case OMPD_threadprivate:
	case OMPD_allocate:
	case OMPD_taskyield:
	case OMPD_barrier:
	case OMPD_taskwait:
	case OMPD_cancellation_point:
	case OMPD_flush:
	case OMPD_depobj:
	case OMPD_scan:
	case OMPD_declare_reduction:
	case OMPD_declare_mapper:
	case OMPD_declare_simd:
	case OMPD_declare_variant:
	case OMPD_begin_declare_variant:
	case OMPD_end_declare_variant:
	case OMPD_declare_target:
	case OMPD_end_declare_target:
	case OMPD_simd:
	case OMPD_for:
	case OMPD_for_simd:
	case OMPD_sections:
	case OMPD_section:
	case OMPD_single:
	case OMPD_master:
	case OMPD_critical:
	case OMPD_taskgroup:
	case OMPD_ordered:
	case OMPD_atomic:
	case OMPD_target_teams:
	case OMPD_requires:
	llvm_unreachable("Unexpected OpenMP directive with schedule clause");
	case OMPD_unknown:
	default:
	llvm_unreachable("Unknown OpenMP directive");
	}
	break;
	case OMPC_device:
	switch (DKind) {
	case OMPD_target_update:
	case OMPD_target_enter_data:
	case OMPD_target_exit_data:
	case OMPD_target:
	case OMPD_target_simd:
	case OMPD_target_teams:
	case OMPD_target_parallel:
	case OMPD_target_teams_distribute:
	case OMPD_target_teams_distribute_simd:
	case OMPD_target_parallel_for:
	case OMPD_target_parallel_for_simd:
	case OMPD_target_teams_distribute_parallel_for:
	case OMPD_target_teams_distribute_parallel_for_simd:
	CaptureRegion = OMPD_task;
	break;
	case OMPD_target_data:
	// Do not capture device-clause expressions.
	break;
	case OMPD_teams_distribute_parallel_for:
	case OMPD_teams_distribute_parallel_for_simd:
	case OMPD_teams:
	case OMPD_teams_distribute:
	case OMPD_teams_distribute_simd:
	case OMPD_distribute_parallel_for:
	case OMPD_distribute_parallel_for_simd:
	case OMPD_task:
	case OMPD_taskloop:
	case OMPD_taskloop_simd:
	case OMPD_master_taskloop:
	case OMPD_master_taskloop_simd:
	case OMPD_parallel_master_taskloop:
	case OMPD_parallel_master_taskloop_simd:
	case OMPD_cancel:
	case OMPD_parallel:
	case OMPD_parallel_master:
	case OMPD_parallel_sections:
	case OMPD_parallel_for:
	case OMPD_parallel_for_simd:
	case OMPD_threadprivate:
	case OMPD_allocate:
	case OMPD_taskyield:
	case OMPD_barrier:
	case OMPD_taskwait:
	case OMPD_cancellation_point:
	case OMPD_flush:
	case OMPD_depobj:
	case OMPD_scan:
	case OMPD_declare_reduction:
	case OMPD_declare_mapper:
	case OMPD_declare_simd:
	case OMPD_declare_variant:
	case OMPD_begin_declare_variant:
	case OMPD_end_declare_variant:
	case OMPD_declare_target:
	case OMPD_end_declare_target:
	case OMPD_simd:
	case OMPD_for:
	case OMPD_for_simd:
	case OMPD_sections:
	case OMPD_section:
	case OMPD_single:
	case OMPD_master:
	case OMPD_critical:
	case OMPD_taskgroup:
	case OMPD_distribute:
	case OMPD_ordered:
	case OMPD_atomic:
	case OMPD_distribute_simd:
	case OMPD_requires:
	llvm_unreachable("Unexpected OpenMP directive with num_teams-clause");
	case OMPD_unknown:
	default:
	llvm_unreachable("Unknown OpenMP directive");
	}
	break;
	case OMPC_grainsize:
	case OMPC_num_tasks:
	case OMPC_final:
	case OMPC_priority:
	switch (DKind) {
	case OMPD_task:
	case OMPD_taskloop:
	case OMPD_taskloop_simd:
	case OMPD_master_taskloop:
	case OMPD_master_taskloop_simd:
	break;
	case OMPD_parallel_master_taskloop:
	case OMPD_parallel_master_taskloop_simd:
	CaptureRegion = OMPD_parallel;
	break;
	case OMPD_target_update:
	case OMPD_target_enter_data:
	case OMPD_target_exit_data:
	case OMPD_target:
	case OMPD_target_simd:
	case OMPD_target_teams:
	case OMPD_target_parallel:
	case OMPD_target_teams_distribute:
	case OMPD_target_teams_distribute_simd:
	case OMPD_target_parallel_for:
	case OMPD_target_parallel_for_simd:
	case OMPD_target_teams_distribute_parallel_for:
	case OMPD_target_teams_distribute_parallel_for_simd:
	case OMPD_target_data:
	case OMPD_teams_distribute_parallel_for:
	case OMPD_teams_distribute_parallel_for_simd:
	case OMPD_teams:
	case OMPD_teams_distribute:
	case OMPD_teams_distribute_simd:
	case OMPD_distribute_parallel_for:
	case OMPD_distribute_parallel_for_simd:
	case OMPD_cancel:
	case OMPD_parallel:
	case OMPD_parallel_master:
	case OMPD_parallel_sections:
	case OMPD_parallel_for:
	case OMPD_parallel_for_simd:
	case OMPD_threadprivate:
	case OMPD_allocate:
	case OMPD_taskyield:
	case OMPD_barrier:
	case OMPD_taskwait:
	case OMPD_cancellation_point:
	case OMPD_flush:
	case OMPD_depobj:
	case OMPD_scan:
	case OMPD_declare_reduction:
	case OMPD_declare_mapper:
	case OMPD_declare_simd:
	case OMPD_declare_variant:
	case OMPD_begin_declare_variant:
	case OMPD_end_declare_variant:
	case OMPD_declare_target:
	case OMPD_end_declare_target:
	case OMPD_simd:
	case OMPD_for:
	case OMPD_for_simd:
	case OMPD_sections:
	case OMPD_section:
	case OMPD_single:
	case OMPD_master:
	case OMPD_critical:
	case OMPD_taskgroup:
	case OMPD_distribute:
	case OMPD_ordered:
	case OMPD_atomic:
	case OMPD_distribute_simd:
	case OMPD_requires:
	llvm_unreachable("Unexpected OpenMP directive with grainsize-clause");
	case OMPD_unknown:
	default:
	llvm_unreachable("Unknown OpenMP directive");
	}
	break;
	case OMPC_firstprivate:
	case OMPC_lastprivate:
	case OMPC_reduction:
	case OMPC_task_reduction:
	case OMPC_in_reduction:
	case OMPC_linear:
	case OMPC_default:
	case OMPC_proc_bind:
	case OMPC_safelen:
	case OMPC_simdlen:
	case OMPC_allocator:
	case OMPC_collapse:
	case OMPC_private:
	case OMPC_shared:
	case OMPC_aligned:
	case OMPC_copyin:
	case OMPC_copyprivate:
	case OMPC_ordered:
	case OMPC_nowait:
	case OMPC_untied:
	case OMPC_mergeable:
	case OMPC_threadprivate:
	case OMPC_allocate:
	case OMPC_flush:
	case OMPC_depobj:
	case OMPC_read:
	case OMPC_write:
	case OMPC_update:
	case OMPC_capture:
	case OMPC_seq_cst:
	case OMPC_acq_rel:
	case OMPC_acquire:
	case OMPC_release:
	case OMPC_relaxed:
	case OMPC_depend:
	case OMPC_threads:
	case OMPC_simd:
	case OMPC_map:
	case OMPC_nogroup:
	case OMPC_hint:
	case OMPC_defaultmap:
	case OMPC_unknown:
	case OMPC_uniform:
	case OMPC_to:
	case OMPC_from:
	case OMPC_use_device_ptr:
	case OMPC_use_device_addr:
	case OMPC_is_device_ptr:
	case OMPC_unified_address:
	case OMPC_unified_shared_memory:
	case OMPC_reverse_offload:
	case OMPC_dynamic_allocators:
	case OMPC_atomic_default_mem_order:
	case OMPC_device_type:
	case OMPC_match:
	case OMPC_nontemporal:
	case OMPC_order:
	case OMPC_destroy:
	case OMPC_detach:
	case OMPC_inclusive:
	case OMPC_exclusive:
	case OMPC_uses_allocators:
	case OMPC_affinity:
	default:
	llvm_unreachable("Unexpected OpenMP clause.");
	}
	return CaptureRegion;
	}

	OMPClause *Sema::ActOnOpenMPIfClause(OpenMPDirectiveKind NameModifier,
	Expr *Condition, SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation NameModifierLoc,
	SourceLocation ColonLoc,
	SourceLocation EndLoc) {
	Expr *ValExpr = Condition;
	Stmt *HelperValStmt = nullptr;
	OpenMPDirectiveKind CaptureRegion = OMPD_unknown;
	if (!Condition->isValueDependent() && !Condition->isTypeDependent() &&
	!Condition->isInstantiationDependent() &&
	!Condition->containsUnexpandedParameterPack()) {
	ExprResult Val = CheckBooleanCondition(StartLoc, Condition);
	if (Val.isInvalid())
	return nullptr;

	ValExpr = Val.get();

	OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
	CaptureRegion = getOpenMPCaptureRegionForClause(
	DKind, OMPC_if, LangOpts.OpenMP, NameModifier);
	if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) {
	ValExpr = MakeFullExpr(ValExpr).get();
	llvm::MapVector<const Expr , DeclRefExpr > Captures;
	ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
	HelperValStmt = buildPreInits(Context, Captures);
	}
	}

	return new (Context)
	OMPIfClause(NameModifier, ValExpr, HelperValStmt, CaptureRegion, StartLoc,
	LParenLoc, NameModifierLoc, ColonLoc, EndLoc);
	}

	OMPClause Sema::ActOnOpenMPFinalClause(Expr Condition,
	SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation EndLoc) {
	Expr *ValExpr = Condition;
	Stmt *HelperValStmt = nullptr;
	OpenMPDirectiveKind CaptureRegion = OMPD_unknown;
	if (!Condition->isValueDependent() && !Condition->isTypeDependent() &&
	!Condition->isInstantiationDependent() &&
	!Condition->containsUnexpandedParameterPack()) {
	ExprResult Val = CheckBooleanCondition(StartLoc, Condition);
	if (Val.isInvalid())
	return nullptr;

	ValExpr = MakeFullExpr(Val.get()).get();

	OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
	CaptureRegion =
	getOpenMPCaptureRegionForClause(DKind, OMPC_final, LangOpts.OpenMP);
	if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) {
	ValExpr = MakeFullExpr(ValExpr).get();
	llvm::MapVector<const Expr , DeclRefExpr > Captures;
	ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
	HelperValStmt = buildPreInits(Context, Captures);
	}
	}

	return new (Context) OMPFinalClause(ValExpr, HelperValStmt, CaptureRegion,
	StartLoc, LParenLoc, EndLoc);
	}

	ExprResult Sema::PerformOpenMPImplicitIntegerConversion(SourceLocation Loc,
	Expr *Op) {
	if (!Op)
	return ExprError();

	class IntConvertDiagnoser : public ICEConvertDiagnoser {
	public:
	IntConvertDiagnoser()
	: ICEConvertDiagnoser(/AllowScopedEnumerations/ false, false, true) {}
	SemaDiagnosticBuilder diagnoseNotInt(Sema &S, SourceLocation Loc,
	QualType T) override {
	return S.Diag(Loc, diag::err_omp_not_integral) << T;
	}
	SemaDiagnosticBuilder diagnoseIncomplete(Sema &S, SourceLocation Loc,
	QualType T) override {
	return S.Diag(Loc, diag::err_omp_incomplete_type) << T;
	}
	SemaDiagnosticBuilder diagnoseExplicitConv(Sema &S, SourceLocation Loc,
	QualType T,
	QualType ConvTy) override {
	return S.Diag(Loc, diag::err_omp_explicit_conversion) << T << ConvTy;
	}
	SemaDiagnosticBuilder noteExplicitConv(Sema &S, CXXConversionDecl *Conv,
	QualType ConvTy) override {
	return S.Diag(Conv->getLocation(), diag::note_omp_conversion_here)
	<< ConvTy->isEnumeralType() << ConvTy;
	}
	SemaDiagnosticBuilder diagnoseAmbiguous(Sema &S, SourceLocation Loc,
	QualType T) override {
	return S.Diag(Loc, diag::err_omp_ambiguous_conversion) << T;
	}
	SemaDiagnosticBuilder noteAmbiguous(Sema &S, CXXConversionDecl *Conv,
	QualType ConvTy) override {
	return S.Diag(Conv->getLocation(), diag::note_omp_conversion_here)
	<< ConvTy->isEnumeralType() << ConvTy;
	}
	SemaDiagnosticBuilder diagnoseConversion(Sema &, SourceLocation, QualType,
	QualType) override {
	llvm_unreachable("conversion functions are permitted");
	}
	} ConvertDiagnoser;
	return PerformContextualImplicitConversion(Loc, Op, ConvertDiagnoser);
	}

	static bool
	isNonNegativeIntegerValue(Expr *&ValExpr, Sema &SemaRef, OpenMPClauseKind CKind,
	bool StrictlyPositive, bool BuildCapture = false,
	OpenMPDirectiveKind DKind = OMPD_unknown,
	OpenMPDirectiveKind *CaptureRegion = nullptr,
	Stmt **HelperValStmt = nullptr) {
	if (!ValExpr->isTypeDependent() && !ValExpr->isValueDependent() &&
	!ValExpr->isInstantiationDependent()) {
	SourceLocation Loc = ValExpr->getExprLoc();
	ExprResult Value =
	SemaRef.PerformOpenMPImplicitIntegerConversion(Loc, ValExpr);
	if (Value.isInvalid())
	return false;

	ValExpr = Value.get();
	// The expression must evaluate to a non-negative integer value.
	llvm::APSInt Result;
	if (ValExpr->isIntegerConstantExpr(Result, SemaRef.Context) &&
	Result.isSigned() &&
	!((!StrictlyPositive && Result.isNonNegative()) \|\|
	(StrictlyPositive && Result.isStrictlyPositive()))) {
	SemaRef.Diag(Loc, diag::err_omp_negative_expression_in_clause)
	<< getOpenMPClauseName(CKind) << (StrictlyPositive ? 1 : 0)
	<< ValExpr->getSourceRange();
	return false;
	}
	if (!BuildCapture)
	return true;
	*CaptureRegion =
	getOpenMPCaptureRegionForClause(DKind, CKind, SemaRef.LangOpts.OpenMP);
	if (*CaptureRegion != OMPD_unknown &&
	!SemaRef.CurContext->isDependentContext()) {
	ValExpr = SemaRef.MakeFullExpr(ValExpr).get();
	llvm::MapVector<const Expr , DeclRefExpr > Captures;
	ValExpr = tryBuildCapture(SemaRef, ValExpr, Captures).get();
	*HelperValStmt = buildPreInits(SemaRef.Context, Captures);
	}
	}
	return true;
	}

	OMPClause Sema::ActOnOpenMPNumThreadsClause(Expr NumThreads,
	SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation EndLoc) {
	Expr *ValExpr = NumThreads;
	Stmt *HelperValStmt = nullptr;

	// OpenMP [2.5, Restrictions]
	// The num_threads expression must evaluate to a positive integer value.
	if (!isNonNegativeIntegerValue(ValExpr, *this, OMPC_num_threads,
	/StrictlyPositive=/true))
	return nullptr;

	OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
	OpenMPDirectiveKind CaptureRegion =
	getOpenMPCaptureRegionForClause(DKind, OMPC_num_threads, LangOpts.OpenMP);
	if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) {
	ValExpr = MakeFullExpr(ValExpr).get();
	llvm::MapVector<const Expr , DeclRefExpr > Captures;
	ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
	HelperValStmt = buildPreInits(Context, Captures);
	}

	return new (Context) OMPNumThreadsClause(
	ValExpr, HelperValStmt, CaptureRegion, StartLoc, LParenLoc, EndLoc);
	}

	ExprResult Sema::VerifyPositiveIntegerConstantInClause(Expr *E,
	OpenMPClauseKind CKind,
	bool StrictlyPositive) {
	if (!E)
	return ExprError();
	if (E->isValueDependent() \|\| E->isTypeDependent() \|\|
	E->isInstantiationDependent() \|\| E->containsUnexpandedParameterPack())
	return E;
	llvm::APSInt Result;
	ExprResult ICE = VerifyIntegerConstantExpression(E, &Result);
	if (ICE.isInvalid())
	return ExprError();
	if ((StrictlyPositive && !Result.isStrictlyPositive()) \|\|
	(!StrictlyPositive && !Result.isNonNegative())) {
	Diag(E->getExprLoc(), diag::err_omp_negative_expression_in_clause)
	<< getOpenMPClauseName(CKind) << (StrictlyPositive ? 1 : 0)
	<< E->getSourceRange();
	return ExprError();
	}
	if (CKind == OMPC_aligned && !Result.isPowerOf2()) {
	Diag(E->getExprLoc(), diag::warn_omp_alignment_not_power_of_two)
	<< E->getSourceRange();
	return ExprError();
	}
	if (CKind == OMPC_collapse && DSAStack->getAssociatedLoops() == 1)
	DSAStack->setAssociatedLoops(Result.getExtValue());
	else if (CKind == OMPC_ordered)
	DSAStack->setAssociatedLoops(Result.getExtValue());
	return ICE;
	}

	OMPClause Sema::ActOnOpenMPSafelenClause(Expr Len, SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation EndLoc) {
	// OpenMP [2.8.1, simd construct, Description]
	// The parameter of the safelen clause must be a constant
	// positive integer expression.
	ExprResult Safelen = VerifyPositiveIntegerConstantInClause(Len, OMPC_safelen);
	if (Safelen.isInvalid())
	return nullptr;
	return new (Context)
	OMPSafelenClause(Safelen.get(), StartLoc, LParenLoc, EndLoc);
	}

	OMPClause Sema::ActOnOpenMPSimdlenClause(Expr Len, SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation EndLoc) {
	// OpenMP [2.8.1, simd construct, Description]
	// The parameter of the simdlen clause must be a constant
	// positive integer expression.
	ExprResult Simdlen = VerifyPositiveIntegerConstantInClause(Len, OMPC_simdlen);
	if (Simdlen.isInvalid())
	return nullptr;
	return new (Context)
	OMPSimdlenClause(Simdlen.get(), StartLoc, LParenLoc, EndLoc);
	}

	/// Tries to find omp_allocator_handle_t type.
	static bool findOMPAllocatorHandleT(Sema &S, SourceLocation Loc,
	DSAStackTy *Stack) {
	QualType OMPAllocatorHandleT = Stack->getOMPAllocatorHandleT();
	if (!OMPAllocatorHandleT.isNull())
	return true;
	// Build the predefined allocator expressions.
	bool ErrorFound = false;
	for (int I = 0; I < OMPAllocateDeclAttr::OMPUserDefinedMemAlloc; ++I) {
	auto AllocatorKind = static_cast<OMPAllocateDeclAttr::AllocatorTypeTy>(I);
	StringRef Allocator =
	OMPAllocateDeclAttr::ConvertAllocatorTypeTyToStr(AllocatorKind);
	DeclarationName AllocatorName = &S.getASTContext().Idents.get(Allocator);
	auto *VD = dyn_cast_or_null<ValueDecl>(
	S.LookupSingleName(S.TUScope, AllocatorName, Loc, Sema::LookupAnyName));
	if (!VD) {
	ErrorFound = true;
	break;
	}
	QualType AllocatorType =
	VD->getType().getNonLValueExprType(S.getASTContext());
	ExprResult Res = S.BuildDeclRefExpr(VD, AllocatorType, VK_LValue, Loc);
	if (!Res.isUsable()) {
	ErrorFound = true;
	break;
	}
	if (OMPAllocatorHandleT.isNull())
	OMPAllocatorHandleT = AllocatorType;
	if (!S.getASTContext().hasSameType(OMPAllocatorHandleT, AllocatorType)) {
	ErrorFound = true;
	break;
	}
	Stack->setAllocator(AllocatorKind, Res.get());
	}
	if (ErrorFound) {
	S.Diag(Loc, diag::err_omp_implied_type_not_found)
	<< "omp_allocator_handle_t";
	return false;
	}
	OMPAllocatorHandleT.addConst();
	Stack->setOMPAllocatorHandleT(OMPAllocatorHandleT);
	return true;
	}

	OMPClause Sema::ActOnOpenMPAllocatorClause(Expr A, SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation EndLoc) {
	// OpenMP [2.11.3, allocate Directive, Description]
	// allocator is an expression of omp_allocator_handle_t type.
	if (!findOMPAllocatorHandleT(*this, A->getExprLoc(), DSAStack))
	return nullptr;

	ExprResult Allocator = DefaultLvalueConversion(A);
	if (Allocator.isInvalid())
	return nullptr;
	Allocator = PerformImplicitConversion(Allocator.get(),
	DSAStack->getOMPAllocatorHandleT(),
	Sema::AA_Initializing,
	/AllowExplicit=/true);
	if (Allocator.isInvalid())
	return nullptr;
	return new (Context)
	OMPAllocatorClause(Allocator.get(), StartLoc, LParenLoc, EndLoc);
	}

	OMPClause Sema::ActOnOpenMPCollapseClause(Expr NumForLoops,
	SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation EndLoc) {
	// OpenMP [2.7.1, loop construct, Description]
	// OpenMP [2.8.1, simd construct, Description]
	// OpenMP [2.9.6, distribute construct, Description]
	// The parameter of the collapse clause must be a constant
	// positive integer expression.
	ExprResult NumForLoopsResult =
	VerifyPositiveIntegerConstantInClause(NumForLoops, OMPC_collapse);
	if (NumForLoopsResult.isInvalid())
	return nullptr;
	return new (Context)
	OMPCollapseClause(NumForLoopsResult.get(), StartLoc, LParenLoc, EndLoc);
	}

	OMPClause *Sema::ActOnOpenMPOrderedClause(SourceLocation StartLoc,
	SourceLocation EndLoc,
	SourceLocation LParenLoc,
	Expr *NumForLoops) {
	// OpenMP [2.7.1, loop construct, Description]
	// OpenMP [2.8.1, simd construct, Description]
	// OpenMP [2.9.6, distribute construct, Description]
	// The parameter of the ordered clause must be a constant
	// positive integer expression if any.
	if (NumForLoops && LParenLoc.isValid()) {
	ExprResult NumForLoopsResult =
	VerifyPositiveIntegerConstantInClause(NumForLoops, OMPC_ordered);
	if (NumForLoopsResult.isInvalid())
	return nullptr;
	NumForLoops = NumForLoopsResult.get();
	} else {
	NumForLoops = nullptr;
	}
	auto *Clause = OMPOrderedClause::Create(
	Context, NumForLoops, NumForLoops ? DSAStack->getAssociatedLoops() : 0,
	StartLoc, LParenLoc, EndLoc);
	DSAStack->setOrderedRegion(/IsOrdered=/true, NumForLoops, Clause);
	return Clause;
	}

	OMPClause *Sema::ActOnOpenMPSimpleClause(
	OpenMPClauseKind Kind, unsigned Argument, SourceLocation ArgumentLoc,
	SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) {
	OMPClause *Res = nullptr;
	switch (Kind) {
	case OMPC_default:
	Res = ActOnOpenMPDefaultClause(static_cast<DefaultKind>(Argument),
	ArgumentLoc, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_proc_bind:
	Res = ActOnOpenMPProcBindClause(static_cast<ProcBindKind>(Argument),
	ArgumentLoc, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_atomic_default_mem_order:
	Res = ActOnOpenMPAtomicDefaultMemOrderClause(
	static_cast<OpenMPAtomicDefaultMemOrderClauseKind>(Argument),
	ArgumentLoc, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_order:
	Res = ActOnOpenMPOrderClause(static_cast<OpenMPOrderClauseKind>(Argument),
	ArgumentLoc, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_update:
	Res = ActOnOpenMPUpdateClause(static_cast<OpenMPDependClauseKind>(Argument),
	ArgumentLoc, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_if:
	case OMPC_final:
	case OMPC_num_threads:
	case OMPC_safelen:
	case OMPC_simdlen:
	case OMPC_allocator:
	case OMPC_collapse:
	case OMPC_schedule:
	case OMPC_private:
	case OMPC_firstprivate:
	case OMPC_lastprivate:
	case OMPC_shared:
	case OMPC_reduction:
	case OMPC_task_reduction:
	case OMPC_in_reduction:
	case OMPC_linear:
	case OMPC_aligned:
	case OMPC_copyin:
	case OMPC_copyprivate:
	case OMPC_ordered:
	case OMPC_nowait:
	case OMPC_untied:
	case OMPC_mergeable:
	case OMPC_threadprivate:
	case OMPC_allocate:
	case OMPC_flush:
	case OMPC_depobj:
	case OMPC_read:
	case OMPC_write:
	case OMPC_capture:
	case OMPC_seq_cst:
	case OMPC_acq_rel:
	case OMPC_acquire:
	case OMPC_release:
	case OMPC_relaxed:
	case OMPC_depend:
	case OMPC_device:
	case OMPC_threads:
	case OMPC_simd:
	case OMPC_map:
	case OMPC_num_teams:
	case OMPC_thread_limit:
	case OMPC_priority:
	case OMPC_grainsize:
	case OMPC_nogroup:
	case OMPC_num_tasks:
	case OMPC_hint:
	case OMPC_dist_schedule:
	case OMPC_defaultmap:
	case OMPC_unknown:
	case OMPC_uniform:
	case OMPC_to:
	case OMPC_from:
	case OMPC_use_device_ptr:
	case OMPC_use_device_addr:
	case OMPC_is_device_ptr:
	case OMPC_unified_address:
	case OMPC_unified_shared_memory:
	case OMPC_reverse_offload:
	case OMPC_dynamic_allocators:
	case OMPC_device_type:
	case OMPC_match:
	case OMPC_nontemporal:
	case OMPC_destroy:
	case OMPC_detach:
	case OMPC_inclusive:
	case OMPC_exclusive:
	case OMPC_uses_allocators:
	case OMPC_affinity:
	default:
	llvm_unreachable("Clause is not allowed.");
	}
	return Res;
	}

	static std::string
	getListOfPossibleValues(OpenMPClauseKind K, unsigned First, unsigned Last,
	ArrayRef<unsigned> Exclude = llvm::None) {
	SmallString<256> Buffer;
	llvm::raw_svector_ostream Out(Buffer);
	unsigned Skipped = Exclude.size();
	auto S = Exclude.begin(), E = Exclude.end();
	for (unsigned I = First; I < Last; ++I) {
	if (std::find(S, E, I) != E) {
	--Skipped;
	continue;
	}
	Out << "'" << getOpenMPSimpleClauseTypeName(K, I) << "'";
	if (I + Skipped + 2 == Last)
	Out << " or ";
	else if (I + Skipped + 1 != Last)
	Out << ", ";
	}
	return std::string(Out.str());
	}

	OMPClause *Sema::ActOnOpenMPDefaultClause(DefaultKind Kind,
	SourceLocation KindKwLoc,
	SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation EndLoc) {
	if (Kind == OMP_DEFAULT_unknown) {
	Diag(KindKwLoc, diag::err_omp_unexpected_clause_value)
	<< getListOfPossibleValues(OMPC_default, /First=/0,
	/Last=/unsigned(OMP_DEFAULT_unknown))
	<< getOpenMPClauseName(OMPC_default);
	return nullptr;
	}

	switch (Kind) {
	case OMP_DEFAULT_none:
	DSAStack->setDefaultDSANone(KindKwLoc);
	break;
	case OMP_DEFAULT_shared:
	DSAStack->setDefaultDSAShared(KindKwLoc);
	break;
	case OMP_DEFAULT_firstprivate:
	DSAStack->setDefaultDSAFirstPrivate(KindKwLoc);
	break;
	default:
	llvm_unreachable("DSA unexpected in OpenMP default clause");
	}

	return new (Context)
	OMPDefaultClause(Kind, KindKwLoc, StartLoc, LParenLoc, EndLoc);
	}

	OMPClause *Sema::ActOnOpenMPProcBindClause(ProcBindKind Kind,
	SourceLocation KindKwLoc,
	SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation EndLoc) {
	if (Kind == OMP_PROC_BIND_unknown) {
	Diag(KindKwLoc, diag::err_omp_unexpected_clause_value)
	<< getListOfPossibleValues(OMPC_proc_bind,
	/First=/unsigned(OMP_PROC_BIND_master),
	/Last=/5)
	<< getOpenMPClauseName(OMPC_proc_bind);
	return nullptr;
	}
	return new (Context)
	OMPProcBindClause(Kind, KindKwLoc, StartLoc, LParenLoc, EndLoc);
	}

	OMPClause *Sema::ActOnOpenMPAtomicDefaultMemOrderClause(
	OpenMPAtomicDefaultMemOrderClauseKind Kind, SourceLocation KindKwLoc,
	SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) {
	if (Kind == OMPC_ATOMIC_DEFAULT_MEM_ORDER_unknown) {
	Diag(KindKwLoc, diag::err_omp_unexpected_clause_value)
	<< getListOfPossibleValues(
	OMPC_atomic_default_mem_order, /First=/0,
	/Last=/OMPC_ATOMIC_DEFAULT_MEM_ORDER_unknown)
	<< getOpenMPClauseName(OMPC_atomic_default_mem_order);
	return nullptr;
	}
	return new (Context) OMPAtomicDefaultMemOrderClause(Kind, KindKwLoc, StartLoc,
	LParenLoc, EndLoc);
	}

	OMPClause *Sema::ActOnOpenMPOrderClause(OpenMPOrderClauseKind Kind,
	SourceLocation KindKwLoc,
	SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation EndLoc) {
	if (Kind == OMPC_ORDER_unknown) {
	static_assert(OMPC_ORDER_unknown > 0,
	"OMPC_ORDER_unknown not greater than 0");
	Diag(KindKwLoc, diag::err_omp_unexpected_clause_value)
	<< getListOfPossibleValues(OMPC_order, /First=/0,
	/Last=/OMPC_ORDER_unknown)
	<< getOpenMPClauseName(OMPC_order);
	return nullptr;
	}
	return new (Context)
	OMPOrderClause(Kind, KindKwLoc, StartLoc, LParenLoc, EndLoc);
	}

	OMPClause *Sema::ActOnOpenMPUpdateClause(OpenMPDependClauseKind Kind,
	SourceLocation KindKwLoc,
	SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation EndLoc) {
	if (Kind == OMPC_DEPEND_unknown \|\| Kind == OMPC_DEPEND_source \|\|
	Kind == OMPC_DEPEND_sink \|\| Kind == OMPC_DEPEND_depobj) {
	unsigned Except[] = {OMPC_DEPEND_source, OMPC_DEPEND_sink,
	OMPC_DEPEND_depobj};
	Diag(KindKwLoc, diag::err_omp_unexpected_clause_value)
	<< getListOfPossibleValues(OMPC_depend, /First=/0,
	/Last=/OMPC_DEPEND_unknown, Except)
	<< getOpenMPClauseName(OMPC_update);
	return nullptr;
	}
	return OMPUpdateClause::Create(Context, StartLoc, LParenLoc, KindKwLoc, Kind,
	EndLoc);
	}

	OMPClause *Sema::ActOnOpenMPSingleExprWithArgClause(
	OpenMPClauseKind Kind, ArrayRef<unsigned> Argument, Expr *Expr,
	SourceLocation StartLoc, SourceLocation LParenLoc,
	ArrayRef<SourceLocation> ArgumentLoc, SourceLocation DelimLoc,
	SourceLocation EndLoc) {
	OMPClause *Res = nullptr;
	switch (Kind) {
	case OMPC_schedule:
	enum { Modifier1, Modifier2, ScheduleKind, NumberOfElements };
	assert(Argument.size() == NumberOfElements &&
	ArgumentLoc.size() == NumberOfElements);
	Res = ActOnOpenMPScheduleClause(
	static_cast<OpenMPScheduleClauseModifier>(Argument[Modifier1]),
	static_cast<OpenMPScheduleClauseModifier>(Argument[Modifier2]),
	static_cast<OpenMPScheduleClauseKind>(Argument[ScheduleKind]), Expr,
	StartLoc, LParenLoc, ArgumentLoc[Modifier1], ArgumentLoc[Modifier2],
	ArgumentLoc[ScheduleKind], DelimLoc, EndLoc);
	break;
	case OMPC_if:
	assert(Argument.size() == 1 && ArgumentLoc.size() == 1);
	Res = ActOnOpenMPIfClause(static_cast<OpenMPDirectiveKind>(Argument.back()),
	Expr, StartLoc, LParenLoc, ArgumentLoc.back(),
	DelimLoc, EndLoc);
	break;
	case OMPC_dist_schedule:
	Res = ActOnOpenMPDistScheduleClause(
	static_cast<OpenMPDistScheduleClauseKind>(Argument.back()), Expr,
	StartLoc, LParenLoc, ArgumentLoc.back(), DelimLoc, EndLoc);
	break;
	case OMPC_defaultmap:
	enum { Modifier, DefaultmapKind };
	Res = ActOnOpenMPDefaultmapClause(
	static_cast<OpenMPDefaultmapClauseModifier>(Argument[Modifier]),
	static_cast<OpenMPDefaultmapClauseKind>(Argument[DefaultmapKind]),
	StartLoc, LParenLoc, ArgumentLoc[Modifier], ArgumentLoc[DefaultmapKind],
	EndLoc);
	break;
	case OMPC_device:
	assert(Argument.size() == 1 && ArgumentLoc.size() == 1);
	Res = ActOnOpenMPDeviceClause(
	static_cast<OpenMPDeviceClauseModifier>(Argument.back()), Expr,
	StartLoc, LParenLoc, ArgumentLoc.back(), EndLoc);
	break;
	case OMPC_final:
	case OMPC_num_threads:
	case OMPC_safelen:
	case OMPC_simdlen:
	case OMPC_allocator:
	case OMPC_collapse:
	case OMPC_default:
	case OMPC_proc_bind:
	case OMPC_private:
	case OMPC_firstprivate:
	case OMPC_lastprivate:
	case OMPC_shared:
	case OMPC_reduction:
	case OMPC_task_reduction:
	case OMPC_in_reduction:
	case OMPC_linear:
	case OMPC_aligned:
	case OMPC_copyin:
	case OMPC_copyprivate:
	case OMPC_ordered:
	case OMPC_nowait:
	case OMPC_untied:
	case OMPC_mergeable:
	case OMPC_threadprivate:
	case OMPC_allocate:
	case OMPC_flush:
	case OMPC_depobj:
	case OMPC_read:
	case OMPC_write:
	case OMPC_update:
	case OMPC_capture:
	case OMPC_seq_cst:
	case OMPC_acq_rel:
	case OMPC_acquire:
	case OMPC_release:
	case OMPC_relaxed:
	case OMPC_depend:
	case OMPC_threads:
	case OMPC_simd:
	case OMPC_map:
	case OMPC_num_teams:
	case OMPC_thread_limit:
	case OMPC_priority:
	case OMPC_grainsize:
	case OMPC_nogroup:
	case OMPC_num_tasks:
	case OMPC_hint:
	case OMPC_unknown:
	case OMPC_uniform:
	case OMPC_to:
	case OMPC_from:
	case OMPC_use_device_ptr:
	case OMPC_use_device_addr:
	case OMPC_is_device_ptr:
	case OMPC_unified_address:
	case OMPC_unified_shared_memory:
	case OMPC_reverse_offload:
	case OMPC_dynamic_allocators:
	case OMPC_atomic_default_mem_order:
	case OMPC_device_type:
	case OMPC_match:
	case OMPC_nontemporal:
	case OMPC_order:
	case OMPC_destroy:
	case OMPC_detach:
	case OMPC_inclusive:
	case OMPC_exclusive:
	case OMPC_uses_allocators:
	case OMPC_affinity:
	default:
	llvm_unreachable("Clause is not allowed.");
	}
	return Res;
	}

	static bool checkScheduleModifiers(Sema &S, OpenMPScheduleClauseModifier M1,
	OpenMPScheduleClauseModifier M2,
	SourceLocation M1Loc, SourceLocation M2Loc) {
	if (M1 == OMPC_SCHEDULE_MODIFIER_unknown && M1Loc.isValid()) {
	SmallVector<unsigned, 2> Excluded;
	if (M2 != OMPC_SCHEDULE_MODIFIER_unknown)
	Excluded.push_back(M2);
	if (M2 == OMPC_SCHEDULE_MODIFIER_nonmonotonic)
	Excluded.push_back(OMPC_SCHEDULE_MODIFIER_monotonic);
	if (M2 == OMPC_SCHEDULE_MODIFIER_monotonic)
	Excluded.push_back(OMPC_SCHEDULE_MODIFIER_nonmonotonic);
	S.Diag(M1Loc, diag::err_omp_unexpected_clause_value)
	<< getListOfPossibleValues(OMPC_schedule,
	/First=/OMPC_SCHEDULE_MODIFIER_unknown + 1,
	/Last=/OMPC_SCHEDULE_MODIFIER_last,
	Excluded)
	<< getOpenMPClauseName(OMPC_schedule);
	return true;
	}
	return false;
	}

	OMPClause *Sema::ActOnOpenMPScheduleClause(
	OpenMPScheduleClauseModifier M1, OpenMPScheduleClauseModifier M2,
	OpenMPScheduleClauseKind Kind, Expr *ChunkSize, SourceLocation StartLoc,
	SourceLocation LParenLoc, SourceLocation M1Loc, SourceLocation M2Loc,
	SourceLocation KindLoc, SourceLocation CommaLoc, SourceLocation EndLoc) {
	if (checkScheduleModifiers(*this, M1, M2, M1Loc, M2Loc) \|\|
	checkScheduleModifiers(*this, M2, M1, M2Loc, M1Loc))
	return nullptr;
	// OpenMP, 2.7.1, Loop Construct, Restrictions
	// Either the monotonic modifier or the nonmonotonic modifier can be specified
	// but not both.
	if ((M1 == M2 && M1 != OMPC_SCHEDULE_MODIFIER_unknown) \|\|
	(M1 == OMPC_SCHEDULE_MODIFIER_monotonic &&
	M2 == OMPC_SCHEDULE_MODIFIER_nonmonotonic) \|\|
	(M1 == OMPC_SCHEDULE_MODIFIER_nonmonotonic &&
	M2 == OMPC_SCHEDULE_MODIFIER_monotonic)) {
	Diag(M2Loc, diag::err_omp_unexpected_schedule_modifier)
	<< getOpenMPSimpleClauseTypeName(OMPC_schedule, M2)
	<< getOpenMPSimpleClauseTypeName(OMPC_schedule, M1);
	return nullptr;
	}
	if (Kind == OMPC_SCHEDULE_unknown) {
	std::string Values;
	if (M1Loc.isInvalid() && M2Loc.isInvalid()) {
	unsigned Exclude[] = {OMPC_SCHEDULE_unknown};
	Values = getListOfPossibleValues(OMPC_schedule, /First=/0,
	/Last=/OMPC_SCHEDULE_MODIFIER_last,
	Exclude);
	} else {
	Values = getListOfPossibleValues(OMPC_schedule, /First=/0,
	/Last=/OMPC_SCHEDULE_unknown);
	}
	Diag(KindLoc, diag::err_omp_unexpected_clause_value)
	<< Values << getOpenMPClauseName(OMPC_schedule);
	return nullptr;
	}
	// OpenMP, 2.7.1, Loop Construct, Restrictions
	// The nonmonotonic modifier can only be specified with schedule(dynamic) or
	// schedule(guided).
	// OpenMP 5.0 does not have this restriction.
	if (LangOpts.OpenMP < 50 &&
	(M1 == OMPC_SCHEDULE_MODIFIER_nonmonotonic \|\|
	M2 == OMPC_SCHEDULE_MODIFIER_nonmonotonic) &&
	Kind != OMPC_SCHEDULE_dynamic && Kind != OMPC_SCHEDULE_guided) {
	Diag(M1 == OMPC_SCHEDULE_MODIFIER_nonmonotonic ? M1Loc : M2Loc,
	diag::err_omp_schedule_nonmonotonic_static);
	return nullptr;
	}
	Expr *ValExpr = ChunkSize;
	Stmt *HelperValStmt = nullptr;
	if (ChunkSize) {
	if (!ChunkSize->isValueDependent() && !ChunkSize->isTypeDependent() &&
	!ChunkSize->isInstantiationDependent() &&
	!ChunkSize->containsUnexpandedParameterPack()) {
	SourceLocation ChunkSizeLoc = ChunkSize->getBeginLoc();
	ExprResult Val =
	PerformOpenMPImplicitIntegerConversion(ChunkSizeLoc, ChunkSize);
	if (Val.isInvalid())
	return nullptr;

	ValExpr = Val.get();

	// OpenMP [2.7.1, Restrictions]
	// chunk_size must be a loop invariant integer expression with a positive
	// value.
	llvm::APSInt Result;
	if (ValExpr->isIntegerConstantExpr(Result, Context)) {
	if (Result.isSigned() && !Result.isStrictlyPositive()) {
	Diag(ChunkSizeLoc, diag::err_omp_negative_expression_in_clause)
	<< "schedule" << 1 << ChunkSize->getSourceRange();
	return nullptr;
	}
	} else if (getOpenMPCaptureRegionForClause(
	DSAStack->getCurrentDirective(), OMPC_schedule,
	LangOpts.OpenMP) != OMPD_unknown &&
	!CurContext->isDependentContext()) {
	ValExpr = MakeFullExpr(ValExpr).get();
	llvm::MapVector<const Expr , DeclRefExpr > Captures;
	ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
	HelperValStmt = buildPreInits(Context, Captures);
	}
	}
	}

	return new (Context)
	OMPScheduleClause(StartLoc, LParenLoc, KindLoc, CommaLoc, EndLoc, Kind,
	ValExpr, HelperValStmt, M1, M1Loc, M2, M2Loc);
	}

	OMPClause *Sema::ActOnOpenMPClause(OpenMPClauseKind Kind,
	SourceLocation StartLoc,
	SourceLocation EndLoc) {
	OMPClause *Res = nullptr;
	switch (Kind) {
	case OMPC_ordered:
	Res = ActOnOpenMPOrderedClause(StartLoc, EndLoc);
	break;
	case OMPC_nowait:
	Res = ActOnOpenMPNowaitClause(StartLoc, EndLoc);
	break;
	case OMPC_untied:
	Res = ActOnOpenMPUntiedClause(StartLoc, EndLoc);
	break;
	case OMPC_mergeable:
	Res = ActOnOpenMPMergeableClause(StartLoc, EndLoc);
	break;
	case OMPC_read:
	Res = ActOnOpenMPReadClause(StartLoc, EndLoc);
	break;
	case OMPC_write:
	Res = ActOnOpenMPWriteClause(StartLoc, EndLoc);
	break;
	case OMPC_update:
	Res = ActOnOpenMPUpdateClause(StartLoc, EndLoc);
	break;
	case OMPC_capture:
	Res = ActOnOpenMPCaptureClause(StartLoc, EndLoc);
	break;
	case OMPC_seq_cst:
	Res = ActOnOpenMPSeqCstClause(StartLoc, EndLoc);
	break;
	case OMPC_acq_rel:
	Res = ActOnOpenMPAcqRelClause(StartLoc, EndLoc);
	break;
	case OMPC_acquire:
	Res = ActOnOpenMPAcquireClause(StartLoc, EndLoc);
	break;
	case OMPC_release:
	Res = ActOnOpenMPReleaseClause(StartLoc, EndLoc);
	break;
	case OMPC_relaxed:
	Res = ActOnOpenMPRelaxedClause(StartLoc, EndLoc);
	break;
	case OMPC_threads:
	Res = ActOnOpenMPThreadsClause(StartLoc, EndLoc);
	break;
	case OMPC_simd:
	Res = ActOnOpenMPSIMDClause(StartLoc, EndLoc);
	break;
	case OMPC_nogroup:
	Res = ActOnOpenMPNogroupClause(StartLoc, EndLoc);
	break;
	case OMPC_unified_address:
	Res = ActOnOpenMPUnifiedAddressClause(StartLoc, EndLoc);
	break;
	case OMPC_unified_shared_memory:
	Res = ActOnOpenMPUnifiedSharedMemoryClause(StartLoc, EndLoc);
	break;
	case OMPC_reverse_offload:
	Res = ActOnOpenMPReverseOffloadClause(StartLoc, EndLoc);
	break;
	case OMPC_dynamic_allocators:
	Res = ActOnOpenMPDynamicAllocatorsClause(StartLoc, EndLoc);
	break;
	case OMPC_destroy:
	Res = ActOnOpenMPDestroyClause(StartLoc, EndLoc);
	break;
	case OMPC_if:
	case OMPC_final:
	case OMPC_num_threads:
	case OMPC_safelen:
	case OMPC_simdlen:
	case OMPC_allocator:
	case OMPC_collapse:
	case OMPC_schedule:
	case OMPC_private:
	case OMPC_firstprivate:
	case OMPC_lastprivate:
	case OMPC_shared:
	case OMPC_reduction:
	case OMPC_task_reduction:
	case OMPC_in_reduction:
	case OMPC_linear:
	case OMPC_aligned:
	case OMPC_copyin:
	case OMPC_copyprivate:
	case OMPC_default:
	case OMPC_proc_bind:
	case OMPC_threadprivate:
	case OMPC_allocate:
	case OMPC_flush:
	case OMPC_depobj:
	case OMPC_depend:
	case OMPC_device:
	case OMPC_map:
	case OMPC_num_teams:
	case OMPC_thread_limit:
	case OMPC_priority:
	case OMPC_grainsize:
	case OMPC_num_tasks:
	case OMPC_hint:
	case OMPC_dist_schedule:
	case OMPC_defaultmap:
	case OMPC_unknown:
	case OMPC_uniform:
	case OMPC_to:
	case OMPC_from:
	case OMPC_use_device_ptr:
	case OMPC_use_device_addr:
	case OMPC_is_device_ptr:
	case OMPC_atomic_default_mem_order:
	case OMPC_device_type:
	case OMPC_match:
	case OMPC_nontemporal:
	case OMPC_order:
	case OMPC_detach:
	case OMPC_inclusive:
	case OMPC_exclusive:
	case OMPC_uses_allocators:
	case OMPC_affinity:
	default:
	llvm_unreachable("Clause is not allowed.");
	}
	return Res;
	}

	OMPClause *Sema::ActOnOpenMPNowaitClause(SourceLocation StartLoc,
	SourceLocation EndLoc) {
	DSAStack->setNowaitRegion();
	return new (Context) OMPNowaitClause(StartLoc, EndLoc);
	}

	OMPClause *Sema::ActOnOpenMPUntiedClause(SourceLocation StartLoc,
	SourceLocation EndLoc) {
	return new (Context) OMPUntiedClause(StartLoc, EndLoc);
	}

	OMPClause *Sema::ActOnOpenMPMergeableClause(SourceLocation StartLoc,
	SourceLocation EndLoc) {
	return new (Context) OMPMergeableClause(StartLoc, EndLoc);
	}

	OMPClause *Sema::ActOnOpenMPReadClause(SourceLocation StartLoc,
	SourceLocation EndLoc) {
	return new (Context) OMPReadClause(StartLoc, EndLoc);
	}

	OMPClause *Sema::ActOnOpenMPWriteClause(SourceLocation StartLoc,
	SourceLocation EndLoc) {
	return new (Context) OMPWriteClause(StartLoc, EndLoc);
	}

	OMPClause *Sema::ActOnOpenMPUpdateClause(SourceLocation StartLoc,
	SourceLocation EndLoc) {
	return OMPUpdateClause::Create(Context, StartLoc, EndLoc);
	}

	OMPClause *Sema::ActOnOpenMPCaptureClause(SourceLocation StartLoc,
	SourceLocation EndLoc) {
	return new (Context) OMPCaptureClause(StartLoc, EndLoc);
	}

	OMPClause *Sema::ActOnOpenMPSeqCstClause(SourceLocation StartLoc,
	SourceLocation EndLoc) {
	return new (Context) OMPSeqCstClause(StartLoc, EndLoc);
	}

	OMPClause *Sema::ActOnOpenMPAcqRelClause(SourceLocation StartLoc,
	SourceLocation EndLoc) {
	return new (Context) OMPAcqRelClause(StartLoc, EndLoc);
	}

	OMPClause *Sema::ActOnOpenMPAcquireClause(SourceLocation StartLoc,
	SourceLocation EndLoc) {
	return new (Context) OMPAcquireClause(StartLoc, EndLoc);
	}

	OMPClause *Sema::ActOnOpenMPReleaseClause(SourceLocation StartLoc,
	SourceLocation EndLoc) {
	return new (Context) OMPReleaseClause(StartLoc, EndLoc);
	}

	OMPClause *Sema::ActOnOpenMPRelaxedClause(SourceLocation StartLoc,
	SourceLocation EndLoc) {
	return new (Context) OMPRelaxedClause(StartLoc, EndLoc);
	}

	OMPClause *Sema::ActOnOpenMPThreadsClause(SourceLocation StartLoc,
	SourceLocation EndLoc) {
	return new (Context) OMPThreadsClause(StartLoc, EndLoc);
	}

	OMPClause *Sema::ActOnOpenMPSIMDClause(SourceLocation StartLoc,
	SourceLocation EndLoc) {
	return new (Context) OMPSIMDClause(StartLoc, EndLoc);
	}

	OMPClause *Sema::ActOnOpenMPNogroupClause(SourceLocation StartLoc,
	SourceLocation EndLoc) {
	return new (Context) OMPNogroupClause(StartLoc, EndLoc);
	}

	OMPClause *Sema::ActOnOpenMPUnifiedAddressClause(SourceLocation StartLoc,
	SourceLocation EndLoc) {
	return new (Context) OMPUnifiedAddressClause(StartLoc, EndLoc);
	}

	OMPClause *Sema::ActOnOpenMPUnifiedSharedMemoryClause(SourceLocation StartLoc,
	SourceLocation EndLoc) {
	return new (Context) OMPUnifiedSharedMemoryClause(StartLoc, EndLoc);
	}

	OMPClause *Sema::ActOnOpenMPReverseOffloadClause(SourceLocation StartLoc,
	SourceLocation EndLoc) {
	return new (Context) OMPReverseOffloadClause(StartLoc, EndLoc);
	}

	OMPClause *Sema::ActOnOpenMPDynamicAllocatorsClause(SourceLocation StartLoc,
	SourceLocation EndLoc) {
	return new (Context) OMPDynamicAllocatorsClause(StartLoc, EndLoc);
	}

	OMPClause *Sema::ActOnOpenMPDestroyClause(SourceLocation StartLoc,
	SourceLocation EndLoc) {
	return new (Context) OMPDestroyClause(StartLoc, EndLoc);
	}

	OMPClause *Sema::ActOnOpenMPVarListClause(
	OpenMPClauseKind Kind, ArrayRef<Expr > VarList, Expr DepModOrTailExpr,
	const OMPVarListLocTy &Locs, SourceLocation ColonLoc,
	CXXScopeSpec &ReductionOrMapperIdScopeSpec,
	DeclarationNameInfo &ReductionOrMapperId, int ExtraModifier,
	ArrayRef<OpenMPMapModifierKind> MapTypeModifiers,
	ArrayRef<SourceLocation> MapTypeModifiersLoc, bool IsMapTypeImplicit,
	SourceLocation ExtraModifierLoc) {
	SourceLocation StartLoc = Locs.StartLoc;
	SourceLocation LParenLoc = Locs.LParenLoc;
	SourceLocation EndLoc = Locs.EndLoc;
	OMPClause *Res = nullptr;
	switch (Kind) {
	case OMPC_private:
	Res = ActOnOpenMPPrivateClause(VarList, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_firstprivate:
	Res = ActOnOpenMPFirstprivateClause(VarList, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_lastprivate:
	assert(0 <= ExtraModifier && ExtraModifier <= OMPC_LASTPRIVATE_unknown &&
	"Unexpected lastprivate modifier.");
	Res = ActOnOpenMPLastprivateClause(
	VarList, static_cast<OpenMPLastprivateModifier>(ExtraModifier),
	ExtraModifierLoc, ColonLoc, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_shared:
	Res = ActOnOpenMPSharedClause(VarList, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_reduction:
	assert(0 <= ExtraModifier && ExtraModifier <= OMPC_REDUCTION_unknown &&
	"Unexpected lastprivate modifier.");
	Res = ActOnOpenMPReductionClause(
	VarList, static_cast<OpenMPReductionClauseModifier>(ExtraModifier),
	StartLoc, LParenLoc, ExtraModifierLoc, ColonLoc, EndLoc,
	ReductionOrMapperIdScopeSpec, ReductionOrMapperId);
	break;
	case OMPC_task_reduction:
	Res = ActOnOpenMPTaskReductionClause(VarList, StartLoc, LParenLoc, ColonLoc,
	EndLoc, ReductionOrMapperIdScopeSpec,
	ReductionOrMapperId);
	break;
	case OMPC_in_reduction:
	Res = ActOnOpenMPInReductionClause(VarList, StartLoc, LParenLoc, ColonLoc,
	EndLoc, ReductionOrMapperIdScopeSpec,
	ReductionOrMapperId);
	break;
	case OMPC_linear:
	assert(0 <= ExtraModifier && ExtraModifier <= OMPC_LINEAR_unknown &&
	"Unexpected linear modifier.");
	Res = ActOnOpenMPLinearClause(
	VarList, DepModOrTailExpr, StartLoc, LParenLoc,
	static_cast<OpenMPLinearClauseKind>(ExtraModifier), ExtraModifierLoc,
	ColonLoc, EndLoc);
	break;
	case OMPC_aligned:
	Res = ActOnOpenMPAlignedClause(VarList, DepModOrTailExpr, StartLoc,
	LParenLoc, ColonLoc, EndLoc);
	break;
	case OMPC_copyin:
	Res = ActOnOpenMPCopyinClause(VarList, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_copyprivate:
	Res = ActOnOpenMPCopyprivateClause(VarList, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_flush:
	Res = ActOnOpenMPFlushClause(VarList, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_depend:
	assert(0 <= ExtraModifier && ExtraModifier <= OMPC_DEPEND_unknown &&
	"Unexpected depend modifier.");
	Res = ActOnOpenMPDependClause(
	DepModOrTailExpr, static_cast<OpenMPDependClauseKind>(ExtraModifier),
	ExtraModifierLoc, ColonLoc, VarList, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_map:
	assert(0 <= ExtraModifier && ExtraModifier <= OMPC_MAP_unknown &&
	"Unexpected map modifier.");
	Res = ActOnOpenMPMapClause(
	MapTypeModifiers, MapTypeModifiersLoc, ReductionOrMapperIdScopeSpec,
	ReductionOrMapperId, static_cast<OpenMPMapClauseKind>(ExtraModifier),
	IsMapTypeImplicit, ExtraModifierLoc, ColonLoc, VarList, Locs);
	break;
	case OMPC_to:
	Res = ActOnOpenMPToClause(VarList, ReductionOrMapperIdScopeSpec,
	ReductionOrMapperId, Locs);
	break;
	case OMPC_from:
	Res = ActOnOpenMPFromClause(VarList, ReductionOrMapperIdScopeSpec,
	ReductionOrMapperId, Locs);
	break;
	case OMPC_use_device_ptr:
	Res = ActOnOpenMPUseDevicePtrClause(VarList, Locs);
	break;
	case OMPC_use_device_addr:
	Res = ActOnOpenMPUseDeviceAddrClause(VarList, Locs);
	break;
	case OMPC_is_device_ptr:
	Res = ActOnOpenMPIsDevicePtrClause(VarList, Locs);
	break;
	case OMPC_allocate:
	Res = ActOnOpenMPAllocateClause(DepModOrTailExpr, VarList, StartLoc,
	LParenLoc, ColonLoc, EndLoc);
	break;
	case OMPC_nontemporal:
	Res = ActOnOpenMPNontemporalClause(VarList, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_inclusive:
	Res = ActOnOpenMPInclusiveClause(VarList, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_exclusive:
	Res = ActOnOpenMPExclusiveClause(VarList, StartLoc, LParenLoc, EndLoc);
	break;
	case OMPC_affinity:
	Res = ActOnOpenMPAffinityClause(StartLoc, LParenLoc, ColonLoc, EndLoc,
	DepModOrTailExpr, VarList);
	break;
	case OMPC_if:
	case OMPC_depobj:
	case OMPC_final:
	case OMPC_num_threads:
	case OMPC_safelen:
	case OMPC_simdlen:
	case OMPC_allocator:
	case OMPC_collapse:
	case OMPC_default:
	case OMPC_proc_bind:
	case OMPC_schedule:
	case OMPC_ordered:
	case OMPC_nowait:
	case OMPC_untied:
	case OMPC_mergeable:
	case OMPC_threadprivate:
	case OMPC_read:
	case OMPC_write:
	case OMPC_update:
	case OMPC_capture:
	case OMPC_seq_cst:
	case OMPC_acq_rel:
	case OMPC_acquire:
	case OMPC_release:
	case OMPC_relaxed:
	case OMPC_device:
	case OMPC_threads:
	case OMPC_simd:
	case OMPC_num_teams:
	case OMPC_thread_limit:
	case OMPC_priority:
	case OMPC_grainsize:
	case OMPC_nogroup:
	case OMPC_num_tasks:
	case OMPC_hint:
	case OMPC_dist_schedule:
	case OMPC_defaultmap:
	case OMPC_unknown:
	case OMPC_uniform:
	case OMPC_unified_address:
	case OMPC_unified_shared_memory:
	case OMPC_reverse_offload:
	case OMPC_dynamic_allocators:
	case OMPC_atomic_default_mem_order:
	case OMPC_device_type:
	case OMPC_match:
	case OMPC_order:
	case OMPC_destroy:
	case OMPC_detach:
	case OMPC_uses_allocators:
	default:
	llvm_unreachable("Clause is not allowed.");
	}
	return Res;
	}

	ExprResult Sema::getOpenMPCapturedExpr(VarDecl *Capture, ExprValueKind VK,
	ExprObjectKind OK, SourceLocation Loc) {
	ExprResult Res = BuildDeclRefExpr(
	Capture, Capture->getType().getNonReferenceType(), VK_LValue, Loc);
	if (!Res.isUsable())
	return ExprError();
	if (OK == OK_Ordinary && !getLangOpts().CPlusPlus) {
	Res = CreateBuiltinUnaryOp(Loc, UO_Deref, Res.get());
	if (!Res.isUsable())
	return ExprError();
	}
	if (VK != VK_LValue && Res.get()->isGLValue()) {
	Res = DefaultLvalueConversion(Res.get());
	if (!Res.isUsable())
	return ExprError();
	}
	return Res;
	}

	OMPClause Sema::ActOnOpenMPPrivateClause(ArrayRef<Expr > VarList,
	SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation EndLoc) {
	SmallVector<Expr *, 8> Vars;
	SmallVector<Expr *, 8> PrivateCopies;
	for (Expr *RefExpr : VarList) {
	assert(RefExpr && "NULL expr in OpenMP private clause.");
	SourceLocation ELoc;
	SourceRange ERange;
	Expr *SimpleRefExpr = RefExpr;
	auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
	if (Res.second) {
	// It will be analyzed later.
	Vars.push_back(RefExpr);
	PrivateCopies.push_back(nullptr);
	}
	ValueDecl *D = Res.first;
	if (!D)
	continue;

	QualType Type = D->getType();
	auto *VD = dyn_cast<VarDecl>(D);

	// OpenMP [2.9.3.3, Restrictions, C/C++, p.3]
	// A variable that appears in a private clause must not have an incomplete
	// type or a reference type.
	if (RequireCompleteType(ELoc, Type, diag::err_omp_private_incomplete_type))
	continue;
	Type = Type.getNonReferenceType();

	// OpenMP 5.0 [2.19.3, List Item Privatization, Restrictions]
	// A variable that is privatized must not have a const-qualified type
	// unless it is of class type with a mutable member. This restriction does
	// not apply to the firstprivate clause.
	//
	// OpenMP 3.1 [2.9.3.3, private clause, Restrictions]
	// A variable that appears in a private clause must not have a
	// const-qualified type unless it is of class type with a mutable member.
	if (rejectConstNotMutableType(*this, D, Type, OMPC_private, ELoc))
	continue;

	// OpenMP [2.9.1.1, Data-sharing Attribute Rules for Variables Referenced
	// in a Construct]
	// Variables with the predetermined data-sharing attributes may not be
	// listed in data-sharing attributes clauses, except for the cases
	// listed below. For these exceptions only, listing a predetermined
	// variable in a data-sharing attribute clause is allowed and overrides
	// the variable's predetermined data-sharing attributes.
	DSAStackTy::DSAVarData DVar = DSAStack->getTopDSA(D, /FromParent=/false);
	if (DVar.CKind != OMPC_unknown && DVar.CKind != OMPC_private) {
	Diag(ELoc, diag::err_omp_wrong_dsa) << getOpenMPClauseName(DVar.CKind)
	<< getOpenMPClauseName(OMPC_private);
	reportOriginalDsa(*this, DSAStack, D, DVar);
	continue;
	}

	OpenMPDirectiveKind CurrDir = DSAStack->getCurrentDirective();
	// Variably modified types are not supported for tasks.
	if (!Type->isAnyPointerType() && Type->isVariablyModifiedType() &&
	isOpenMPTaskingDirective(CurrDir)) {
	Diag(ELoc, diag::err_omp_variably_modified_type_not_supported)
	<< getOpenMPClauseName(OMPC_private) << Type
	<< getOpenMPDirectiveName(CurrDir);
	bool IsDecl =
	!VD \|\|
	VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
	Diag(D->getLocation(),
	IsDecl ? diag::note_previous_decl : diag::note_defined_here)
	<< D;
	continue;
	}

	// OpenMP 4.5 [2.15.5.1, Restrictions, p.3]
	// A list item cannot appear in both a map clause and a data-sharing
	// attribute clause on the same construct
	//
	// OpenMP 5.0 [2.19.7.1, Restrictions, p.7]
	// A list item cannot appear in both a map clause and a data-sharing
	// attribute clause on the same construct unless the construct is a
	// combined construct.
	if ((LangOpts.OpenMP <= 45 && isOpenMPTargetExecutionDirective(CurrDir)) \|\|
	CurrDir == OMPD_target) {
	OpenMPClauseKind ConflictKind;
	if (DSAStack->checkMappableExprComponentListsForDecl(
	VD, /CurrentRegionOnly=/true,
	[&](OMPClauseMappableExprCommon::MappableExprComponentListRef,
	OpenMPClauseKind WhereFoundClauseKind) -> bool {
	ConflictKind = WhereFoundClauseKind;
	return true;
	})) {
	Diag(ELoc, diag::err_omp_variable_in_given_clause_and_dsa)
	<< getOpenMPClauseName(OMPC_private)
	<< getOpenMPClauseName(ConflictKind)
	<< getOpenMPDirectiveName(CurrDir);
	reportOriginalDsa(*this, DSAStack, D, DVar);
	continue;
	}
	}

	// OpenMP [2.9.3.3, Restrictions, C/C++, p.1]
	// A variable of class type (or array thereof) that appears in a private
	// clause requires an accessible, unambiguous default constructor for the
	// class type.
	// Generate helper private variable and initialize it with the default
	// value. The address of the original variable is replaced by the address of
	// the new private variable in CodeGen. This new variable is not added to
	// IdResolver, so the code in the OpenMP region uses original variable for
	// proper diagnostics.
	Type = Type.getUnqualifiedType();
	VarDecl *VDPrivate =
	buildVarDecl(*this, ELoc, Type, D->getName(),
	D->hasAttrs() ? &D->getAttrs() : nullptr,
	VD ? cast<DeclRefExpr>(SimpleRefExpr) : nullptr);
	ActOnUninitializedDecl(VDPrivate);
	if (VDPrivate->isInvalidDecl())
	continue;
	DeclRefExpr *VDPrivateRefExpr = buildDeclRefExpr(
	*this, VDPrivate, RefExpr->getType().getUnqualifiedType(), ELoc);

	DeclRefExpr *Ref = nullptr;
	if (!VD && !CurContext->isDependentContext())
	Ref = buildCapture(this, D, SimpleRefExpr, /WithInit=*/false);
	DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_private, Ref);
	Vars.push_back((VD \|\| CurContext->isDependentContext())
	? RefExpr->IgnoreParens()
	: Ref);
	PrivateCopies.push_back(VDPrivateRefExpr);
	}

	if (Vars.empty())
	return nullptr;

	return OMPPrivateClause::Create(Context, StartLoc, LParenLoc, EndLoc, Vars,
	PrivateCopies);
	}

	namespace {
	class DiagsUninitializedSeveretyRAII {
	private:
	DiagnosticsEngine &Diags;
	SourceLocation SavedLoc;
	bool IsIgnored = false;

	public:
	DiagsUninitializedSeveretyRAII(DiagnosticsEngine &Diags, SourceLocation Loc,
	bool IsIgnored)
	: Diags(Diags), SavedLoc(Loc), IsIgnored(IsIgnored) {
	if (!IsIgnored) {
	Diags.setSeverity(/Diag/ diag::warn_uninit_self_reference_in_init,
	/Map/ diag::Severity::Ignored, Loc);
	}
	}
	~DiagsUninitializedSeveretyRAII() {
	if (!IsIgnored)
	Diags.popMappings(SavedLoc);
	}
	};
	}

	OMPClause Sema::ActOnOpenMPFirstprivateClause(ArrayRef<Expr > VarList,
	SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation EndLoc) {
	SmallVector<Expr *, 8> Vars;
	SmallVector<Expr *, 8> PrivateCopies;
	SmallVector<Expr *, 8> Inits;
	SmallVector<Decl *, 4> ExprCaptures;
	bool IsImplicitClause =
	StartLoc.isInvalid() && LParenLoc.isInvalid() && EndLoc.isInvalid();
	SourceLocation ImplicitClauseLoc = DSAStack->getConstructLoc();

	for (Expr *RefExpr : VarList) {
	assert(RefExpr && "NULL expr in OpenMP firstprivate clause.");
	SourceLocation ELoc;
	SourceRange ERange;
	Expr *SimpleRefExpr = RefExpr;
	auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
	if (Res.second) {
	// It will be analyzed later.
	Vars.push_back(RefExpr);
	PrivateCopies.push_back(nullptr);
	Inits.push_back(nullptr);
	}
	ValueDecl *D = Res.first;
	if (!D)
	continue;

	ELoc = IsImplicitClause ? ImplicitClauseLoc : ELoc;
	QualType Type = D->getType();
	auto *VD = dyn_cast<VarDecl>(D);

	// OpenMP [2.9.3.3, Restrictions, C/C++, p.3]
	// A variable that appears in a private clause must not have an incomplete
	// type or a reference type.
	if (RequireCompleteType(ELoc, Type,
	diag::err_omp_firstprivate_incomplete_type))
	continue;
	Type = Type.getNonReferenceType();

	// OpenMP [2.9.3.4, Restrictions, C/C++, p.1]
	// A variable of class type (or array thereof) that appears in a private
	// clause requires an accessible, unambiguous copy constructor for the
	// class type.
	QualType ElemType = Context.getBaseElementType(Type).getNonReferenceType();

	// If an implicit firstprivate variable found it was checked already.
	DSAStackTy::DSAVarData TopDVar;
	if (!IsImplicitClause) {
	DSAStackTy::DSAVarData DVar =
	DSAStack->getTopDSA(D, /FromParent=/false);
	TopDVar = DVar;
	OpenMPDirectiveKind CurrDir = DSAStack->getCurrentDirective();
	bool IsConstant = ElemType.isConstant(Context);
	// OpenMP [2.4.13, Data-sharing Attribute Clauses]
	// A list item that specifies a given variable may not appear in more
	// than one clause on the same directive, except that a variable may be
	// specified in both firstprivate and lastprivate clauses.
	// OpenMP 4.5 [2.10.8, Distribute Construct, p.3]
	// A list item may appear in a firstprivate or lastprivate clause but not
	// both.
	if (DVar.CKind != OMPC_unknown && DVar.CKind != OMPC_firstprivate &&
	(isOpenMPDistributeDirective(CurrDir) \|\|
	DVar.CKind != OMPC_lastprivate) &&
	DVar.RefExpr) {
	Diag(ELoc, diag::err_omp_wrong_dsa)
	<< getOpenMPClauseName(DVar.CKind)
	<< getOpenMPClauseName(OMPC_firstprivate);
	reportOriginalDsa(*this, DSAStack, D, DVar);
	continue;
	}

	// OpenMP [2.9.1.1, Data-sharing Attribute Rules for Variables Referenced
	// in a Construct]
	// Variables with the predetermined data-sharing attributes may not be
	// listed in data-sharing attributes clauses, except for the cases
	// listed below. For these exceptions only, listing a predetermined
	// variable in a data-sharing attribute clause is allowed and overrides
	// the variable's predetermined data-sharing attributes.
	// OpenMP [2.9.1.1, Data-sharing Attribute Rules for Variables Referenced
	// in a Construct, C/C++, p.2]
	// Variables with const-qualified type having no mutable member may be
	// listed in a firstprivate clause, even if they are static data members.
	if (!(IsConstant \|\| (VD && VD->isStaticDataMember())) && !DVar.RefExpr &&
	DVar.CKind != OMPC_unknown && DVar.CKind != OMPC_shared) {
	Diag(ELoc, diag::err_omp_wrong_dsa)
	<< getOpenMPClauseName(DVar.CKind)
	<< getOpenMPClauseName(OMPC_firstprivate);
	reportOriginalDsa(*this, DSAStack, D, DVar);
	continue;
	}

	// OpenMP [2.9.3.4, Restrictions, p.2]
	// A list item that is private within a parallel region must not appear
	// in a firstprivate clause on a worksharing construct if any of the
	// worksharing regions arising from the worksharing construct ever bind
	// to any of the parallel regions arising from the parallel construct.
	// OpenMP 4.5 [2.15.3.4, Restrictions, p.3]
	// A list item that is private within a teams region must not appear in a
	// firstprivate clause on a distribute construct if any of the distribute
	// regions arising from the distribute construct ever bind to any of the
	// teams regions arising from the teams construct.
	// OpenMP 4.5 [2.15.3.4, Restrictions, p.3]
	// A list item that appears in a reduction clause of a teams construct
	// must not appear in a firstprivate clause on a distribute construct if
	// any of the distribute regions arising from the distribute construct
	// ever bind to any of the teams regions arising from the teams construct.
	if ((isOpenMPWorksharingDirective(CurrDir) \|\|
	isOpenMPDistributeDirective(CurrDir)) &&
	!isOpenMPParallelDirective(CurrDir) &&
	!isOpenMPTeamsDirective(CurrDir)) {
	DVar = DSAStack->getImplicitDSA(D, true);
	if (DVar.CKind != OMPC_shared &&
	(isOpenMPParallelDirective(DVar.DKind) \|\|
	isOpenMPTeamsDirective(DVar.DKind) \|\|
	DVar.DKind == OMPD_unknown)) {
	Diag(ELoc, diag::err_omp_required_access)
	<< getOpenMPClauseName(OMPC_firstprivate)
	<< getOpenMPClauseName(OMPC_shared);
	reportOriginalDsa(*this, DSAStack, D, DVar);
	continue;
	}
	}
	// OpenMP [2.9.3.4, Restrictions, p.3]
	// A list item that appears in a reduction clause of a parallel construct
	// must not appear in a firstprivate clause on a worksharing or task
	// construct if any of the worksharing or task regions arising from the
	// worksharing or task construct ever bind to any of the parallel regions
	// arising from the parallel construct.
	// OpenMP [2.9.3.4, Restrictions, p.4]
	// A list item that appears in a reduction clause in worksharing
	// construct must not appear in a firstprivate clause in a task construct
	// encountered during execution of any of the worksharing regions arising
	// from the worksharing construct.
	if (isOpenMPTaskingDirective(CurrDir)) {
	DVar = DSAStack->hasInnermostDSA(
	D, [](OpenMPClauseKind C) { return C == OMPC_reduction; },
	[](OpenMPDirectiveKind K) {
	return isOpenMPParallelDirective(K) \|\|
	isOpenMPWorksharingDirective(K) \|\|
	isOpenMPTeamsDirective(K);
	},
	/FromParent=/true);
	if (DVar.CKind == OMPC_reduction &&
	(isOpenMPParallelDirective(DVar.DKind) \|\|
	isOpenMPWorksharingDirective(DVar.DKind) \|\|
	isOpenMPTeamsDirective(DVar.DKind))) {
	Diag(ELoc, diag::err_omp_parallel_reduction_in_task_firstprivate)
	<< getOpenMPDirectiveName(DVar.DKind);
	reportOriginalDsa(*this, DSAStack, D, DVar);
	continue;
	}
	}

	// OpenMP 4.5 [2.15.5.1, Restrictions, p.3]
	// A list item cannot appear in both a map clause and a data-sharing
	// attribute clause on the same construct
	//
	// OpenMP 5.0 [2.19.7.1, Restrictions, p.7]
	// A list item cannot appear in both a map clause and a data-sharing
	// attribute clause on the same construct unless the construct is a
	// combined construct.
	if ((LangOpts.OpenMP <= 45 &&
	isOpenMPTargetExecutionDirective(CurrDir)) \|\|
	CurrDir == OMPD_target) {
	OpenMPClauseKind ConflictKind;
	if (DSAStack->checkMappableExprComponentListsForDecl(
	VD, /CurrentRegionOnly=/true,
	[&ConflictKind](
	OMPClauseMappableExprCommon::MappableExprComponentListRef,
	OpenMPClauseKind WhereFoundClauseKind) {
	ConflictKind = WhereFoundClauseKind;
	return true;
	})) {
	Diag(ELoc, diag::err_omp_variable_in_given_clause_and_dsa)
	<< getOpenMPClauseName(OMPC_firstprivate)
	<< getOpenMPClauseName(ConflictKind)
	<< getOpenMPDirectiveName(DSAStack->getCurrentDirective());
	reportOriginalDsa(*this, DSAStack, D, DVar);
	continue;
	}
	}
	}

	// Variably modified types are not supported for tasks.
	if (!Type->isAnyPointerType() && Type->isVariablyModifiedType() &&
	isOpenMPTaskingDirective(DSAStack->getCurrentDirective())) {
	Diag(ELoc, diag::err_omp_variably_modified_type_not_supported)
	<< getOpenMPClauseName(OMPC_firstprivate) << Type
	<< getOpenMPDirectiveName(DSAStack->getCurrentDirective());
	bool IsDecl =
	!VD \|\|
	VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
	Diag(D->getLocation(),
	IsDecl ? diag::note_previous_decl : diag::note_defined_here)
	<< D;
	continue;
	}

	Type = Type.getUnqualifiedType();
	VarDecl *VDPrivate =
	buildVarDecl(*this, ELoc, Type, D->getName(),
	D->hasAttrs() ? &D->getAttrs() : nullptr,
	VD ? cast<DeclRefExpr>(SimpleRefExpr) : nullptr);
	// Generate helper private variable and initialize it with the value of the
	// original variable. The address of the original variable is replaced by
	// the address of the new private variable in the CodeGen. This new variable
	// is not added to IdResolver, so the code in the OpenMP region uses
	// original variable for proper diagnostics and variable capturing.
	Expr *VDInitRefExpr = nullptr;
	// For arrays generate initializer for single element and replace it by the
	// original array element in CodeGen.
	if (Type->isArrayType()) {
	VarDecl *VDInit =
	buildVarDecl(*this, RefExpr->getExprLoc(), ElemType, D->getName());
	VDInitRefExpr = buildDeclRefExpr(*this, VDInit, ElemType, ELoc);
	Expr *Init = DefaultLvalueConversion(VDInitRefExpr).get();
	ElemType = ElemType.getUnqualifiedType();
	VarDecl VDInitTemp = buildVarDecl(this, RefExpr->getExprLoc(), ElemType,
	".firstprivate.temp");
	InitializedEntity Entity =
	InitializedEntity::InitializeVariable(VDInitTemp);
	InitializationKind Kind = InitializationKind::CreateCopy(ELoc, ELoc);

	InitializationSequence InitSeq(*this, Entity, Kind, Init);
	ExprResult Result = InitSeq.Perform(*this, Entity, Kind, Init);
	if (Result.isInvalid())
	VDPrivate->setInvalidDecl();
	else
	VDPrivate->setInit(Result.getAs<Expr>());
	// Remove temp variable declaration.
	Context.Deallocate(VDInitTemp);
	} else {
	VarDecl VDInit = buildVarDecl(this, RefExpr->getExprLoc(), Type,
	".firstprivate.temp");
	VDInitRefExpr = buildDeclRefExpr(*this, VDInit, RefExpr->getType(),
	RefExpr->getExprLoc());
	AddInitializerToDecl(VDPrivate,
	DefaultLvalueConversion(VDInitRefExpr).get(),
	/DirectInit=/false);
	}
	if (VDPrivate->isInvalidDecl()) {
	if (IsImplicitClause) {
	Diag(RefExpr->getExprLoc(),
	diag::note_omp_task_predetermined_firstprivate_here);
	}
	continue;
	}
	CurContext->addDecl(VDPrivate);
	DeclRefExpr *VDPrivateRefExpr = buildDeclRefExpr(
	*this, VDPrivate, RefExpr->getType().getUnqualifiedType(),
	RefExpr->getExprLoc());
	DeclRefExpr *Ref = nullptr;
	if (!VD && !CurContext->isDependentContext()) {
	if (TopDVar.CKind == OMPC_lastprivate) {
	Ref = TopDVar.PrivateCopy;
	} else {
	Ref = buildCapture(this, D, SimpleRefExpr, /WithInit=*/true);
	if (!isOpenMPCapturedDecl(D))
	ExprCaptures.push_back(Ref->getDecl());
	}
	}
	if (!IsImplicitClause)
	DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_firstprivate, Ref);
	Vars.push_back((VD \|\| CurContext->isDependentContext())
	? RefExpr->IgnoreParens()
	: Ref);
	PrivateCopies.push_back(VDPrivateRefExpr);
	Inits.push_back(VDInitRefExpr);
	}

	if (Vars.empty())
	return nullptr;

	return OMPFirstprivateClause::Create(Context, StartLoc, LParenLoc, EndLoc,
	Vars, PrivateCopies, Inits,
	buildPreInits(Context, ExprCaptures));
	}

	OMPClause *Sema::ActOnOpenMPLastprivateClause(
	ArrayRef<Expr *> VarList, OpenMPLastprivateModifier LPKind,
	SourceLocation LPKindLoc, SourceLocation ColonLoc, SourceLocation StartLoc,
	SourceLocation LParenLoc, SourceLocation EndLoc) {
	if (LPKind == OMPC_LASTPRIVATE_unknown && LPKindLoc.isValid()) {
	assert(ColonLoc.isValid() && "Colon location must be valid.");
	Diag(LPKindLoc, diag::err_omp_unexpected_clause_value)
	<< getListOfPossibleValues(OMPC_lastprivate, /First=/0,
	/Last=/OMPC_LASTPRIVATE_unknown)
	<< getOpenMPClauseName(OMPC_lastprivate);
	return nullptr;
	}

	SmallVector<Expr *, 8> Vars;
	SmallVector<Expr *, 8> SrcExprs;
	SmallVector<Expr *, 8> DstExprs;
	SmallVector<Expr *, 8> AssignmentOps;
	SmallVector<Decl *, 4> ExprCaptures;
	SmallVector<Expr *, 4> ExprPostUpdates;
	for (Expr *RefExpr : VarList) {
	assert(RefExpr && "NULL expr in OpenMP lastprivate clause.");
	SourceLocation ELoc;
	SourceRange ERange;
	Expr *SimpleRefExpr = RefExpr;
	auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
	if (Res.second) {
	// It will be analyzed later.
	Vars.push_back(RefExpr);
	SrcExprs.push_back(nullptr);
	DstExprs.push_back(nullptr);
	AssignmentOps.push_back(nullptr);
	}
	ValueDecl *D = Res.first;
	if (!D)
	continue;

	QualType Type = D->getType();
	auto *VD = dyn_cast<VarDecl>(D);

	// OpenMP [2.14.3.5, Restrictions, C/C++, p.2]
	// A variable that appears in a lastprivate clause must not have an
	// incomplete type or a reference type.
	if (RequireCompleteType(ELoc, Type,
	diag::err_omp_lastprivate_incomplete_type))
	continue;
	Type = Type.getNonReferenceType();

	// OpenMP 5.0 [2.19.3, List Item Privatization, Restrictions]
	// A variable that is privatized must not have a const-qualified type
	// unless it is of class type with a mutable member. This restriction does
	// not apply to the firstprivate clause.
	//
	// OpenMP 3.1 [2.9.3.5, lastprivate clause, Restrictions]
	// A variable that appears in a lastprivate clause must not have a
	// const-qualified type unless it is of class type with a mutable member.
	if (rejectConstNotMutableType(*this, D, Type, OMPC_lastprivate, ELoc))
	continue;

	// OpenMP 5.0 [2.19.4.5 lastprivate Clause, Restrictions]
	// A list item that appears in a lastprivate clause with the conditional
	// modifier must be a scalar variable.
	if (LPKind == OMPC_LASTPRIVATE_conditional && !Type->isScalarType()) {
	Diag(ELoc, diag::err_omp_lastprivate_conditional_non_scalar);
	bool IsDecl = !VD \|\| VD->isThisDeclarationADefinition(Context) ==
	VarDecl::DeclarationOnly;
	Diag(D->getLocation(),
	IsDecl ? diag::note_previous_decl : diag::note_defined_here)
	<< D;
	continue;
	}

	OpenMPDirectiveKind CurrDir = DSAStack->getCurrentDirective();
	// OpenMP [2.14.1.1, Data-sharing Attribute Rules for Variables Referenced
	// in a Construct]
	// Variables with the predetermined data-sharing attributes may not be
	// listed in data-sharing attributes clauses, except for the cases
	// listed below.
	// OpenMP 4.5 [2.10.8, Distribute Construct, p.3]
	// A list item may appear in a firstprivate or lastprivate clause but not
	// both.
	DSAStackTy::DSAVarData DVar = DSAStack->getTopDSA(D, /FromParent=/false);
	if (DVar.CKind != OMPC_unknown && DVar.CKind != OMPC_lastprivate &&
	(isOpenMPDistributeDirective(CurrDir) \|\|
	DVar.CKind != OMPC_firstprivate) &&
	(DVar.CKind != OMPC_private \|\| DVar.RefExpr != nullptr)) {
	Diag(ELoc, diag::err_omp_wrong_dsa)
	<< getOpenMPClauseName(DVar.CKind)
	<< getOpenMPClauseName(OMPC_lastprivate);
	reportOriginalDsa(*this, DSAStack, D, DVar);
	continue;
	}

	// OpenMP [2.14.3.5, Restrictions, p.2]
	// A list item that is private within a parallel region, or that appears in
	// the reduction clause of a parallel construct, must not appear in a
	// lastprivate clause on a worksharing construct if any of the corresponding
	// worksharing regions ever binds to any of the corresponding parallel
	// regions.
	DSAStackTy::DSAVarData TopDVar = DVar;
	if (isOpenMPWorksharingDirective(CurrDir) &&
	!isOpenMPParallelDirective(CurrDir) &&
	!isOpenMPTeamsDirective(CurrDir)) {
	DVar = DSAStack->getImplicitDSA(D, true);
	if (DVar.CKind != OMPC_shared) {
	Diag(ELoc, diag::err_omp_required_access)
	<< getOpenMPClauseName(OMPC_lastprivate)
	<< getOpenMPClauseName(OMPC_shared);
	reportOriginalDsa(*this, DSAStack, D, DVar);
	continue;
	}
	}

	// OpenMP [2.14.3.5, Restrictions, C++, p.1,2]
	// A variable of class type (or array thereof) that appears in a
	// lastprivate clause requires an accessible, unambiguous default
	// constructor for the class type, unless the list item is also specified
	// in a firstprivate clause.
	// A variable of class type (or array thereof) that appears in a
	// lastprivate clause requires an accessible, unambiguous copy assignment
	// operator for the class type.
	Type = Context.getBaseElementType(Type).getNonReferenceType();
	VarDecl SrcVD = buildVarDecl(this, ERange.getBegin(),
	Type.getUnqualifiedType(), ".lastprivate.src",
	D->hasAttrs() ? &D->getAttrs() : nullptr);
	DeclRefExpr *PseudoSrcExpr =
	buildDeclRefExpr(*this, SrcVD, Type.getUnqualifiedType(), ELoc);
	VarDecl *DstVD =
	buildVarDecl(*this, ERange.getBegin(), Type, ".lastprivate.dst",
	D->hasAttrs() ? &D->getAttrs() : nullptr);
	DeclRefExpr PseudoDstExpr = buildDeclRefExpr(this, DstVD, Type, ELoc);
	// For arrays generate assignment operation for single element and replace
	// it by the original array element in CodeGen.
	ExprResult AssignmentOp = BuildBinOp(/S=/nullptr, ELoc, BO_Assign,
	PseudoDstExpr, PseudoSrcExpr);
	if (AssignmentOp.isInvalid())
	continue;
	AssignmentOp =
	ActOnFinishFullExpr(AssignmentOp.get(), ELoc, /DiscardedValue/ false);
	if (AssignmentOp.isInvalid())
	continue;

	DeclRefExpr *Ref = nullptr;
	if (!VD && !CurContext->isDependentContext()) {
	if (TopDVar.CKind == OMPC_firstprivate) {
	Ref = TopDVar.PrivateCopy;
	} else {
	Ref = buildCapture(this, D, SimpleRefExpr, /WithInit=*/false);
	if (!isOpenMPCapturedDecl(D))
	ExprCaptures.push_back(Ref->getDecl());
	}
	if (TopDVar.CKind == OMPC_firstprivate \|\|
	(!isOpenMPCapturedDecl(D) &&
	Ref->getDecl()->hasAttr<OMPCaptureNoInitAttr>())) {
	ExprResult RefRes = DefaultLvalueConversion(Ref);
	if (!RefRes.isUsable())
	continue;
	ExprResult PostUpdateRes =
	BuildBinOp(DSAStack->getCurScope(), ELoc, BO_Assign, SimpleRefExpr,
	RefRes.get());
	if (!PostUpdateRes.isUsable())
	continue;
	ExprPostUpdates.push_back(
	IgnoredValueConversions(PostUpdateRes.get()).get());
	}
	}
	DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_lastprivate, Ref);
	Vars.push_back((VD \|\| CurContext->isDependentContext())
	? RefExpr->IgnoreParens()
	: Ref);
	SrcExprs.push_back(PseudoSrcExpr);
	DstExprs.push_back(PseudoDstExpr);
	AssignmentOps.push_back(AssignmentOp.get());
	}

	if (Vars.empty())
	return nullptr;

	return OMPLastprivateClause::Create(Context, StartLoc, LParenLoc, EndLoc,
	Vars, SrcExprs, DstExprs, AssignmentOps,
	LPKind, LPKindLoc, ColonLoc,
	buildPreInits(Context, ExprCaptures),
	buildPostUpdate(*this, ExprPostUpdates));
	}

	OMPClause Sema::ActOnOpenMPSharedClause(ArrayRef<Expr > VarList,
	SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation EndLoc) {
	SmallVector<Expr *, 8> Vars;
	for (Expr *RefExpr : VarList) {
	assert(RefExpr && "NULL expr in OpenMP lastprivate clause.");
	SourceLocation ELoc;
	SourceRange ERange;
	Expr *SimpleRefExpr = RefExpr;
	auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
	if (Res.second) {
	// It will be analyzed later.
	Vars.push_back(RefExpr);
	}
	ValueDecl *D = Res.first;
	if (!D)
	continue;

	auto *VD = dyn_cast<VarDecl>(D);
	// OpenMP [2.9.1.1, Data-sharing Attribute Rules for Variables Referenced
	// in a Construct]
	// Variables with the predetermined data-sharing attributes may not be
	// listed in data-sharing attributes clauses, except for the cases
	// listed below. For these exceptions only, listing a predetermined
	// variable in a data-sharing attribute clause is allowed and overrides
	// the variable's predetermined data-sharing attributes.
	DSAStackTy::DSAVarData DVar = DSAStack->getTopDSA(D, /FromParent=/false);
	if (DVar.CKind != OMPC_unknown && DVar.CKind != OMPC_shared &&
	DVar.RefExpr) {
	Diag(ELoc, diag::err_omp_wrong_dsa) << getOpenMPClauseName(DVar.CKind)
	<< getOpenMPClauseName(OMPC_shared);
	reportOriginalDsa(*this, DSAStack, D, DVar);
	continue;
	}

	DeclRefExpr *Ref = nullptr;
	if (!VD && isOpenMPCapturedDecl(D) && !CurContext->isDependentContext())
	Ref = buildCapture(this, D, SimpleRefExpr, /WithInit=*/true);
	DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_shared, Ref);
	Vars.push_back((VD \|\| !Ref \|\| CurContext->isDependentContext())
	? RefExpr->IgnoreParens()
	: Ref);
	}

	if (Vars.empty())
	return nullptr;

	return OMPSharedClause::Create(Context, StartLoc, LParenLoc, EndLoc, Vars);
	}

	namespace {
	class DSARefChecker : public StmtVisitor<DSARefChecker, bool> {
	DSAStackTy *Stack;

	public:
	bool VisitDeclRefExpr(DeclRefExpr *E) {
	if (auto *VD = dyn_cast<VarDecl>(E->getDecl())) {
	DSAStackTy::DSAVarData DVar = Stack->getTopDSA(VD, /FromParent=/false);
	if (DVar.CKind == OMPC_shared && !DVar.RefExpr)
	return false;
	if (DVar.CKind != OMPC_unknown)
	return true;
	DSAStackTy::DSAVarData DVarPrivate = Stack->hasDSA(
	VD, isOpenMPPrivate, [](OpenMPDirectiveKind) { return true; },
	/FromParent=/true);
	return DVarPrivate.CKind != OMPC_unknown;
	}
	return false;
	}
	bool VisitStmt(Stmt *S) {
	for (Stmt *Child : S->children()) {
	if (Child && Visit(Child))
	return true;
	}
	return false;
	}
	explicit DSARefChecker(DSAStackTy *S) : Stack(S) {}
	};
	} // namespace

	namespace {
	// Transform MemberExpression for specified FieldDecl of current class to
	// DeclRefExpr to specified OMPCapturedExprDecl.
	class TransformExprToCaptures : public TreeTransform<TransformExprToCaptures> {
	typedef TreeTransform<TransformExprToCaptures> BaseTransform;
	ValueDecl *Field = nullptr;
	DeclRefExpr *CapturedExpr = nullptr;

	public:
	TransformExprToCaptures(Sema &SemaRef, ValueDecl *FieldDecl)
	: BaseTransform(SemaRef), Field(FieldDecl), CapturedExpr(nullptr) {}

	ExprResult TransformMemberExpr(MemberExpr *E) {
	if (isa<CXXThisExpr>(E->getBase()->IgnoreParenImpCasts()) &&
	E->getMemberDecl() == Field) {
	CapturedExpr = buildCapture(SemaRef, Field, E, /WithInit=/false);
	return CapturedExpr;
	}
	return BaseTransform::TransformMemberExpr(E);
	}
	DeclRefExpr *getCapturedExpr() { return CapturedExpr; }
	};
	} // namespace

	template <typename T, typename U>
	static T filterLookupForUDReductionAndMapper(
	SmallVectorImpl<U> &Lookups, const llvm::function_ref<T(ValueDecl *)> Gen) {
	for (U &Set : Lookups) {
	for (auto *D : Set) {
	if (T Res = Gen(cast<ValueDecl>(D)))
	return Res;
	}
	}
	return T();
	}

	static NamedDecl findAcceptableDecl(Sema &SemaRef, NamedDecl D) {
	assert(!LookupResult::isVisible(SemaRef, D) && "not in slow case");

	for (auto RD : D->redecls()) {
	// Don't bother with extra checks if we already know this one isn't visible.
	if (RD == D)
	continue;

	auto ND = cast<NamedDecl>(RD);
	if (LookupResult::isVisible(SemaRef, ND))
	return ND;
	}

	return nullptr;
	}

	static void
	argumentDependentLookup(Sema &SemaRef, const DeclarationNameInfo &Id,
	SourceLocation Loc, QualType Ty,
	SmallVectorImpl<UnresolvedSet<8>> &Lookups) {
	// Find all of the associated namespaces and classes based on the
	// arguments we have.
	Sema::AssociatedNamespaceSet AssociatedNamespaces;
	Sema::AssociatedClassSet AssociatedClasses;
	OpaqueValueExpr OVE(Loc, Ty, VK_LValue);
	SemaRef.FindAssociatedClassesAndNamespaces(Loc, &OVE, AssociatedNamespaces,
	AssociatedClasses);

	// C++ [basic.lookup.argdep]p3:
	// Let X be the lookup set produced by unqualified lookup (3.4.1)
	// and let Y be the lookup set produced by argument dependent
	// lookup (defined as follows). If X contains [...] then Y is
	// empty. Otherwise Y is the set of declarations found in the
	// namespaces associated with the argument types as described
	// below. The set of declarations found by the lookup of the name
	// is the union of X and Y.
	//
	// Here, we compute Y and add its members to the overloaded
	// candidate set.
	for (auto *NS : AssociatedNamespaces) {
	// When considering an associated namespace, the lookup is the
	// same as the lookup performed when the associated namespace is
	// used as a qualifier (3.4.3.2) except that:
	//
	// -- Any using-directives in the associated namespace are
	// ignored.
	//
	// -- Any namespace-scope friend functions declared in
	// associated classes are visible within their respective
	// namespaces even if they are not visible during an ordinary
	// lookup (11.4).
	DeclContext::lookup_result R = NS->lookup(Id.getName());
	for (auto *D : R) {
	auto *Underlying = D;
	if (auto *USD = dyn_cast<UsingShadowDecl>(D))
	Underlying = USD->getTargetDecl();

	if (!isa<OMPDeclareReductionDecl>(Underlying) &&
	!isa<OMPDeclareMapperDecl>(Underlying))
	continue;

	if (!SemaRef.isVisible(D)) {
	D = findAcceptableDecl(SemaRef, D);
	if (!D)
	continue;
	if (auto *USD = dyn_cast<UsingShadowDecl>(D))
	Underlying = USD->getTargetDecl();
	}
	Lookups.emplace_back();
	Lookups.back().addDecl(Underlying);
	}
	}
	}

	static ExprResult
	buildDeclareReductionRef(Sema &SemaRef, SourceLocation Loc, SourceRange Range,
	Scope *S, CXXScopeSpec &ReductionIdScopeSpec,
	const DeclarationNameInfo &ReductionId, QualType Ty,
	CXXCastPath &BasePath, Expr *UnresolvedReduction) {
	if (ReductionIdScopeSpec.isInvalid())
	return ExprError();
	SmallVector<UnresolvedSet<8>, 4> Lookups;
	if (S) {
	LookupResult Lookup(SemaRef, ReductionId, Sema::LookupOMPReductionName);
	Lookup.suppressDiagnostics();
	while (S && SemaRef.LookupParsedName(Lookup, S, &ReductionIdScopeSpec)) {
	NamedDecl *D = Lookup.getRepresentativeDecl();
	do {
	S = S->getParent();
	} while (S && !S->isDeclScope(D));
	if (S)
	S = S->getParent();
	Lookups.emplace_back();
	Lookups.back().append(Lookup.begin(), Lookup.end());
	Lookup.clear();
	}
	} else if (auto *ULE =
	cast_or_null<UnresolvedLookupExpr>(UnresolvedReduction)) {
	Lookups.push_back(UnresolvedSet<8>());
	Decl *PrevD = nullptr;
	for (NamedDecl *D : ULE->decls()) {
	if (D == PrevD)
	Lookups.push_back(UnresolvedSet<8>());
	else if (auto *DRD = dyn_cast<OMPDeclareReductionDecl>(D))
	Lookups.back().addDecl(DRD);
	PrevD = D;
	}
	}
	if (SemaRef.CurContext->isDependentContext() \|\| Ty->isDependentType() \|\|
	Ty->isInstantiationDependentType() \|\|
	Ty->containsUnexpandedParameterPack() \|\|
	filterLookupForUDReductionAndMapper<bool>(Lookups, [](ValueDecl *D) {
	return !D->isInvalidDecl() &&
	(D->getType()->isDependentType() \|\|
	D->getType()->isInstantiationDependentType() \|\|
	D->getType()->containsUnexpandedParameterPack());
	})) {
	UnresolvedSet<8> ResSet;
	for (const UnresolvedSet<8> &Set : Lookups) {
	if (Set.empty())
	continue;
	ResSet.append(Set.begin(), Set.end());
	// The last item marks the end of all declarations at the specified scope.
	ResSet.addDecl(Set[Set.size() - 1]);
	}
	return UnresolvedLookupExpr::Create(
	SemaRef.Context, /NamingClass=/nullptr,
	ReductionIdScopeSpec.getWithLocInContext(SemaRef.Context), ReductionId,
	/ADL=/true, /Overloaded=/true, ResSet.begin(), ResSet.end());
	}
	// Lookup inside the classes.
	// C++ [over.match.oper]p3:
	// For a unary operator @ with an operand of a type whose
	// cv-unqualified version is T1, and for a binary operator @ with
	// a left operand of a type whose cv-unqualified version is T1 and
	// a right operand of a type whose cv-unqualified version is T2,
	// three sets of candidate functions, designated member
	// candidates, non-member candidates and built-in candidates, are
	// constructed as follows:
	// -- If T1 is a complete class type or a class currently being
	// defined, the set of member candidates is the result of the
	// qualified lookup of T1::operator@ (13.3.1.1.1); otherwise,
	// the set of member candidates is empty.
	LookupResult Lookup(SemaRef, ReductionId, Sema::LookupOMPReductionName);
	Lookup.suppressDiagnostics();
	if (const auto *TyRec = Ty->getAs<RecordType>()) {
	// Complete the type if it can be completed.
	// If the type is neither complete nor being defined, bail out now.
	if (SemaRef.isCompleteType(Loc, Ty) \|\| TyRec->isBeingDefined() \|\|
	TyRec->getDecl()->getDefinition()) {
	Lookup.clear();
	SemaRef.LookupQualifiedName(Lookup, TyRec->getDecl());
	if (Lookup.empty()) {
	Lookups.emplace_back();
	Lookups.back().append(Lookup.begin(), Lookup.end());
	}
	}
	}
	// Perform ADL.
	if (SemaRef.getLangOpts().CPlusPlus)
	argumentDependentLookup(SemaRef, ReductionId, Loc, Ty, Lookups);
	if (auto VD = filterLookupForUDReductionAndMapper<ValueDecl >(
	Lookups, [&SemaRef, Ty](ValueDecl D) -> ValueDecl {
	if (!D->isInvalidDecl() &&
	SemaRef.Context.hasSameType(D->getType(), Ty))
	return D;
	return nullptr;
	}))
	return SemaRef.BuildDeclRefExpr(VD, VD->getType().getNonReferenceType(),
	VK_LValue, Loc);
	if (SemaRef.getLangOpts().CPlusPlus) {
	if (auto VD = filterLookupForUDReductionAndMapper<ValueDecl >(
	Lookups, [&SemaRef, Ty, Loc](ValueDecl D) -> ValueDecl {
	if (!D->isInvalidDecl() &&
	SemaRef.IsDerivedFrom(Loc, Ty, D->getType()) &&
	!Ty.isMoreQualifiedThan(D->getType()))
	return D;
	return nullptr;
	})) {
	CXXBasePaths Paths(/FindAmbiguities=/true, /RecordPaths=/true,
	/DetectVirtual=/false);
	if (SemaRef.IsDerivedFrom(Loc, Ty, VD->getType(), Paths)) {
	if (!Paths.isAmbiguous(SemaRef.Context.getCanonicalType(
	VD->getType().getUnqualifiedType()))) {
	if (SemaRef.CheckBaseClassAccess(
	Loc, VD->getType(), Ty, Paths.front(),
	/DiagID=/0) != Sema::AR_inaccessible) {
	SemaRef.BuildBasePathArray(Paths, BasePath);
	return SemaRef.BuildDeclRefExpr(
	VD, VD->getType().getNonReferenceType(), VK_LValue, Loc);
	}
	}
	}
	}
	}
	if (ReductionIdScopeSpec.isSet()) {
	SemaRef.Diag(Loc, diag::err_omp_not_resolved_reduction_identifier)
	<< Ty << Range;
	return ExprError();
	}
	return ExprEmpty();
	}

	namespace {
	/// Data for the reduction-based clauses.
	struct ReductionData {
	/// List of original reduction items.
	SmallVector<Expr *, 8> Vars;
	/// List of private copies of the reduction items.
	SmallVector<Expr *, 8> Privates;
	/// LHS expressions for the reduction_op expressions.
	SmallVector<Expr *, 8> LHSs;
	/// RHS expressions for the reduction_op expressions.
	SmallVector<Expr *, 8> RHSs;
	/// Reduction operation expression.
	SmallVector<Expr *, 8> ReductionOps;
	/// inscan copy operation expressions.
	SmallVector<Expr *, 8> InscanCopyOps;
	/// inscan copy temp array expressions for prefix sums.
	SmallVector<Expr *, 8> InscanCopyArrayTemps;
	/// inscan copy temp array element expressions for prefix sums.
	SmallVector<Expr *, 8> InscanCopyArrayElems;
	/// Taskgroup descriptors for the corresponding reduction items in
	/// in_reduction clauses.
	SmallVector<Expr *, 8> TaskgroupDescriptors;
	/// List of captures for clause.
	SmallVector<Decl *, 4> ExprCaptures;
	/// List of postupdate expressions.
	SmallVector<Expr *, 4> ExprPostUpdates;
	/// Reduction modifier.
	unsigned RedModifier = 0;
	ReductionData() = delete;
	/// Reserves required memory for the reduction data.
	ReductionData(unsigned Size, unsigned Modifier = 0) : RedModifier(Modifier) {
	Vars.reserve(Size);
	Privates.reserve(Size);
	LHSs.reserve(Size);
	RHSs.reserve(Size);
	ReductionOps.reserve(Size);
	if (RedModifier == OMPC_REDUCTION_inscan) {
	InscanCopyOps.reserve(Size);
	InscanCopyArrayTemps.reserve(Size);
	InscanCopyArrayElems.reserve(Size);
	}
	TaskgroupDescriptors.reserve(Size);
	ExprCaptures.reserve(Size);
	ExprPostUpdates.reserve(Size);
	}
	/// Stores reduction item and reduction operation only (required for dependent
	/// reduction item).
	void push(Expr Item, Expr ReductionOp) {
	Vars.emplace_back(Item);
	Privates.emplace_back(nullptr);
	LHSs.emplace_back(nullptr);
	RHSs.emplace_back(nullptr);
	ReductionOps.emplace_back(ReductionOp);
	TaskgroupDescriptors.emplace_back(nullptr);
	if (RedModifier == OMPC_REDUCTION_inscan) {
	InscanCopyOps.push_back(nullptr);
	InscanCopyArrayTemps.push_back(nullptr);
	InscanCopyArrayElems.push_back(nullptr);
	}
	}
	/// Stores reduction data.
	void push(Expr Item, Expr Private, Expr LHS, Expr RHS, Expr *ReductionOp,
	Expr TaskgroupDescriptor, Expr CopyOp, Expr *CopyArrayTemp,
	Expr *CopyArrayElem) {
	Vars.emplace_back(Item);
	Privates.emplace_back(Private);
	LHSs.emplace_back(LHS);
	RHSs.emplace_back(RHS);
	ReductionOps.emplace_back(ReductionOp);
	TaskgroupDescriptors.emplace_back(TaskgroupDescriptor);
	if (RedModifier == OMPC_REDUCTION_inscan) {
	InscanCopyOps.push_back(CopyOp);
	InscanCopyArrayTemps.push_back(CopyArrayTemp);
	InscanCopyArrayElems.push_back(CopyArrayElem);
	} else {
	assert(CopyOp == nullptr && CopyArrayTemp == nullptr &&
	CopyArrayElem == nullptr &&
	"Copy operation must be used for inscan reductions only.");
	}
	}
	};
	} // namespace

	static bool checkOMPArraySectionConstantForReduction(
	ASTContext &Context, const OMPArraySectionExpr *OASE, bool &SingleElement,
	SmallVectorImpl<llvm::APSInt> &ArraySizes) {
	const Expr *Length = OASE->getLength();
	if (Length == nullptr) {
	// For array sections of the form [1:] or [:], we would need to analyze
	// the lower bound...
	if (OASE->getColonLocFirst().isValid())
	return false;

	// This is an array subscript which has implicit length 1!
	SingleElement = true;
	ArraySizes.push_back(llvm::APSInt::get(1));
	} else {
	Expr::EvalResult Result;
	if (!Length->EvaluateAsInt(Result, Context))
	return false;

	llvm::APSInt ConstantLengthValue = Result.Val.getInt();
	SingleElement = (ConstantLengthValue.getSExtValue() == 1);
	ArraySizes.push_back(ConstantLengthValue);
	}

	// Get the base of this array section and walk up from there.
	const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();

	// We require length = 1 for all array sections except the right-most to
	// guarantee that the memory region is contiguous and has no holes in it.
	while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base)) {
	Length = TempOASE->getLength();
	if (Length == nullptr) {
	// For array sections of the form [1:] or [:], we would need to analyze
	// the lower bound...
	if (OASE->getColonLocFirst().isValid())
	return false;

	// This is an array subscript which has implicit length 1!
	ArraySizes.push_back(llvm::APSInt::get(1));
	} else {
	Expr::EvalResult Result;
	if (!Length->EvaluateAsInt(Result, Context))
	return false;

	llvm::APSInt ConstantLengthValue = Result.Val.getInt();
	if (ConstantLengthValue.getSExtValue() != 1)
	return false;

	ArraySizes.push_back(ConstantLengthValue);
	}
	Base = TempOASE->getBase()->IgnoreParenImpCasts();
	}

	// If we have a single element, we don't need to add the implicit lengths.
	if (!SingleElement) {
	while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base)) {
	// Has implicit length 1!
	ArraySizes.push_back(llvm::APSInt::get(1));
	Base = TempASE->getBase()->IgnoreParenImpCasts();
	}
	}

	// This array section can be privatized as a single value or as a constant
	// sized array.
	return true;
	}

	static bool actOnOMPReductionKindClause(
	Sema &S, DSAStackTy *Stack, OpenMPClauseKind ClauseKind,
	ArrayRef<Expr *> VarList, SourceLocation StartLoc, SourceLocation LParenLoc,
	SourceLocation ColonLoc, SourceLocation EndLoc,
	CXXScopeSpec &ReductionIdScopeSpec, const DeclarationNameInfo &ReductionId,
	ArrayRef<Expr *> UnresolvedReductions, ReductionData &RD) {
	DeclarationName DN = ReductionId.getName();
	OverloadedOperatorKind OOK = DN.getCXXOverloadedOperator();
	BinaryOperatorKind BOK = BO_Comma;

	ASTContext &Context = S.Context;
	// OpenMP [2.14.3.6, reduction clause]
	// C
	// reduction-identifier is either an identifier or one of the following
	// operators: +, -, *, &, \|, ^, && and \|\|
	// C++
	// reduction-identifier is either an id-expression or one of the following
	// operators: +, -, *, &, \|, ^, && and \|\|
	switch (OOK) {
	case OO_Plus:
	case OO_Minus:
	BOK = BO_Add;
	break;
	case OO_Star:
	BOK = BO_Mul;
	break;
	case OO_Amp:
	BOK = BO_And;
	break;
	case OO_Pipe:
	BOK = BO_Or;
	break;
	case OO_Caret:
	BOK = BO_Xor;
	break;
	case OO_AmpAmp:
	BOK = BO_LAnd;
	break;
	case OO_PipePipe:
	BOK = BO_LOr;
	break;
	case OO_New:
	case OO_Delete:
	case OO_Array_New:
	case OO_Array_Delete:
	case OO_Slash:
	case OO_Percent:
	case OO_Tilde:
	case OO_Exclaim:
	case OO_Equal:
	case OO_Less:
	case OO_Greater:
	case OO_LessEqual:
	case OO_GreaterEqual:
	case OO_PlusEqual:
	case OO_MinusEqual:
	case OO_StarEqual:
	case OO_SlashEqual:
	case OO_PercentEqual:
	case OO_CaretEqual:
	case OO_AmpEqual:
	case OO_PipeEqual:
	case OO_LessLess:
	case OO_GreaterGreater:
	case OO_LessLessEqual:
	case OO_GreaterGreaterEqual:
	case OO_EqualEqual:
	case OO_ExclaimEqual:
	case OO_Spaceship:
	case OO_PlusPlus:
	case OO_MinusMinus:
	case OO_Comma:
	case OO_ArrowStar:
	case OO_Arrow:
	case OO_Call:
	case OO_Subscript:
	case OO_Conditional:
	case OO_Coawait:
	case NUM_OVERLOADED_OPERATORS:
	llvm_unreachable("Unexpected reduction identifier");
	case OO_None:
	if (IdentifierInfo *II = DN.getAsIdentifierInfo()) {
	if (II->isStr("max"))
	BOK = BO_GT;
	else if (II->isStr("min"))
	BOK = BO_LT;
	}
	break;
	}
	SourceRange ReductionIdRange;
	if (ReductionIdScopeSpec.isValid())
	ReductionIdRange.setBegin(ReductionIdScopeSpec.getBeginLoc());
	else
	ReductionIdRange.setBegin(ReductionId.getBeginLoc());
	ReductionIdRange.setEnd(ReductionId.getEndLoc());

	auto IR = UnresolvedReductions.begin(), ER = UnresolvedReductions.end();
	bool FirstIter = true;
	for (Expr *RefExpr : VarList) {
	assert(RefExpr && "nullptr expr in OpenMP reduction clause.");
	// OpenMP [2.1, C/C++]
	// A list item is a variable or array section, subject to the restrictions
	// specified in Section 2.4 on page 42 and in each of the sections
	// describing clauses and directives for which a list appears.
	// OpenMP [2.14.3.3, Restrictions, p.1]
	// A variable that is part of another variable (as an array or
	// structure element) cannot appear in a private clause.
	if (!FirstIter && IR != ER)
	++IR;
	FirstIter = false;
	SourceLocation ELoc;
	SourceRange ERange;
	Expr *SimpleRefExpr = RefExpr;
	auto Res = getPrivateItem(S, SimpleRefExpr, ELoc, ERange,
	/AllowArraySection=/true);
	if (Res.second) {
	// Try to find 'declare reduction' corresponding construct before using
	// builtin/overloaded operators.
	QualType Type = Context.DependentTy;
	CXXCastPath BasePath;
	ExprResult DeclareReductionRef = buildDeclareReductionRef(
	S, ELoc, ERange, Stack->getCurScope(), ReductionIdScopeSpec,
	ReductionId, Type, BasePath, IR == ER ? nullptr : *IR);
	Expr *ReductionOp = nullptr;
	if (S.CurContext->isDependentContext() &&
	(DeclareReductionRef.isUnset() \|\|
	isa<UnresolvedLookupExpr>(DeclareReductionRef.get())))
	ReductionOp = DeclareReductionRef.get();
	// It will be analyzed later.
	RD.push(RefExpr, ReductionOp);
	}
	ValueDecl *D = Res.first;
	if (!D)
	continue;

	Expr *TaskgroupDescriptor = nullptr;
	QualType Type;
	auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr->IgnoreParens());
	auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr->IgnoreParens());
	if (ASE) {
	Type = ASE->getType().getNonReferenceType();
	} else if (OASE) {
	QualType BaseType =
	OMPArraySectionExpr::getBaseOriginalType(OASE->getBase());
	if (const auto *ATy = BaseType->getAsArrayTypeUnsafe())
	Type = ATy->getElementType();
	else
	Type = BaseType->getPointeeType();
	Type = Type.getNonReferenceType();
	} else {
	Type = Context.getBaseElementType(D->getType().getNonReferenceType());
	}
	auto *VD = dyn_cast<VarDecl>(D);

	// OpenMP [2.9.3.3, Restrictions, C/C++, p.3]
	// A variable that appears in a private clause must not have an incomplete
	// type or a reference type.
	if (S.RequireCompleteType(ELoc, D->getType(),
	diag::err_omp_reduction_incomplete_type))
	continue;
	// OpenMP [2.14.3.6, reduction clause, Restrictions]
	// A list item that appears in a reduction clause must not be
	// const-qualified.
	if (rejectConstNotMutableType(S, D, Type, ClauseKind, ELoc,
	/AcceptIfMutable/ false, ASE \|\| OASE))
	continue;

	OpenMPDirectiveKind CurrDir = Stack->getCurrentDirective();
	// OpenMP [2.9.3.6, Restrictions, C/C++, p.4]
	// If a list-item is a reference type then it must bind to the same object
	// for all threads of the team.
	if (!ASE && !OASE) {
	if (VD) {
	VarDecl *VDDef = VD->getDefinition();
	if (VD->getType()->isReferenceType() && VDDef && VDDef->hasInit()) {
	DSARefChecker Check(Stack);
	if (Check.Visit(VDDef->getInit())) {
	S.Diag(ELoc, diag::err_omp_reduction_ref_type_arg)
	<< getOpenMPClauseName(ClauseKind) << ERange;
	S.Diag(VDDef->getLocation(), diag::note_defined_here) << VDDef;
	continue;
	}
	}
	}

	// OpenMP [2.14.1.1, Data-sharing Attribute Rules for Variables Referenced
	// in a Construct]
	// Variables with the predetermined data-sharing attributes may not be
	// listed in data-sharing attributes clauses, except for the cases
	// listed below. For these exceptions only, listing a predetermined
	// variable in a data-sharing attribute clause is allowed and overrides
	// the variable's predetermined data-sharing attributes.
	// OpenMP [2.14.3.6, Restrictions, p.3]
	// Any number of reduction clauses can be specified on the directive,
	// but a list item can appear only once in the reduction clauses for that
	// directive.
	DSAStackTy::DSAVarData DVar = Stack->getTopDSA(D, /FromParent=/false);
	if (DVar.CKind == OMPC_reduction) {
	S.Diag(ELoc, diag::err_omp_once_referenced)
	<< getOpenMPClauseName(ClauseKind);
	if (DVar.RefExpr)
	S.Diag(DVar.RefExpr->getExprLoc(), diag::note_omp_referenced);
	continue;
	}
	if (DVar.CKind != OMPC_unknown) {
	S.Diag(ELoc, diag::err_omp_wrong_dsa)
	<< getOpenMPClauseName(DVar.CKind)
	<< getOpenMPClauseName(OMPC_reduction);
	reportOriginalDsa(S, Stack, D, DVar);
	continue;
	}

	// OpenMP [2.14.3.6, Restrictions, p.1]
	// A list item that appears in a reduction clause of a worksharing
	// construct must be shared in the parallel regions to which any of the
	// worksharing regions arising from the worksharing construct bind.
	if (isOpenMPWorksharingDirective(CurrDir) &&
	!isOpenMPParallelDirective(CurrDir) &&
	!isOpenMPTeamsDirective(CurrDir)) {
	DVar = Stack->getImplicitDSA(D, true);
	if (DVar.CKind != OMPC_shared) {
	S.Diag(ELoc, diag::err_omp_required_access)
	<< getOpenMPClauseName(OMPC_reduction)
	<< getOpenMPClauseName(OMPC_shared);
	reportOriginalDsa(S, Stack, D, DVar);
	continue;
	}
	}
	}

	// Try to find 'declare reduction' corresponding construct before using
	// builtin/overloaded operators.
	CXXCastPath BasePath;
	ExprResult DeclareReductionRef = buildDeclareReductionRef(
	S, ELoc, ERange, Stack->getCurScope(), ReductionIdScopeSpec,
	ReductionId, Type, BasePath, IR == ER ? nullptr : *IR);
	if (DeclareReductionRef.isInvalid())
	continue;
	if (S.CurContext->isDependentContext() &&
	(DeclareReductionRef.isUnset() \|\|
	isa<UnresolvedLookupExpr>(DeclareReductionRef.get()))) {
	RD.push(RefExpr, DeclareReductionRef.get());
	continue;
	}
	if (BOK == BO_Comma && DeclareReductionRef.isUnset()) {
	// Not allowed reduction identifier is found.
	S.Diag(ReductionId.getBeginLoc(),
	diag::err_omp_unknown_reduction_identifier)
	<< Type << ReductionIdRange;
	continue;
	}

	// OpenMP [2.14.3.6, reduction clause, Restrictions]
	// The type of a list item that appears in a reduction clause must be valid
	// for the reduction-identifier. For a max or min reduction in C, the type
	// of the list item must be an allowed arithmetic data type: char, int,
	// float, double, or _Bool, possibly modified with long, short, signed, or
	// unsigned. For a max or min reduction in C++, the type of the list item
	// must be an allowed arithmetic data type: char, wchar_t, int, float,
	// double, or bool, possibly modified with long, short, signed, or unsigned.
	if (DeclareReductionRef.isUnset()) {
	if ((BOK == BO_GT \|\| BOK == BO_LT) &&
	!(Type->isScalarType() \|\|
	(S.getLangOpts().CPlusPlus && Type->isArithmeticType()))) {
	S.Diag(ELoc, diag::err_omp_clause_not_arithmetic_type_arg)
	<< getOpenMPClauseName(ClauseKind) << S.getLangOpts().CPlusPlus;
	if (!ASE && !OASE) {
	bool IsDecl = !VD \|\| VD->isThisDeclarationADefinition(Context) ==
	VarDecl::DeclarationOnly;
	S.Diag(D->getLocation(),
	IsDecl ? diag::note_previous_decl : diag::note_defined_here)
	<< D;
	}
	continue;
	}
	if ((BOK == BO_OrAssign \|\| BOK == BO_AndAssign \|\| BOK == BO_XorAssign) &&
	!S.getLangOpts().CPlusPlus && Type->isFloatingType()) {
	S.Diag(ELoc, diag::err_omp_clause_floating_type_arg)
	<< getOpenMPClauseName(ClauseKind);
	if (!ASE && !OASE) {
	bool IsDecl = !VD \|\| VD->isThisDeclarationADefinition(Context) ==
	VarDecl::DeclarationOnly;
	S.Diag(D->getLocation(),
	IsDecl ? diag::note_previous_decl : diag::note_defined_here)
	<< D;
	}
	continue;
	}
	}

	Type = Type.getNonLValueExprType(Context).getUnqualifiedType();
	VarDecl *LHSVD = buildVarDecl(S, ELoc, Type, ".reduction.lhs",
	D->hasAttrs() ? &D->getAttrs() : nullptr);
	VarDecl *RHSVD = buildVarDecl(S, ELoc, Type, D->getName(),
	D->hasAttrs() ? &D->getAttrs() : nullptr);
	QualType PrivateTy = Type;

	// Try if we can determine constant lengths for all array sections and avoid
	// the VLA.
	bool ConstantLengthOASE = false;
	if (OASE) {
	bool SingleElement;
	llvm::SmallVector<llvm::APSInt, 4> ArraySizes;
	ConstantLengthOASE = checkOMPArraySectionConstantForReduction(
	Context, OASE, SingleElement, ArraySizes);

	// If we don't have a single element, we must emit a constant array type.
	if (ConstantLengthOASE && !SingleElement) {
	for (llvm::APSInt &Size : ArraySizes)
	PrivateTy = Context.getConstantArrayType(PrivateTy, Size, nullptr,
	ArrayType::Normal,
	/IndexTypeQuals=/0);
	}
	}

	if ((OASE && !ConstantLengthOASE) \|\|
	(!OASE && !ASE &&
	D->getType().getNonReferenceType()->isVariablyModifiedType())) {
	if (!Context.getTargetInfo().isVLASupported()) {
	if (isOpenMPTargetExecutionDirective(Stack->getCurrentDirective())) {
	S.Diag(ELoc, diag::err_omp_reduction_vla_unsupported) << !!OASE;
	S.Diag(ELoc, diag::note_vla_unsupported);
	continue;
	} else {
	S.targetDiag(ELoc, diag::err_omp_reduction_vla_unsupported) << !!OASE;
	S.targetDiag(ELoc, diag::note_vla_unsupported);
	}
	}
	// For arrays/array sections only:
	// Create pseudo array type for private copy. The size for this array will
	// be generated during codegen.
	// For array subscripts or single variables Private Ty is the same as Type
	// (type of the variable or single array element).
	PrivateTy = Context.getVariableArrayType(
	Type,
	new (Context) OpaqueValueExpr(ELoc, Context.getSizeType(), VK_RValue),
	ArrayType::Normal, /IndexTypeQuals=/0, SourceRange());
	} else if (!ASE && !OASE &&
	Context.getAsArrayType(D->getType().getNonReferenceType())) {
	PrivateTy = D->getType().getNonReferenceType();
	}
	// Private copy.
	VarDecl *PrivateVD =
	buildVarDecl(S, ELoc, PrivateTy, D->getName(),
	D->hasAttrs() ? &D->getAttrs() : nullptr,
	VD ? cast<DeclRefExpr>(SimpleRefExpr) : nullptr);
	// Add initializer for private variable.
	Expr *Init = nullptr;
	DeclRefExpr *LHSDRE = buildDeclRefExpr(S, LHSVD, Type, ELoc);
	DeclRefExpr *RHSDRE = buildDeclRefExpr(S, RHSVD, Type, ELoc);
	if (DeclareReductionRef.isUsable()) {
	auto *DRDRef = DeclareReductionRef.getAs<DeclRefExpr>();
	auto *DRD = cast<OMPDeclareReductionDecl>(DRDRef->getDecl());
	if (DRD->getInitializer()) {
	S.ActOnUninitializedDecl(PrivateVD);
	Init = DRDRef;
	RHSVD->setInit(DRDRef);
	RHSVD->setInitStyle(VarDecl::CallInit);
	}
	} else {
	switch (BOK) {
	case BO_Add:
	case BO_Xor:
	case BO_Or:
	case BO_LOr:
	// '+', '-', '^', '\|', '\|\|' reduction ops - initializer is '0'.
	if (Type->isScalarType() \|\| Type->isAnyComplexType())
	Init = S.ActOnIntegerConstant(ELoc, /Val=/0).get();
	break;
	case BO_Mul:
	case BO_LAnd:
	if (Type->isScalarType() \|\| Type->isAnyComplexType()) {
	// '*' and '&&' reduction ops - initializer is '1'.
	Init = S.ActOnIntegerConstant(ELoc, /Val=/1).get();
	}
	break;
	case BO_And: {
	// '&' reduction op - initializer is '~0'.
	QualType OrigType = Type;
	if (auto *ComplexTy = OrigType->getAs<ComplexType>())
	Type = ComplexTy->getElementType();
	if (Type->isRealFloatingType()) {
	llvm::APFloat InitValue = llvm::APFloat::getAllOnesValue(
	Context.getFloatTypeSemantics(Type),
	Context.getTypeSize(Type));
	Init = FloatingLiteral::Create(Context, InitValue, /isexact=/true,
	Type, ELoc);
	} else if (Type->isScalarType()) {
	uint64_t Size = Context.getTypeSize(Type);
	QualType IntTy = Context.getIntTypeForBitwidth(Size, /Signed=/0);
	llvm::APInt InitValue = llvm::APInt::getAllOnesValue(Size);
	Init = IntegerLiteral::Create(Context, InitValue, IntTy, ELoc);
	}
	if (Init && OrigType->isAnyComplexType()) {
	// Init = 0xFFFF + 0xFFFFi;
	auto *Im = new (Context) ImaginaryLiteral(Init, OrigType);
	Init = S.CreateBuiltinBinOp(ELoc, BO_Add, Init, Im).get();
	}
	Type = OrigType;
	break;
	}
	case BO_LT:
	case BO_GT: {
	// 'min' reduction op - initializer is 'Largest representable number in
	// the reduction list item type'.
	// 'max' reduction op - initializer is 'Least representable number in
	// the reduction list item type'.
	if (Type->isIntegerType() \|\| Type->isPointerType()) {
	bool IsSigned = Type->hasSignedIntegerRepresentation();
	uint64_t Size = Context.getTypeSize(Type);
	QualType IntTy =
	Context.getIntTypeForBitwidth(Size, /Signed=/IsSigned);
	llvm::APInt InitValue =
	(BOK != BO_LT) ? IsSigned ? llvm::APInt::getSignedMinValue(Size)
	: llvm::APInt::getMinValue(Size)
	: IsSigned ? llvm::APInt::getSignedMaxValue(Size)
	: llvm::APInt::getMaxValue(Size);
	Init = IntegerLiteral::Create(Context, InitValue, IntTy, ELoc);
	if (Type->isPointerType()) {
	// Cast to pointer type.
	ExprResult CastExpr = S.BuildCStyleCastExpr(
	ELoc, Context.getTrivialTypeSourceInfo(Type, ELoc), ELoc, Init);
	if (CastExpr.isInvalid())
	continue;
	Init = CastExpr.get();
	}
	} else if (Type->isRealFloatingType()) {
	llvm::APFloat InitValue = llvm::APFloat::getLargest(
	Context.getFloatTypeSemantics(Type), BOK != BO_LT);
	Init = FloatingLiteral::Create(Context, InitValue, /isexact=/true,
	Type, ELoc);
	}
	break;
	}
	case BO_PtrMemD:
	case BO_PtrMemI:
	case BO_MulAssign:
	case BO_Div:
	case BO_Rem:
	case BO_Sub:
	case BO_Shl:
	case BO_Shr:
	case BO_LE:
	case BO_GE:
	case BO_EQ:
	case BO_NE:
	case BO_Cmp:
	case BO_AndAssign:
	case BO_XorAssign:
	case BO_OrAssign:
	case BO_Assign:
	case BO_AddAssign:
	case BO_SubAssign:
	case BO_DivAssign:
	case BO_RemAssign:
	case BO_ShlAssign:
	case BO_ShrAssign:
	case BO_Comma:
	llvm_unreachable("Unexpected reduction operation");
	}
	}
	if (Init && DeclareReductionRef.isUnset()) {
	S.AddInitializerToDecl(RHSVD, Init, /DirectInit=/false);
	// Store initializer for single element in private copy. Will be used
	// during codegen.
	PrivateVD->setInit(RHSVD->getInit());
	PrivateVD->setInitStyle(RHSVD->getInitStyle());
	} else if (!Init) {
	S.ActOnUninitializedDecl(RHSVD);
	// Store initializer for single element in private copy. Will be used
	// during codegen.
	PrivateVD->setInit(RHSVD->getInit());
	PrivateVD->setInitStyle(RHSVD->getInitStyle());
	}
	if (RHSVD->isInvalidDecl())
	continue;
	if (!RHSVD->hasInit() &&
	(DeclareReductionRef.isUnset() \|\| !S.LangOpts.CPlusPlus)) {
	S.Diag(ELoc, diag::err_omp_reduction_id_not_compatible)
	<< Type << ReductionIdRange;
	bool IsDecl = !VD \|\| VD->isThisDeclarationADefinition(Context) ==
	VarDecl::DeclarationOnly;
	S.Diag(D->getLocation(),
	IsDecl ? diag::note_previous_decl : diag::note_defined_here)
	<< D;
	continue;
	}
	DeclRefExpr *PrivateDRE = buildDeclRefExpr(S, PrivateVD, PrivateTy, ELoc);
	ExprResult ReductionOp;
	if (DeclareReductionRef.isUsable()) {
	QualType RedTy = DeclareReductionRef.get()->getType();
	QualType PtrRedTy = Context.getPointerType(RedTy);
	ExprResult LHS = S.CreateBuiltinUnaryOp(ELoc, UO_AddrOf, LHSDRE);
	ExprResult RHS = S.CreateBuiltinUnaryOp(ELoc, UO_AddrOf, RHSDRE);
	if (!BasePath.empty()) {
	LHS = S.DefaultLvalueConversion(LHS.get());
	RHS = S.DefaultLvalueConversion(RHS.get());
	LHS = ImplicitCastExpr::Create(Context, PtrRedTy,
	CK_UncheckedDerivedToBase, LHS.get(),
	&BasePath, LHS.get()->getValueKind());
	RHS = ImplicitCastExpr::Create(Context, PtrRedTy,
	CK_UncheckedDerivedToBase, RHS.get(),
	&BasePath, RHS.get()->getValueKind());
	}
	FunctionProtoType::ExtProtoInfo EPI;
	QualType Params[] = {PtrRedTy, PtrRedTy};
	QualType FnTy = Context.getFunctionType(Context.VoidTy, Params, EPI);
	auto *OVE = new (Context) OpaqueValueExpr(
	ELoc, Context.getPointerType(FnTy), VK_RValue, OK_Ordinary,
	S.DefaultLvalueConversion(DeclareReductionRef.get()).get());
	Expr *Args[] = {LHS.get(), RHS.get()};
	ReductionOp =
	CallExpr::Create(Context, OVE, Args, Context.VoidTy, VK_RValue, ELoc);
	} else {
	ReductionOp = S.BuildBinOp(
	Stack->getCurScope(), ReductionId.getBeginLoc(), BOK, LHSDRE, RHSDRE);
	if (ReductionOp.isUsable()) {
	if (BOK != BO_LT && BOK != BO_GT) {
	ReductionOp =
	S.BuildBinOp(Stack->getCurScope(), ReductionId.getBeginLoc(),
	BO_Assign, LHSDRE, ReductionOp.get());
	} else {
	auto *ConditionalOp = new (Context)
	ConditionalOperator(ReductionOp.get(), ELoc, LHSDRE, ELoc, RHSDRE,
	Type, VK_LValue, OK_Ordinary);
	ReductionOp =
	S.BuildBinOp(Stack->getCurScope(), ReductionId.getBeginLoc(),
	BO_Assign, LHSDRE, ConditionalOp);
	}
	if (ReductionOp.isUsable())
	ReductionOp = S.ActOnFinishFullExpr(ReductionOp.get(),
	/DiscardedValue/ false);
	}
	if (!ReductionOp.isUsable())
	continue;
	}

	// Add copy operations for inscan reductions.
	// LHS = RHS;
	ExprResult CopyOpRes, TempArrayRes, TempArrayElem;
	if (ClauseKind == OMPC_reduction &&
	RD.RedModifier == OMPC_REDUCTION_inscan) {
	ExprResult RHS = S.DefaultLvalueConversion(RHSDRE);
	CopyOpRes = S.BuildBinOp(Stack->getCurScope(), ELoc, BO_Assign, LHSDRE,
	RHS.get());
	if (!CopyOpRes.isUsable())
	continue;
	CopyOpRes =
	S.ActOnFinishFullExpr(CopyOpRes.get(), /DiscardedValue=/true);
	if (!CopyOpRes.isUsable())
	continue;
	// For simd directive and simd-based directives in simd mode no need to
	// construct temp array, need just a single temp element.
	if (Stack->getCurrentDirective() == OMPD_simd \|\|
	(S.getLangOpts().OpenMPSimd &&
	isOpenMPSimdDirective(Stack->getCurrentDirective()))) {
	VarDecl *TempArrayVD =
	buildVarDecl(S, ELoc, PrivateTy, D->getName(),
	D->hasAttrs() ? &D->getAttrs() : nullptr);
	// Add a constructor to the temp decl.
	S.ActOnUninitializedDecl(TempArrayVD);
	TempArrayRes = buildDeclRefExpr(S, TempArrayVD, PrivateTy, ELoc);
	} else {
	// Build temp array for prefix sum.
	auto *Dim = new (S.Context)
	OpaqueValueExpr(ELoc, S.Context.getSizeType(), VK_RValue);
	QualType ArrayTy =
	S.Context.getVariableArrayType(PrivateTy, Dim, ArrayType::Normal,
	/IndexTypeQuals=/0, {ELoc, ELoc});
	VarDecl *TempArrayVD =
	buildVarDecl(S, ELoc, ArrayTy, D->getName(),
	D->hasAttrs() ? &D->getAttrs() : nullptr);
	// Add a constructor to the temp decl.
	S.ActOnUninitializedDecl(TempArrayVD);
	TempArrayRes = buildDeclRefExpr(S, TempArrayVD, ArrayTy, ELoc);
	TempArrayElem =
	S.DefaultFunctionArrayLvalueConversion(TempArrayRes.get());
	auto *Idx = new (S.Context)
	OpaqueValueExpr(ELoc, S.Context.getSizeType(), VK_RValue);
	TempArrayElem = S.CreateBuiltinArraySubscriptExpr(TempArrayElem.get(),
	ELoc, Idx, ELoc);
	}
	}

	// OpenMP [2.15.4.6, Restrictions, p.2]
	// A list item that appears in an in_reduction clause of a task construct
	// must appear in a task_reduction clause of a construct associated with a
	// taskgroup region that includes the participating task in its taskgroup
	// set. The construct associated with the innermost region that meets this
	// condition must specify the same reduction-identifier as the in_reduction
	// clause.
	if (ClauseKind == OMPC_in_reduction) {
	SourceRange ParentSR;
	BinaryOperatorKind ParentBOK;
	const Expr *ParentReductionOp = nullptr;
	Expr ParentBOKTD = nullptr, ParentReductionOpTD = nullptr;
	DSAStackTy::DSAVarData ParentBOKDSA =
	Stack->getTopMostTaskgroupReductionData(D, ParentSR, ParentBOK,
	ParentBOKTD);
	DSAStackTy::DSAVarData ParentReductionOpDSA =
	Stack->getTopMostTaskgroupReductionData(
	D, ParentSR, ParentReductionOp, ParentReductionOpTD);
	bool IsParentBOK = ParentBOKDSA.DKind != OMPD_unknown;
	bool IsParentReductionOp = ParentReductionOpDSA.DKind != OMPD_unknown;
	if ((DeclareReductionRef.isUnset() && IsParentReductionOp) \|\|
	(DeclareReductionRef.isUsable() && IsParentBOK) \|\|
	(IsParentBOK && BOK != ParentBOK) \|\| IsParentReductionOp) {
	bool EmitError = true;
	if (IsParentReductionOp && DeclareReductionRef.isUsable()) {
	llvm::FoldingSetNodeID RedId, ParentRedId;
	ParentReductionOp->Profile(ParentRedId, Context, /Canonical=/true);
	DeclareReductionRef.get()->Profile(RedId, Context,
	/Canonical=/true);
	EmitError = RedId != ParentRedId;
	}
	if (EmitError) {
	S.Diag(ReductionId.getBeginLoc(),
	diag::err_omp_reduction_identifier_mismatch)
	<< ReductionIdRange << RefExpr->getSourceRange();
	S.Diag(ParentSR.getBegin(),
	diag::note_omp_previous_reduction_identifier)
	<< ParentSR
	<< (IsParentBOK ? ParentBOKDSA.RefExpr
	: ParentReductionOpDSA.RefExpr)
	->getSourceRange();
	continue;
	}
	}
	TaskgroupDescriptor = IsParentBOK ? ParentBOKTD : ParentReductionOpTD;
	}

	DeclRefExpr *Ref = nullptr;
	Expr *VarsExpr = RefExpr->IgnoreParens();
	if (!VD && !S.CurContext->isDependentContext()) {
	if (ASE \|\| OASE) {
	TransformExprToCaptures RebuildToCapture(S, D);
	VarsExpr =
	RebuildToCapture.TransformExpr(RefExpr->IgnoreParens()).get();
	Ref = RebuildToCapture.getCapturedExpr();
	} else {
	VarsExpr = Ref = buildCapture(S, D, SimpleRefExpr, /WithInit=/false);
	}
	if (!S.isOpenMPCapturedDecl(D)) {
	RD.ExprCaptures.emplace_back(Ref->getDecl());
	if (Ref->getDecl()->hasAttr<OMPCaptureNoInitAttr>()) {
	ExprResult RefRes = S.DefaultLvalueConversion(Ref);
	if (!RefRes.isUsable())
	continue;
	ExprResult PostUpdateRes =
	S.BuildBinOp(Stack->getCurScope(), ELoc, BO_Assign, SimpleRefExpr,
	RefRes.get());
	if (!PostUpdateRes.isUsable())
	continue;
	if (isOpenMPTaskingDirective(Stack->getCurrentDirective()) \|\|
	Stack->getCurrentDirective() == OMPD_taskgroup) {
	S.Diag(RefExpr->getExprLoc(),
	diag::err_omp_reduction_non_addressable_expression)
	<< RefExpr->getSourceRange();
	continue;
	}
	RD.ExprPostUpdates.emplace_back(
	S.IgnoredValueConversions(PostUpdateRes.get()).get());
	}
	}
	}
	// All reduction items are still marked as reduction (to do not increase
	// code base size).
	unsigned Modifier = RD.RedModifier;
	// Consider task_reductions as reductions with task modifier. Required for
	// correct analysis of in_reduction clauses.
	if (CurrDir == OMPD_taskgroup && ClauseKind == OMPC_task_reduction)
	Modifier = OMPC_REDUCTION_task;
	Stack->addDSA(D, RefExpr->IgnoreParens(), OMPC_reduction, Ref, Modifier);
	if (Modifier == OMPC_REDUCTION_task &&
	(CurrDir == OMPD_taskgroup \|\|
	((isOpenMPParallelDirective(CurrDir) \|\|
	isOpenMPWorksharingDirective(CurrDir)) &&
	!isOpenMPSimdDirective(CurrDir)))) {
	if (DeclareReductionRef.isUsable())
	Stack->addTaskgroupReductionData(D, ReductionIdRange,
	DeclareReductionRef.get());
	else
	Stack->addTaskgroupReductionData(D, ReductionIdRange, BOK);
	}
	RD.push(VarsExpr, PrivateDRE, LHSDRE, RHSDRE, ReductionOp.get(),
	TaskgroupDescriptor, CopyOpRes.get(), TempArrayRes.get(),
	TempArrayElem.get());
	}
	return RD.Vars.empty();
	}

	OMPClause *Sema::ActOnOpenMPReductionClause(
	ArrayRef<Expr *> VarList, OpenMPReductionClauseModifier Modifier,
	SourceLocation StartLoc, SourceLocation LParenLoc,
	SourceLocation ModifierLoc, SourceLocation ColonLoc, SourceLocation EndLoc,
	CXXScopeSpec &ReductionIdScopeSpec, const DeclarationNameInfo &ReductionId,
	ArrayRef<Expr *> UnresolvedReductions) {
	if (ModifierLoc.isValid() && Modifier == OMPC_REDUCTION_unknown) {
	Diag(LParenLoc, diag::err_omp_unexpected_clause_value)
	<< getListOfPossibleValues(OMPC_reduction, /First=/0,
	/Last=/OMPC_REDUCTION_unknown)
	<< getOpenMPClauseName(OMPC_reduction);
	return nullptr;
	}
	// OpenMP 5.0, 2.19.5.4 reduction Clause, Restrictions
	// A reduction clause with the inscan reduction-modifier may only appear on a
	// worksharing-loop construct, a worksharing-loop SIMD construct, a simd
	// construct, a parallel worksharing-loop construct or a parallel
	// worksharing-loop SIMD construct.
	if (Modifier == OMPC_REDUCTION_inscan &&
	(DSAStack->getCurrentDirective() != OMPD_for &&
	DSAStack->getCurrentDirective() != OMPD_for_simd &&
	DSAStack->getCurrentDirective() != OMPD_simd &&
	DSAStack->getCurrentDirective() != OMPD_parallel_for &&
	DSAStack->getCurrentDirective() != OMPD_parallel_for_simd)) {
	Diag(ModifierLoc, diag::err_omp_wrong_inscan_reduction);
	return nullptr;
	}

	ReductionData RD(VarList.size(), Modifier);
	if (actOnOMPReductionKindClause(*this, DSAStack, OMPC_reduction, VarList,
	StartLoc, LParenLoc, ColonLoc, EndLoc,
	ReductionIdScopeSpec, ReductionId,
	UnresolvedReductions, RD))
	return nullptr;

	return OMPReductionClause::Create(
	Context, StartLoc, LParenLoc, ModifierLoc, ColonLoc, EndLoc, Modifier,
	RD.Vars, ReductionIdScopeSpec.getWithLocInContext(Context), ReductionId,
	RD.Privates, RD.LHSs, RD.RHSs, RD.ReductionOps, RD.InscanCopyOps,
	RD.InscanCopyArrayTemps, RD.InscanCopyArrayElems,
	buildPreInits(Context, RD.ExprCaptures),
	buildPostUpdate(*this, RD.ExprPostUpdates));
	}

	OMPClause *Sema::ActOnOpenMPTaskReductionClause(
	ArrayRef<Expr *> VarList, SourceLocation StartLoc, SourceLocation LParenLoc,
	SourceLocation ColonLoc, SourceLocation EndLoc,
	CXXScopeSpec &ReductionIdScopeSpec, const DeclarationNameInfo &ReductionId,
	ArrayRef<Expr *> UnresolvedReductions) {
	ReductionData RD(VarList.size());
	if (actOnOMPReductionKindClause(*this, DSAStack, OMPC_task_reduction, VarList,
	StartLoc, LParenLoc, ColonLoc, EndLoc,
	ReductionIdScopeSpec, ReductionId,
	UnresolvedReductions, RD))
	return nullptr;

	return OMPTaskReductionClause::Create(
	Context, StartLoc, LParenLoc, ColonLoc, EndLoc, RD.Vars,
	ReductionIdScopeSpec.getWithLocInContext(Context), ReductionId,
	RD.Privates, RD.LHSs, RD.RHSs, RD.ReductionOps,
	buildPreInits(Context, RD.ExprCaptures),
	buildPostUpdate(*this, RD.ExprPostUpdates));
	}

	OMPClause *Sema::ActOnOpenMPInReductionClause(
	ArrayRef<Expr *> VarList, SourceLocation StartLoc, SourceLocation LParenLoc,
	SourceLocation ColonLoc, SourceLocation EndLoc,
	CXXScopeSpec &ReductionIdScopeSpec, const DeclarationNameInfo &ReductionId,
	ArrayRef<Expr *> UnresolvedReductions) {
	ReductionData RD(VarList.size());
	if (actOnOMPReductionKindClause(*this, DSAStack, OMPC_in_reduction, VarList,
	StartLoc, LParenLoc, ColonLoc, EndLoc,
	ReductionIdScopeSpec, ReductionId,
	UnresolvedReductions, RD))
	return nullptr;

	return OMPInReductionClause::Create(
	Context, StartLoc, LParenLoc, ColonLoc, EndLoc, RD.Vars,
	ReductionIdScopeSpec.getWithLocInContext(Context), ReductionId,
	RD.Privates, RD.LHSs, RD.RHSs, RD.ReductionOps, RD.TaskgroupDescriptors,
	buildPreInits(Context, RD.ExprCaptures),
	buildPostUpdate(*this, RD.ExprPostUpdates));
	}

	bool Sema::CheckOpenMPLinearModifier(OpenMPLinearClauseKind LinKind,
	SourceLocation LinLoc) {
	if ((!LangOpts.CPlusPlus && LinKind != OMPC_LINEAR_val) \|\|
	LinKind == OMPC_LINEAR_unknown) {
	Diag(LinLoc, diag::err_omp_wrong_linear_modifier) << LangOpts.CPlusPlus;
	return true;
	}
	return false;
	}

	bool Sema::CheckOpenMPLinearDecl(const ValueDecl *D, SourceLocation ELoc,
	OpenMPLinearClauseKind LinKind, QualType Type,
	bool IsDeclareSimd) {
	const auto *VD = dyn_cast_or_null<VarDecl>(D);
	// A variable must not have an incomplete type or a reference type.
	if (RequireCompleteType(ELoc, Type, diag::err_omp_linear_incomplete_type))
	return true;
	if ((LinKind == OMPC_LINEAR_uval \|\| LinKind == OMPC_LINEAR_ref) &&
	!Type->isReferenceType()) {
	Diag(ELoc, diag::err_omp_wrong_linear_modifier_non_reference)
	<< Type << getOpenMPSimpleClauseTypeName(OMPC_linear, LinKind);
	return true;
	}
	Type = Type.getNonReferenceType();

	// OpenMP 5.0 [2.19.3, List Item Privatization, Restrictions]
	// A variable that is privatized must not have a const-qualified type
	// unless it is of class type with a mutable member. This restriction does
	// not apply to the firstprivate clause, nor to the linear clause on
	// declarative directives (like declare simd).
	if (!IsDeclareSimd &&
	rejectConstNotMutableType(*this, D, Type, OMPC_linear, ELoc))
	return true;

	// A list item must be of integral or pointer type.
	Type = Type.getUnqualifiedType().getCanonicalType();
	const auto *Ty = Type.getTypePtrOrNull();
	if (!Ty \|\| (LinKind != OMPC_LINEAR_ref && !Ty->isDependentType() &&
	!Ty->isIntegralType(Context) && !Ty->isPointerType())) {
	Diag(ELoc, diag::err_omp_linear_expected_int_or_ptr) << Type;
	if (D) {
	bool IsDecl =
	!VD \|\|
	VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
	Diag(D->getLocation(),
	IsDecl ? diag::note_previous_decl : diag::note_defined_here)
	<< D;
	}
	return true;
	}
	return false;
	}

	OMPClause *Sema::ActOnOpenMPLinearClause(
	ArrayRef<Expr > VarList, Expr Step, SourceLocation StartLoc,
	SourceLocation LParenLoc, OpenMPLinearClauseKind LinKind,
	SourceLocation LinLoc, SourceLocation ColonLoc, SourceLocation EndLoc) {
	SmallVector<Expr *, 8> Vars;
	SmallVector<Expr *, 8> Privates;
	SmallVector<Expr *, 8> Inits;
	SmallVector<Decl *, 4> ExprCaptures;
	SmallVector<Expr *, 4> ExprPostUpdates;
	if (CheckOpenMPLinearModifier(LinKind, LinLoc))
	LinKind = OMPC_LINEAR_val;
	for (Expr *RefExpr : VarList) {
	assert(RefExpr && "NULL expr in OpenMP linear clause.");
	SourceLocation ELoc;
	SourceRange ERange;
	Expr *SimpleRefExpr = RefExpr;
	auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
	if (Res.second) {
	// It will be analyzed later.
	Vars.push_back(RefExpr);
	Privates.push_back(nullptr);
	Inits.push_back(nullptr);
	}
	ValueDecl *D = Res.first;
	if (!D)
	continue;

	QualType Type = D->getType();
	auto *VD = dyn_cast<VarDecl>(D);

	// OpenMP [2.14.3.7, linear clause]
	// A list-item cannot appear in more than one linear clause.
	// A list-item that appears in a linear clause cannot appear in any
	// other data-sharing attribute clause.
	DSAStackTy::DSAVarData DVar = DSAStack->getTopDSA(D, /FromParent=/false);
	if (DVar.RefExpr) {
	Diag(ELoc, diag::err_omp_wrong_dsa) << getOpenMPClauseName(DVar.CKind)
	<< getOpenMPClauseName(OMPC_linear);
	reportOriginalDsa(*this, DSAStack, D, DVar);
	continue;
	}

	if (CheckOpenMPLinearDecl(D, ELoc, LinKind, Type))
	continue;
	Type = Type.getNonReferenceType().getUnqualifiedType().getCanonicalType();

	// Build private copy of original var.
	VarDecl *Private =
	buildVarDecl(*this, ELoc, Type, D->getName(),
	D->hasAttrs() ? &D->getAttrs() : nullptr,
	VD ? cast<DeclRefExpr>(SimpleRefExpr) : nullptr);
	DeclRefExpr PrivateRef = buildDeclRefExpr(this, Private, Type, ELoc);
	// Build var to save initial value.
	VarDecl Init = buildVarDecl(this, ELoc, Type, ".linear.start");
	Expr *InitExpr;
	DeclRefExpr *Ref = nullptr;
	if (!VD && !CurContext->isDependentContext()) {
	Ref = buildCapture(this, D, SimpleRefExpr, /WithInit=*/false);
	if (!isOpenMPCapturedDecl(D)) {
	ExprCaptures.push_back(Ref->getDecl());
	if (Ref->getDecl()->hasAttr<OMPCaptureNoInitAttr>()) {
	ExprResult RefRes = DefaultLvalueConversion(Ref);
	if (!RefRes.isUsable())
	continue;
	ExprResult PostUpdateRes =
	BuildBinOp(DSAStack->getCurScope(), ELoc, BO_Assign,
	SimpleRefExpr, RefRes.get());
	if (!PostUpdateRes.isUsable())
	continue;
	ExprPostUpdates.push_back(
	IgnoredValueConversions(PostUpdateRes.get()).get());
	}
	}
	}
	if (LinKind == OMPC_LINEAR_uval)
	InitExpr = VD ? VD->getInit() : SimpleRefExpr;
	else
	InitExpr = VD ? SimpleRefExpr : Ref;
	AddInitializerToDecl(Init, DefaultLvalueConversion(InitExpr).get(),
	/DirectInit=/false);
	DeclRefExpr InitRef = buildDeclRefExpr(this, Init, Type, ELoc);

	DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_linear, Ref);
	Vars.push_back((VD \|\| CurContext->isDependentContext())
	? RefExpr->IgnoreParens()
	: Ref);
	Privates.push_back(PrivateRef);
	Inits.push_back(InitRef);
	}

	if (Vars.empty())
	return nullptr;

	Expr *StepExpr = Step;
	Expr *CalcStepExpr = nullptr;
	if (Step && !Step->isValueDependent() && !Step->isTypeDependent() &&
	!Step->isInstantiationDependent() &&
	!Step->containsUnexpandedParameterPack()) {
	SourceLocation StepLoc = Step->getBeginLoc();
	ExprResult Val = PerformOpenMPImplicitIntegerConversion(StepLoc, Step);
	if (Val.isInvalid())
	return nullptr;
	StepExpr = Val.get();

	// Build var to save the step value.
	VarDecl *SaveVar =
	buildVarDecl(*this, StepLoc, StepExpr->getType(), ".linear.step");
	ExprResult SaveRef =
	buildDeclRefExpr(*this, SaveVar, StepExpr->getType(), StepLoc);
	ExprResult CalcStep =
	BuildBinOp(CurScope, StepLoc, BO_Assign, SaveRef.get(), StepExpr);
	CalcStep = ActOnFinishFullExpr(CalcStep.get(), /DiscardedValue/ false);

	// Warn about zero linear step (it would be probably better specified as
	// making corresponding variables 'const').
	llvm::APSInt Result;
	bool IsConstant = StepExpr->isIntegerConstantExpr(Result, Context);
	if (IsConstant && !Result.isNegative() && !Result.isStrictlyPositive())
	Diag(StepLoc, diag::warn_omp_linear_step_zero) << Vars[0]
	<< (Vars.size() > 1);
	if (!IsConstant && CalcStep.isUsable()) {
	// Calculate the step beforehand instead of doing this on each iteration.
	// (This is not used if the number of iterations may be kfold-ed).
	CalcStepExpr = CalcStep.get();
	}
	}

	return OMPLinearClause::Create(Context, StartLoc, LParenLoc, LinKind, LinLoc,
	ColonLoc, EndLoc, Vars, Privates, Inits,
	StepExpr, CalcStepExpr,
	buildPreInits(Context, ExprCaptures),
	buildPostUpdate(*this, ExprPostUpdates));
	}

	static bool FinishOpenMPLinearClause(OMPLinearClause &Clause, DeclRefExpr *IV,
	Expr *NumIterations, Sema &SemaRef,
	Scope S, DSAStackTy Stack) {
	// Walk the vars and build update/final expressions for the CodeGen.
	SmallVector<Expr *, 8> Updates;
	SmallVector<Expr *, 8> Finals;
	SmallVector<Expr *, 8> UsedExprs;
	Expr *Step = Clause.getStep();
	Expr *CalcStep = Clause.getCalcStep();
	// OpenMP [2.14.3.7, linear clause]
	// If linear-step is not specified it is assumed to be 1.
	if (!Step)
	Step = SemaRef.ActOnIntegerConstant(SourceLocation(), 1).get();
	else if (CalcStep)
	Step = cast<BinaryOperator>(CalcStep)->getLHS();
	bool HasErrors = false;
	auto CurInit = Clause.inits().begin();
	auto CurPrivate = Clause.privates().begin();
	OpenMPLinearClauseKind LinKind = Clause.getModifier();
	for (Expr *RefExpr : Clause.varlists()) {
	SourceLocation ELoc;
	SourceRange ERange;
	Expr *SimpleRefExpr = RefExpr;
	auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange);
	ValueDecl *D = Res.first;
	if (Res.second \|\| !D) {
	Updates.push_back(nullptr);
	Finals.push_back(nullptr);
	HasErrors = true;
	continue;
	}
	auto &&Info = Stack->isLoopControlVariable(D);
	// OpenMP [2.15.11, distribute simd Construct]
	// A list item may not appear in a linear clause, unless it is the loop
	// iteration variable.
	if (isOpenMPDistributeDirective(Stack->getCurrentDirective()) &&
	isOpenMPSimdDirective(Stack->getCurrentDirective()) && !Info.first) {
	SemaRef.Diag(ELoc,
	diag::err_omp_linear_distribute_var_non_loop_iteration);
	Updates.push_back(nullptr);
	Finals.push_back(nullptr);
	HasErrors = true;
	continue;
	}
	Expr InitExpr = CurInit;

	// Build privatized reference to the current linear var.
	auto *DE = cast<DeclRefExpr>(SimpleRefExpr);
	Expr *CapturedRef;
	if (LinKind == OMPC_LINEAR_uval)
	CapturedRef = cast<VarDecl>(DE->getDecl())->getInit();
	else
	CapturedRef =
	buildDeclRefExpr(SemaRef, cast<VarDecl>(DE->getDecl()),
	DE->getType().getUnqualifiedType(), DE->getExprLoc(),
	/RefersToCapture=/true);

	// Build update: Var = InitExpr + IV * Step
	ExprResult Update;
	if (!Info.first)
	Update = buildCounterUpdate(
	SemaRef, S, RefExpr->getExprLoc(), *CurPrivate, InitExpr, IV, Step,
	/Subtract=/false, /IsNonRectangularLB=/false);
	else
	Update = *CurPrivate;
	Update = SemaRef.ActOnFinishFullExpr(Update.get(), DE->getBeginLoc(),
	/DiscardedValue/ false);

	// Build final: Var = InitExpr + NumIterations * Step
	ExprResult Final;
	if (!Info.first)
	Final =
	buildCounterUpdate(SemaRef, S, RefExpr->getExprLoc(), CapturedRef,
	InitExpr, NumIterations, Step, /Subtract=/false,
	/IsNonRectangularLB=/false);
	else
	Final = *CurPrivate;
	Final = SemaRef.ActOnFinishFullExpr(Final.get(), DE->getBeginLoc(),
	/DiscardedValue/ false);

	if (!Update.isUsable() \|\| !Final.isUsable()) {
	Updates.push_back(nullptr);
	Finals.push_back(nullptr);
	UsedExprs.push_back(nullptr);
	HasErrors = true;
	} else {
	Updates.push_back(Update.get());
	Finals.push_back(Final.get());
	if (!Info.first)
	UsedExprs.push_back(SimpleRefExpr);
	}
	++CurInit;
	++CurPrivate;
	}
	if (Expr *S = Clause.getStep())
	UsedExprs.push_back(S);
	// Fill the remaining part with the nullptr.
	UsedExprs.append(Clause.varlist_size() + 1 - UsedExprs.size(), nullptr);
	Clause.setUpdates(Updates);
	Clause.setFinals(Finals);
	Clause.setUsedExprs(UsedExprs);
	return HasErrors;
	}

	OMPClause *Sema::ActOnOpenMPAlignedClause(
	ArrayRef<Expr > VarList, Expr Alignment, SourceLocation StartLoc,
	SourceLocation LParenLoc, SourceLocation ColonLoc, SourceLocation EndLoc) {
	SmallVector<Expr *, 8> Vars;
	for (Expr *RefExpr : VarList) {
	assert(RefExpr && "NULL expr in OpenMP linear clause.");
	SourceLocation ELoc;
	SourceRange ERange;
	Expr *SimpleRefExpr = RefExpr;
	auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
	if (Res.second) {
	// It will be analyzed later.
	Vars.push_back(RefExpr);
	}
	ValueDecl *D = Res.first;
	if (!D)
	continue;

	QualType QType = D->getType();
	auto *VD = dyn_cast<VarDecl>(D);

	// OpenMP [2.8.1, simd construct, Restrictions]
	// The type of list items appearing in the aligned clause must be
	// array, pointer, reference to array, or reference to pointer.
	QType = QType.getNonReferenceType().getUnqualifiedType().getCanonicalType();
	const Type *Ty = QType.getTypePtrOrNull();
	if (!Ty \|\| (!Ty->isArrayType() && !Ty->isPointerType())) {
	Diag(ELoc, diag::err_omp_aligned_expected_array_or_ptr)
	<< QType << getLangOpts().CPlusPlus << ERange;
	bool IsDecl =
	!VD \|\|
	VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
	Diag(D->getLocation(),
	IsDecl ? diag::note_previous_decl : diag::note_defined_here)
	<< D;
	continue;
	}

	// OpenMP [2.8.1, simd construct, Restrictions]
	// A list-item cannot appear in more than one aligned clause.
	if (const Expr *PrevRef = DSAStack->addUniqueAligned(D, SimpleRefExpr)) {
	Diag(ELoc, diag::err_omp_used_in_clause_twice)
	<< 0 << getOpenMPClauseName(OMPC_aligned) << ERange;
	Diag(PrevRef->getExprLoc(), diag::note_omp_explicit_dsa)
	<< getOpenMPClauseName(OMPC_aligned);
	continue;
	}

	DeclRefExpr *Ref = nullptr;
	if (!VD && isOpenMPCapturedDecl(D))
	Ref = buildCapture(this, D, SimpleRefExpr, /WithInit=*/true);
	Vars.push_back(DefaultFunctionArrayConversion(
	(VD \|\| !Ref) ? RefExpr->IgnoreParens() : Ref)
	.get());
	}

	// OpenMP [2.8.1, simd construct, Description]
	// The parameter of the aligned clause, alignment, must be a constant
	// positive integer expression.
	// If no optional parameter is specified, implementation-defined default
	// alignments for SIMD instructions on the target platforms are assumed.
	if (Alignment != nullptr) {
	ExprResult AlignResult =
	VerifyPositiveIntegerConstantInClause(Alignment, OMPC_aligned);
	if (AlignResult.isInvalid())
	return nullptr;
	Alignment = AlignResult.get();
	}
	if (Vars.empty())
	return nullptr;

	return OMPAlignedClause::Create(Context, StartLoc, LParenLoc, ColonLoc,
	EndLoc, Vars, Alignment);
	}

	OMPClause Sema::ActOnOpenMPCopyinClause(ArrayRef<Expr > VarList,
	SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation EndLoc) {
	SmallVector<Expr *, 8> Vars;
	SmallVector<Expr *, 8> SrcExprs;
	SmallVector<Expr *, 8> DstExprs;
	SmallVector<Expr *, 8> AssignmentOps;
	for (Expr *RefExpr : VarList) {
	assert(RefExpr && "NULL expr in OpenMP copyin clause.");
	if (isa<DependentScopeDeclRefExpr>(RefExpr)) {
	// It will be analyzed later.
	Vars.push_back(RefExpr);
	SrcExprs.push_back(nullptr);
	DstExprs.push_back(nullptr);
	AssignmentOps.push_back(nullptr);
	continue;
	}

	SourceLocation ELoc = RefExpr->getExprLoc();
	// OpenMP [2.1, C/C++]
	// A list item is a variable name.
	// OpenMP [2.14.4.1, Restrictions, p.1]
	// A list item that appears in a copyin clause must be threadprivate.
	auto *DE = dyn_cast<DeclRefExpr>(RefExpr);
	if (!DE \|\| !isa<VarDecl>(DE->getDecl())) {
	Diag(ELoc, diag::err_omp_expected_var_name_member_expr)
	<< 0 << RefExpr->getSourceRange();
	continue;
	}

	Decl *D = DE->getDecl();
	auto *VD = cast<VarDecl>(D);

	QualType Type = VD->getType();
	if (Type->isDependentType() \|\| Type->isInstantiationDependentType()) {
	// It will be analyzed later.
	Vars.push_back(DE);
	SrcExprs.push_back(nullptr);
	DstExprs.push_back(nullptr);
	AssignmentOps.push_back(nullptr);
	continue;
	}

	// OpenMP [2.14.4.1, Restrictions, C/C++, p.1]
	// A list item that appears in a copyin clause must be threadprivate.
	if (!DSAStack->isThreadPrivate(VD)) {
	Diag(ELoc, diag::err_omp_required_access)
	<< getOpenMPClauseName(OMPC_copyin)
	<< getOpenMPDirectiveName(OMPD_threadprivate);
	continue;
	}

	// OpenMP [2.14.4.1, Restrictions, C/C++, p.2]
	// A variable of class type (or array thereof) that appears in a
	// copyin clause requires an accessible, unambiguous copy assignment
	// operator for the class type.
	QualType ElemType = Context.getBaseElementType(Type).getNonReferenceType();
	VarDecl *SrcVD =
	buildVarDecl(*this, DE->getBeginLoc(), ElemType.getUnqualifiedType(),
	".copyin.src", VD->hasAttrs() ? &VD->getAttrs() : nullptr);
	DeclRefExpr *PseudoSrcExpr = buildDeclRefExpr(
	*this, SrcVD, ElemType.getUnqualifiedType(), DE->getExprLoc());
	VarDecl *DstVD =
	buildVarDecl(*this, DE->getBeginLoc(), ElemType, ".copyin.dst",
	VD->hasAttrs() ? &VD->getAttrs() : nullptr);
	DeclRefExpr *PseudoDstExpr =
	buildDeclRefExpr(*this, DstVD, ElemType, DE->getExprLoc());
	// For arrays generate assignment operation for single element and replace
	// it by the original array element in CodeGen.
	ExprResult AssignmentOp =
	BuildBinOp(/S=/nullptr, DE->getExprLoc(), BO_Assign, PseudoDstExpr,
	PseudoSrcExpr);
	if (AssignmentOp.isInvalid())
	continue;
	AssignmentOp = ActOnFinishFullExpr(AssignmentOp.get(), DE->getExprLoc(),
	/DiscardedValue/ false);
	if (AssignmentOp.isInvalid())
	continue;

	DSAStack->addDSA(VD, DE, OMPC_copyin);
	Vars.push_back(DE);
	SrcExprs.push_back(PseudoSrcExpr);
	DstExprs.push_back(PseudoDstExpr);
	AssignmentOps.push_back(AssignmentOp.get());
	}

	if (Vars.empty())
	return nullptr;

	return OMPCopyinClause::Create(Context, StartLoc, LParenLoc, EndLoc, Vars,
	SrcExprs, DstExprs, AssignmentOps);
	}

	OMPClause Sema::ActOnOpenMPCopyprivateClause(ArrayRef<Expr > VarList,
	SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation EndLoc) {
	SmallVector<Expr *, 8> Vars;
	SmallVector<Expr *, 8> SrcExprs;
	SmallVector<Expr *, 8> DstExprs;
	SmallVector<Expr *, 8> AssignmentOps;
	for (Expr *RefExpr : VarList) {
	assert(RefExpr && "NULL expr in OpenMP linear clause.");
	SourceLocation ELoc;
	SourceRange ERange;
	Expr *SimpleRefExpr = RefExpr;
	auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
	if (Res.second) {
	// It will be analyzed later.
	Vars.push_back(RefExpr);
	SrcExprs.push_back(nullptr);
	DstExprs.push_back(nullptr);
	AssignmentOps.push_back(nullptr);
	}
	ValueDecl *D = Res.first;
	if (!D)
	continue;

	QualType Type = D->getType();
	auto *VD = dyn_cast<VarDecl>(D);

	// OpenMP [2.14.4.2, Restrictions, p.2]
	// A list item that appears in a copyprivate clause may not appear in a
	// private or firstprivate clause on the single construct.
	if (!VD \|\| !DSAStack->isThreadPrivate(VD)) {
	DSAStackTy::DSAVarData DVar =
	DSAStack->getTopDSA(D, /FromParent=/false);
	if (DVar.CKind != OMPC_unknown && DVar.CKind != OMPC_copyprivate &&
	DVar.RefExpr) {
	Diag(ELoc, diag::err_omp_wrong_dsa)
	<< getOpenMPClauseName(DVar.CKind)
	<< getOpenMPClauseName(OMPC_copyprivate);
	reportOriginalDsa(*this, DSAStack, D, DVar);
	continue;
	}

	// OpenMP [2.11.4.2, Restrictions, p.1]
	// All list items that appear in a copyprivate clause must be either
	// threadprivate or private in the enclosing context.
	if (DVar.CKind == OMPC_unknown) {
	DVar = DSAStack->getImplicitDSA(D, false);
	if (DVar.CKind == OMPC_shared) {
	Diag(ELoc, diag::err_omp_required_access)
	<< getOpenMPClauseName(OMPC_copyprivate)
	<< "threadprivate or private in the enclosing context";
	reportOriginalDsa(*this, DSAStack, D, DVar);
	continue;
	}
	}
	}

	// Variably modified types are not supported.
	if (!Type->isAnyPointerType() && Type->isVariablyModifiedType()) {
	Diag(ELoc, diag::err_omp_variably_modified_type_not_supported)
	<< getOpenMPClauseName(OMPC_copyprivate) << Type
	<< getOpenMPDirectiveName(DSAStack->getCurrentDirective());
	bool IsDecl =
	!VD \|\|
	VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
	Diag(D->getLocation(),
	IsDecl ? diag::note_previous_decl : diag::note_defined_here)
	<< D;
	continue;
	}

	// OpenMP [2.14.4.1, Restrictions, C/C++, p.2]
	// A variable of class type (or array thereof) that appears in a
	// copyin clause requires an accessible, unambiguous copy assignment
	// operator for the class type.
	Type = Context.getBaseElementType(Type.getNonReferenceType())
	.getUnqualifiedType();
	VarDecl *SrcVD =
	buildVarDecl(*this, RefExpr->getBeginLoc(), Type, ".copyprivate.src",
	D->hasAttrs() ? &D->getAttrs() : nullptr);
	DeclRefExpr PseudoSrcExpr = buildDeclRefExpr(this, SrcVD, Type, ELoc);
	VarDecl *DstVD =
	buildVarDecl(*this, RefExpr->getBeginLoc(), Type, ".copyprivate.dst",
	D->hasAttrs() ? &D->getAttrs() : nullptr);
	DeclRefExpr PseudoDstExpr = buildDeclRefExpr(this, DstVD, Type, ELoc);
	ExprResult AssignmentOp = BuildBinOp(
	DSAStack->getCurScope(), ELoc, BO_Assign, PseudoDstExpr, PseudoSrcExpr);
	if (AssignmentOp.isInvalid())
	continue;
	AssignmentOp =
	ActOnFinishFullExpr(AssignmentOp.get(), ELoc, /DiscardedValue/ false);
	if (AssignmentOp.isInvalid())
	continue;

	// No need to mark vars as copyprivate, they are already threadprivate or
	// implicitly private.
	assert(VD \|\| isOpenMPCapturedDecl(D));
	Vars.push_back(
	VD ? RefExpr->IgnoreParens()
	: buildCapture(this, D, SimpleRefExpr, /WithInit=*/false));
	SrcExprs.push_back(PseudoSrcExpr);
	DstExprs.push_back(PseudoDstExpr);
	AssignmentOps.push_back(AssignmentOp.get());
	}

	if (Vars.empty())
	return nullptr;

	return OMPCopyprivateClause::Create(Context, StartLoc, LParenLoc, EndLoc,
	Vars, SrcExprs, DstExprs, AssignmentOps);
	}

	OMPClause Sema::ActOnOpenMPFlushClause(ArrayRef<Expr > VarList,
	SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation EndLoc) {
	if (VarList.empty())
	return nullptr;

	return OMPFlushClause::Create(Context, StartLoc, LParenLoc, EndLoc, VarList);
	}

	/// Tries to find omp_depend_t. type.
	static bool findOMPDependT(Sema &S, SourceLocation Loc, DSAStackTy *Stack,
	bool Diagnose = true) {
	QualType OMPDependT = Stack->getOMPDependT();
	if (!OMPDependT.isNull())
	return true;
	IdentifierInfo *II = &S.PP.getIdentifierTable().get("omp_depend_t");
	ParsedType PT = S.getTypeName(*II, Loc, S.getCurScope());
	if (!PT.getAsOpaquePtr() \|\| PT.get().isNull()) {
	if (Diagnose)
	S.Diag(Loc, diag::err_omp_implied_type_not_found) << "omp_depend_t";
	return false;
	}
	Stack->setOMPDependT(PT.get());
	return true;
	}

	OMPClause Sema::ActOnOpenMPDepobjClause(Expr Depobj, SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation EndLoc) {
	if (!Depobj)
	return nullptr;

	bool OMPDependTFound = findOMPDependT(*this, StartLoc, DSAStack);

	// OpenMP 5.0, 2.17.10.1 depobj Construct
	// depobj is an lvalue expression of type omp_depend_t.
	if (!Depobj->isTypeDependent() && !Depobj->isValueDependent() &&
	!Depobj->isInstantiationDependent() &&
	!Depobj->containsUnexpandedParameterPack() &&
	(OMPDependTFound &&
	!Context.typesAreCompatible(DSAStack->getOMPDependT(), Depobj->getType(),
	/CompareUnqualified=/true))) {
	Diag(Depobj->getExprLoc(), diag::err_omp_expected_omp_depend_t_lvalue)
	<< 0 << Depobj->getType() << Depobj->getSourceRange();
	}

	if (!Depobj->isLValue()) {
	Diag(Depobj->getExprLoc(), diag::err_omp_expected_omp_depend_t_lvalue)
	<< 1 << Depobj->getSourceRange();
	}

	return OMPDepobjClause::Create(Context, StartLoc, LParenLoc, EndLoc, Depobj);
	}

	OMPClause *
	Sema::ActOnOpenMPDependClause(Expr *DepModifier, OpenMPDependClauseKind DepKind,
	SourceLocation DepLoc, SourceLocation ColonLoc,
	ArrayRef<Expr *> VarList, SourceLocation StartLoc,
	SourceLocation LParenLoc, SourceLocation EndLoc) {
	if (DSAStack->getCurrentDirective() == OMPD_ordered &&
	DepKind != OMPC_DEPEND_source && DepKind != OMPC_DEPEND_sink) {
	Diag(DepLoc, diag::err_omp_unexpected_clause_value)
	<< "'source' or 'sink'" << getOpenMPClauseName(OMPC_depend);
	return nullptr;
	}
	if ((DSAStack->getCurrentDirective() != OMPD_ordered \|\|
	DSAStack->getCurrentDirective() == OMPD_depobj) &&
	(DepKind == OMPC_DEPEND_unknown \|\| DepKind == OMPC_DEPEND_source \|\|
	DepKind == OMPC_DEPEND_sink \|\|
	((LangOpts.OpenMP < 50 \|\|
	DSAStack->getCurrentDirective() == OMPD_depobj) &&
	DepKind == OMPC_DEPEND_depobj))) {
	SmallVector<unsigned, 3> Except;
	Except.push_back(OMPC_DEPEND_source);
	Except.push_back(OMPC_DEPEND_sink);
	if (LangOpts.OpenMP < 50 \|\| DSAStack->getCurrentDirective() == OMPD_depobj)
	Except.push_back(OMPC_DEPEND_depobj);
	std::string Expected = (LangOpts.OpenMP >= 50 && !DepModifier)
	? "depend modifier(iterator) or "
	: "";
	Diag(DepLoc, diag::err_omp_unexpected_clause_value)
	<< Expected + getListOfPossibleValues(OMPC_depend, /First=/0,
	/Last=/OMPC_DEPEND_unknown,
	Except)
	<< getOpenMPClauseName(OMPC_depend);
	return nullptr;
	}
	if (DepModifier &&
	(DepKind == OMPC_DEPEND_source \|\| DepKind == OMPC_DEPEND_sink)) {
	Diag(DepModifier->getExprLoc(),
	diag::err_omp_depend_sink_source_with_modifier);
	return nullptr;
	}
	if (DepModifier &&
	!DepModifier->getType()->isSpecificBuiltinType(BuiltinType::OMPIterator))
	Diag(DepModifier->getExprLoc(), diag::err_omp_depend_modifier_not_iterator);

	SmallVector<Expr *, 8> Vars;
	DSAStackTy::OperatorOffsetTy OpsOffs;
	llvm::APSInt DepCounter(/BitWidth=/32);
	llvm::APSInt TotalDepCount(/BitWidth=/32);
	if (DepKind == OMPC_DEPEND_sink \|\| DepKind == OMPC_DEPEND_source) {
	if (const Expr *OrderedCountExpr =
	DSAStack->getParentOrderedRegionParam().first) {
	TotalDepCount = OrderedCountExpr->EvaluateKnownConstInt(Context);
	TotalDepCount.setIsUnsigned(/Val=/true);
	}
	}
	for (Expr *RefExpr : VarList) {
	assert(RefExpr && "NULL expr in OpenMP shared clause.");
	if (isa<DependentScopeDeclRefExpr>(RefExpr)) {
	// It will be analyzed later.
	Vars.push_back(RefExpr);
	continue;
	}

	SourceLocation ELoc = RefExpr->getExprLoc();
	Expr *SimpleExpr = RefExpr->IgnoreParenCasts();
	if (DepKind == OMPC_DEPEND_sink) {
	if (DSAStack->getParentOrderedRegionParam().first &&
	DepCounter >= TotalDepCount) {
	Diag(ELoc, diag::err_omp_depend_sink_unexpected_expr);
	continue;
	}
	++DepCounter;
	// OpenMP [2.13.9, Summary]
	// depend(dependence-type : vec), where dependence-type is:
	// 'sink' and where vec is the iteration vector, which has the form:
	// x1 [+- d1], x2 [+- d2 ], . . . , xn [+- dn]
	// where n is the value specified by the ordered clause in the loop
	// directive, xi denotes the loop iteration variable of the i-th nested
	// loop associated with the loop directive, and di is a constant
	// non-negative integer.
	if (CurContext->isDependentContext()) {
	// It will be analyzed later.
	Vars.push_back(RefExpr);
	continue;
	}
	SimpleExpr = SimpleExpr->IgnoreImplicit();
	OverloadedOperatorKind OOK = OO_None;
	SourceLocation OOLoc;
	Expr *LHS = SimpleExpr;
	Expr *RHS = nullptr;
	if (auto *BO = dyn_cast<BinaryOperator>(SimpleExpr)) {
	OOK = BinaryOperator::getOverloadedOperator(BO->getOpcode());
	OOLoc = BO->getOperatorLoc();
	LHS = BO->getLHS()->IgnoreParenImpCasts();
	RHS = BO->getRHS()->IgnoreParenImpCasts();
	} else if (auto *OCE = dyn_cast<CXXOperatorCallExpr>(SimpleExpr)) {
	OOK = OCE->getOperator();
	OOLoc = OCE->getOperatorLoc();
	LHS = OCE->getArg(/Arg=/0)->IgnoreParenImpCasts();
	RHS = OCE->getArg(/Arg=/1)->IgnoreParenImpCasts();
	} else if (auto *MCE = dyn_cast<CXXMemberCallExpr>(SimpleExpr)) {
	OOK = MCE->getMethodDecl()
	->getNameInfo()
	.getName()
	.getCXXOverloadedOperator();
	OOLoc = MCE->getCallee()->getExprLoc();
	LHS = MCE->getImplicitObjectArgument()->IgnoreParenImpCasts();
	RHS = MCE->getArg(/Arg=/0)->IgnoreParenImpCasts();
	}
	SourceLocation ELoc;
	SourceRange ERange;
	auto Res = getPrivateItem(*this, LHS, ELoc, ERange);
	if (Res.second) {
	// It will be analyzed later.
	Vars.push_back(RefExpr);
	}
	ValueDecl *D = Res.first;
	if (!D)
	continue;

	if (OOK != OO_Plus && OOK != OO_Minus && (RHS \|\| OOK != OO_None)) {
	Diag(OOLoc, diag::err_omp_depend_sink_expected_plus_minus);
	continue;
	}
	if (RHS) {
	ExprResult RHSRes = VerifyPositiveIntegerConstantInClause(
	RHS, OMPC_depend, /StrictlyPositive=/false);
	if (RHSRes.isInvalid())
	continue;
	}
	if (!CurContext->isDependentContext() &&
	DSAStack->getParentOrderedRegionParam().first &&
	DepCounter != DSAStack->isParentLoopControlVariable(D).first) {
	const ValueDecl *VD =
	DSAStack->getParentLoopControlVariable(DepCounter.getZExtValue());
	if (VD)
	Diag(ELoc, diag::err_omp_depend_sink_expected_loop_iteration)
	<< 1 << VD;
	else
	Diag(ELoc, diag::err_omp_depend_sink_expected_loop_iteration) << 0;
	continue;
	}
	OpsOffs.emplace_back(RHS, OOK);
	} else {
	bool OMPDependTFound = LangOpts.OpenMP >= 50;
	if (OMPDependTFound)
	OMPDependTFound = findOMPDependT(*this, StartLoc, DSAStack,
	DepKind == OMPC_DEPEND_depobj);
	if (DepKind == OMPC_DEPEND_depobj) {
	// OpenMP 5.0, 2.17.11 depend Clause, Restrictions, C/C++
	// List items used in depend clauses with the depobj dependence type
	// must be expressions of the omp_depend_t type.
	if (!RefExpr->isValueDependent() && !RefExpr->isTypeDependent() &&
	!RefExpr->isInstantiationDependent() &&
	!RefExpr->containsUnexpandedParameterPack() &&
	(OMPDependTFound &&
	!Context.hasSameUnqualifiedType(DSAStack->getOMPDependT(),
	RefExpr->getType()))) {
	Diag(ELoc, diag::err_omp_expected_omp_depend_t_lvalue)
	<< 0 << RefExpr->getType() << RefExpr->getSourceRange();
	continue;
	}
	if (!RefExpr->isLValue()) {
	Diag(ELoc, diag::err_omp_expected_omp_depend_t_lvalue)
	<< 1 << RefExpr->getType() << RefExpr->getSourceRange();
	continue;
	}
	} else {
	// OpenMP 5.0 [2.17.11, Restrictions]
	// List items used in depend clauses cannot be zero-length array
	// sections.
	QualType ExprTy = RefExpr->getType().getNonReferenceType();
	const auto *OASE = dyn_cast<OMPArraySectionExpr>(SimpleExpr);
	if (OASE) {
	QualType BaseType =
	OMPArraySectionExpr::getBaseOriginalType(OASE->getBase());
	if (const auto *ATy = BaseType->getAsArrayTypeUnsafe())
	ExprTy = ATy->getElementType();
	else
	ExprTy = BaseType->getPointeeType();
	ExprTy = ExprTy.getNonReferenceType();
	const Expr *Length = OASE->getLength();
	Expr::EvalResult Result;
	if (Length && !Length->isValueDependent() &&
	Length->EvaluateAsInt(Result, Context) &&
	Result.Val.getInt().isNullValue()) {
	Diag(ELoc,
	diag::err_omp_depend_zero_length_array_section_not_allowed)
	<< SimpleExpr->getSourceRange();
	continue;
	}
	}

	// OpenMP 5.0, 2.17.11 depend Clause, Restrictions, C/C++
	// List items used in depend clauses with the in, out, inout or
	// mutexinoutset dependence types cannot be expressions of the
	// omp_depend_t type.
	if (!RefExpr->isValueDependent() && !RefExpr->isTypeDependent() &&
	!RefExpr->isInstantiationDependent() &&
	!RefExpr->containsUnexpandedParameterPack() &&
	(OMPDependTFound &&
	DSAStack->getOMPDependT().getTypePtr() == ExprTy.getTypePtr())) {
	Diag(ELoc, diag::err_omp_expected_addressable_lvalue_or_array_item)
	<< (LangOpts.OpenMP >= 50 ? 1 : 0) << 1
	<< RefExpr->getSourceRange();
	continue;
	}

	auto *ASE = dyn_cast<ArraySubscriptExpr>(SimpleExpr);
	if (!RefExpr->IgnoreParenImpCasts()->isLValue() \|\|
	(ASE && !ASE->getBase()->isTypeDependent() &&
	!ASE->getBase()
	->getType()
	.getNonReferenceType()
	->isPointerType() &&
	!ASE->getBase()->getType().getNonReferenceType()->isArrayType())) {
	Diag(ELoc, diag::err_omp_expected_addressable_lvalue_or_array_item)
	<< (LangOpts.OpenMP >= 50 ? 1 : 0)
	<< (LangOpts.OpenMP >= 50 ? 1 : 0) << RefExpr->getSourceRange();
	continue;
	}

	ExprResult Res;
	{
	Sema::TentativeAnalysisScope Trap(*this);
	Res = CreateBuiltinUnaryOp(ELoc, UO_AddrOf,
	RefExpr->IgnoreParenImpCasts());
	}
	if (!Res.isUsable() && !isa<OMPArraySectionExpr>(SimpleExpr) &&
	!isa<OMPArrayShapingExpr>(SimpleExpr)) {
	Diag(ELoc, diag::err_omp_expected_addressable_lvalue_or_array_item)
	<< (LangOpts.OpenMP >= 50 ? 1 : 0)
	<< (LangOpts.OpenMP >= 50 ? 1 : 0) << RefExpr->getSourceRange();
	continue;
	}
	}
	}
	Vars.push_back(RefExpr->IgnoreParenImpCasts());
	}

	if (!CurContext->isDependentContext() && DepKind == OMPC_DEPEND_sink &&
	TotalDepCount > VarList.size() &&
	DSAStack->getParentOrderedRegionParam().first &&
	DSAStack->getParentLoopControlVariable(VarList.size() + 1)) {
	Diag(EndLoc, diag::err_omp_depend_sink_expected_loop_iteration)
	<< 1 << DSAStack->getParentLoopControlVariable(VarList.size() + 1);
	}
	if (DepKind != OMPC_DEPEND_source && DepKind != OMPC_DEPEND_sink &&
	Vars.empty())
	return nullptr;

	auto *C = OMPDependClause::Create(Context, StartLoc, LParenLoc, EndLoc,
	DepModifier, DepKind, DepLoc, ColonLoc,
	Vars, TotalDepCount.getZExtValue());
	if ((DepKind == OMPC_DEPEND_sink \|\| DepKind == OMPC_DEPEND_source) &&
	DSAStack->isParentOrderedRegion())
	DSAStack->addDoacrossDependClause(C, OpsOffs);
	return C;
	}

	OMPClause *Sema::ActOnOpenMPDeviceClause(OpenMPDeviceClauseModifier Modifier,
	Expr *Device, SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation ModifierLoc,
	SourceLocation EndLoc) {
	assert((ModifierLoc.isInvalid() \|\| LangOpts.OpenMP >= 50) &&
	"Unexpected device modifier in OpenMP < 50.");

	bool ErrorFound = false;
	if (ModifierLoc.isValid() && Modifier == OMPC_DEVICE_unknown) {
	std::string Values =
	getListOfPossibleValues(OMPC_device, /First=/0, OMPC_DEVICE_unknown);
	Diag(ModifierLoc, diag::err_omp_unexpected_clause_value)
	<< Values << getOpenMPClauseName(OMPC_device);
	ErrorFound = true;
	}

	Expr *ValExpr = Device;
	Stmt *HelperValStmt = nullptr;

	// OpenMP [2.9.1, Restrictions]
	// The device expression must evaluate to a non-negative integer value.
	ErrorFound = !isNonNegativeIntegerValue(ValExpr, *this, OMPC_device,
	/StrictlyPositive=/false) \|\|
	ErrorFound;
	if (ErrorFound)
	return nullptr;

	OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
	OpenMPDirectiveKind CaptureRegion =
	getOpenMPCaptureRegionForClause(DKind, OMPC_device, LangOpts.OpenMP);
	if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) {
	ValExpr = MakeFullExpr(ValExpr).get();
	llvm::MapVector<const Expr , DeclRefExpr > Captures;
	ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
	HelperValStmt = buildPreInits(Context, Captures);
	}

	return new (Context)
	OMPDeviceClause(Modifier, ValExpr, HelperValStmt, CaptureRegion, StartLoc,
	LParenLoc, ModifierLoc, EndLoc);
	}

	static bool checkTypeMappable(SourceLocation SL, SourceRange SR, Sema &SemaRef,
	DSAStackTy *Stack, QualType QTy,
	bool FullCheck = true) {
	NamedDecl *ND;
	if (QTy->isIncompleteType(&ND)) {
	SemaRef.Diag(SL, diag::err_incomplete_type) << QTy << SR;
	return false;
	}
	if (FullCheck && !SemaRef.CurContext->isDependentContext() &&
	!QTy.isTriviallyCopyableType(SemaRef.Context))
	SemaRef.Diag(SL, diag::warn_omp_non_trivial_type_mapped) << QTy << SR;
	return true;
	}

	/// Return true if it can be proven that the provided array expression
	/// (array section or array subscript) does NOT specify the whole size of the
	/// array whose base type is \a BaseQTy.
	static bool checkArrayExpressionDoesNotReferToWholeSize(Sema &SemaRef,
	const Expr *E,
	QualType BaseQTy) {
	const auto *OASE = dyn_cast<OMPArraySectionExpr>(E);

	// If this is an array subscript, it refers to the whole size if the size of
	// the dimension is constant and equals 1. Also, an array section assumes the
	// format of an array subscript if no colon is used.
	if (isa<ArraySubscriptExpr>(E) \|\|
	(OASE && OASE->getColonLocFirst().isInvalid())) {
	if (const auto *ATy = dyn_cast<ConstantArrayType>(BaseQTy.getTypePtr()))
	return ATy->getSize().getSExtValue() != 1;
	// Size can't be evaluated statically.
	return false;
	}

	assert(OASE && "Expecting array section if not an array subscript.");
	const Expr *LowerBound = OASE->getLowerBound();
	const Expr *Length = OASE->getLength();

	// If there is a lower bound that does not evaluates to zero, we are not
	// covering the whole dimension.
	if (LowerBound) {
	Expr::EvalResult Result;
	if (!LowerBound->EvaluateAsInt(Result, SemaRef.getASTContext()))
	return false; // Can't get the integer value as a constant.

	llvm::APSInt ConstLowerBound = Result.Val.getInt();
	if (ConstLowerBound.getSExtValue())
	return true;
	}

	// If we don't have a length we covering the whole dimension.
	if (!Length)
	return false;

	// If the base is a pointer, we don't have a way to get the size of the
	// pointee.
	if (BaseQTy->isPointerType())
	return false;

	// We can only check if the length is the same as the size of the dimension
	// if we have a constant array.
	const auto *CATy = dyn_cast<ConstantArrayType>(BaseQTy.getTypePtr());
	if (!CATy)
	return false;

	Expr::EvalResult Result;
	if (!Length->EvaluateAsInt(Result, SemaRef.getASTContext()))
	return false; // Can't get the integer value as a constant.

	llvm::APSInt ConstLength = Result.Val.getInt();
	return CATy->getSize().getSExtValue() != ConstLength.getSExtValue();
	}

	// Return true if it can be proven that the provided array expression (array
	// section or array subscript) does NOT specify a single element of the array
	// whose base type is \a BaseQTy.
	static bool checkArrayExpressionDoesNotReferToUnitySize(Sema &SemaRef,
	const Expr *E,
	QualType BaseQTy) {
	const auto *OASE = dyn_cast<OMPArraySectionExpr>(E);

	// An array subscript always refer to a single element. Also, an array section
	// assumes the format of an array subscript if no colon is used.
	if (isa<ArraySubscriptExpr>(E) \|\|
	(OASE && OASE->getColonLocFirst().isInvalid()))
	return false;

	assert(OASE && "Expecting array section if not an array subscript.");
	const Expr *Length = OASE->getLength();

	// If we don't have a length we have to check if the array has unitary size
	// for this dimension. Also, we should always expect a length if the base type
	// is pointer.
	if (!Length) {
	if (const auto *ATy = dyn_cast<ConstantArrayType>(BaseQTy.getTypePtr()))
	return ATy->getSize().getSExtValue() != 1;
	// We cannot assume anything.
	return false;
	}

	// Check if the length evaluates to 1.
	Expr::EvalResult Result;
	if (!Length->EvaluateAsInt(Result, SemaRef.getASTContext()))
	return false; // Can't get the integer value as a constant.

	llvm::APSInt ConstLength = Result.Val.getInt();
	return ConstLength.getSExtValue() != 1;
	}

	// The base of elements of list in a map clause have to be either:
	// - a reference to variable or field.
	// - a member expression.
	// - an array expression.
	//
	// E.g. if we have the expression 'r.S.Arr[:12]', we want to retrieve the
	// reference to 'r'.
	//
	// If we have:
	//
	// struct SS {
	// Bla S;
	// foo() {
	// #pragma omp target map (S.Arr[:12]);
	// }
	// }
	//
	// We want to retrieve the member expression 'this->S';

	// OpenMP 5.0 [2.19.7.1, map Clause, Restrictions, p.2]
	// If a list item is an array section, it must specify contiguous storage.
	//
	// For this restriction it is sufficient that we make sure only references
	// to variables or fields and array expressions, and that no array sections
	// exist except in the rightmost expression (unless they cover the whole
	// dimension of the array). E.g. these would be invalid:
	//
	// r.ArrS[3:5].Arr[6:7]
	//
	// r.ArrS[3:5].x
	//
	// but these would be valid:
	// r.ArrS[3].Arr[6:7]
	//
	// r.ArrS[3].x
	namespace {
	class MapBaseChecker final : public StmtVisitor<MapBaseChecker, bool> {
	Sema &SemaRef;
	OpenMPClauseKind CKind = OMPC_unknown;
	OMPClauseMappableExprCommon::MappableExprComponentList &Components;
	bool NoDiagnose = false;
	const Expr *RelevantExpr = nullptr;
	bool AllowUnitySizeArraySection = true;
	bool AllowWholeSizeArraySection = true;
	SourceLocation ELoc;
	SourceRange ERange;

	void emitErrorMsg() {
	// If nothing else worked, this is not a valid map clause expression.
	if (SemaRef.getLangOpts().OpenMP < 50) {
	SemaRef.Diag(ELoc,
	diag::err_omp_expected_named_var_member_or_array_expression)
	<< ERange;
	} else {
	SemaRef.Diag(ELoc, diag::err_omp_non_lvalue_in_map_or_motion_clauses)
	<< getOpenMPClauseName(CKind) << ERange;
	}
	}

	public:
	bool VisitDeclRefExpr(DeclRefExpr *DRE) {
	if (!isa<VarDecl>(DRE->getDecl())) {
	emitErrorMsg();
	return false;
	}
	assert(!RelevantExpr && "RelevantExpr is expected to be nullptr");
	RelevantExpr = DRE;
	// Record the component.
	Components.emplace_back(DRE, DRE->getDecl());
	return true;
	}

	bool VisitMemberExpr(MemberExpr *ME) {
	Expr *E = ME;
	Expr *BaseE = ME->getBase()->IgnoreParenCasts();

	if (isa<CXXThisExpr>(BaseE)) {
	assert(!RelevantExpr && "RelevantExpr is expected to be nullptr");
	// We found a base expression: this->Val.
	RelevantExpr = ME;
	} else {
	E = BaseE;
	}

	if (!isa<FieldDecl>(ME->getMemberDecl())) {
	if (!NoDiagnose) {
	SemaRef.Diag(ELoc, diag::err_omp_expected_access_to_data_field)
	<< ME->getSourceRange();
	return false;
	}
	if (RelevantExpr)
	return false;
	return Visit(E);
	}

	auto *FD = cast<FieldDecl>(ME->getMemberDecl());

	// OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, C/C++, p.3]
	// A bit-field cannot appear in a map clause.
	//
	if (FD->isBitField()) {
	if (!NoDiagnose) {
	SemaRef.Diag(ELoc, diag::err_omp_bit_fields_forbidden_in_clause)
	<< ME->getSourceRange() << getOpenMPClauseName(CKind);
	return false;
	}
	if (RelevantExpr)
	return false;
	return Visit(E);
	}

	// OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, C++, p.1]
	// If the type of a list item is a reference to a type T then the type
	// will be considered to be T for all purposes of this clause.
	QualType CurType = BaseE->getType().getNonReferenceType();

	// OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, C/C++, p.2]
	// A list item cannot be a variable that is a member of a structure with
	// a union type.
	//
	if (CurType->isUnionType()) {
	if (!NoDiagnose) {
	SemaRef.Diag(ELoc, diag::err_omp_union_type_not_allowed)
	<< ME->getSourceRange();
	return false;
	}
	return RelevantExpr \|\| Visit(E);
	}

	// If we got a member expression, we should not expect any array section
	// before that:
	//
	// OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, p.7]
	// If a list item is an element of a structure, only the rightmost symbol
	// of the variable reference can be an array section.
	//
	AllowUnitySizeArraySection = false;
	AllowWholeSizeArraySection = false;

	// Record the component.
	Components.emplace_back(ME, FD);
	return RelevantExpr \|\| Visit(E);
	}

	bool VisitArraySubscriptExpr(ArraySubscriptExpr *AE) {
	Expr *E = AE->getBase()->IgnoreParenImpCasts();

	if (!E->getType()->isAnyPointerType() && !E->getType()->isArrayType()) {
	if (!NoDiagnose) {
	SemaRef.Diag(ELoc, diag::err_omp_expected_base_var_name)
	<< 0 << AE->getSourceRange();
	return false;
	}
	return RelevantExpr \|\| Visit(E);
	}

	// If we got an array subscript that express the whole dimension we
	// can have any array expressions before. If it only expressing part of
	// the dimension, we can only have unitary-size array expressions.
	if (checkArrayExpressionDoesNotReferToWholeSize(SemaRef, AE,
	E->getType()))
	AllowWholeSizeArraySection = false;

	if (const auto *TE = dyn_cast<CXXThisExpr>(E->IgnoreParenCasts())) {
	Expr::EvalResult Result;
	if (!AE->getIdx()->isValueDependent() &&
	AE->getIdx()->EvaluateAsInt(Result, SemaRef.getASTContext()) &&
	!Result.Val.getInt().isNullValue()) {
	SemaRef.Diag(AE->getIdx()->getExprLoc(),
	diag::err_omp_invalid_map_this_expr);
	SemaRef.Diag(AE->getIdx()->getExprLoc(),
	diag::note_omp_invalid_subscript_on_this_ptr_map);
	}
	assert(!RelevantExpr && "RelevantExpr is expected to be nullptr");
	RelevantExpr = TE;
	}

	// Record the component - we don't have any declaration associated.
	Components.emplace_back(AE, nullptr);

	return RelevantExpr \|\| Visit(E);
	}

	bool VisitOMPArraySectionExpr(OMPArraySectionExpr *OASE) {
	assert(!NoDiagnose && "Array sections cannot be implicitly mapped.");
	Expr *E = OASE->getBase()->IgnoreParenImpCasts();
	QualType CurType =
	OMPArraySectionExpr::getBaseOriginalType(E).getCanonicalType();

	// OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, C++, p.1]
	// If the type of a list item is a reference to a type T then the type
	// will be considered to be T for all purposes of this clause.
	if (CurType->isReferenceType())
	CurType = CurType->getPointeeType();

	bool IsPointer = CurType->isAnyPointerType();

	if (!IsPointer && !CurType->isArrayType()) {
	SemaRef.Diag(ELoc, diag::err_omp_expected_base_var_name)
	<< 0 << OASE->getSourceRange();
	return false;
	}

	bool NotWhole =
	checkArrayExpressionDoesNotReferToWholeSize(SemaRef, OASE, CurType);
	bool NotUnity =
	checkArrayExpressionDoesNotReferToUnitySize(SemaRef, OASE, CurType);

	if (AllowWholeSizeArraySection) {
	// Any array section is currently allowed. Allowing a whole size array
	// section implies allowing a unity array section as well.
	//
	// If this array section refers to the whole dimension we can still
	// accept other array sections before this one, except if the base is a
	// pointer. Otherwise, only unitary sections are accepted.
	if (NotWhole \|\| IsPointer)
	AllowWholeSizeArraySection = false;
	} else if (AllowUnitySizeArraySection && NotUnity) {
	// A unity or whole array section is not allowed and that is not
	// compatible with the properties of the current array section.
	SemaRef.Diag(
	ELoc, diag::err_array_section_does_not_specify_contiguous_storage)
	<< OASE->getSourceRange();
	return false;
	}

	if (const auto *TE = dyn_cast<CXXThisExpr>(E)) {
	Expr::EvalResult ResultR;
	Expr::EvalResult ResultL;
	if (!OASE->getLength()->isValueDependent() &&
	OASE->getLength()->EvaluateAsInt(ResultR, SemaRef.getASTContext()) &&
	!ResultR.Val.getInt().isOneValue()) {
	SemaRef.Diag(OASE->getLength()->getExprLoc(),
	diag::err_omp_invalid_map_this_expr);
	SemaRef.Diag(OASE->getLength()->getExprLoc(),
	diag::note_omp_invalid_length_on_this_ptr_mapping);
	}
	if (OASE->getLowerBound() && !OASE->getLowerBound()->isValueDependent() &&
	OASE->getLowerBound()->EvaluateAsInt(ResultL,
	SemaRef.getASTContext()) &&
	!ResultL.Val.getInt().isNullValue()) {
	SemaRef.Diag(OASE->getLowerBound()->getExprLoc(),
	diag::err_omp_invalid_map_this_expr);
	SemaRef.Diag(OASE->getLowerBound()->getExprLoc(),
	diag::note_omp_invalid_lower_bound_on_this_ptr_mapping);
	}
	assert(!RelevantExpr && "RelevantExpr is expected to be nullptr");
	RelevantExpr = TE;
	}

	// Record the component - we don't have any declaration associated.
	Components.emplace_back(OASE, nullptr);
	return RelevantExpr \|\| Visit(E);
	}
	bool VisitOMPArrayShapingExpr(OMPArrayShapingExpr *E) {
	Expr *Base = E->getBase();

	// Record the component - we don't have any declaration associated.
	Components.emplace_back(E, nullptr);

	return Visit(Base->IgnoreParenImpCasts());
	}

	bool VisitUnaryOperator(UnaryOperator *UO) {
	if (SemaRef.getLangOpts().OpenMP < 50 \|\| !UO->isLValue() \|\|
	UO->getOpcode() != UO_Deref) {
	emitErrorMsg();
	return false;
	}
	if (!RelevantExpr) {
	// Record the component if haven't found base decl.
	Components.emplace_back(UO, nullptr);
	}
	return RelevantExpr \|\| Visit(UO->getSubExpr()->IgnoreParenImpCasts());
	}
	bool VisitBinaryOperator(BinaryOperator *BO) {
	if (SemaRef.getLangOpts().OpenMP < 50 \|\| !BO->getType()->isPointerType()) {
	emitErrorMsg();
	return false;
	}

	// Pointer arithmetic is the only thing we expect to happen here so after we
	// make sure the binary operator is a pointer type, the we only thing need
	// to to is to visit the subtree that has the same type as root (so that we
	// know the other subtree is just an offset)
	Expr *LE = BO->getLHS()->IgnoreParenImpCasts();
	Expr *RE = BO->getRHS()->IgnoreParenImpCasts();
	Components.emplace_back(BO, nullptr);
	assert((LE->getType().getTypePtr() == BO->getType().getTypePtr() \|\|
	RE->getType().getTypePtr() == BO->getType().getTypePtr()) &&
	"Either LHS or RHS have base decl inside");
	if (BO->getType().getTypePtr() == LE->getType().getTypePtr())
	return RelevantExpr \|\| Visit(LE);
	return RelevantExpr \|\| Visit(RE);
	}
	bool VisitCXXThisExpr(CXXThisExpr *CTE) {
	assert(!RelevantExpr && "RelevantExpr is expected to be nullptr");
	RelevantExpr = CTE;
	Components.emplace_back(CTE, nullptr);
	return true;
	}
	bool VisitStmt(Stmt *) {
	emitErrorMsg();
	return false;
	}
	const Expr *getFoundBase() const {
	return RelevantExpr;
	}
	explicit MapBaseChecker(
	Sema &SemaRef, OpenMPClauseKind CKind,
	OMPClauseMappableExprCommon::MappableExprComponentList &Components,
	bool NoDiagnose, SourceLocation &ELoc, SourceRange &ERange)
	: SemaRef(SemaRef), CKind(CKind), Components(Components),
	NoDiagnose(NoDiagnose), ELoc(ELoc), ERange(ERange) {}
	};
	} // namespace

	/// Return the expression of the base of the mappable expression or null if it
	/// cannot be determined and do all the necessary checks to see if the expression
	/// is valid as a standalone mappable expression. In the process, record all the
	/// components of the expression.
	static const Expr *checkMapClauseExpressionBase(
	Sema &SemaRef, Expr *E,
	OMPClauseMappableExprCommon::MappableExprComponentList &CurComponents,
	OpenMPClauseKind CKind, bool NoDiagnose) {
	SourceLocation ELoc = E->getExprLoc();
	SourceRange ERange = E->getSourceRange();
	MapBaseChecker Checker(SemaRef, CKind, CurComponents, NoDiagnose, ELoc,
	ERange);
	if (Checker.Visit(E->IgnoreParens()))
	return Checker.getFoundBase();
	return nullptr;
	}

	// Return true if expression E associated with value VD has conflicts with other
	// map information.
	static bool checkMapConflicts(
	Sema &SemaRef, DSAStackTy DSAS, const ValueDecl VD, const Expr *E,
	bool CurrentRegionOnly,
	OMPClauseMappableExprCommon::MappableExprComponentListRef CurComponents,
	OpenMPClauseKind CKind) {
	assert(VD && E);
	SourceLocation ELoc = E->getExprLoc();
	SourceRange ERange = E->getSourceRange();

	// In order to easily check the conflicts we need to match each component of
	// the expression under test with the components of the expressions that are
	// already in the stack.

	assert(!CurComponents.empty() && "Map clause expression with no components!");
	assert(CurComponents.back().getAssociatedDeclaration() == VD &&
	"Map clause expression with unexpected base!");

	// Variables to help detecting enclosing problems in data environment nests.
	bool IsEnclosedByDataEnvironmentExpr = false;
	const Expr *EnclosingExpr = nullptr;

	bool FoundError = DSAS->checkMappableExprComponentListsForDecl(
	VD, CurrentRegionOnly,
	[&IsEnclosedByDataEnvironmentExpr, &SemaRef, VD, CurrentRegionOnly, ELoc,
	ERange, CKind, &EnclosingExpr,
	CurComponents](OMPClauseMappableExprCommon::MappableExprComponentListRef
	StackComponents,
	OpenMPClauseKind) {
	assert(!StackComponents.empty() &&
	"Map clause expression with no components!");
	assert(StackComponents.back().getAssociatedDeclaration() == VD &&
	"Map clause expression with unexpected base!");
	(void)VD;

	// The whole expression in the stack.
	const Expr *RE = StackComponents.front().getAssociatedExpression();

	// Expressions must start from the same base. Here we detect at which
	// point both expressions diverge from each other and see if we can
	// detect if the memory referred to both expressions is contiguous and
	// do not overlap.
	auto CI = CurComponents.rbegin();
	auto CE = CurComponents.rend();
	auto SI = StackComponents.rbegin();
	auto SE = StackComponents.rend();
	for (; CI != CE && SI != SE; ++CI, ++SI) {

	// OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, p.3]
	// At most one list item can be an array item derived from a given
	// variable in map clauses of the same construct.
	if (CurrentRegionOnly &&
	(isa<ArraySubscriptExpr>(CI->getAssociatedExpression()) \|\|
	isa<OMPArraySectionExpr>(CI->getAssociatedExpression()) \|\|
	isa<OMPArrayShapingExpr>(CI->getAssociatedExpression())) &&
	(isa<ArraySubscriptExpr>(SI->getAssociatedExpression()) \|\|
	isa<OMPArraySectionExpr>(SI->getAssociatedExpression()) \|\|
	isa<OMPArrayShapingExpr>(SI->getAssociatedExpression()))) {
	SemaRef.Diag(CI->getAssociatedExpression()->getExprLoc(),
	diag::err_omp_multiple_array_items_in_map_clause)
	<< CI->getAssociatedExpression()->getSourceRange();
	SemaRef.Diag(SI->getAssociatedExpression()->getExprLoc(),
	diag::note_used_here)
	<< SI->getAssociatedExpression()->getSourceRange();
	return true;
	}

	// Do both expressions have the same kind?
	if (CI->getAssociatedExpression()->getStmtClass() !=
	SI->getAssociatedExpression()->getStmtClass())
	break;

	// Are we dealing with different variables/fields?
	if (CI->getAssociatedDeclaration() != SI->getAssociatedDeclaration())
	break;
	}
	// Check if the extra components of the expressions in the enclosing
	// data environment are redundant for the current base declaration.
	// If they are, the maps completely overlap, which is legal.
	for (; SI != SE; ++SI) {
	QualType Type;
	if (const auto *ASE =
	dyn_cast<ArraySubscriptExpr>(SI->getAssociatedExpression())) {
	Type = ASE->getBase()->IgnoreParenImpCasts()->getType();
	} else if (const auto *OASE = dyn_cast<OMPArraySectionExpr>(
	SI->getAssociatedExpression())) {
	const Expr *E = OASE->getBase()->IgnoreParenImpCasts();
	Type =
	OMPArraySectionExpr::getBaseOriginalType(E).getCanonicalType();
	} else if (const auto *OASE = dyn_cast<OMPArrayShapingExpr>(
	SI->getAssociatedExpression())) {
	Type = OASE->getBase()->getType()->getPointeeType();
	}
	if (Type.isNull() \|\| Type->isAnyPointerType() \|\|
	checkArrayExpressionDoesNotReferToWholeSize(
	SemaRef, SI->getAssociatedExpression(), Type))
	break;
	}

	// OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, p.4]
	// List items of map clauses in the same construct must not share
	// original storage.
	//
	// If the expressions are exactly the same or one is a subset of the
	// other, it means they are sharing storage.
	if (CI == CE && SI == SE) {
	if (CurrentRegionOnly) {
	if (CKind == OMPC_map) {
	SemaRef.Diag(ELoc, diag::err_omp_map_shared_storage) << ERange;
	} else {
	assert(CKind == OMPC_to \|\| CKind == OMPC_from);
	SemaRef.Diag(ELoc, diag::err_omp_once_referenced_in_target_update)
	<< ERange;
	}
	SemaRef.Diag(RE->getExprLoc(), diag::note_used_here)
	<< RE->getSourceRange();
	return true;
	}
	// If we find the same expression in the enclosing data environment,
	// that is legal.
	IsEnclosedByDataEnvironmentExpr = true;
	return false;
	}

	QualType DerivedType =
	std::prev(CI)->getAssociatedDeclaration()->getType();
	SourceLocation DerivedLoc =
	std::prev(CI)->getAssociatedExpression()->getExprLoc();

	// OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, C++, p.1]
	// If the type of a list item is a reference to a type T then the type
	// will be considered to be T for all purposes of this clause.
	DerivedType = DerivedType.getNonReferenceType();

	// OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, C/C++, p.1]
	// A variable for which the type is pointer and an array section
	// derived from that variable must not appear as list items of map
	// clauses of the same construct.
	//
	// Also, cover one of the cases in:
	// OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, p.5]
	// If any part of the original storage of a list item has corresponding
	// storage in the device data environment, all of the original storage
	// must have corresponding storage in the device data environment.
	//
	if (DerivedType->isAnyPointerType()) {
	if (CI == CE \|\| SI == SE) {
	SemaRef.Diag(
	DerivedLoc,
	diag::err_omp_pointer_mapped_along_with_derived_section)
	<< DerivedLoc;
	SemaRef.Diag(RE->getExprLoc(), diag::note_used_here)
	<< RE->getSourceRange();
	return true;
	}
	if (CI->getAssociatedExpression()->getStmtClass() !=
	SI->getAssociatedExpression()->getStmtClass() \|\|
	CI->getAssociatedDeclaration()->getCanonicalDecl() ==
	SI->getAssociatedDeclaration()->getCanonicalDecl()) {
	assert(CI != CE && SI != SE);
	SemaRef.Diag(DerivedLoc, diag::err_omp_same_pointer_dereferenced)
	<< DerivedLoc;
	SemaRef.Diag(RE->getExprLoc(), diag::note_used_here)
	<< RE->getSourceRange();
	return true;
	}
	}

	// OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, p.4]
	// List items of map clauses in the same construct must not share
	// original storage.
	//
	// An expression is a subset of the other.
	if (CurrentRegionOnly && (CI == CE \|\| SI == SE)) {
	if (CKind == OMPC_map) {
	if (CI != CE \|\| SI != SE) {
	// Allow constructs like this: map(s, s.ptr[0:1]), where s.ptr is
	// a pointer.
	auto Begin =
	CI != CE ? CurComponents.begin() : StackComponents.begin();
	auto End = CI != CE ? CurComponents.end() : StackComponents.end();
	auto It = Begin;
	while (It != End && !It->getAssociatedDeclaration())
	std::advance(It, 1);
	assert(It != End &&
	"Expected at least one component with the declaration.");
	if (It != Begin && It->getAssociatedDeclaration()
	->getType()
	.getCanonicalType()
	->isAnyPointerType()) {
	IsEnclosedByDataEnvironmentExpr = false;
	EnclosingExpr = nullptr;
	return false;
	}
	}
	SemaRef.Diag(ELoc, diag::err_omp_map_shared_storage) << ERange;
	} else {
	assert(CKind == OMPC_to \|\| CKind == OMPC_from);
	SemaRef.Diag(ELoc, diag::err_omp_once_referenced_in_target_update)
	<< ERange;
	}
	SemaRef.Diag(RE->getExprLoc(), diag::note_used_here)
	<< RE->getSourceRange();
	return true;
	}

	// The current expression uses the same base as other expression in the
	// data environment but does not contain it completely.
	if (!CurrentRegionOnly && SI != SE)
	EnclosingExpr = RE;

	// The current expression is a subset of the expression in the data
	// environment.
	IsEnclosedByDataEnvironmentExpr \|=
	(!CurrentRegionOnly && CI != CE && SI == SE);

	return false;
	});

	if (CurrentRegionOnly)
	return FoundError;

	// OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, p.5]
	// If any part of the original storage of a list item has corresponding
	// storage in the device data environment, all of the original storage must
	// have corresponding storage in the device data environment.
	// OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, p.6]
	// If a list item is an element of a structure, and a different element of
	// the structure has a corresponding list item in the device data environment
	// prior to a task encountering the construct associated with the map clause,
	// then the list item must also have a corresponding list item in the device
	// data environment prior to the task encountering the construct.
	//
	if (EnclosingExpr && !IsEnclosedByDataEnvironmentExpr) {
	SemaRef.Diag(ELoc,
	diag::err_omp_original_storage_is_shared_and_does_not_contain)
	<< ERange;
	SemaRef.Diag(EnclosingExpr->getExprLoc(), diag::note_used_here)
	<< EnclosingExpr->getSourceRange();
	return true;
	}

	return FoundError;
	}

	// Look up the user-defined mapper given the mapper name and mapped type, and
	// build a reference to it.
	static ExprResult buildUserDefinedMapperRef(Sema &SemaRef, Scope *S,
	CXXScopeSpec &MapperIdScopeSpec,
	const DeclarationNameInfo &MapperId,
	QualType Type,
	Expr *UnresolvedMapper) {
	if (MapperIdScopeSpec.isInvalid())
	return ExprError();
	// Get the actual type for the array type.
	if (Type->isArrayType()) {
	assert(Type->getAsArrayTypeUnsafe() && "Expect to get a valid array type");
	Type = Type->getAsArrayTypeUnsafe()->getElementType().getCanonicalType();
	}
	// Find all user-defined mappers with the given MapperId.
	SmallVector<UnresolvedSet<8>, 4> Lookups;
	LookupResult Lookup(SemaRef, MapperId, Sema::LookupOMPMapperName);
	Lookup.suppressDiagnostics();
	if (S) {
	while (S && SemaRef.LookupParsedName(Lookup, S, &MapperIdScopeSpec)) {
	NamedDecl *D = Lookup.getRepresentativeDecl();
	while (S && !S->isDeclScope(D))
	S = S->getParent();
	if (S)
	S = S->getParent();
	Lookups.emplace_back();
	Lookups.back().append(Lookup.begin(), Lookup.end());
	Lookup.clear();
	}
	} else if (auto *ULE = cast_or_null<UnresolvedLookupExpr>(UnresolvedMapper)) {
	// Extract the user-defined mappers with the given MapperId.
	Lookups.push_back(UnresolvedSet<8>());
	for (NamedDecl *D : ULE->decls()) {
	auto *DMD = cast<OMPDeclareMapperDecl>(D);
	assert(DMD && "Expect valid OMPDeclareMapperDecl during instantiation.");
	Lookups.back().addDecl(DMD);
	}
	}
	// Defer the lookup for dependent types. The results will be passed through
	// UnresolvedMapper on instantiation.
	if (SemaRef.CurContext->isDependentContext() \|\| Type->isDependentType() \|\|
	Type->isInstantiationDependentType() \|\|
	Type->containsUnexpandedParameterPack() \|\|
	filterLookupForUDReductionAndMapper<bool>(Lookups, [](ValueDecl *D) {
	return !D->isInvalidDecl() &&
	(D->getType()->isDependentType() \|\|
	D->getType()->isInstantiationDependentType() \|\|
	D->getType()->containsUnexpandedParameterPack());
	})) {
	UnresolvedSet<8> URS;
	for (const UnresolvedSet<8> &Set : Lookups) {
	if (Set.empty())
	continue;
	URS.append(Set.begin(), Set.end());
	}
	return UnresolvedLookupExpr::Create(
	SemaRef.Context, /NamingClass=/nullptr,
	MapperIdScopeSpec.getWithLocInContext(SemaRef.Context), MapperId,
	/ADL=/false, /Overloaded=/true, URS.begin(), URS.end());
	}
	SourceLocation Loc = MapperId.getLoc();
	// [OpenMP 5.0], 2.19.7.3 declare mapper Directive, Restrictions
	// The type must be of struct, union or class type in C and C++
	if (!Type->isStructureOrClassType() && !Type->isUnionType() &&
	(MapperIdScopeSpec.isSet() \|\| MapperId.getAsString() != "default")) {
	SemaRef.Diag(Loc, diag::err_omp_mapper_wrong_type);
	return ExprError();
	}
	// Perform argument dependent lookup.
	if (SemaRef.getLangOpts().CPlusPlus && !MapperIdScopeSpec.isSet())
	argumentDependentLookup(SemaRef, MapperId, Loc, Type, Lookups);
	// Return the first user-defined mapper with the desired type.
	if (auto VD = filterLookupForUDReductionAndMapper<ValueDecl >(
	Lookups, [&SemaRef, Type](ValueDecl D) -> ValueDecl {
	if (!D->isInvalidDecl() &&
	SemaRef.Context.hasSameType(D->getType(), Type))
	return D;
	return nullptr;
	}))
	return SemaRef.BuildDeclRefExpr(VD, Type, VK_LValue, Loc);
	// Find the first user-defined mapper with a type derived from the desired
	// type.
	if (auto VD = filterLookupForUDReductionAndMapper<ValueDecl >(
	Lookups, [&SemaRef, Type, Loc](ValueDecl D) -> ValueDecl {
	if (!D->isInvalidDecl() &&
	SemaRef.IsDerivedFrom(Loc, Type, D->getType()) &&
	!Type.isMoreQualifiedThan(D->getType()))
	return D;
	return nullptr;
	})) {
	CXXBasePaths Paths(/FindAmbiguities=/true, /RecordPaths=/true,
	/DetectVirtual=/false);
	if (SemaRef.IsDerivedFrom(Loc, Type, VD->getType(), Paths)) {
	if (!Paths.isAmbiguous(SemaRef.Context.getCanonicalType(
	VD->getType().getUnqualifiedType()))) {
	if (SemaRef.CheckBaseClassAccess(
	Loc, VD->getType(), Type, Paths.front(),
	/DiagID=/0) != Sema::AR_inaccessible) {
	return SemaRef.BuildDeclRefExpr(VD, Type, VK_LValue, Loc);
	}
	}
	}
	}
	// Report error if a mapper is specified, but cannot be found.
	if (MapperIdScopeSpec.isSet() \|\| MapperId.getAsString() != "default") {
	SemaRef.Diag(Loc, diag::err_omp_invalid_mapper)
	<< Type << MapperId.getName();
	return ExprError();
	}
	return ExprEmpty();
	}

	namespace {
	// Utility struct that gathers all the related lists associated with a mappable
	// expression.
	struct MappableVarListInfo {
	// The list of expressions.
	ArrayRef<Expr *> VarList;
	// The list of processed expressions.
	SmallVector<Expr *, 16> ProcessedVarList;
	// The mappble components for each expression.
	OMPClauseMappableExprCommon::MappableExprComponentLists VarComponents;
	// The base declaration of the variable.
	SmallVector<ValueDecl *, 16> VarBaseDeclarations;
	// The reference to the user-defined mapper associated with every expression.
	SmallVector<Expr *, 16> UDMapperList;

	MappableVarListInfo(ArrayRef<Expr *> VarList) : VarList(VarList) {
	// We have a list of components and base declarations for each entry in the
	// variable list.
	VarComponents.reserve(VarList.size());
	VarBaseDeclarations.reserve(VarList.size());
	}
	};
	}

	// Check the validity of the provided variable list for the provided clause kind
	// \a CKind. In the check process the valid expressions, mappable expression
	// components, variables, and user-defined mappers are extracted and used to
	// fill \a ProcessedVarList, \a VarComponents, \a VarBaseDeclarations, and \a
	// UDMapperList in MVLI. \a MapType, \a IsMapTypeImplicit, \a MapperIdScopeSpec,
	// and \a MapperId are expected to be valid if the clause kind is 'map'.
	static void checkMappableExpressionList(
	Sema &SemaRef, DSAStackTy *DSAS, OpenMPClauseKind CKind,
	MappableVarListInfo &MVLI, SourceLocation StartLoc,
	CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo MapperId,
	ArrayRef<Expr *> UnresolvedMappers,
	OpenMPMapClauseKind MapType = OMPC_MAP_unknown,
	bool IsMapTypeImplicit = false) {
	// We only expect mappable expressions in 'to', 'from', and 'map' clauses.
	assert((CKind == OMPC_map \|\| CKind == OMPC_to \|\| CKind == OMPC_from) &&
	"Unexpected clause kind with mappable expressions!");

	// If the identifier of user-defined mapper is not specified, it is "default".
	// We do not change the actual name in this clause to distinguish whether a
	// mapper is specified explicitly, i.e., it is not explicitly specified when
	// MapperId.getName() is empty.
	if (!MapperId.getName() \|\| MapperId.getName().isEmpty()) {
	auto &DeclNames = SemaRef.getASTContext().DeclarationNames;
	MapperId.setName(DeclNames.getIdentifier(
	&SemaRef.getASTContext().Idents.get("default")));
	}

	// Iterators to find the current unresolved mapper expression.
	auto UMIt = UnresolvedMappers.begin(), UMEnd = UnresolvedMappers.end();
	bool UpdateUMIt = false;
	Expr *UnresolvedMapper = nullptr;

	// Keep track of the mappable components and base declarations in this clause.
	// Each entry in the list is going to have a list of components associated. We
	// record each set of the components so that we can build the clause later on.
	// In the end we should have the same amount of declarations and component
	// lists.

	for (Expr *RE : MVLI.VarList) {
	assert(RE && "Null expr in omp to/from/map clause");
	SourceLocation ELoc = RE->getExprLoc();

	// Find the current unresolved mapper expression.
	if (UpdateUMIt && UMIt != UMEnd) {
	UMIt++;
	assert(
	UMIt != UMEnd &&
	"Expect the size of UnresolvedMappers to match with that of VarList");
	}
	UpdateUMIt = true;
	if (UMIt != UMEnd)
	UnresolvedMapper = *UMIt;

	const Expr *VE = RE->IgnoreParenLValueCasts();

	if (VE->isValueDependent() \|\| VE->isTypeDependent() \|\|
	VE->isInstantiationDependent() \|\|
	VE->containsUnexpandedParameterPack()) {
	// Try to find the associated user-defined mapper.
	ExprResult ER = buildUserDefinedMapperRef(
	SemaRef, DSAS->getCurScope(), MapperIdScopeSpec, MapperId,
	VE->getType().getCanonicalType(), UnresolvedMapper);
	if (ER.isInvalid())
	continue;
	MVLI.UDMapperList.push_back(ER.get());
	// We can only analyze this information once the missing information is
	// resolved.
	MVLI.ProcessedVarList.push_back(RE);
	continue;
	}

	Expr *SimpleExpr = RE->IgnoreParenCasts();

	if (!RE->isLValue()) {
	if (SemaRef.getLangOpts().OpenMP < 50) {
	SemaRef.Diag(
	ELoc, diag::err_omp_expected_named_var_member_or_array_expression)
	<< RE->getSourceRange();
	} else {
	SemaRef.Diag(ELoc, diag::err_omp_non_lvalue_in_map_or_motion_clauses)
	<< getOpenMPClauseName(CKind) << RE->getSourceRange();
	}
	continue;
	}

	OMPClauseMappableExprCommon::MappableExprComponentList CurComponents;
	ValueDecl *CurDeclaration = nullptr;

	// Obtain the array or member expression bases if required. Also, fill the
	// components array with all the components identified in the process.
	const Expr *BE = checkMapClauseExpressionBase(
	SemaRef, SimpleExpr, CurComponents, CKind, /NoDiagnose=/false);
	if (!BE)
	continue;

	assert(!CurComponents.empty() &&
	"Invalid mappable expression information.");

	if (const auto *TE = dyn_cast<CXXThisExpr>(BE)) {
	// Add store "this" pointer to class in DSAStackTy for future checking
	DSAS->addMappedClassesQualTypes(TE->getType());
	// Try to find the associated user-defined mapper.
	ExprResult ER = buildUserDefinedMapperRef(
	SemaRef, DSAS->getCurScope(), MapperIdScopeSpec, MapperId,
	VE->getType().getCanonicalType(), UnresolvedMapper);
	if (ER.isInvalid())
	continue;
	MVLI.UDMapperList.push_back(ER.get());
	// Skip restriction checking for variable or field declarations
	MVLI.ProcessedVarList.push_back(RE);
	MVLI.VarComponents.resize(MVLI.VarComponents.size() + 1);
	MVLI.VarComponents.back().append(CurComponents.begin(),
	CurComponents.end());
	MVLI.VarBaseDeclarations.push_back(nullptr);
	continue;
	}

	// For the following checks, we rely on the base declaration which is
	// expected to be associated with the last component. The declaration is
	// expected to be a variable or a field (if 'this' is being mapped).
	CurDeclaration = CurComponents.back().getAssociatedDeclaration();
	assert(CurDeclaration && "Null decl on map clause.");
	assert(
	CurDeclaration->isCanonicalDecl() &&
	"Expecting components to have associated only canonical declarations.");

	auto *VD = dyn_cast<VarDecl>(CurDeclaration);
	const auto *FD = dyn_cast<FieldDecl>(CurDeclaration);

	assert((VD \|\| FD) && "Only variables or fields are expected here!");
	(void)FD;

	// OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, p.10]
	// threadprivate variables cannot appear in a map clause.
	// OpenMP 4.5 [2.10.5, target update Construct]
	// threadprivate variables cannot appear in a from clause.
	if (VD && DSAS->isThreadPrivate(VD)) {
	DSAStackTy::DSAVarData DVar = DSAS->getTopDSA(VD, /FromParent=/false);
	SemaRef.Diag(ELoc, diag::err_omp_threadprivate_in_clause)
	<< getOpenMPClauseName(CKind);
	reportOriginalDsa(SemaRef, DSAS, VD, DVar);
	continue;
	}

	// OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, p.9]
	// A list item cannot appear in both a map clause and a data-sharing
	// attribute clause on the same construct.

	// Check conflicts with other map clause expressions. We check the conflicts
	// with the current construct separately from the enclosing data
	// environment, because the restrictions are different. We only have to
	// check conflicts across regions for the map clauses.
	if (checkMapConflicts(SemaRef, DSAS, CurDeclaration, SimpleExpr,
	/CurrentRegionOnly=/true, CurComponents, CKind))
	break;
	if (CKind == OMPC_map &&
	checkMapConflicts(SemaRef, DSAS, CurDeclaration, SimpleExpr,
	/CurrentRegionOnly=/false, CurComponents, CKind))
	break;

	// OpenMP 4.5 [2.10.5, target update Construct]
	// OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, C++, p.1]
	// If the type of a list item is a reference to a type T then the type will
	// be considered to be T for all purposes of this clause.
	auto I = llvm::find_if(
	CurComponents,
	[](const OMPClauseMappableExprCommon::MappableComponent &MC) {
	return MC.getAssociatedDeclaration();
	});
	assert(I != CurComponents.end() && "Null decl on map clause.");
	QualType Type;
	auto *ASE = dyn_cast<ArraySubscriptExpr>(VE->IgnoreParens());
	auto *OASE = dyn_cast<OMPArraySectionExpr>(VE->IgnoreParens());
	auto *OAShE = dyn_cast<OMPArrayShapingExpr>(VE->IgnoreParens());
	if (ASE) {
	Type = ASE->getType().getNonReferenceType();
	} else if (OASE) {
	QualType BaseType =
	OMPArraySectionExpr::getBaseOriginalType(OASE->getBase());
	if (const auto *ATy = BaseType->getAsArrayTypeUnsafe())
	Type = ATy->getElementType();
	else
	Type = BaseType->getPointeeType();
	Type = Type.getNonReferenceType();
	} else if (OAShE) {
	Type = OAShE->getBase()->getType()->getPointeeType();
	} else {
	Type = VE->getType();
	}

	// OpenMP 4.5 [2.10.5, target update Construct, Restrictions, p.4]
	// A list item in a to or from clause must have a mappable type.
	// OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, p.9]
	// A list item must have a mappable type.
	if (!checkTypeMappable(VE->getExprLoc(), VE->getSourceRange(), SemaRef,
	DSAS, Type))
	continue;

	Type = I->getAssociatedDeclaration()->getType().getNonReferenceType();

	if (CKind == OMPC_map) {
	// target enter data
	// OpenMP [2.10.2, Restrictions, p. 99]
	// A map-type must be specified in all map clauses and must be either
	// to or alloc.
	OpenMPDirectiveKind DKind = DSAS->getCurrentDirective();
	if (DKind == OMPD_target_enter_data &&
	!(MapType == OMPC_MAP_to \|\| MapType == OMPC_MAP_alloc)) {
	SemaRef.Diag(StartLoc, diag::err_omp_invalid_map_type_for_directive)
	<< (IsMapTypeImplicit ? 1 : 0)
	<< getOpenMPSimpleClauseTypeName(OMPC_map, MapType)
	<< getOpenMPDirectiveName(DKind);
	continue;
	}

	// target exit_data
	// OpenMP [2.10.3, Restrictions, p. 102]
	// A map-type must be specified in all map clauses and must be either
	// from, release, or delete.
	if (DKind == OMPD_target_exit_data &&
	!(MapType == OMPC_MAP_from \|\| MapType == OMPC_MAP_release \|\|
	MapType == OMPC_MAP_delete)) {
	SemaRef.Diag(StartLoc, diag::err_omp_invalid_map_type_for_directive)
	<< (IsMapTypeImplicit ? 1 : 0)
	<< getOpenMPSimpleClauseTypeName(OMPC_map, MapType)
	<< getOpenMPDirectiveName(DKind);
	continue;
	}

	// target, target data
	// OpenMP 5.0 [2.12.2, Restrictions, p. 163]
	// OpenMP 5.0 [2.12.5, Restrictions, p. 174]
	// A map-type in a map clause must be to, from, tofrom or alloc
	if ((DKind == OMPD_target_data \|\|
	isOpenMPTargetExecutionDirective(DKind)) &&
	!(MapType == OMPC_MAP_to \|\| MapType == OMPC_MAP_from \|\|
	MapType == OMPC_MAP_tofrom \|\| MapType == OMPC_MAP_alloc)) {
	SemaRef.Diag(StartLoc, diag::err_omp_invalid_map_type_for_directive)
	<< (IsMapTypeImplicit ? 1 : 0)
	<< getOpenMPSimpleClauseTypeName(OMPC_map, MapType)
	<< getOpenMPDirectiveName(DKind);
	continue;
	}

	// OpenMP 4.5 [2.15.5.1, Restrictions, p.3]
	// A list item cannot appear in both a map clause and a data-sharing
	// attribute clause on the same construct
	//
	// OpenMP 5.0 [2.19.7.1, Restrictions, p.7]
	// A list item cannot appear in both a map clause and a data-sharing
	// attribute clause on the same construct unless the construct is a
	// combined construct.
	if (VD && ((SemaRef.LangOpts.OpenMP <= 45 &&
	isOpenMPTargetExecutionDirective(DKind)) \|\|
	DKind == OMPD_target)) {
	DSAStackTy::DSAVarData DVar = DSAS->getTopDSA(VD, /FromParent=/false);
	if (isOpenMPPrivate(DVar.CKind)) {
	SemaRef.Diag(ELoc, diag::err_omp_variable_in_given_clause_and_dsa)
	<< getOpenMPClauseName(DVar.CKind)
	<< getOpenMPClauseName(OMPC_map)
	<< getOpenMPDirectiveName(DSAS->getCurrentDirective());
	reportOriginalDsa(SemaRef, DSAS, CurDeclaration, DVar);
	continue;
	}
	}
	}

	// Try to find the associated user-defined mapper.
	ExprResult ER = buildUserDefinedMapperRef(
	SemaRef, DSAS->getCurScope(), MapperIdScopeSpec, MapperId,
	Type.getCanonicalType(), UnresolvedMapper);
	if (ER.isInvalid())
	continue;
	MVLI.UDMapperList.push_back(ER.get());

	// Save the current expression.
	MVLI.ProcessedVarList.push_back(RE);

	// Store the components in the stack so that they can be used to check
	// against other clauses later on.
	DSAS->addMappableExpressionComponents(CurDeclaration, CurComponents,
	/WhereFoundClauseKind=/OMPC_map);

	// Save the components and declaration to create the clause. For purposes of
	// the clause creation, any component list that has has base 'this' uses
	// null as base declaration.
	MVLI.VarComponents.resize(MVLI.VarComponents.size() + 1);
	MVLI.VarComponents.back().append(CurComponents.begin(),
	CurComponents.end());
	MVLI.VarBaseDeclarations.push_back(isa<MemberExpr>(BE) ? nullptr
	: CurDeclaration);
	}
	}

	OMPClause *Sema::ActOnOpenMPMapClause(
	ArrayRef<OpenMPMapModifierKind> MapTypeModifiers,
	ArrayRef<SourceLocation> MapTypeModifiersLoc,
	CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo &MapperId,
	OpenMPMapClauseKind MapType, bool IsMapTypeImplicit, SourceLocation MapLoc,
	SourceLocation ColonLoc, ArrayRef<Expr *> VarList,
	const OMPVarListLocTy &Locs, ArrayRef<Expr *> UnresolvedMappers) {
	OpenMPMapModifierKind Modifiers[] = {OMPC_MAP_MODIFIER_unknown,
	OMPC_MAP_MODIFIER_unknown,
	OMPC_MAP_MODIFIER_unknown};
	SourceLocation ModifiersLoc[NumberOfOMPMapClauseModifiers];

	// Process map-type-modifiers, flag errors for duplicate modifiers.
	unsigned Count = 0;
	for (unsigned I = 0, E = MapTypeModifiers.size(); I < E; ++I) {
	if (MapTypeModifiers[I] != OMPC_MAP_MODIFIER_unknown &&
	llvm::find(Modifiers, MapTypeModifiers[I]) != std::end(Modifiers)) {
	Diag(MapTypeModifiersLoc[I], diag::err_omp_duplicate_map_type_modifier);
	continue;
	}
	assert(Count < NumberOfOMPMapClauseModifiers &&
	"Modifiers exceed the allowed number of map type modifiers");
	Modifiers[Count] = MapTypeModifiers[I];
	ModifiersLoc[Count] = MapTypeModifiersLoc[I];
	++Count;
	}

	MappableVarListInfo MVLI(VarList);
	checkMappableExpressionList(*this, DSAStack, OMPC_map, MVLI, Locs.StartLoc,
	MapperIdScopeSpec, MapperId, UnresolvedMappers,
	MapType, IsMapTypeImplicit);

	// We need to produce a map clause even if we don't have variables so that
	// other diagnostics related with non-existing map clauses are accurate.
	return OMPMapClause::Create(Context, Locs, MVLI.ProcessedVarList,
	MVLI.VarBaseDeclarations, MVLI.VarComponents,
	MVLI.UDMapperList, Modifiers, ModifiersLoc,
	MapperIdScopeSpec.getWithLocInContext(Context),
	MapperId, MapType, IsMapTypeImplicit, MapLoc);
	}

	QualType Sema::ActOnOpenMPDeclareReductionType(SourceLocation TyLoc,
	TypeResult ParsedType) {
	assert(ParsedType.isUsable());

	QualType ReductionType = GetTypeFromParser(ParsedType.get());
	if (ReductionType.isNull())
	return QualType();

	// [OpenMP 4.0], 2.15 declare reduction Directive, Restrictions, C\C++
	// A type name in a declare reduction directive cannot be a function type, an
	// array type, a reference type, or a type qualified with const, volatile or
	// restrict.
	if (ReductionType.hasQualifiers()) {
	Diag(TyLoc, diag::err_omp_reduction_wrong_type) << 0;
	return QualType();
	}

	if (ReductionType->isFunctionType()) {
	Diag(TyLoc, diag::err_omp_reduction_wrong_type) << 1;
	return QualType();
	}
	if (ReductionType->isReferenceType()) {
	Diag(TyLoc, diag::err_omp_reduction_wrong_type) << 2;
	return QualType();
	}
	if (ReductionType->isArrayType()) {
	Diag(TyLoc, diag::err_omp_reduction_wrong_type) << 3;
	return QualType();
	}
	return ReductionType;
	}

	Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareReductionDirectiveStart(
	Scope S, DeclContext DC, DeclarationName Name,
	ArrayRef<std::pair<QualType, SourceLocation>> ReductionTypes,
	AccessSpecifier AS, Decl *PrevDeclInScope) {
	SmallVector<Decl *, 8> Decls;
	Decls.reserve(ReductionTypes.size());

	LookupResult Lookup(*this, Name, SourceLocation(), LookupOMPReductionName,
	forRedeclarationInCurContext());
	// [OpenMP 4.0], 2.15 declare reduction Directive, Restrictions
	// A reduction-identifier may not be re-declared in the current scope for the
	// same type or for a type that is compatible according to the base language
	// rules.
	llvm::DenseMap<QualType, SourceLocation> PreviousRedeclTypes;
	OMPDeclareReductionDecl *PrevDRD = nullptr;
	bool InCompoundScope = true;
	if (S != nullptr) {
	// Find previous declaration with the same name not referenced in other
	// declarations.
	FunctionScopeInfo *ParentFn = getEnclosingFunction();
	InCompoundScope =
	(ParentFn != nullptr) && !ParentFn->CompoundScopes.empty();
	LookupName(Lookup, S);
	FilterLookupForScope(Lookup, DC, S, /ConsiderLinkage=/false,
	/AllowInlineNamespace=/false);
	llvm::DenseMap<OMPDeclareReductionDecl *, bool> UsedAsPrevious;
	LookupResult::Filter Filter = Lookup.makeFilter();
	while (Filter.hasNext()) {
	auto *PrevDecl = cast<OMPDeclareReductionDecl>(Filter.next());
	if (InCompoundScope) {
	auto I = UsedAsPrevious.find(PrevDecl);
	if (I == UsedAsPrevious.end())
	UsedAsPrevious[PrevDecl] = false;
	if (OMPDeclareReductionDecl *D = PrevDecl->getPrevDeclInScope())
	UsedAsPrevious[D] = true;
	}
	PreviousRedeclTypes[PrevDecl->getType().getCanonicalType()] =
	PrevDecl->getLocation();
	}
	Filter.done();
	if (InCompoundScope) {
	for (const auto &PrevData : UsedAsPrevious) {
	if (!PrevData.second) {
	PrevDRD = PrevData.first;
	break;
	}
	}
	}
	} else if (PrevDeclInScope != nullptr) {
	auto *PrevDRDInScope = PrevDRD =
	cast<OMPDeclareReductionDecl>(PrevDeclInScope);
	do {
	PreviousRedeclTypes[PrevDRDInScope->getType().getCanonicalType()] =
	PrevDRDInScope->getLocation();
	PrevDRDInScope = PrevDRDInScope->getPrevDeclInScope();
	} while (PrevDRDInScope != nullptr);
	}
	for (const auto &TyData : ReductionTypes) {
	const auto I = PreviousRedeclTypes.find(TyData.first.getCanonicalType());
	bool Invalid = false;
	if (I != PreviousRedeclTypes.end()) {
	Diag(TyData.second, diag::err_omp_declare_reduction_redefinition)
	<< TyData.first;
	Diag(I->second, diag::note_previous_definition);
	Invalid = true;
	}
	PreviousRedeclTypes[TyData.first.getCanonicalType()] = TyData.second;
	auto *DRD = OMPDeclareReductionDecl::Create(Context, DC, TyData.second,
	Name, TyData.first, PrevDRD);
	DC->addDecl(DRD);
	DRD->setAccess(AS);
	Decls.push_back(DRD);
	if (Invalid)
	DRD->setInvalidDecl();
	else
	PrevDRD = DRD;
	}

	return DeclGroupPtrTy::make(
	DeclGroupRef::Create(Context, Decls.begin(), Decls.size()));
	}

	void Sema::ActOnOpenMPDeclareReductionCombinerStart(Scope S, Decl D) {
	auto *DRD = cast<OMPDeclareReductionDecl>(D);

	// Enter new function scope.
	PushFunctionScope();
	setFunctionHasBranchProtectedScope();
	getCurFunction()->setHasOMPDeclareReductionCombiner();

	if (S != nullptr)
	PushDeclContext(S, DRD);
	else
	CurContext = DRD;

	PushExpressionEvaluationContext(
	ExpressionEvaluationContext::PotentiallyEvaluated);

	QualType ReductionType = DRD->getType();
	// Create 'T* omp_parm;T omp_in;'. All references to 'omp_in' will
	// be replaced by '*omp_parm' during codegen. This required because 'omp_in'
	// uses semantics of argument handles by value, but it should be passed by
	// reference. C lang does not support references, so pass all parameters as
	// pointers.
	// Create 'T omp_in;' variable.
	VarDecl *OmpInParm =
	buildVarDecl(*this, D->getLocation(), ReductionType, "omp_in");
	// Create 'T* omp_parm;T omp_out;'. All references to 'omp_out' will
	// be replaced by '*omp_parm' during codegen. This required because 'omp_out'
	// uses semantics of argument handles by value, but it should be passed by
	// reference. C lang does not support references, so pass all parameters as
	// pointers.
	// Create 'T omp_out;' variable.
	VarDecl *OmpOutParm =
	buildVarDecl(*this, D->getLocation(), ReductionType, "omp_out");
	if (S != nullptr) {
	PushOnScopeChains(OmpInParm, S);
	PushOnScopeChains(OmpOutParm, S);
	} else {
	DRD->addDecl(OmpInParm);
	DRD->addDecl(OmpOutParm);
	}
	Expr *InE =
	::buildDeclRefExpr(*this, OmpInParm, ReductionType, D->getLocation());
	Expr *OutE =
	::buildDeclRefExpr(*this, OmpOutParm, ReductionType, D->getLocation());
	DRD->setCombinerData(InE, OutE);
	}

	void Sema::ActOnOpenMPDeclareReductionCombinerEnd(Decl D, Expr Combiner) {
	auto *DRD = cast<OMPDeclareReductionDecl>(D);
	DiscardCleanupsInEvaluationContext();
	PopExpressionEvaluationContext();

	PopDeclContext();
	PopFunctionScopeInfo();

	if (Combiner != nullptr)
	DRD->setCombiner(Combiner);
	else
	DRD->setInvalidDecl();
	}

	VarDecl Sema::ActOnOpenMPDeclareReductionInitializerStart(Scope S, Decl *D) {
	auto *DRD = cast<OMPDeclareReductionDecl>(D);

	// Enter new function scope.
	PushFunctionScope();
	setFunctionHasBranchProtectedScope();

	if (S != nullptr)
	PushDeclContext(S, DRD);
	else
	CurContext = DRD;

	PushExpressionEvaluationContext(
	ExpressionEvaluationContext::PotentiallyEvaluated);

	QualType ReductionType = DRD->getType();
	// Create 'T* omp_parm;T omp_priv;'. All references to 'omp_priv' will
	// be replaced by '*omp_parm' during codegen. This required because 'omp_priv'
	// uses semantics of argument handles by value, but it should be passed by
	// reference. C lang does not support references, so pass all parameters as
	// pointers.
	// Create 'T omp_priv;' variable.
	VarDecl *OmpPrivParm =
	buildVarDecl(*this, D->getLocation(), ReductionType, "omp_priv");
	// Create 'T* omp_parm;T omp_orig;'. All references to 'omp_orig' will
	// be replaced by '*omp_parm' during codegen. This required because 'omp_orig'
	// uses semantics of argument handles by value, but it should be passed by
	// reference. C lang does not support references, so pass all parameters as
	// pointers.
	// Create 'T omp_orig;' variable.
	VarDecl *OmpOrigParm =
	buildVarDecl(*this, D->getLocation(), ReductionType, "omp_orig");
	if (S != nullptr) {
	PushOnScopeChains(OmpPrivParm, S);
	PushOnScopeChains(OmpOrigParm, S);
	} else {
	DRD->addDecl(OmpPrivParm);
	DRD->addDecl(OmpOrigParm);
	}
	Expr *OrigE =
	::buildDeclRefExpr(*this, OmpOrigParm, ReductionType, D->getLocation());
	Expr *PrivE =
	::buildDeclRefExpr(*this, OmpPrivParm, ReductionType, D->getLocation());
	DRD->setInitializerData(OrigE, PrivE);
	return OmpPrivParm;
	}

	void Sema::ActOnOpenMPDeclareReductionInitializerEnd(Decl D, Expr Initializer,
	VarDecl *OmpPrivParm) {
	auto *DRD = cast<OMPDeclareReductionDecl>(D);
	DiscardCleanupsInEvaluationContext();
	PopExpressionEvaluationContext();

	PopDeclContext();
	PopFunctionScopeInfo();

	if (Initializer != nullptr) {
	DRD->setInitializer(Initializer, OMPDeclareReductionDecl::CallInit);
	} else if (OmpPrivParm->hasInit()) {
	DRD->setInitializer(OmpPrivParm->getInit(),
	OmpPrivParm->isDirectInit()
	? OMPDeclareReductionDecl::DirectInit
	: OMPDeclareReductionDecl::CopyInit);
	} else {
	DRD->setInvalidDecl();
	}
	}

	Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareReductionDirectiveEnd(
	Scope *S, DeclGroupPtrTy DeclReductions, bool IsValid) {
	for (Decl *D : DeclReductions.get()) {
	if (IsValid) {
	if (S)
	PushOnScopeChains(cast<OMPDeclareReductionDecl>(D), S,
	/AddToContext=/false);
	} else {
	D->setInvalidDecl();
	}
	}
	return DeclReductions;
	}

	TypeResult Sema::ActOnOpenMPDeclareMapperVarDecl(Scope *S, Declarator &D) {
	TypeSourceInfo *TInfo = GetTypeForDeclarator(D, S);
	QualType T = TInfo->getType();
	if (D.isInvalidType())
	return true;

	if (getLangOpts().CPlusPlus) {
	// Check that there are no default arguments (C++ only).
	CheckExtraCXXDefaultArguments(D);
	}

	return CreateParsedType(T, TInfo);
	}

	QualType Sema::ActOnOpenMPDeclareMapperType(SourceLocation TyLoc,
	TypeResult ParsedType) {
	assert(ParsedType.isUsable() && "Expect usable parsed mapper type");

	QualType MapperType = GetTypeFromParser(ParsedType.get());
	assert(!MapperType.isNull() && "Expect valid mapper type");

	// [OpenMP 5.0], 2.19.7.3 declare mapper Directive, Restrictions
	// The type must be of struct, union or class type in C and C++
	if (!MapperType->isStructureOrClassType() && !MapperType->isUnionType()) {
	Diag(TyLoc, diag::err_omp_mapper_wrong_type);
	return QualType();
	}
	return MapperType;
	}

	OMPDeclareMapperDecl *Sema::ActOnOpenMPDeclareMapperDirectiveStart(
	Scope S, DeclContext DC, DeclarationName Name, QualType MapperType,
	SourceLocation StartLoc, DeclarationName VN, AccessSpecifier AS,
	Decl *PrevDeclInScope) {
	LookupResult Lookup(*this, Name, SourceLocation(), LookupOMPMapperName,
	forRedeclarationInCurContext());
	// [OpenMP 5.0], 2.19.7.3 declare mapper Directive, Restrictions
	// A mapper-identifier may not be redeclared in the current scope for the
	// same type or for a type that is compatible according to the base language
	// rules.
	llvm::DenseMap<QualType, SourceLocation> PreviousRedeclTypes;
	OMPDeclareMapperDecl *PrevDMD = nullptr;
	bool InCompoundScope = true;
	if (S != nullptr) {
	// Find previous declaration with the same name not referenced in other
	// declarations.
	FunctionScopeInfo *ParentFn = getEnclosingFunction();
	InCompoundScope =
	(ParentFn != nullptr) && !ParentFn->CompoundScopes.empty();
	LookupName(Lookup, S);
	FilterLookupForScope(Lookup, DC, S, /ConsiderLinkage=/false,
	/AllowInlineNamespace=/false);
	llvm::DenseMap<OMPDeclareMapperDecl *, bool> UsedAsPrevious;
	LookupResult::Filter Filter = Lookup.makeFilter();
	while (Filter.hasNext()) {
	auto *PrevDecl = cast<OMPDeclareMapperDecl>(Filter.next());
	if (InCompoundScope) {
	auto I = UsedAsPrevious.find(PrevDecl);
	if (I == UsedAsPrevious.end())
	UsedAsPrevious[PrevDecl] = false;
	if (OMPDeclareMapperDecl *D = PrevDecl->getPrevDeclInScope())
	UsedAsPrevious[D] = true;
	}
	PreviousRedeclTypes[PrevDecl->getType().getCanonicalType()] =
	PrevDecl->getLocation();
	}
	Filter.done();
	if (InCompoundScope) {
	for (const auto &PrevData : UsedAsPrevious) {
	if (!PrevData.second) {
	PrevDMD = PrevData.first;
	break;
	}
	}
	}
	} else if (PrevDeclInScope) {
	auto *PrevDMDInScope = PrevDMD =
	cast<OMPDeclareMapperDecl>(PrevDeclInScope);
	do {
	PreviousRedeclTypes[PrevDMDInScope->getType().getCanonicalType()] =
	PrevDMDInScope->getLocation();
	PrevDMDInScope = PrevDMDInScope->getPrevDeclInScope();
	} while (PrevDMDInScope != nullptr);
	}
	const auto I = PreviousRedeclTypes.find(MapperType.getCanonicalType());
	bool Invalid = false;
	if (I != PreviousRedeclTypes.end()) {
	Diag(StartLoc, diag::err_omp_declare_mapper_redefinition)
	<< MapperType << Name;
	Diag(I->second, diag::note_previous_definition);
	Invalid = true;
	}
	auto *DMD = OMPDeclareMapperDecl::Create(Context, DC, StartLoc, Name,
	MapperType, VN, PrevDMD);
	DC->addDecl(DMD);
	DMD->setAccess(AS);
	if (Invalid)
	DMD->setInvalidDecl();

	// Enter new function scope.
	PushFunctionScope();
	setFunctionHasBranchProtectedScope();

	CurContext = DMD;

	return DMD;
	}

	void Sema::ActOnOpenMPDeclareMapperDirectiveVarDecl(OMPDeclareMapperDecl *DMD,
	Scope *S,
	QualType MapperType,
	SourceLocation StartLoc,
	DeclarationName VN) {
	VarDecl VD = buildVarDecl(this, StartLoc, MapperType, VN.getAsString());
	if (S)
	PushOnScopeChains(VD, S);
	else
	DMD->addDecl(VD);
	Expr MapperVarRefExpr = buildDeclRefExpr(this, VD, MapperType, StartLoc);
	DMD->setMapperVarRef(MapperVarRefExpr);
	}

	Sema::DeclGroupPtrTy
	Sema::ActOnOpenMPDeclareMapperDirectiveEnd(OMPDeclareMapperDecl D, Scope S,
	ArrayRef<OMPClause *> ClauseList) {
	PopDeclContext();
	PopFunctionScopeInfo();

	if (D) {
	if (S)
	PushOnScopeChains(D, S, /AddToContext=/false);
	D->CreateClauses(Context, ClauseList);
	}

	return DeclGroupPtrTy::make(DeclGroupRef(D));
	}

	OMPClause Sema::ActOnOpenMPNumTeamsClause(Expr NumTeams,
	SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation EndLoc) {
	Expr *ValExpr = NumTeams;
	Stmt *HelperValStmt = nullptr;

	// OpenMP [teams Constrcut, Restrictions]
	// The num_teams expression must evaluate to a positive integer value.
	if (!isNonNegativeIntegerValue(ValExpr, *this, OMPC_num_teams,
	/StrictlyPositive=/true))
	return nullptr;

	OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
	OpenMPDirectiveKind CaptureRegion =
	getOpenMPCaptureRegionForClause(DKind, OMPC_num_teams, LangOpts.OpenMP);
	if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) {
	ValExpr = MakeFullExpr(ValExpr).get();
	llvm::MapVector<const Expr , DeclRefExpr > Captures;
	ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
	HelperValStmt = buildPreInits(Context, Captures);
	}

	return new (Context) OMPNumTeamsClause(ValExpr, HelperValStmt, CaptureRegion,
	StartLoc, LParenLoc, EndLoc);
	}

	OMPClause Sema::ActOnOpenMPThreadLimitClause(Expr ThreadLimit,
	SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation EndLoc) {
	Expr *ValExpr = ThreadLimit;
	Stmt *HelperValStmt = nullptr;

	// OpenMP [teams Constrcut, Restrictions]
	// The thread_limit expression must evaluate to a positive integer value.
	if (!isNonNegativeIntegerValue(ValExpr, *this, OMPC_thread_limit,
	/StrictlyPositive=/true))
	return nullptr;

	OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
	OpenMPDirectiveKind CaptureRegion = getOpenMPCaptureRegionForClause(
	DKind, OMPC_thread_limit, LangOpts.OpenMP);
	if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) {
	ValExpr = MakeFullExpr(ValExpr).get();
	llvm::MapVector<const Expr , DeclRefExpr > Captures;
	ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
	HelperValStmt = buildPreInits(Context, Captures);
	}

	return new (Context) OMPThreadLimitClause(
	ValExpr, HelperValStmt, CaptureRegion, StartLoc, LParenLoc, EndLoc);
	}

	OMPClause Sema::ActOnOpenMPPriorityClause(Expr Priority,
	SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation EndLoc) {
	Expr *ValExpr = Priority;
	Stmt *HelperValStmt = nullptr;
	OpenMPDirectiveKind CaptureRegion = OMPD_unknown;

	// OpenMP [2.9.1, task Constrcut]
	// The priority-value is a non-negative numerical scalar expression.
	if (!isNonNegativeIntegerValue(
	ValExpr, *this, OMPC_priority,
	/StrictlyPositive=/false, /BuildCapture=/true,
	DSAStack->getCurrentDirective(), &CaptureRegion, &HelperValStmt))
	return nullptr;

	return new (Context) OMPPriorityClause(ValExpr, HelperValStmt, CaptureRegion,
	StartLoc, LParenLoc, EndLoc);
	}

	OMPClause Sema::ActOnOpenMPGrainsizeClause(Expr Grainsize,
	SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation EndLoc) {
	Expr *ValExpr = Grainsize;
	Stmt *HelperValStmt = nullptr;
	OpenMPDirectiveKind CaptureRegion = OMPD_unknown;

	// OpenMP [2.9.2, taskloop Constrcut]
	// The parameter of the grainsize clause must be a positive integer
	// expression.
	if (!isNonNegativeIntegerValue(
	ValExpr, *this, OMPC_grainsize,
	/StrictlyPositive=/true, /BuildCapture=/true,
	DSAStack->getCurrentDirective(), &CaptureRegion, &HelperValStmt))
	return nullptr;

	return new (Context) OMPGrainsizeClause(ValExpr, HelperValStmt, CaptureRegion,
	StartLoc, LParenLoc, EndLoc);
	}

	OMPClause Sema::ActOnOpenMPNumTasksClause(Expr NumTasks,
	SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation EndLoc) {
	Expr *ValExpr = NumTasks;
	Stmt *HelperValStmt = nullptr;
	OpenMPDirectiveKind CaptureRegion = OMPD_unknown;

	// OpenMP [2.9.2, taskloop Constrcut]
	// The parameter of the num_tasks clause must be a positive integer
	// expression.
	if (!isNonNegativeIntegerValue(
	ValExpr, *this, OMPC_num_tasks,
	/StrictlyPositive=/true, /BuildCapture=/true,
	DSAStack->getCurrentDirective(), &CaptureRegion, &HelperValStmt))
	return nullptr;

	return new (Context) OMPNumTasksClause(ValExpr, HelperValStmt, CaptureRegion,
	StartLoc, LParenLoc, EndLoc);
	}

	OMPClause Sema::ActOnOpenMPHintClause(Expr Hint, SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation EndLoc) {
	// OpenMP [2.13.2, critical construct, Description]
	// ... where hint-expression is an integer constant expression that evaluates
	// to a valid lock hint.
	ExprResult HintExpr = VerifyPositiveIntegerConstantInClause(Hint, OMPC_hint);
	if (HintExpr.isInvalid())
	return nullptr;
	return new (Context)
	OMPHintClause(HintExpr.get(), StartLoc, LParenLoc, EndLoc);
	}

	/// Tries to find omp_event_handle_t type.
	static bool findOMPEventHandleT(Sema &S, SourceLocation Loc,
	DSAStackTy *Stack) {
	QualType OMPEventHandleT = Stack->getOMPEventHandleT();
	if (!OMPEventHandleT.isNull())
	return true;
	IdentifierInfo *II = &S.PP.getIdentifierTable().get("omp_event_handle_t");
	ParsedType PT = S.getTypeName(*II, Loc, S.getCurScope());
	if (!PT.getAsOpaquePtr() \|\| PT.get().isNull()) {
	S.Diag(Loc, diag::err_omp_implied_type_not_found) << "omp_event_handle_t";
	return false;
	}
	Stack->setOMPEventHandleT(PT.get());
	return true;
	}

	OMPClause Sema::ActOnOpenMPDetachClause(Expr Evt, SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation EndLoc) {
	if (!Evt->isValueDependent() && !Evt->isTypeDependent() &&
	!Evt->isInstantiationDependent() &&
	!Evt->containsUnexpandedParameterPack()) {
	if (!findOMPEventHandleT(*this, Evt->getExprLoc(), DSAStack))
	return nullptr;
	// OpenMP 5.0, 2.10.1 task Construct.
	// event-handle is a variable of the omp_event_handle_t type.
	auto *Ref = dyn_cast<DeclRefExpr>(Evt->IgnoreParenImpCasts());
	if (!Ref) {
	Diag(Evt->getExprLoc(), diag::err_omp_var_expected)
	<< "omp_event_handle_t" << 0 << Evt->getSourceRange();
	return nullptr;
	}
	auto *VD = dyn_cast_or_null<VarDecl>(Ref->getDecl());
	if (!VD) {
	Diag(Evt->getExprLoc(), diag::err_omp_var_expected)
	<< "omp_event_handle_t" << 0 << Evt->getSourceRange();
	return nullptr;
	}
	if (!Context.hasSameUnqualifiedType(DSAStack->getOMPEventHandleT(),
	VD->getType()) \|\|
	VD->getType().isConstant(Context)) {
	Diag(Evt->getExprLoc(), diag::err_omp_var_expected)
	<< "omp_event_handle_t" << 1 << VD->getType()
	<< Evt->getSourceRange();
	return nullptr;
	}
	// OpenMP 5.0, 2.10.1 task Construct
	// [detach clause]... The event-handle will be considered as if it was
	// specified on a firstprivate clause.
	DSAStackTy::DSAVarData DVar = DSAStack->getTopDSA(VD, /FromParent=/false);
	if (DVar.CKind != OMPC_unknown && DVar.CKind != OMPC_firstprivate &&
	DVar.RefExpr) {
	Diag(Evt->getExprLoc(), diag::err_omp_wrong_dsa)
	<< getOpenMPClauseName(DVar.CKind)
	<< getOpenMPClauseName(OMPC_firstprivate);
	reportOriginalDsa(*this, DSAStack, VD, DVar);
	return nullptr;
	}
	}

	return new (Context) OMPDetachClause(Evt, StartLoc, LParenLoc, EndLoc);
	}

	OMPClause *Sema::ActOnOpenMPDistScheduleClause(
	OpenMPDistScheduleClauseKind Kind, Expr *ChunkSize, SourceLocation StartLoc,
	SourceLocation LParenLoc, SourceLocation KindLoc, SourceLocation CommaLoc,
	SourceLocation EndLoc) {
	if (Kind == OMPC_DIST_SCHEDULE_unknown) {
	std::string Values;
	Values += "'";
	Values += getOpenMPSimpleClauseTypeName(OMPC_dist_schedule, 0);
	Values += "'";
	Diag(KindLoc, diag::err_omp_unexpected_clause_value)
	<< Values << getOpenMPClauseName(OMPC_dist_schedule);
	return nullptr;
	}
	Expr *ValExpr = ChunkSize;
	Stmt *HelperValStmt = nullptr;
	if (ChunkSize) {
	if (!ChunkSize->isValueDependent() && !ChunkSize->isTypeDependent() &&
	!ChunkSize->isInstantiationDependent() &&
	!ChunkSize->containsUnexpandedParameterPack()) {
	SourceLocation ChunkSizeLoc = ChunkSize->getBeginLoc();
	ExprResult Val =
	PerformOpenMPImplicitIntegerConversion(ChunkSizeLoc, ChunkSize);
	if (Val.isInvalid())
	return nullptr;

	ValExpr = Val.get();

	// OpenMP [2.7.1, Restrictions]
	// chunk_size must be a loop invariant integer expression with a positive
	// value.
	llvm::APSInt Result;
	if (ValExpr->isIntegerConstantExpr(Result, Context)) {
	if (Result.isSigned() && !Result.isStrictlyPositive()) {
	Diag(ChunkSizeLoc, diag::err_omp_negative_expression_in_clause)
	<< "dist_schedule" << ChunkSize->getSourceRange();
	return nullptr;
	}
	} else if (getOpenMPCaptureRegionForClause(
	DSAStack->getCurrentDirective(), OMPC_dist_schedule,
	LangOpts.OpenMP) != OMPD_unknown &&
	!CurContext->isDependentContext()) {
	ValExpr = MakeFullExpr(ValExpr).get();
	llvm::MapVector<const Expr , DeclRefExpr > Captures;
	ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
	HelperValStmt = buildPreInits(Context, Captures);
	}
	}
	}

	return new (Context)
	OMPDistScheduleClause(StartLoc, LParenLoc, KindLoc, CommaLoc, EndLoc,
	Kind, ValExpr, HelperValStmt);
	}

	OMPClause *Sema::ActOnOpenMPDefaultmapClause(
	OpenMPDefaultmapClauseModifier M, OpenMPDefaultmapClauseKind Kind,
	SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation MLoc,
	SourceLocation KindLoc, SourceLocation EndLoc) {
	if (getLangOpts().OpenMP < 50) {
	if (M != OMPC_DEFAULTMAP_MODIFIER_tofrom \|\|
	Kind != OMPC_DEFAULTMAP_scalar) {
	std::string Value;
	SourceLocation Loc;
	Value += "'";
	if (M != OMPC_DEFAULTMAP_MODIFIER_tofrom) {
	Value += getOpenMPSimpleClauseTypeName(OMPC_defaultmap,
	OMPC_DEFAULTMAP_MODIFIER_tofrom);
	Loc = MLoc;
	} else {
	Value += getOpenMPSimpleClauseTypeName(OMPC_defaultmap,
	OMPC_DEFAULTMAP_scalar);
	Loc = KindLoc;
	}
	Value += "'";
	Diag(Loc, diag::err_omp_unexpected_clause_value)
	<< Value << getOpenMPClauseName(OMPC_defaultmap);
	return nullptr;
	}
	} else {
	bool isDefaultmapModifier = (M != OMPC_DEFAULTMAP_MODIFIER_unknown);
	bool isDefaultmapKind = (Kind != OMPC_DEFAULTMAP_unknown) \|\|
	(LangOpts.OpenMP >= 50 && KindLoc.isInvalid());
	if (!isDefaultmapKind \|\| !isDefaultmapModifier) {
	std::string ModifierValue = "'alloc', 'from', 'to', 'tofrom', "
	"'firstprivate', 'none', 'default'";
	std::string KindValue = "'scalar', 'aggregate', 'pointer'";
	if (!isDefaultmapKind && isDefaultmapModifier) {
	Diag(KindLoc, diag::err_omp_unexpected_clause_value)
	<< KindValue << getOpenMPClauseName(OMPC_defaultmap);
	} else if (isDefaultmapKind && !isDefaultmapModifier) {
	Diag(MLoc, diag::err_omp_unexpected_clause_value)
	<< ModifierValue << getOpenMPClauseName(OMPC_defaultmap);
	} else {
	Diag(MLoc, diag::err_omp_unexpected_clause_value)
	<< ModifierValue << getOpenMPClauseName(OMPC_defaultmap);
	Diag(KindLoc, diag::err_omp_unexpected_clause_value)
	<< KindValue << getOpenMPClauseName(OMPC_defaultmap);
	}
	return nullptr;
	}

	// OpenMP [5.0, 2.12.5, Restrictions, p. 174]
	// At most one defaultmap clause for each category can appear on the
	// directive.
	if (DSAStack->checkDefaultmapCategory(Kind)) {
	Diag(StartLoc, diag::err_omp_one_defaultmap_each_category);
	return nullptr;
	}
	}
	if (Kind == OMPC_DEFAULTMAP_unknown) {
	// Variable category is not specified - mark all categories.
	DSAStack->setDefaultDMAAttr(M, OMPC_DEFAULTMAP_aggregate, StartLoc);
	DSAStack->setDefaultDMAAttr(M, OMPC_DEFAULTMAP_scalar, StartLoc);
	DSAStack->setDefaultDMAAttr(M, OMPC_DEFAULTMAP_pointer, StartLoc);
	} else {
	DSAStack->setDefaultDMAAttr(M, Kind, StartLoc);
	}

	return new (Context)
	OMPDefaultmapClause(StartLoc, LParenLoc, MLoc, KindLoc, EndLoc, Kind, M);
	}

	bool Sema::ActOnStartOpenMPDeclareTargetDirective(SourceLocation Loc) {
	DeclContext *CurLexicalContext = getCurLexicalContext();
	if (!CurLexicalContext->isFileContext() &&
	!CurLexicalContext->isExternCContext() &&
	!CurLexicalContext->isExternCXXContext() &&
	!isa<CXXRecordDecl>(CurLexicalContext) &&
	!isa<ClassTemplateDecl>(CurLexicalContext) &&
	!isa<ClassTemplatePartialSpecializationDecl>(CurLexicalContext) &&
	!isa<ClassTemplateSpecializationDecl>(CurLexicalContext)) {
	Diag(Loc, diag::err_omp_region_not_file_context);
	return false;
	}
	++DeclareTargetNestingLevel;
	return true;
	}

	void Sema::ActOnFinishOpenMPDeclareTargetDirective() {
	assert(DeclareTargetNestingLevel > 0 &&
	"Unexpected ActOnFinishOpenMPDeclareTargetDirective");
	--DeclareTargetNestingLevel;
	}

	NamedDecl *
	Sema::lookupOpenMPDeclareTargetName(Scope *CurScope, CXXScopeSpec &ScopeSpec,
	const DeclarationNameInfo &Id,
	NamedDeclSetType &SameDirectiveDecls) {
	LookupResult Lookup(*this, Id, LookupOrdinaryName);
	LookupParsedName(Lookup, CurScope, &ScopeSpec, true);

	if (Lookup.isAmbiguous())
	return nullptr;
	Lookup.suppressDiagnostics();

	if (!Lookup.isSingleResult()) {
	VarOrFuncDeclFilterCCC CCC(*this);
	if (TypoCorrection Corrected =
	CorrectTypo(Id, LookupOrdinaryName, CurScope, nullptr, CCC,
	CTK_ErrorRecovery)) {
	diagnoseTypo(Corrected, PDiag(diag::err_undeclared_var_use_suggest)
	<< Id.getName());
	checkDeclIsAllowedInOpenMPTarget(nullptr, Corrected.getCorrectionDecl());
	return nullptr;
	}

	Diag(Id.getLoc(), diag::err_undeclared_var_use) << Id.getName();
	return nullptr;
	}

	NamedDecl *ND = Lookup.getAsSingle<NamedDecl>();
	if (!isa<VarDecl>(ND) && !isa<FunctionDecl>(ND) &&
	!isa<FunctionTemplateDecl>(ND)) {
	Diag(Id.getLoc(), diag::err_omp_invalid_target_decl) << Id.getName();
	return nullptr;
	}
	if (!SameDirectiveDecls.insert(cast<NamedDecl>(ND->getCanonicalDecl())))
	Diag(Id.getLoc(), diag::err_omp_declare_target_multiple) << Id.getName();
	return ND;
	}

	void Sema::ActOnOpenMPDeclareTargetName(
	NamedDecl *ND, SourceLocation Loc, OMPDeclareTargetDeclAttr::MapTypeTy MT,
	OMPDeclareTargetDeclAttr::DevTypeTy DT) {
	assert((isa<VarDecl>(ND) \|\| isa<FunctionDecl>(ND) \|\|
	isa<FunctionTemplateDecl>(ND)) &&
	"Expected variable, function or function template.");

	// Diagnose marking after use as it may lead to incorrect diagnosis and
	// codegen.
	if (LangOpts.OpenMP >= 50 &&
	(ND->isUsed(/CheckUsedAttr=/false) \|\| ND->isReferenced()))
	Diag(Loc, diag::warn_omp_declare_target_after_first_use);

	Optional<OMPDeclareTargetDeclAttr::DevTypeTy> DevTy =
	OMPDeclareTargetDeclAttr::getDeviceType(cast<ValueDecl>(ND));
	if (DevTy.hasValue() && *DevTy != DT) {
	Diag(Loc, diag::err_omp_device_type_mismatch)
	<< OMPDeclareTargetDeclAttr::ConvertDevTypeTyToStr(DT)
	<< OMPDeclareTargetDeclAttr::ConvertDevTypeTyToStr(*DevTy);
	return;
	}
	Optional<OMPDeclareTargetDeclAttr::MapTypeTy> Res =
	OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(cast<ValueDecl>(ND));
	if (!Res) {
	auto *A = OMPDeclareTargetDeclAttr::CreateImplicit(Context, MT, DT,
	SourceRange(Loc, Loc));
	ND->addAttr(A);
	if (ASTMutationListener *ML = Context.getASTMutationListener())
	ML->DeclarationMarkedOpenMPDeclareTarget(ND, A);
	checkDeclIsAllowedInOpenMPTarget(nullptr, ND, Loc);
	} else if (*Res != MT) {
	Diag(Loc, diag::err_omp_declare_target_to_and_link) << ND;
	}
	}

	static void checkDeclInTargetContext(SourceLocation SL, SourceRange SR,
	Sema &SemaRef, Decl *D) {
	if (!D \|\| !isa<VarDecl>(D))
	return;
	auto *VD = cast<VarDecl>(D);
	Optional<OMPDeclareTargetDeclAttr::MapTypeTy> MapTy =
	OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD);
	if (SemaRef.LangOpts.OpenMP >= 50 &&
	(SemaRef.getCurLambda(/IgnoreNonLambdaCapturingScope=/true) \|\|
	SemaRef.getCurBlock() \|\| SemaRef.getCurCapturedRegion()) &&
	VD->hasGlobalStorage()) {
	llvm::Optional<OMPDeclareTargetDeclAttr::MapTypeTy> MapTy =
	OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD);
	if (!MapTy \|\| *MapTy != OMPDeclareTargetDeclAttr::MT_To) {
	// OpenMP 5.0, 2.12.7 declare target Directive, Restrictions
	// If a lambda declaration and definition appears between a
	// declare target directive and the matching end declare target
	// directive, all variables that are captured by the lambda
	// expression must also appear in a to clause.
	SemaRef.Diag(VD->getLocation(),
	diag::err_omp_lambda_capture_in_declare_target_not_to);
	SemaRef.Diag(SL, diag::note_var_explicitly_captured_here)
	<< VD << 0 << SR;
	return;
	}
	}
	if (MapTy.hasValue())
	return;
	SemaRef.Diag(VD->getLocation(), diag::warn_omp_not_in_target_context);
	SemaRef.Diag(SL, diag::note_used_here) << SR;
	}

	static bool checkValueDeclInTarget(SourceLocation SL, SourceRange SR,
	Sema &SemaRef, DSAStackTy *Stack,
	ValueDecl *VD) {
	return OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD) \|\|
	checkTypeMappable(SL, SR, SemaRef, Stack, VD->getType(),
	/FullCheck=/false);
	}

	void Sema::checkDeclIsAllowedInOpenMPTarget(Expr E, Decl D,
	SourceLocation IdLoc) {
	if (!D \|\| D->isInvalidDecl())
	return;
	SourceRange SR = E ? E->getSourceRange() : D->getSourceRange();
	SourceLocation SL = E ? E->getBeginLoc() : D->getLocation();
	if (auto *VD = dyn_cast<VarDecl>(D)) {
	// Only global variables can be marked as declare target.
	if (!VD->isFileVarDecl() && !VD->isStaticLocal() &&
	!VD->isStaticDataMember())
	return;
	// 2.10.6: threadprivate variable cannot appear in a declare target
	// directive.
	if (DSAStack->isThreadPrivate(VD)) {
	Diag(SL, diag::err_omp_threadprivate_in_target);
	reportOriginalDsa(*this, DSAStack, VD, DSAStack->getTopDSA(VD, false));
	return;
	}
	}
	if (const auto *FTD = dyn_cast<FunctionTemplateDecl>(D))
	D = FTD->getTemplatedDecl();
	if (auto *FD = dyn_cast<FunctionDecl>(D)) {
	llvm::Optional<OMPDeclareTargetDeclAttr::MapTypeTy> Res =
	OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(FD);
	if (IdLoc.isValid() && Res && *Res == OMPDeclareTargetDeclAttr::MT_Link) {
	Diag(IdLoc, diag::err_omp_function_in_link_clause);
	Diag(FD->getLocation(), diag::note_defined_here) << FD;
	return;
	}
	}
	if (auto *VD = dyn_cast<ValueDecl>(D)) {
	// Problem if any with var declared with incomplete type will be reported
	// as normal, so no need to check it here.
	if ((E \|\| !VD->getType()->isIncompleteType()) &&
	!checkValueDeclInTarget(SL, SR, *this, DSAStack, VD))
	return;
	if (!E && !OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD)) {
	// Checking declaration inside declare target region.
	if (isa<VarDecl>(D) \|\| isa<FunctionDecl>(D) \|\|
	isa<FunctionTemplateDecl>(D)) {
	auto *A = OMPDeclareTargetDeclAttr::CreateImplicit(
	Context, OMPDeclareTargetDeclAttr::MT_To,
	OMPDeclareTargetDeclAttr::DT_Any, SourceRange(IdLoc, IdLoc));
	D->addAttr(A);
	if (ASTMutationListener *ML = Context.getASTMutationListener())
	ML->DeclarationMarkedOpenMPDeclareTarget(D, A);
	}
	return;
	}
	}
	if (!E)
	return;
	checkDeclInTargetContext(E->getExprLoc(), E->getSourceRange(), *this, D);
	}

	OMPClause Sema::ActOnOpenMPToClause(ArrayRef<Expr > VarList,
	CXXScopeSpec &MapperIdScopeSpec,
	DeclarationNameInfo &MapperId,
	const OMPVarListLocTy &Locs,
	ArrayRef<Expr *> UnresolvedMappers) {
	MappableVarListInfo MVLI(VarList);
	checkMappableExpressionList(*this, DSAStack, OMPC_to, MVLI, Locs.StartLoc,
	MapperIdScopeSpec, MapperId, UnresolvedMappers);
	if (MVLI.ProcessedVarList.empty())
	return nullptr;

	return OMPToClause::Create(
	Context, Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations,
	MVLI.VarComponents, MVLI.UDMapperList,
	MapperIdScopeSpec.getWithLocInContext(Context), MapperId);
	}

	OMPClause Sema::ActOnOpenMPFromClause(ArrayRef<Expr > VarList,
	CXXScopeSpec &MapperIdScopeSpec,
	DeclarationNameInfo &MapperId,
	const OMPVarListLocTy &Locs,
	ArrayRef<Expr *> UnresolvedMappers) {
	MappableVarListInfo MVLI(VarList);
	checkMappableExpressionList(*this, DSAStack, OMPC_from, MVLI, Locs.StartLoc,
	MapperIdScopeSpec, MapperId, UnresolvedMappers);
	if (MVLI.ProcessedVarList.empty())
	return nullptr;

	return OMPFromClause::Create(
	Context, Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations,
	MVLI.VarComponents, MVLI.UDMapperList,
	MapperIdScopeSpec.getWithLocInContext(Context), MapperId);
	}

	OMPClause Sema::ActOnOpenMPUseDevicePtrClause(ArrayRef<Expr > VarList,
	const OMPVarListLocTy &Locs) {
	MappableVarListInfo MVLI(VarList);
	SmallVector<Expr *, 8> PrivateCopies;
	SmallVector<Expr *, 8> Inits;

	for (Expr *RefExpr : VarList) {
	assert(RefExpr && "NULL expr in OpenMP use_device_ptr clause.");
	SourceLocation ELoc;
	SourceRange ERange;
	Expr *SimpleRefExpr = RefExpr;
	auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
	if (Res.second) {
	// It will be analyzed later.
	MVLI.ProcessedVarList.push_back(RefExpr);
	PrivateCopies.push_back(nullptr);
	Inits.push_back(nullptr);
	}
	ValueDecl *D = Res.first;
	if (!D)
	continue;

	QualType Type = D->getType();
	Type = Type.getNonReferenceType().getUnqualifiedType();

	auto *VD = dyn_cast<VarDecl>(D);

	// Item should be a pointer or reference to pointer.
	if (!Type->isPointerType()) {
	Diag(ELoc, diag::err_omp_usedeviceptr_not_a_pointer)
	<< 0 << RefExpr->getSourceRange();
	continue;
	}

	// Build the private variable and the expression that refers to it.
	auto VDPrivate =
	buildVarDecl(*this, ELoc, Type, D->getName(),
	D->hasAttrs() ? &D->getAttrs() : nullptr,
	VD ? cast<DeclRefExpr>(SimpleRefExpr) : nullptr);
	if (VDPrivate->isInvalidDecl())
	continue;

	CurContext->addDecl(VDPrivate);
	DeclRefExpr *VDPrivateRefExpr = buildDeclRefExpr(
	*this, VDPrivate, RefExpr->getType().getUnqualifiedType(), ELoc);

	// Add temporary variable to initialize the private copy of the pointer.
	VarDecl *VDInit =
	buildVarDecl(*this, RefExpr->getExprLoc(), Type, ".devptr.temp");
	DeclRefExpr *VDInitRefExpr = buildDeclRefExpr(
	*this, VDInit, RefExpr->getType(), RefExpr->getExprLoc());
	AddInitializerToDecl(VDPrivate,
	DefaultLvalueConversion(VDInitRefExpr).get(),
	/DirectInit=/false);

	// If required, build a capture to implement the privatization initialized
	// with the current list item value.
	DeclRefExpr *Ref = nullptr;
	if (!VD)
	Ref = buildCapture(this, D, SimpleRefExpr, /WithInit=*/true);
	MVLI.ProcessedVarList.push_back(VD ? RefExpr->IgnoreParens() : Ref);
	PrivateCopies.push_back(VDPrivateRefExpr);
	Inits.push_back(VDInitRefExpr);

	// We need to add a data sharing attribute for this variable to make sure it
	// is correctly captured. A variable that shows up in a use_device_ptr has
	// similar properties of a first private variable.
	DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_firstprivate, Ref);

	// Create a mappable component for the list item. List items in this clause
	// only need a component.
	MVLI.VarBaseDeclarations.push_back(D);
	MVLI.VarComponents.resize(MVLI.VarComponents.size() + 1);
	MVLI.VarComponents.back().push_back(
	OMPClauseMappableExprCommon::MappableComponent(SimpleRefExpr, D));
	}

	if (MVLI.ProcessedVarList.empty())
	return nullptr;

	return OMPUseDevicePtrClause::Create(
	Context, Locs, MVLI.ProcessedVarList, PrivateCopies, Inits,
	MVLI.VarBaseDeclarations, MVLI.VarComponents);
	}

	OMPClause Sema::ActOnOpenMPUseDeviceAddrClause(ArrayRef<Expr > VarList,
	const OMPVarListLocTy &Locs) {
	MappableVarListInfo MVLI(VarList);

	for (Expr *RefExpr : VarList) {
	assert(RefExpr && "NULL expr in OpenMP use_device_addr clause.");
	SourceLocation ELoc;
	SourceRange ERange;
	Expr *SimpleRefExpr = RefExpr;
	auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange,
	/AllowArraySection=/true);
	if (Res.second) {
	// It will be analyzed later.
	MVLI.ProcessedVarList.push_back(RefExpr);
	}
	ValueDecl *D = Res.first;
	if (!D)
	continue;
	auto *VD = dyn_cast<VarDecl>(D);

	// If required, build a capture to implement the privatization initialized
	// with the current list item value.
	DeclRefExpr *Ref = nullptr;
	if (!VD)
	Ref = buildCapture(this, D, SimpleRefExpr, /WithInit=*/true);
	MVLI.ProcessedVarList.push_back(VD ? RefExpr->IgnoreParens() : Ref);

	// We need to add a data sharing attribute for this variable to make sure it
	// is correctly captured. A variable that shows up in a use_device_addr has
	// similar properties of a first private variable.
	DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_firstprivate, Ref);

	// Create a mappable component for the list item. List items in this clause
	// only need a component.
	MVLI.VarBaseDeclarations.push_back(D);
	MVLI.VarComponents.emplace_back();
	Expr *Component = SimpleRefExpr;
	if (VD && (isa<OMPArraySectionExpr>(RefExpr->IgnoreParenImpCasts()) \|\|
	isa<ArraySubscriptExpr>(RefExpr->IgnoreParenImpCasts())))
	Component = DefaultFunctionArrayLvalueConversion(SimpleRefExpr).get();
	MVLI.VarComponents.back().push_back(
	OMPClauseMappableExprCommon::MappableComponent(Component, D));
	}

	if (MVLI.ProcessedVarList.empty())
	return nullptr;

	return OMPUseDeviceAddrClause::Create(Context, Locs, MVLI.ProcessedVarList,
	MVLI.VarBaseDeclarations,
	MVLI.VarComponents);
	}

	OMPClause Sema::ActOnOpenMPIsDevicePtrClause(ArrayRef<Expr > VarList,
	const OMPVarListLocTy &Locs) {
	MappableVarListInfo MVLI(VarList);
	for (Expr *RefExpr : VarList) {
	assert(RefExpr && "NULL expr in OpenMP is_device_ptr clause.");
	SourceLocation ELoc;
	SourceRange ERange;
	Expr *SimpleRefExpr = RefExpr;
	auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
	if (Res.second) {
	// It will be analyzed later.
	MVLI.ProcessedVarList.push_back(RefExpr);
	}
	ValueDecl *D = Res.first;
	if (!D)
	continue;

	QualType Type = D->getType();
	// item should be a pointer or array or reference to pointer or array
	if (!Type.getNonReferenceType()->isPointerType() &&
	!Type.getNonReferenceType()->isArrayType()) {
	Diag(ELoc, diag::err_omp_argument_type_isdeviceptr)
	<< 0 << RefExpr->getSourceRange();
	continue;
	}

	// Check if the declaration in the clause does not show up in any data
	// sharing attribute.
	DSAStackTy::DSAVarData DVar = DSAStack->getTopDSA(D, /FromParent=/false);
	if (isOpenMPPrivate(DVar.CKind)) {
	Diag(ELoc, diag::err_omp_variable_in_given_clause_and_dsa)
	<< getOpenMPClauseName(DVar.CKind)
	<< getOpenMPClauseName(OMPC_is_device_ptr)
	<< getOpenMPDirectiveName(DSAStack->getCurrentDirective());
	reportOriginalDsa(*this, DSAStack, D, DVar);
	continue;
	}

	const Expr *ConflictExpr;
	if (DSAStack->checkMappableExprComponentListsForDecl(
	D, /CurrentRegionOnly=/true,
	[&ConflictExpr](
	OMPClauseMappableExprCommon::MappableExprComponentListRef R,
	OpenMPClauseKind) -> bool {
	ConflictExpr = R.front().getAssociatedExpression();
	return true;
	})) {
	Diag(ELoc, diag::err_omp_map_shared_storage) << RefExpr->getSourceRange();
	Diag(ConflictExpr->getExprLoc(), diag::note_used_here)
	<< ConflictExpr->getSourceRange();
	continue;
	}

	// Store the components in the stack so that they can be used to check
	// against other clauses later on.
	OMPClauseMappableExprCommon::MappableComponent MC(SimpleRefExpr, D);
	DSAStack->addMappableExpressionComponents(
	D, MC, /WhereFoundClauseKind=/OMPC_is_device_ptr);

	// Record the expression we've just processed.
	MVLI.ProcessedVarList.push_back(SimpleRefExpr);

	// Create a mappable component for the list item. List items in this clause
	// only need a component. We use a null declaration to signal fields in
	// 'this'.
	assert((isa<DeclRefExpr>(SimpleRefExpr) \|\|
	isa<CXXThisExpr>(cast<MemberExpr>(SimpleRefExpr)->getBase())) &&
	"Unexpected device pointer expression!");
	MVLI.VarBaseDeclarations.push_back(
	isa<DeclRefExpr>(SimpleRefExpr) ? D : nullptr);
	MVLI.VarComponents.resize(MVLI.VarComponents.size() + 1);
	MVLI.VarComponents.back().push_back(MC);
	}

	if (MVLI.ProcessedVarList.empty())
	return nullptr;

	return OMPIsDevicePtrClause::Create(Context, Locs, MVLI.ProcessedVarList,
	MVLI.VarBaseDeclarations,
	MVLI.VarComponents);
	}

	OMPClause *Sema::ActOnOpenMPAllocateClause(
	Expr Allocator, ArrayRef<Expr > VarList, SourceLocation StartLoc,
	SourceLocation ColonLoc, SourceLocation LParenLoc, SourceLocation EndLoc) {
	if (Allocator) {
	// OpenMP [2.11.4 allocate Clause, Description]
	// allocator is an expression of omp_allocator_handle_t type.
	if (!findOMPAllocatorHandleT(*this, Allocator->getExprLoc(), DSAStack))
	return nullptr;

	ExprResult AllocatorRes = DefaultLvalueConversion(Allocator);
	if (AllocatorRes.isInvalid())
	return nullptr;
	AllocatorRes = PerformImplicitConversion(AllocatorRes.get(),
	DSAStack->getOMPAllocatorHandleT(),
	Sema::AA_Initializing,
	/AllowExplicit=/true);
	if (AllocatorRes.isInvalid())
	return nullptr;
	Allocator = AllocatorRes.get();
	} else {
	// OpenMP 5.0, 2.11.4 allocate Clause, Restrictions.
	// allocate clauses that appear on a target construct or on constructs in a
	// target region must specify an allocator expression unless a requires
	// directive with the dynamic_allocators clause is present in the same
	// compilation unit.
	if (LangOpts.OpenMPIsDevice &&
	!DSAStack->hasRequiresDeclWithClause<OMPDynamicAllocatorsClause>())
	targetDiag(StartLoc, diag::err_expected_allocator_expression);
	}
	// Analyze and build list of variables.
	SmallVector<Expr *, 8> Vars;
	for (Expr *RefExpr : VarList) {
	assert(RefExpr && "NULL expr in OpenMP private clause.");
	SourceLocation ELoc;
	SourceRange ERange;
	Expr *SimpleRefExpr = RefExpr;
	auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
	if (Res.second) {
	// It will be analyzed later.
	Vars.push_back(RefExpr);
	}
	ValueDecl *D = Res.first;
	if (!D)
	continue;

	auto *VD = dyn_cast<VarDecl>(D);
	DeclRefExpr *Ref = nullptr;
	if (!VD && !CurContext->isDependentContext())
	Ref = buildCapture(this, D, SimpleRefExpr, /WithInit=*/false);
	Vars.push_back((VD \|\| CurContext->isDependentContext())
	? RefExpr->IgnoreParens()
	: Ref);
	}

	if (Vars.empty())
	return nullptr;

	if (Allocator)
	DSAStack->addInnerAllocatorExpr(Allocator);
	return OMPAllocateClause::Create(Context, StartLoc, LParenLoc, Allocator,
	ColonLoc, EndLoc, Vars);
	}

	OMPClause Sema::ActOnOpenMPNontemporalClause(ArrayRef<Expr > VarList,
	SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation EndLoc) {
	SmallVector<Expr *, 8> Vars;
	for (Expr *RefExpr : VarList) {
	assert(RefExpr && "NULL expr in OpenMP nontemporal clause.");
	SourceLocation ELoc;
	SourceRange ERange;
	Expr *SimpleRefExpr = RefExpr;
	auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
	if (Res.second)
	// It will be analyzed later.
	Vars.push_back(RefExpr);
	ValueDecl *D = Res.first;
	if (!D)
	continue;

	// OpenMP 5.0, 2.9.3.1 simd Construct, Restrictions.
	// A list-item cannot appear in more than one nontemporal clause.
	if (const Expr *PrevRef =
	DSAStack->addUniqueNontemporal(D, SimpleRefExpr)) {
	Diag(ELoc, diag::err_omp_used_in_clause_twice)
	<< 0 << getOpenMPClauseName(OMPC_nontemporal) << ERange;
	Diag(PrevRef->getExprLoc(), diag::note_omp_explicit_dsa)
	<< getOpenMPClauseName(OMPC_nontemporal);
	continue;
	}

	Vars.push_back(RefExpr);
	}

	if (Vars.empty())
	return nullptr;

	return OMPNontemporalClause::Create(Context, StartLoc, LParenLoc, EndLoc,
	Vars);
	}

	OMPClause Sema::ActOnOpenMPInclusiveClause(ArrayRef<Expr > VarList,
	SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation EndLoc) {
	SmallVector<Expr *, 8> Vars;
	for (Expr *RefExpr : VarList) {
	assert(RefExpr && "NULL expr in OpenMP nontemporal clause.");
	SourceLocation ELoc;
	SourceRange ERange;
	Expr *SimpleRefExpr = RefExpr;
	auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange,
	/AllowArraySection=/true);
	if (Res.second)
	// It will be analyzed later.
	Vars.push_back(RefExpr);
	ValueDecl *D = Res.first;
	if (!D)
	continue;

	const DSAStackTy::DSAVarData DVar =
	DSAStack->getTopDSA(D, /FromParent=/true);
	// OpenMP 5.0, 2.9.6, scan Directive, Restrictions.
	// A list item that appears in the inclusive or exclusive clause must appear
	// in a reduction clause with the inscan modifier on the enclosing
	// worksharing-loop, worksharing-loop SIMD, or simd construct.
	if (DVar.CKind != OMPC_reduction \|\|
	DVar.Modifier != OMPC_REDUCTION_inscan)
	Diag(ELoc, diag::err_omp_inclusive_exclusive_not_reduction)
	<< RefExpr->getSourceRange();

	if (DSAStack->getParentDirective() != OMPD_unknown)
	DSAStack->markDeclAsUsedInScanDirective(D);
	Vars.push_back(RefExpr);
	}

	if (Vars.empty())
	return nullptr;

	return OMPInclusiveClause::Create(Context, StartLoc, LParenLoc, EndLoc, Vars);
	}

	OMPClause Sema::ActOnOpenMPExclusiveClause(ArrayRef<Expr > VarList,
	SourceLocation StartLoc,
	SourceLocation LParenLoc,
	SourceLocation EndLoc) {
	SmallVector<Expr *, 8> Vars;
	for (Expr *RefExpr : VarList) {
	assert(RefExpr && "NULL expr in OpenMP nontemporal clause.");
	SourceLocation ELoc;
	SourceRange ERange;
	Expr *SimpleRefExpr = RefExpr;
	auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange,
	/AllowArraySection=/true);
	if (Res.second)
	// It will be analyzed later.
	Vars.push_back(RefExpr);
	ValueDecl *D = Res.first;
	if (!D)
	continue;

	OpenMPDirectiveKind ParentDirective = DSAStack->getParentDirective();
	DSAStackTy::DSAVarData DVar;
	if (ParentDirective != OMPD_unknown)
	DVar = DSAStack->getTopDSA(D, /FromParent=/true);
	// OpenMP 5.0, 2.9.6, scan Directive, Restrictions.
	// A list item that appears in the inclusive or exclusive clause must appear
	// in a reduction clause with the inscan modifier on the enclosing
	// worksharing-loop, worksharing-loop SIMD, or simd construct.
	if (ParentDirective == OMPD_unknown \|\| DVar.CKind != OMPC_reduction \|\|
	DVar.Modifier != OMPC_REDUCTION_inscan) {
	Diag(ELoc, diag::err_omp_inclusive_exclusive_not_reduction)
	<< RefExpr->getSourceRange();
	} else {
	DSAStack->markDeclAsUsedInScanDirective(D);
	}
	Vars.push_back(RefExpr);
	}

	if (Vars.empty())
	return nullptr;

	return OMPExclusiveClause::Create(Context, StartLoc, LParenLoc, EndLoc, Vars);
	}

	/// Tries to find omp_alloctrait_t type.
	static bool findOMPAlloctraitT(Sema &S, SourceLocation Loc, DSAStackTy *Stack) {
	QualType OMPAlloctraitT = Stack->getOMPAlloctraitT();
	if (!OMPAlloctraitT.isNull())
	return true;
	IdentifierInfo &II = S.PP.getIdentifierTable().get("omp_alloctrait_t");
	ParsedType PT = S.getTypeName(II, Loc, S.getCurScope());
	if (!PT.getAsOpaquePtr() \|\| PT.get().isNull()) {
	S.Diag(Loc, diag::err_omp_implied_type_not_found) << "omp_alloctrait_t";
	return false;
	}
	Stack->setOMPAlloctraitT(PT.get());
	return true;
	}

	OMPClause *Sema::ActOnOpenMPUsesAllocatorClause(
	SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc,
	ArrayRef<UsesAllocatorsData> Data) {
	// OpenMP [2.12.5, target Construct]
	// allocator is an identifier of omp_allocator_handle_t type.
	if (!findOMPAllocatorHandleT(*this, StartLoc, DSAStack))
	return nullptr;
	// OpenMP [2.12.5, target Construct]
	// allocator-traits-array is an identifier of const omp_alloctrait_t * type.
	if (llvm::any_of(
	Data,
	[](const UsesAllocatorsData &D) { return D.AllocatorTraits; }) &&
	!findOMPAlloctraitT(*this, StartLoc, DSAStack))
	return nullptr;
	llvm::SmallSet<CanonicalDeclPtr<Decl>, 4> PredefinedAllocators;
	for (int I = 0; I < OMPAllocateDeclAttr::OMPUserDefinedMemAlloc; ++I) {
	auto AllocatorKind = static_cast<OMPAllocateDeclAttr::AllocatorTypeTy>(I);
	StringRef Allocator =
	OMPAllocateDeclAttr::ConvertAllocatorTypeTyToStr(AllocatorKind);
	DeclarationName AllocatorName = &Context.Idents.get(Allocator);
	PredefinedAllocators.insert(LookupSingleName(
	TUScope, AllocatorName, StartLoc, Sema::LookupAnyName));
	}

	SmallVector<OMPUsesAllocatorsClause::Data, 4> NewData;
	for (const UsesAllocatorsData &D : Data) {
	Expr *AllocatorExpr = nullptr;
	// Check allocator expression.
	if (D.Allocator->isTypeDependent()) {
	AllocatorExpr = D.Allocator;
	} else {
	// Traits were specified - need to assign new allocator to the specified
	// allocator, so it must be an lvalue.
	AllocatorExpr = D.Allocator->IgnoreParenImpCasts();
	auto *DRE = dyn_cast<DeclRefExpr>(AllocatorExpr);
	bool IsPredefinedAllocator = false;
	if (DRE)
	IsPredefinedAllocator = PredefinedAllocators.count(DRE->getDecl());
	if (!DRE \|\|
	!(Context.hasSameUnqualifiedType(
	AllocatorExpr->getType(), DSAStack->getOMPAllocatorHandleT()) \|\|
	Context.typesAreCompatible(AllocatorExpr->getType(),
	DSAStack->getOMPAllocatorHandleT(),
	/CompareUnqualified=/true)) \|\|
	(!IsPredefinedAllocator &&
	(AllocatorExpr->getType().isConstant(Context) \|\|
	!AllocatorExpr->isLValue()))) {
	Diag(D.Allocator->getExprLoc(), diag::err_omp_var_expected)
	<< "omp_allocator_handle_t" << (DRE ? 1 : 0)
	<< AllocatorExpr->getType() << D.Allocator->getSourceRange();
	continue;
	}
	// OpenMP [2.12.5, target Construct]
	// Predefined allocators appearing in a uses_allocators clause cannot have
	// traits specified.
	if (IsPredefinedAllocator && D.AllocatorTraits) {
	Diag(D.AllocatorTraits->getExprLoc(),
	diag::err_omp_predefined_allocator_with_traits)
	<< D.AllocatorTraits->getSourceRange();
	Diag(D.Allocator->getExprLoc(), diag::note_omp_predefined_allocator)
	<< cast<NamedDecl>(DRE->getDecl())->getName()
	<< D.Allocator->getSourceRange();
	continue;
	}
	// OpenMP [2.12.5, target Construct]
	// Non-predefined allocators appearing in a uses_allocators clause must
	// have traits specified.
	if (!IsPredefinedAllocator && !D.AllocatorTraits) {
	Diag(D.Allocator->getExprLoc(),
	diag::err_omp_nonpredefined_allocator_without_traits);
	continue;
	}
	// No allocator traits - just convert it to rvalue.
	if (!D.AllocatorTraits)
	AllocatorExpr = DefaultLvalueConversion(AllocatorExpr).get();
	DSAStack->addUsesAllocatorsDecl(
	DRE->getDecl(),
	IsPredefinedAllocator
	? DSAStackTy::UsesAllocatorsDeclKind::PredefinedAllocator
	: DSAStackTy::UsesAllocatorsDeclKind::UserDefinedAllocator);
	}
	Expr *AllocatorTraitsExpr = nullptr;
	if (D.AllocatorTraits) {
	if (D.AllocatorTraits->isTypeDependent()) {
	AllocatorTraitsExpr = D.AllocatorTraits;
	} else {
	// OpenMP [2.12.5, target Construct]
	// Arrays that contain allocator traits that appear in a uses_allocators
	// clause must be constant arrays, have constant values and be defined
	// in the same scope as the construct in which the clause appears.
	AllocatorTraitsExpr = D.AllocatorTraits->IgnoreParenImpCasts();
	// Check that traits expr is a constant array.
	QualType TraitTy;
	if (const ArrayType *Ty =
	AllocatorTraitsExpr->getType()->getAsArrayTypeUnsafe())
	if (const auto *ConstArrayTy = dyn_cast<ConstantArrayType>(Ty))
	TraitTy = ConstArrayTy->getElementType();
	if (TraitTy.isNull() \|\|
	!(Context.hasSameUnqualifiedType(TraitTy,
	DSAStack->getOMPAlloctraitT()) \|\|
	Context.typesAreCompatible(TraitTy, DSAStack->getOMPAlloctraitT(),
	/CompareUnqualified=/true))) {
	Diag(D.AllocatorTraits->getExprLoc(),
	diag::err_omp_expected_array_alloctraits)
	<< AllocatorTraitsExpr->getType();
	continue;
	}
	// Do not map by default allocator traits if it is a standalone
	// variable.
	if (auto *DRE = dyn_cast<DeclRefExpr>(AllocatorTraitsExpr))
	DSAStack->addUsesAllocatorsDecl(
	DRE->getDecl(),
	DSAStackTy::UsesAllocatorsDeclKind::AllocatorTrait);
	}
	}
	OMPUsesAllocatorsClause::Data &NewD = NewData.emplace_back();
	NewD.Allocator = AllocatorExpr;
	NewD.AllocatorTraits = AllocatorTraitsExpr;
	NewD.LParenLoc = D.LParenLoc;
	NewD.RParenLoc = D.RParenLoc;
	}
	return OMPUsesAllocatorsClause::Create(Context, StartLoc, LParenLoc, EndLoc,
	NewData);
	}

	OMPClause *Sema::ActOnOpenMPAffinityClause(
	SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation ColonLoc,
	SourceLocation EndLoc, Expr Modifier, ArrayRef<Expr > Locators) {
	SmallVector<Expr *, 8> Vars;
	for (Expr *RefExpr : Locators) {
	assert(RefExpr && "NULL expr in OpenMP shared clause.");
	if (isa<DependentScopeDeclRefExpr>(RefExpr) \|\| RefExpr->isTypeDependent()) {
	// It will be analyzed later.
	Vars.push_back(RefExpr);
	continue;
	}

	SourceLocation ELoc = RefExpr->getExprLoc();
	Expr *SimpleExpr = RefExpr->IgnoreParenImpCasts();

	if (!SimpleExpr->isLValue()) {
	Diag(ELoc, diag::err_omp_expected_addressable_lvalue_or_array_item)
	<< 1 << 0 << RefExpr->getSourceRange();
	continue;
	}

	ExprResult Res;
	{
	Sema::TentativeAnalysisScope Trap(*this);
	Res = CreateBuiltinUnaryOp(ELoc, UO_AddrOf, SimpleExpr);
	}
	if (!Res.isUsable() && !isa<OMPArraySectionExpr>(SimpleExpr) &&
	!isa<OMPArrayShapingExpr>(SimpleExpr)) {
	Diag(ELoc, diag::err_omp_expected_addressable_lvalue_or_array_item)
	<< 1 << 0 << RefExpr->getSourceRange();
	continue;
	}
	Vars.push_back(SimpleExpr);
	}

	return OMPAffinityClause::Create(Context, StartLoc, LParenLoc, ColonLoc,
	EndLoc, Modifier, Vars);
	}
	diff --git a/lld/COFF/Config.h b/lld/COFF/Config.h
	index 72d826b8bd17..7c439176f3a4 100644
	--- a/lld/COFF/Config.h
	+++ b/lld/COFF/Config.h
	@@ -1,247 +1,248 @@
	//===- Config.h -------------------------------------------------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLD_COFF_CONFIG_H
	#define LLD_COFF_CONFIG_H

	#include "llvm/ADT/StringMap.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/Object/COFF.h"
	#include "llvm/Support/CachePruning.h"
	#include <cstdint>
	#include <map>
	#include <set>
	#include <string>

	namespace lld {
	namespace coff {

	using llvm::COFF::IMAGE_FILE_MACHINE_UNKNOWN;
	using llvm::COFF::WindowsSubsystem;
	using llvm::StringRef;
	class DefinedAbsolute;
	class DefinedRelative;
	class StringChunk;
	class Symbol;
	class InputFile;

	// Short aliases.
	static const auto AMD64 = llvm::COFF::IMAGE_FILE_MACHINE_AMD64;
	static const auto ARM64 = llvm::COFF::IMAGE_FILE_MACHINE_ARM64;
	static const auto ARMNT = llvm::COFF::IMAGE_FILE_MACHINE_ARMNT;
	static const auto I386 = llvm::COFF::IMAGE_FILE_MACHINE_I386;

	// Represents an /export option.
	struct Export {
	StringRef name; // N in /export:N or /export:E=N
	StringRef extName; // E in /export:E=N
	Symbol *sym = nullptr;
	uint16_t ordinal = 0;
	bool noname = false;
	bool data = false;
	bool isPrivate = false;
	bool constant = false;

	// If an export is a form of /export:foo=dllname.bar, that means
	// that foo should be exported as an alias to bar in the DLL.
	// forwardTo is set to "dllname.bar" part. Usually empty.
	StringRef forwardTo;
	StringChunk *forwardChunk = nullptr;

	// True if this /export option was in .drectves section.
	bool directives = false;
	StringRef symbolName;
	StringRef exportName; // Name in DLL

	bool operator==(const Export &e) {
	return (name == e.name && extName == e.extName &&
	ordinal == e.ordinal && noname == e.noname &&
	data == e.data && isPrivate == e.isPrivate);
	}
	};

	enum class DebugType {
	None = 0x0,
	CV = 0x1, /// CodeView
	PData = 0x2, /// Procedure Data
	Fixup = 0x4, /// Relocation Table
	};

	enum class GuardCFLevel {
	Off,
	NoLongJmp, // Emit gfids but no longjmp tables
	Full, // Enable all protections.
	};

	// Global configuration.
	struct Configuration {
	enum ManifestKind { SideBySide, Embed, No };
	bool is64() { return machine == AMD64 \|\| machine == ARM64; }

	llvm::COFF::MachineTypes machine = IMAGE_FILE_MACHINE_UNKNOWN;
	size_t wordsize;
	bool verbose = false;
	WindowsSubsystem subsystem = llvm::COFF::IMAGE_SUBSYSTEM_UNKNOWN;
	Symbol *entry = nullptr;
	bool noEntry = false;
	std::string outputFile;
	std::string importName;
	bool demangle = true;
	bool doGC = true;
	bool doICF = true;
	bool tailMerge;
	bool relocatable = true;
	bool forceMultiple = false;
	bool forceMultipleRes = false;
	bool forceUnresolved = false;
	bool debug = false;
	bool debugDwarf = false;
	bool debugGHashes = false;
	bool debugSymtab = false;
	bool driver = false;
	bool driverUponly = false;
	bool driverWdm = false;
	bool showTiming = false;
	bool showSummary = false;
	unsigned debugTypes = static_cast<unsigned>(DebugType::None);
	std::vector<std::string> natvisFiles;
	llvm::StringMap<std::string> namedStreams;
	llvm::SmallString<128> pdbAltPath;
	llvm::SmallString<128> pdbPath;
	llvm::SmallString<128> pdbSourcePath;
	std::vector<llvm::StringRef> argv;

	// Symbols in this set are considered as live by the garbage collector.
	std::vector<Symbol *> gcroot;

	std::set<std::string> noDefaultLibs;
	bool noDefaultLibAll = false;

	// True if we are creating a DLL.
	bool dll = false;
	StringRef implib;
	std::vector<Export> exports;
	bool hadExplicitExports;
	std::set<std::string> delayLoads;
	std::map<std::string, int> dllOrder;
	Symbol *delayLoadHelper = nullptr;

	bool saveTemps = false;

	// /guard:cf
	GuardCFLevel guardCF = GuardCFLevel::Off;

	// Used for SafeSEH.
	bool safeSEH = false;
	Symbol *sehTable = nullptr;
	Symbol *sehCount = nullptr;
	+ bool noSEH = false;

	// Used for /opt:lldlto=N
	unsigned ltoo = 2;

	// Used for /opt:lldltojobs=N
	std::string thinLTOJobs;
	// Used for /opt:lldltopartitions=N
	unsigned ltoPartitions = 1;

	// Used for /opt:lldltocache=path
	StringRef ltoCache;
	// Used for /opt:lldltocachepolicy=policy
	llvm::CachePruningPolicy ltoCachePolicy;

	// Used for /merge:from=to (e.g. /merge:.rdata=.text)
	std::map<StringRef, StringRef> merge;

	// Used for /section=.name,{DEKPRSW} to set section attributes.
	std::map<StringRef, uint32_t> section;

	// Options for manifest files.
	ManifestKind manifest = No;
	int manifestID = 1;
	StringRef manifestDependency;
	bool manifestUAC = true;
	std::vector<std::string> manifestInput;
	StringRef manifestLevel = "'asInvoker'";
	StringRef manifestUIAccess = "'false'";
	StringRef manifestFile;

	// Used for /aligncomm.
	std::map<std::string, int> alignComm;

	// Used for /failifmismatch.
	std::map<StringRef, std::pair<StringRef, InputFile *>> mustMatch;

	// Used for /alternatename.
	std::map<StringRef, StringRef> alternateNames;

	// Used for /order.
	llvm::StringMap<int> order;

	// Used for /lldmap.
	std::string lldmapFile;

	// Used for /map.
	std::string mapFile;

	// Used for /thinlto-index-only:
	llvm::StringRef thinLTOIndexOnlyArg;

	// Used for /thinlto-object-prefix-replace:
	std::pair<llvm::StringRef, llvm::StringRef> thinLTOPrefixReplace;

	// Used for /thinlto-object-suffix-replace:
	std::pair<llvm::StringRef, llvm::StringRef> thinLTOObjectSuffixReplace;

	// Used for /lto-obj-path:
	llvm::StringRef ltoObjPath;

	uint64_t align = 4096;
	uint64_t imageBase = -1;
	uint64_t fileAlign = 512;
	uint64_t stackReserve = 1024 * 1024;
	uint64_t stackCommit = 4096;
	uint64_t heapReserve = 1024 * 1024;
	uint64_t heapCommit = 4096;
	uint32_t majorImageVersion = 0;
	uint32_t minorImageVersion = 0;
	uint32_t majorOSVersion = 6;
	uint32_t minorOSVersion = 0;
	uint32_t timestamp = 0;
	uint32_t functionPadMin = 0;
	bool dynamicBase = true;
	bool allowBind = true;
	bool cetCompat = false;
	bool nxCompat = true;
	bool allowIsolation = true;
	bool terminalServerAware = true;
	bool largeAddressAware = false;
	bool highEntropyVA = false;
	bool appContainer = false;
	bool mingw = false;
	bool warnMissingOrderSymbol = true;
	bool warnLocallyDefinedImported = true;
	bool warnDebugInfoUnusable = true;
	bool warnLongSectionNames = true;
	bool incremental = true;
	bool integrityCheck = false;
	bool killAt = false;
	bool repro = false;
	bool swaprunCD = false;
	bool swaprunNet = false;
	bool thinLTOEmitImportsFiles;
	bool thinLTOIndexOnly;
	bool autoImport = false;
	bool pseudoRelocs = false;
	};

	extern Configuration *config;

	} // namespace coff
	} // namespace lld

	#endif
	diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
	index 7372505bb616..9ceccef86779 100644
	--- a/lld/COFF/Driver.cpp
	+++ b/lld/COFF/Driver.cpp
	@@ -1,2052 +1,2053 @@
	//===- Driver.cpp ---------------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "Driver.h"
	#include "Config.h"
	#include "DebugTypes.h"
	#include "ICF.h"
	#include "InputFiles.h"
	#include "MarkLive.h"
	#include "MinGW.h"
	#include "SymbolTable.h"
	#include "Symbols.h"
	#include "Writer.h"
	#include "lld/Common/Args.h"
	#include "lld/Common/Driver.h"
	#include "lld/Common/ErrorHandler.h"
	#include "lld/Common/Filesystem.h"
	#include "lld/Common/Memory.h"
	#include "lld/Common/Timer.h"
	#include "lld/Common/Version.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/BinaryFormat/Magic.h"
	#include "llvm/LTO/LTO.h"
	#include "llvm/Object/ArchiveWriter.h"
	#include "llvm/Object/COFFImportFile.h"
	#include "llvm/Object/COFFModuleDefinition.h"
	#include "llvm/Object/WindowsMachineFlag.h"
	#include "llvm/Option/Arg.h"
	#include "llvm/Option/ArgList.h"
	#include "llvm/Option/Option.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/LEB128.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/Parallel.h"
	#include "llvm/Support/Path.h"
	#include "llvm/Support/Process.h"
	#include "llvm/Support/TarWriter.h"
	#include "llvm/Support/TargetSelect.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/ToolDrivers/llvm-lib/LibDriver.h"
	#include <algorithm>
	#include <future>
	#include <memory>

	using namespace llvm;
	using namespace llvm::object;
	using namespace llvm::COFF;
	using llvm::sys::Process;

	namespace lld {
	namespace coff {

	static Timer inputFileTimer("Input File Reading", Timer::root());

	Configuration *config;
	LinkerDriver *driver;

	bool link(ArrayRef<const char *> args, bool canExitEarly, raw_ostream &stdoutOS,
	raw_ostream &stderrOS) {
	lld::stdoutOS = &stdoutOS;
	lld::stderrOS = &stderrOS;

	errorHandler().logName = args::getFilenameWithoutExe(args[0]);
	errorHandler().errorLimitExceededMsg =
	"too many errors emitted, stopping now"
	" (use /errorlimit:0 to see all errors)";
	errorHandler().exitEarly = canExitEarly;
	stderrOS.enable_colors(stderrOS.has_colors());

	config = make<Configuration>();
	symtab = make<SymbolTable>();
	driver = make<LinkerDriver>();

	driver->link(args);

	// Call exit() if we can to avoid calling destructors.
	if (canExitEarly)
	exitLld(errorCount() ? 1 : 0);

	freeArena();
	ObjFile::instances.clear();
	ImportFile::instances.clear();
	BitcodeFile::instances.clear();
	memset(MergeChunk::instances, 0, sizeof(MergeChunk::instances));
	TpiSource::clear();

	return !errorCount();
	}

	// Parse options of the form "old;new".
	static std::pair<StringRef, StringRef> getOldNewOptions(opt::InputArgList &args,
	unsigned id) {
	auto *arg = args.getLastArg(id);
	if (!arg)
	return {"", ""};

	StringRef s = arg->getValue();
	std::pair<StringRef, StringRef> ret = s.split(';');
	if (ret.second.empty())
	error(arg->getSpelling() + " expects 'old;new' format, but got " + s);
	return ret;
	}

	// Drop directory components and replace extension with
	// ".exe", ".dll" or ".sys".
	static std::string getOutputPath(StringRef path) {
	StringRef ext = ".exe";
	if (config->dll)
	ext = ".dll";
	else if (config->driver)
	ext = ".sys";

	return (sys::path::stem(path) + ext).str();
	}

	// Returns true if S matches /crtend.?\.o$/.
	static bool isCrtend(StringRef s) {
	if (!s.endswith(".o"))
	return false;
	s = s.drop_back(2);
	if (s.endswith("crtend"))
	return true;
	return !s.empty() && s.drop_back().endswith("crtend");
	}

	// ErrorOr is not default constructible, so it cannot be used as the type
	// parameter of a future.
	// FIXME: We could open the file in createFutureForFile and avoid needing to
	// return an error here, but for the moment that would cost us a file descriptor
	// (a limited resource on Windows) for the duration that the future is pending.
	using MBErrPair = std::pair<std::unique_ptr<MemoryBuffer>, std::error_code>;

	// Create a std::future that opens and maps a file using the best strategy for
	// the host platform.
	static std::future<MBErrPair> createFutureForFile(std::string path) {
	#if _WIN32
	// On Windows, file I/O is relatively slow so it is best to do this
	// asynchronously.
	auto strategy = std::launch::async;
	#else
	auto strategy = std::launch::deferred;
	#endif
	return std::async(strategy, [=]() {
	auto mbOrErr = MemoryBuffer::getFile(path,
	/FileSize/ -1,
	/RequiresNullTerminator/ false);
	if (!mbOrErr)
	return MBErrPair{nullptr, mbOrErr.getError()};
	return MBErrPair{std::move(*mbOrErr), std::error_code()};
	});
	}

	// Symbol names are mangled by prepending "_" on x86.
	static StringRef mangle(StringRef sym) {
	assert(config->machine != IMAGE_FILE_MACHINE_UNKNOWN);
	if (config->machine == I386)
	return saver.save("_" + sym);
	return sym;
	}

	static bool findUnderscoreMangle(StringRef sym) {
	Symbol *s = symtab->findMangle(mangle(sym));
	return s && !isa<Undefined>(s);
	}

	MemoryBufferRef LinkerDriver::takeBuffer(std::unique_ptr<MemoryBuffer> mb) {
	MemoryBufferRef mbref = *mb;
	make<std::unique_ptr<MemoryBuffer>>(std::move(mb)); // take ownership

	if (driver->tar)
	driver->tar->append(relativeToRoot(mbref.getBufferIdentifier()),
	mbref.getBuffer());
	return mbref;
	}

	void LinkerDriver::addBuffer(std::unique_ptr<MemoryBuffer> mb,
	bool wholeArchive, bool lazy) {
	StringRef filename = mb->getBufferIdentifier();

	MemoryBufferRef mbref = takeBuffer(std::move(mb));
	filePaths.push_back(filename);

	// File type is detected by contents, not by file extension.
	switch (identify_magic(mbref.getBuffer())) {
	case file_magic::windows_resource:
	resources.push_back(mbref);
	break;
	case file_magic::archive:
	if (wholeArchive) {
	std::unique_ptr<Archive> file =
	CHECK(Archive::create(mbref), filename + ": failed to parse archive");
	Archive *archive = file.get();
	make<std::unique_ptr<Archive>>(std::move(file)); // take ownership

	int memberIndex = 0;
	for (MemoryBufferRef m : getArchiveMembers(archive))
	addArchiveBuffer(m, "<whole-archive>", filename, memberIndex++);
	return;
	}
	symtab->addFile(make<ArchiveFile>(mbref));
	break;
	case file_magic::bitcode:
	if (lazy)
	symtab->addFile(make<LazyObjFile>(mbref));
	else
	symtab->addFile(make<BitcodeFile>(mbref, "", 0));
	break;
	case file_magic::coff_object:
	case file_magic::coff_import_library:
	if (lazy)
	symtab->addFile(make<LazyObjFile>(mbref));
	else
	symtab->addFile(make<ObjFile>(mbref));
	break;
	case file_magic::pdb:
	symtab->addFile(make<PDBInputFile>(mbref));
	break;
	case file_magic::coff_cl_gl_object:
	error(filename + ": is not a native COFF file. Recompile without /GL");
	break;
	case file_magic::pecoff_executable:
	if (filename.endswith_lower(".dll")) {
	error(filename + ": bad file type. Did you specify a DLL instead of an "
	"import library?");
	break;
	}
	LLVM_FALLTHROUGH;
	default:
	error(mbref.getBufferIdentifier() + ": unknown file type");
	break;
	}
	}

	void LinkerDriver::enqueuePath(StringRef path, bool wholeArchive, bool lazy) {
	auto future = std::make_shared<std::future<MBErrPair>>(
	createFutureForFile(std::string(path)));
	std::string pathStr = std::string(path);
	enqueueTask([=]() {
	auto mbOrErr = future->get();
	if (mbOrErr.second) {
	std::string msg =
	"could not open '" + pathStr + "': " + mbOrErr.second.message();
	// Check if the filename is a typo for an option flag. OptTable thinks
	// that all args that are not known options and that start with / are
	// filenames, but e.g. `/nodefaultlibs` is more likely a typo for
	// the option `/nodefaultlib` than a reference to a file in the root
	// directory.
	std::string nearest;
	if (optTable.findNearest(pathStr, nearest) > 1)
	error(msg);
	else
	error(msg + "; did you mean '" + nearest + "'");
	} else
	driver->addBuffer(std::move(mbOrErr.first), wholeArchive, lazy);
	});
	}

	void LinkerDriver::addArchiveBuffer(MemoryBufferRef mb, StringRef symName,
	StringRef parentName,
	uint64_t offsetInArchive) {
	file_magic magic = identify_magic(mb.getBuffer());
	if (magic == file_magic::coff_import_library) {
	InputFile *imp = make<ImportFile>(mb);
	imp->parentName = parentName;
	symtab->addFile(imp);
	return;
	}

	InputFile *obj;
	if (magic == file_magic::coff_object) {
	obj = make<ObjFile>(mb);
	} else if (magic == file_magic::bitcode) {
	obj = make<BitcodeFile>(mb, parentName, offsetInArchive);
	} else {
	error("unknown file type: " + mb.getBufferIdentifier());
	return;
	}

	obj->parentName = parentName;
	symtab->addFile(obj);
	log("Loaded " + toString(obj) + " for " + symName);
	}

	void LinkerDriver::enqueueArchiveMember(const Archive::Child &c,
	const Archive::Symbol &sym,
	StringRef parentName) {

	auto reportBufferError = [=](Error &&e, StringRef childName) {
	fatal("could not get the buffer for the member defining symbol " +
	toCOFFString(sym) + ": " + parentName + "(" + childName + "): " +
	toString(std::move(e)));
	};

	if (!c.getParent()->isThin()) {
	uint64_t offsetInArchive = c.getChildOffset();
	Expected<MemoryBufferRef> mbOrErr = c.getMemoryBufferRef();
	if (!mbOrErr)
	reportBufferError(mbOrErr.takeError(), check(c.getFullName()));
	MemoryBufferRef mb = mbOrErr.get();
	enqueueTask([=]() {
	driver->addArchiveBuffer(mb, toCOFFString(sym), parentName,
	offsetInArchive);
	});
	return;
	}

	std::string childName = CHECK(
	c.getFullName(),
	"could not get the filename for the member defining symbol " +
	toCOFFString(sym));
	auto future = std::make_shared<std::future<MBErrPair>>(
	createFutureForFile(childName));
	enqueueTask([=]() {
	auto mbOrErr = future->get();
	if (mbOrErr.second)
	reportBufferError(errorCodeToError(mbOrErr.second), childName);
	// Pass empty string as archive name so that the original filename is
	// used as the buffer identifier.
	driver->addArchiveBuffer(takeBuffer(std::move(mbOrErr.first)),
	toCOFFString(sym), "", /OffsetInArchive=/0);
	});
	}

	static bool isDecorated(StringRef sym) {
	return sym.startswith("@") \|\| sym.contains("@@") \|\| sym.startswith("?") \|\|
	(!config->mingw && sym.contains('@'));
	}

	// Parses .drectve section contents and returns a list of files
	// specified by /defaultlib.
	void LinkerDriver::parseDirectives(InputFile *file) {
	StringRef s = file->getDirectives();
	if (s.empty())
	return;

	log("Directives: " + toString(file) + ": " + s);

	ArgParser parser;
	// .drectve is always tokenized using Windows shell rules.
	// /EXPORT: option can appear too many times, processing in fastpath.
	ParsedDirectives directives = parser.parseDirectives(s);

	for (StringRef e : directives.exports) {
	// If a common header file contains dllexported function
	// declarations, many object files may end up with having the
	// same /EXPORT options. In order to save cost of parsing them,
	// we dedup them first.
	if (!directivesExports.insert(e).second)
	continue;

	Export exp = parseExport(e);
	if (config->machine == I386 && config->mingw) {
	if (!isDecorated(exp.name))
	exp.name = saver.save("_" + exp.name);
	if (!exp.extName.empty() && !isDecorated(exp.extName))
	exp.extName = saver.save("_" + exp.extName);
	}
	exp.directives = true;
	config->exports.push_back(exp);
	}

	// Handle /include: in bulk.
	for (StringRef inc : directives.includes)
	addUndefined(inc);

	for (auto *arg : directives.args) {
	switch (arg->getOption().getID()) {
	case OPT_aligncomm:
	parseAligncomm(arg->getValue());
	break;
	case OPT_alternatename:
	parseAlternateName(arg->getValue());
	break;
	case OPT_defaultlib:
	if (Optional<StringRef> path = findLib(arg->getValue()))
	enqueuePath(*path, false, false);
	break;
	case OPT_entry:
	config->entry = addUndefined(mangle(arg->getValue()));
	break;
	case OPT_failifmismatch:
	checkFailIfMismatch(arg->getValue(), file);
	break;
	case OPT_incl:
	addUndefined(arg->getValue());
	break;
	case OPT_merge:
	parseMerge(arg->getValue());
	break;
	case OPT_nodefaultlib:
	config->noDefaultLibs.insert(doFindLib(arg->getValue()).lower());
	break;
	case OPT_section:
	parseSection(arg->getValue());
	break;
	case OPT_subsystem:
	parseSubsystem(arg->getValue(), &config->subsystem,
	&config->majorOSVersion, &config->minorOSVersion);
	break;
	// Only add flags here that link.exe accepts in
	// `#pragma comment(linker, "/flag")`-generated sections.
	case OPT_editandcontinue:
	case OPT_guardsym:
	case OPT_throwingnew:
	break;
	default:
	error(arg->getSpelling() + " is not allowed in .drectve");
	}
	}
	}

	// Find file from search paths. You can omit ".obj", this function takes
	// care of that. Note that the returned path is not guaranteed to exist.
	StringRef LinkerDriver::doFindFile(StringRef filename) {
	bool hasPathSep = (filename.find_first_of("/\\") != StringRef::npos);
	if (hasPathSep)
	return filename;
	bool hasExt = filename.contains('.');
	for (StringRef dir : searchPaths) {
	SmallString<128> path = dir;
	sys::path::append(path, filename);
	if (sys::fs::exists(path.str()))
	return saver.save(path.str());
	if (!hasExt) {
	path.append(".obj");
	if (sys::fs::exists(path.str()))
	return saver.save(path.str());
	}
	}
	return filename;
	}

	static Optional<sys::fs::UniqueID> getUniqueID(StringRef path) {
	sys::fs::UniqueID ret;
	if (sys::fs::getUniqueID(path, ret))
	return None;
	return ret;
	}

	// Resolves a file path. This never returns the same path
	// (in that case, it returns None).
	Optional<StringRef> LinkerDriver::findFile(StringRef filename) {
	StringRef path = doFindFile(filename);

	if (Optional<sys::fs::UniqueID> id = getUniqueID(path)) {
	bool seen = !visitedFiles.insert(*id).second;
	if (seen)
	return None;
	}

	if (path.endswith_lower(".lib"))
	visitedLibs.insert(std::string(sys::path::filename(path)));
	return path;
	}

	// MinGW specific. If an embedded directive specified to link to
	// foo.lib, but it isn't found, try libfoo.a instead.
	StringRef LinkerDriver::doFindLibMinGW(StringRef filename) {
	if (filename.contains('/') \|\| filename.contains('\\'))
	return filename;

	SmallString<128> s = filename;
	sys::path::replace_extension(s, ".a");
	StringRef libName = saver.save("lib" + s.str());
	return doFindFile(libName);
	}

	// Find library file from search path.
	StringRef LinkerDriver::doFindLib(StringRef filename) {
	// Add ".lib" to Filename if that has no file extension.
	bool hasExt = filename.contains('.');
	if (!hasExt)
	filename = saver.save(filename + ".lib");
	StringRef ret = doFindFile(filename);
	// For MinGW, if the find above didn't turn up anything, try
	// looking for a MinGW formatted library name.
	if (config->mingw && ret == filename)
	return doFindLibMinGW(filename);
	return ret;
	}

	// Resolves a library path. /nodefaultlib options are taken into
	// consideration. This never returns the same path (in that case,
	// it returns None).
	Optional<StringRef> LinkerDriver::findLib(StringRef filename) {
	if (config->noDefaultLibAll)
	return None;
	if (!visitedLibs.insert(filename.lower()).second)
	return None;

	StringRef path = doFindLib(filename);
	if (config->noDefaultLibs.count(path.lower()))
	return None;

	if (Optional<sys::fs::UniqueID> id = getUniqueID(path))
	if (!visitedFiles.insert(*id).second)
	return None;
	return path;
	}

	// Parses LIB environment which contains a list of search paths.
	void LinkerDriver::addLibSearchPaths() {
	Optional<std::string> envOpt = Process::GetEnv("LIB");
	if (!envOpt.hasValue())
	return;
	StringRef env = saver.save(*envOpt);
	while (!env.empty()) {
	StringRef path;
	std::tie(path, env) = env.split(';');
	searchPaths.push_back(path);
	}
	}

	Symbol *LinkerDriver::addUndefined(StringRef name) {
	Symbol *b = symtab->addUndefined(name);
	if (!b->isGCRoot) {
	b->isGCRoot = true;
	config->gcroot.push_back(b);
	}
	return b;
	}

	StringRef LinkerDriver::mangleMaybe(Symbol *s) {
	// If the plain symbol name has already been resolved, do nothing.
	Undefined *unmangled = dyn_cast<Undefined>(s);
	if (!unmangled)
	return "";

	// Otherwise, see if a similar, mangled symbol exists in the symbol table.
	Symbol *mangled = symtab->findMangle(unmangled->getName());
	if (!mangled)
	return "";

	// If we find a similar mangled symbol, make this an alias to it and return
	// its name.
	log(unmangled->getName() + " aliased to " + mangled->getName());
	unmangled->weakAlias = symtab->addUndefined(mangled->getName());
	return mangled->getName();
	}

	// Windows specific -- find default entry point name.
	//
	// There are four different entry point functions for Windows executables,
	// each of which corresponds to a user-defined "main" function. This function
	// infers an entry point from a user-defined "main" function.
	StringRef LinkerDriver::findDefaultEntry() {
	assert(config->subsystem != IMAGE_SUBSYSTEM_UNKNOWN &&
	"must handle /subsystem before calling this");

	if (config->mingw)
	return mangle(config->subsystem == IMAGE_SUBSYSTEM_WINDOWS_GUI
	? "WinMainCRTStartup"
	: "mainCRTStartup");

	if (config->subsystem == IMAGE_SUBSYSTEM_WINDOWS_GUI) {
	if (findUnderscoreMangle("wWinMain")) {
	if (!findUnderscoreMangle("WinMain"))
	return mangle("wWinMainCRTStartup");
	warn("found both wWinMain and WinMain; using latter");
	}
	return mangle("WinMainCRTStartup");
	}
	if (findUnderscoreMangle("wmain")) {
	if (!findUnderscoreMangle("main"))
	return mangle("wmainCRTStartup");
	warn("found both wmain and main; using latter");
	}
	return mangle("mainCRTStartup");
	}

	WindowsSubsystem LinkerDriver::inferSubsystem() {
	if (config->dll)
	return IMAGE_SUBSYSTEM_WINDOWS_GUI;
	if (config->mingw)
	return IMAGE_SUBSYSTEM_WINDOWS_CUI;
	// Note that link.exe infers the subsystem from the presence of these
	// functions even if /entry: or /nodefaultlib are passed which causes them
	// to not be called.
	bool haveMain = findUnderscoreMangle("main");
	bool haveWMain = findUnderscoreMangle("wmain");
	bool haveWinMain = findUnderscoreMangle("WinMain");
	bool haveWWinMain = findUnderscoreMangle("wWinMain");
	if (haveMain \|\| haveWMain) {
	if (haveWinMain \|\| haveWWinMain) {
	warn(std::string("found ") + (haveMain ? "main" : "wmain") + " and " +
	(haveWinMain ? "WinMain" : "wWinMain") +
	"; defaulting to /subsystem:console");
	}
	return IMAGE_SUBSYSTEM_WINDOWS_CUI;
	}
	if (haveWinMain \|\| haveWWinMain)
	return IMAGE_SUBSYSTEM_WINDOWS_GUI;
	return IMAGE_SUBSYSTEM_UNKNOWN;
	}

	static uint64_t getDefaultImageBase() {
	if (config->is64())
	return config->dll ? 0x180000000 : 0x140000000;
	return config->dll ? 0x10000000 : 0x400000;
	}

	static std::string createResponseFile(const opt::InputArgList &args,
	ArrayRef<StringRef> filePaths,
	ArrayRef<StringRef> searchPaths) {
	SmallString<0> data;
	raw_svector_ostream os(data);

	for (auto *arg : args) {
	switch (arg->getOption().getID()) {
	case OPT_linkrepro:
	case OPT_reproduce:
	case OPT_INPUT:
	case OPT_defaultlib:
	case OPT_libpath:
	case OPT_manifest:
	case OPT_manifest_colon:
	case OPT_manifestdependency:
	case OPT_manifestfile:
	case OPT_manifestinput:
	case OPT_manifestuac:
	break;
	case OPT_implib:
	case OPT_pdb:
	case OPT_pdbstripped:
	case OPT_out:
	os << arg->getSpelling() << sys::path::filename(arg->getValue()) << "\n";
	break;
	default:
	os << toString(*arg) << "\n";
	}
	}

	for (StringRef path : searchPaths) {
	std::string relPath = relativeToRoot(path);
	os << "/libpath:" << quote(relPath) << "\n";
	}

	for (StringRef path : filePaths)
	os << quote(relativeToRoot(path)) << "\n";

	return std::string(data.str());
	}

	enum class DebugKind { Unknown, None, Full, FastLink, GHash, Dwarf, Symtab };

	static DebugKind parseDebugKind(const opt::InputArgList &args) {
	auto *a = args.getLastArg(OPT_debug, OPT_debug_opt);
	if (!a)
	return DebugKind::None;
	if (a->getNumValues() == 0)
	return DebugKind::Full;

	DebugKind debug = StringSwitch<DebugKind>(a->getValue())
	.CaseLower("none", DebugKind::None)
	.CaseLower("full", DebugKind::Full)
	.CaseLower("fastlink", DebugKind::FastLink)
	// LLD extensions
	.CaseLower("ghash", DebugKind::GHash)
	.CaseLower("dwarf", DebugKind::Dwarf)
	.CaseLower("symtab", DebugKind::Symtab)
	.Default(DebugKind::Unknown);

	if (debug == DebugKind::FastLink) {
	warn("/debug:fastlink unsupported; using /debug:full");
	return DebugKind::Full;
	}
	if (debug == DebugKind::Unknown) {
	error("/debug: unknown option: " + Twine(a->getValue()));
	return DebugKind::None;
	}
	return debug;
	}

	static unsigned parseDebugTypes(const opt::InputArgList &args) {
	unsigned debugTypes = static_cast<unsigned>(DebugType::None);

	if (auto *a = args.getLastArg(OPT_debugtype)) {
	SmallVector<StringRef, 3> types;
	StringRef(a->getValue())
	.split(types, ',', /MaxSplit=/-1, /KeepEmpty=/false);

	for (StringRef type : types) {
	unsigned v = StringSwitch<unsigned>(type.lower())
	.Case("cv", static_cast<unsigned>(DebugType::CV))
	.Case("pdata", static_cast<unsigned>(DebugType::PData))
	.Case("fixup", static_cast<unsigned>(DebugType::Fixup))
	.Default(0);
	if (v == 0) {
	warn("/debugtype: unknown option '" + type + "'");
	continue;
	}
	debugTypes \|= v;
	}
	return debugTypes;
	}

	// Default debug types
	debugTypes = static_cast<unsigned>(DebugType::CV);
	if (args.hasArg(OPT_driver))
	debugTypes \|= static_cast<unsigned>(DebugType::PData);
	if (args.hasArg(OPT_profile))
	debugTypes \|= static_cast<unsigned>(DebugType::Fixup);

	return debugTypes;
	}

	static std::string getMapFile(const opt::InputArgList &args,
	opt::OptSpecifier os, opt::OptSpecifier osFile) {
	auto *arg = args.getLastArg(os, osFile);
	if (!arg)
	return "";
	if (arg->getOption().getID() == osFile.getID())
	return arg->getValue();

	assert(arg->getOption().getID() == os.getID());
	StringRef outFile = config->outputFile;
	return (outFile.substr(0, outFile.rfind('.')) + ".map").str();
	}

	static std::string getImplibPath() {
	if (!config->implib.empty())
	return std::string(config->implib);
	SmallString<128> out = StringRef(config->outputFile);
	sys::path::replace_extension(out, ".lib");
	return std::string(out.str());
	}

	// The import name is calculated as follows:
	//
	// \| LIBRARY w/ ext \| LIBRARY w/o ext \| no LIBRARY
	// -----+----------------+---------------------+------------------
	// LINK \| {value} \| {value}.{.dll/.exe} \| {output name}
	// LIB \| {value} \| {value}.dll \| {output name}.dll
	//
	static std::string getImportName(bool asLib) {
	SmallString<128> out;

	if (config->importName.empty()) {
	out.assign(sys::path::filename(config->outputFile));
	if (asLib)
	sys::path::replace_extension(out, ".dll");
	} else {
	out.assign(config->importName);
	if (!sys::path::has_extension(out))
	sys::path::replace_extension(out,
	(config->dll \|\| asLib) ? ".dll" : ".exe");
	}

	return std::string(out.str());
	}

	static void createImportLibrary(bool asLib) {
	std::vector<COFFShortExport> exports;
	for (Export &e1 : config->exports) {
	COFFShortExport e2;
	e2.Name = std::string(e1.name);
	e2.SymbolName = std::string(e1.symbolName);
	e2.ExtName = std::string(e1.extName);
	e2.Ordinal = e1.ordinal;
	e2.Noname = e1.noname;
	e2.Data = e1.data;
	e2.Private = e1.isPrivate;
	e2.Constant = e1.constant;
	exports.push_back(e2);
	}

	auto handleError = [](Error &&e) {
	handleAllErrors(std::move(e),
	[](ErrorInfoBase &eib) { error(eib.message()); });
	};
	std::string libName = getImportName(asLib);
	std::string path = getImplibPath();

	if (!config->incremental) {
	handleError(writeImportLibrary(libName, path, exports, config->machine,
	config->mingw));
	return;
	}

	// If the import library already exists, replace it only if the contents
	// have changed.
	ErrorOr<std::unique_ptr<MemoryBuffer>> oldBuf = MemoryBuffer::getFile(
	path, /FileSize/ -1, /RequiresNullTerminator/ false);
	if (!oldBuf) {
	handleError(writeImportLibrary(libName, path, exports, config->machine,
	config->mingw));
	return;
	}

	SmallString<128> tmpName;
	if (std::error_code ec =
	sys::fs::createUniqueFile(path + ".tmp-%%%%%%%%.lib", tmpName))
	fatal("cannot create temporary file for import library " + path + ": " +
	ec.message());

	if (Error e = writeImportLibrary(libName, tmpName, exports, config->machine,
	config->mingw)) {
	handleError(std::move(e));
	return;
	}

	std::unique_ptr<MemoryBuffer> newBuf = check(MemoryBuffer::getFile(
	tmpName, /FileSize/ -1, /RequiresNullTerminator/ false));
	if ((*oldBuf)->getBuffer() != newBuf->getBuffer()) {
	oldBuf->reset();
	handleError(errorCodeToError(sys::fs::rename(tmpName, path)));
	} else {
	sys::fs::remove(tmpName);
	}
	}

	static void parseModuleDefs(StringRef path) {
	std::unique_ptr<MemoryBuffer> mb = CHECK(
	MemoryBuffer::getFile(path, -1, false, true), "could not open " + path);
	COFFModuleDefinition m = check(parseCOFFModuleDefinition(
	mb->getMemBufferRef(), config->machine, config->mingw));

	if (config->outputFile.empty())
	config->outputFile = std::string(saver.save(m.OutputFile));
	config->importName = std::string(saver.save(m.ImportName));
	if (m.ImageBase)
	config->imageBase = m.ImageBase;
	if (m.StackReserve)
	config->stackReserve = m.StackReserve;
	if (m.StackCommit)
	config->stackCommit = m.StackCommit;
	if (m.HeapReserve)
	config->heapReserve = m.HeapReserve;
	if (m.HeapCommit)
	config->heapCommit = m.HeapCommit;
	if (m.MajorImageVersion)
	config->majorImageVersion = m.MajorImageVersion;
	if (m.MinorImageVersion)
	config->minorImageVersion = m.MinorImageVersion;
	if (m.MajorOSVersion)
	config->majorOSVersion = m.MajorOSVersion;
	if (m.MinorOSVersion)
	config->minorOSVersion = m.MinorOSVersion;

	for (COFFShortExport e1 : m.Exports) {
	Export e2;
	// In simple cases, only Name is set. Renamed exports are parsed
	// and set as "ExtName = Name". If Name has the form "OtherDll.Func",
	// it shouldn't be a normal exported function but a forward to another
	// DLL instead. This is supported by both MS and GNU linkers.
	if (!e1.ExtName.empty() && e1.ExtName != e1.Name &&
	StringRef(e1.Name).contains('.')) {
	e2.name = saver.save(e1.ExtName);
	e2.forwardTo = saver.save(e1.Name);
	config->exports.push_back(e2);
	continue;
	}
	e2.name = saver.save(e1.Name);
	e2.extName = saver.save(e1.ExtName);
	e2.ordinal = e1.Ordinal;
	e2.noname = e1.Noname;
	e2.data = e1.Data;
	e2.isPrivate = e1.Private;
	e2.constant = e1.Constant;
	config->exports.push_back(e2);
	}
	}

	void LinkerDriver::enqueueTask(std::function<void()> task) {
	taskQueue.push_back(std::move(task));
	}

	bool LinkerDriver::run() {
	ScopedTimer t(inputFileTimer);

	bool didWork = !taskQueue.empty();
	while (!taskQueue.empty()) {
	taskQueue.front()();
	taskQueue.pop_front();
	}
	return didWork;
	}

	// Parse an /order file. If an option is given, the linker places
	// COMDAT sections in the same order as their names appear in the
	// given file.
	static void parseOrderFile(StringRef arg) {
	// For some reason, the MSVC linker requires a filename to be
	// preceded by "@".
	if (!arg.startswith("@")) {
	error("malformed /order option: '@' missing");
	return;
	}

	// Get a list of all comdat sections for error checking.
	DenseSet<StringRef> set;
	for (Chunk *c : symtab->getChunks())
	if (auto *sec = dyn_cast<SectionChunk>(c))
	if (sec->sym)
	set.insert(sec->sym->getName());

	// Open a file.
	StringRef path = arg.substr(1);
	std::unique_ptr<MemoryBuffer> mb = CHECK(
	MemoryBuffer::getFile(path, -1, false, true), "could not open " + path);

	// Parse a file. An order file contains one symbol per line.
	// All symbols that were not present in a given order file are
	// considered to have the lowest priority 0 and are placed at
	// end of an output section.
	for (StringRef arg : args::getLines(mb->getMemBufferRef())) {
	std::string s(arg);
	if (config->machine == I386 && !isDecorated(s))
	s = "_" + s;

	if (set.count(s) == 0) {
	if (config->warnMissingOrderSymbol)
	warn("/order:" + arg + ": missing symbol: " + s + " [LNK4037]");
	}
	else
	config->order[s] = INT_MIN + config->order.size();
	}
	}

	static void markAddrsig(Symbol *s) {
	if (auto *d = dyn_cast_or_null<Defined>(s))
	if (SectionChunk *c = dyn_cast_or_null<SectionChunk>(d->getChunk()))
	c->keepUnique = true;
	}

	static void findKeepUniqueSections() {
	// Exported symbols could be address-significant in other executables or DSOs,
	// so we conservatively mark them as address-significant.
	for (Export &r : config->exports)
	markAddrsig(r.sym);

	// Visit the address-significance table in each object file and mark each
	// referenced symbol as address-significant.
	for (ObjFile *obj : ObjFile::instances) {
	ArrayRef<Symbol *> syms = obj->getSymbols();
	if (obj->addrsigSec) {
	ArrayRef<uint8_t> contents;
	cantFail(
	obj->getCOFFObj()->getSectionContents(obj->addrsigSec, contents));
	const uint8_t *cur = contents.begin();
	while (cur != contents.end()) {
	unsigned size;
	const char *err;
	uint64_t symIndex = decodeULEB128(cur, &size, contents.end(), &err);
	if (err)
	fatal(toString(obj) + ": could not decode addrsig section: " + err);
	if (symIndex >= syms.size())
	fatal(toString(obj) + ": invalid symbol index in addrsig section");
	markAddrsig(syms[symIndex]);
	cur += size;
	}
	} else {
	// If an object file does not have an address-significance table,
	// conservatively mark all of its symbols as address-significant.
	for (Symbol *s : syms)
	markAddrsig(s);
	}
	}
	}

	// link.exe replaces each %foo% in altPath with the contents of environment
	// variable foo, and adds the two magic env vars _PDB (expands to the basename
	// of pdb's output path) and _EXT (expands to the extension of the output
	// binary).
	// lld only supports %_PDB% and %_EXT% and warns on references to all other env
	// vars.
	static void parsePDBAltPath(StringRef altPath) {
	SmallString<128> buf;
	StringRef pdbBasename =
	sys::path::filename(config->pdbPath, sys::path::Style::windows);
	StringRef binaryExtension =
	sys::path::extension(config->outputFile, sys::path::Style::windows);
	if (!binaryExtension.empty())
	binaryExtension = binaryExtension.substr(1); // %_EXT% does not include '.'.

	// Invariant:
	// +--------- cursor ('a...' might be the empty string).
	// \| +----- firstMark
	// \| \| +- secondMark
	// v v v
	// a...%...%...
	size_t cursor = 0;
	while (cursor < altPath.size()) {
	size_t firstMark, secondMark;
	if ((firstMark = altPath.find('%', cursor)) == StringRef::npos \|\|
	(secondMark = altPath.find('%', firstMark + 1)) == StringRef::npos) {
	// Didn't find another full fragment, treat rest of string as literal.
	buf.append(altPath.substr(cursor));
	break;
	}

	// Found a full fragment. Append text in front of first %, and interpret
	// text between first and second % as variable name.
	buf.append(altPath.substr(cursor, firstMark - cursor));
	StringRef var = altPath.substr(firstMark, secondMark - firstMark + 1);
	if (var.equals_lower("%_pdb%"))
	buf.append(pdbBasename);
	else if (var.equals_lower("%_ext%"))
	buf.append(binaryExtension);
	else {
	warn("only %_PDB% and %_EXT% supported in /pdbaltpath:, keeping " +
	var + " as literal");
	buf.append(var);
	}

	cursor = secondMark + 1;
	}

	config->pdbAltPath = buf;
	}

	/// Convert resource files and potentially merge input resource object
	/// trees into one resource tree.
	/// Call after ObjFile::Instances is complete.
	void LinkerDriver::convertResources() {
	std::vector<ObjFile *> resourceObjFiles;

	for (ObjFile *f : ObjFile::instances) {
	if (f->isResourceObjFile())
	resourceObjFiles.push_back(f);
	}

	if (!config->mingw &&
	(resourceObjFiles.size() > 1 \|\|
	(resourceObjFiles.size() == 1 && !resources.empty()))) {
	error((!resources.empty() ? "internal .obj file created from .res files"
	: toString(resourceObjFiles[1])) +
	": more than one resource obj file not allowed, already got " +
	toString(resourceObjFiles.front()));
	return;
	}

	if (resources.empty() && resourceObjFiles.size() <= 1) {
	// No resources to convert, and max one resource object file in
	// the input. Keep that preconverted resource section as is.
	for (ObjFile *f : resourceObjFiles)
	f->includeResourceChunks();
	return;
	}
	ObjFile *f = make<ObjFile>(convertResToCOFF(resources, resourceObjFiles));
	symtab->addFile(f);
	f->includeResourceChunks();
	}

	// In MinGW, if no symbols are chosen to be exported, then all symbols are
	// automatically exported by default. This behavior can be forced by the
	// -export-all-symbols option, so that it happens even when exports are
	// explicitly specified. The automatic behavior can be disabled using the
	// -exclude-all-symbols option, so that lld-link behaves like link.exe rather
	// than MinGW in the case that nothing is explicitly exported.
	void LinkerDriver::maybeExportMinGWSymbols(const opt::InputArgList &args) {
	if (!config->dll)
	return;

	if (!args.hasArg(OPT_export_all_symbols)) {
	if (!config->exports.empty())
	return;
	if (args.hasArg(OPT_exclude_all_symbols))
	return;
	}

	AutoExporter exporter;

	for (auto *arg : args.filtered(OPT_wholearchive_file))
	if (Optional<StringRef> path = doFindFile(arg->getValue()))
	exporter.addWholeArchive(*path);

	symtab->forEachSymbol([&](Symbol *s) {
	auto *def = dyn_cast<Defined>(s);
	if (!exporter.shouldExport(def))
	return;

	Export e;
	e.name = def->getName();
	e.sym = def;
	if (Chunk *c = def->getChunk())
	if (!(c->getOutputCharacteristics() & IMAGE_SCN_MEM_EXECUTE))
	e.data = true;
	config->exports.push_back(e);
	});
	}

	// lld has a feature to create a tar file containing all input files as well as
	// all command line options, so that other people can run lld again with exactly
	// the same inputs. This feature is accessible via /linkrepro and /reproduce.
	//
	// /linkrepro and /reproduce are very similar, but /linkrepro takes a directory
	// name while /reproduce takes a full path. We have /linkrepro for compatibility
	// with Microsoft link.exe.
	Optional<std::string> getReproduceFile(const opt::InputArgList &args) {
	if (auto *arg = args.getLastArg(OPT_reproduce))
	return std::string(arg->getValue());

	if (auto *arg = args.getLastArg(OPT_linkrepro)) {
	SmallString<64> path = StringRef(arg->getValue());
	sys::path::append(path, "repro.tar");
	return std::string(path);
	}

	return None;
	}

	void LinkerDriver::link(ArrayRef<const char *> argsArr) {
	ScopedTimer rootTimer(Timer::root());

	// Needed for LTO.
	InitializeAllTargetInfos();
	InitializeAllTargets();
	InitializeAllTargetMCs();
	InitializeAllAsmParsers();
	InitializeAllAsmPrinters();

	// If the first command line argument is "/lib", link.exe acts like lib.exe.
	// We call our own implementation of lib.exe that understands bitcode files.
	if (argsArr.size() > 1 && StringRef(argsArr[1]).equals_lower("/lib")) {
	if (llvm::libDriverMain(argsArr.slice(1)) != 0)
	fatal("lib failed");
	return;
	}

	// Parse command line options.
	ArgParser parser;
	opt::InputArgList args = parser.parse(argsArr);

	// Parse and evaluate -mllvm options.
	std::vector<const char *> v;
	v.push_back("lld-link (LLVM option parsing)");
	for (auto *arg : args.filtered(OPT_mllvm))
	v.push_back(arg->getValue());
	cl::ParseCommandLineOptions(v.size(), v.data());

	// Handle /errorlimit early, because error() depends on it.
	if (auto *arg = args.getLastArg(OPT_errorlimit)) {
	int n = 20;
	StringRef s = arg->getValue();
	if (s.getAsInteger(10, n))
	error(arg->getSpelling() + " number expected, but got " + s);
	errorHandler().errorLimit = n;
	}

	// Handle /help
	if (args.hasArg(OPT_help)) {
	printHelp(argsArr[0]);
	return;
	}

	// /threads: takes a positive integer and provides the default value for
	// /opt:lldltojobs=.
	if (auto *arg = args.getLastArg(OPT_threads)) {
	StringRef v(arg->getValue());
	unsigned threads = 0;
	if (!llvm::to_integer(v, threads, 0) \|\| threads == 0)
	error(arg->getSpelling() + ": expected a positive integer, but got '" +
	arg->getValue() + "'");
	parallel::strategy = hardware_concurrency(threads);
	config->thinLTOJobs = v.str();
	}

	if (args.hasArg(OPT_show_timing))
	config->showTiming = true;

	config->showSummary = args.hasArg(OPT_summary);

	// Handle --version, which is an lld extension. This option is a bit odd
	// because it doesn't start with "/", but we deliberately chose "--" to
	// avoid conflict with /version and for compatibility with clang-cl.
	if (args.hasArg(OPT_dash_dash_version)) {
	lld::outs() << getLLDVersion() << "\n";
	return;
	}

	// Handle /lldmingw early, since it can potentially affect how other
	// options are handled.
	config->mingw = args.hasArg(OPT_lldmingw);

	// Handle /linkrepro and /reproduce.
	if (Optional<std::string> path = getReproduceFile(args)) {
	Expected<std::unique_ptr<TarWriter>> errOrWriter =
	TarWriter::create(path, sys::path::stem(path));

	if (errOrWriter) {
	tar = std::move(*errOrWriter);
	} else {
	error("/linkrepro: failed to open " + *path + ": " +
	toString(errOrWriter.takeError()));
	}
	}

	if (!args.hasArg(OPT_INPUT, OPT_wholearchive_file)) {
	if (args.hasArg(OPT_deffile))
	config->noEntry = true;
	else
	fatal("no input files");
	}

	// Construct search path list.
	searchPaths.push_back("");
	for (auto *arg : args.filtered(OPT_libpath))
	searchPaths.push_back(arg->getValue());
	if (!args.hasArg(OPT_lldignoreenv))
	addLibSearchPaths();

	// Handle /ignore
	for (auto *arg : args.filtered(OPT_ignore)) {
	SmallVector<StringRef, 8> vec;
	StringRef(arg->getValue()).split(vec, ',');
	for (StringRef s : vec) {
	if (s == "4037")
	config->warnMissingOrderSymbol = false;
	else if (s == "4099")
	config->warnDebugInfoUnusable = false;
	else if (s == "4217")
	config->warnLocallyDefinedImported = false;
	else if (s == "longsections")
	config->warnLongSectionNames = false;
	// Other warning numbers are ignored.
	}
	}

	// Handle /out
	if (auto *arg = args.getLastArg(OPT_out))
	config->outputFile = arg->getValue();

	// Handle /verbose
	if (args.hasArg(OPT_verbose))
	config->verbose = true;
	errorHandler().verbose = config->verbose;

	// Handle /force or /force:unresolved
	if (args.hasArg(OPT_force, OPT_force_unresolved))
	config->forceUnresolved = true;

	// Handle /force or /force:multiple
	if (args.hasArg(OPT_force, OPT_force_multiple))
	config->forceMultiple = true;

	// Handle /force or /force:multipleres
	if (args.hasArg(OPT_force, OPT_force_multipleres))
	config->forceMultipleRes = true;

	// Handle /debug
	DebugKind debug = parseDebugKind(args);
	if (debug == DebugKind::Full \|\| debug == DebugKind::Dwarf \|\|
	debug == DebugKind::GHash) {
	config->debug = true;
	config->incremental = true;
	}

	// Handle /demangle
	config->demangle = args.hasFlag(OPT_demangle, OPT_demangle_no);

	// Handle /debugtype
	config->debugTypes = parseDebugTypes(args);

	// Handle /driver[:uponly\|:wdm].
	config->driverUponly = args.hasArg(OPT_driver_uponly) \|\|
	args.hasArg(OPT_driver_uponly_wdm) \|\|
	args.hasArg(OPT_driver_wdm_uponly);
	config->driverWdm = args.hasArg(OPT_driver_wdm) \|\|
	args.hasArg(OPT_driver_uponly_wdm) \|\|
	args.hasArg(OPT_driver_wdm_uponly);
	config->driver =
	config->driverUponly \|\| config->driverWdm \|\| args.hasArg(OPT_driver);

	// Handle /pdb
	bool shouldCreatePDB =
	(debug == DebugKind::Full \|\| debug == DebugKind::GHash);
	if (shouldCreatePDB) {
	if (auto *arg = args.getLastArg(OPT_pdb))
	config->pdbPath = arg->getValue();
	if (auto *arg = args.getLastArg(OPT_pdbaltpath))
	config->pdbAltPath = arg->getValue();
	if (args.hasArg(OPT_natvis))
	config->natvisFiles = args.getAllArgValues(OPT_natvis);
	if (args.hasArg(OPT_pdbstream)) {
	for (const StringRef value : args.getAllArgValues(OPT_pdbstream)) {
	const std::pair<StringRef, StringRef> nameFile = value.split("=");
	const StringRef name = nameFile.first;
	const std::string file = nameFile.second.str();
	config->namedStreams[name] = file;
	}
	}

	if (auto *arg = args.getLastArg(OPT_pdb_source_path))
	config->pdbSourcePath = arg->getValue();
	}

	// Handle /pdbstripped
	if (args.hasArg(OPT_pdbstripped))
	warn("ignoring /pdbstripped flag, it is not yet supported");

	// Handle /noentry
	if (args.hasArg(OPT_noentry)) {
	if (args.hasArg(OPT_dll))
	config->noEntry = true;
	else
	error("/noentry must be specified with /dll");
	}

	// Handle /dll
	if (args.hasArg(OPT_dll)) {
	config->dll = true;
	config->manifestID = 2;
	}

	// Handle /dynamicbase and /fixed. We can't use hasFlag for /dynamicbase
	// because we need to explicitly check whether that option or its inverse was
	// present in the argument list in order to handle /fixed.
	auto *dynamicBaseArg = args.getLastArg(OPT_dynamicbase, OPT_dynamicbase_no);
	if (dynamicBaseArg &&
	dynamicBaseArg->getOption().getID() == OPT_dynamicbase_no)
	config->dynamicBase = false;

	// MSDN claims "/FIXED:NO is the default setting for a DLL, and /FIXED is the
	// default setting for any other project type.", but link.exe defaults to
	// /FIXED:NO for exe outputs as well. Match behavior, not docs.
	bool fixed = args.hasFlag(OPT_fixed, OPT_fixed_no, false);
	if (fixed) {
	if (dynamicBaseArg &&
	dynamicBaseArg->getOption().getID() == OPT_dynamicbase) {
	error("/fixed must not be specified with /dynamicbase");
	} else {
	config->relocatable = false;
	config->dynamicBase = false;
	}
	}

	// Handle /appcontainer
	config->appContainer =
	args.hasFlag(OPT_appcontainer, OPT_appcontainer_no, false);

	// Handle /machine
	if (auto *arg = args.getLastArg(OPT_machine)) {
	config->machine = getMachineType(arg->getValue());
	if (config->machine == IMAGE_FILE_MACHINE_UNKNOWN)
	fatal(Twine("unknown /machine argument: ") + arg->getValue());
	}

	// Handle /nodefaultlib:<filename>
	for (auto *arg : args.filtered(OPT_nodefaultlib))
	config->noDefaultLibs.insert(doFindLib(arg->getValue()).lower());

	// Handle /nodefaultlib
	if (args.hasArg(OPT_nodefaultlib_all))
	config->noDefaultLibAll = true;

	// Handle /base
	if (auto *arg = args.getLastArg(OPT_base))
	parseNumbers(arg->getValue(), &config->imageBase);

	// Handle /filealign
	if (auto *arg = args.getLastArg(OPT_filealign)) {
	parseNumbers(arg->getValue(), &config->fileAlign);
	if (!isPowerOf2_64(config->fileAlign))
	error("/filealign: not a power of two: " + Twine(config->fileAlign));
	}

	// Handle /stack
	if (auto *arg = args.getLastArg(OPT_stack))
	parseNumbers(arg->getValue(), &config->stackReserve, &config->stackCommit);

	// Handle /guard:cf
	if (auto *arg = args.getLastArg(OPT_guard))
	parseGuard(arg->getValue());

	// Handle /heap
	if (auto *arg = args.getLastArg(OPT_heap))
	parseNumbers(arg->getValue(), &config->heapReserve, &config->heapCommit);

	// Handle /version
	if (auto *arg = args.getLastArg(OPT_version))
	parseVersion(arg->getValue(), &config->majorImageVersion,
	&config->minorImageVersion);

	// Handle /subsystem
	if (auto *arg = args.getLastArg(OPT_subsystem))
	parseSubsystem(arg->getValue(), &config->subsystem, &config->majorOSVersion,
	&config->minorOSVersion);

	// Handle /timestamp
	if (llvm::opt::Arg *arg = args.getLastArg(OPT_timestamp, OPT_repro)) {
	if (arg->getOption().getID() == OPT_repro) {
	config->timestamp = 0;
	config->repro = true;
	} else {
	config->repro = false;
	StringRef value(arg->getValue());
	if (value.getAsInteger(0, config->timestamp))
	fatal(Twine("invalid timestamp: ") + value +
	". Expected 32-bit integer");
	}
	} else {
	config->repro = false;
	config->timestamp = time(nullptr);
	}

	// Handle /alternatename
	for (auto *arg : args.filtered(OPT_alternatename))
	parseAlternateName(arg->getValue());

	// Handle /include
	for (auto *arg : args.filtered(OPT_incl))
	addUndefined(arg->getValue());

	// Handle /implib
	if (auto *arg = args.getLastArg(OPT_implib))
	config->implib = arg->getValue();

	// Handle /opt.
	bool doGC = debug == DebugKind::None \|\| args.hasArg(OPT_profile);
	unsigned icfLevel =
	args.hasArg(OPT_profile) ? 0 : 1; // 0: off, 1: limited, 2: on
	unsigned tailMerge = 1;
	for (auto *arg : args.filtered(OPT_opt)) {
	std::string str = StringRef(arg->getValue()).lower();
	SmallVector<StringRef, 1> vec;
	StringRef(str).split(vec, ',');
	for (StringRef s : vec) {
	if (s == "ref") {
	doGC = true;
	} else if (s == "noref") {
	doGC = false;
	} else if (s == "icf" \|\| s.startswith("icf=")) {
	icfLevel = 2;
	} else if (s == "noicf") {
	icfLevel = 0;
	} else if (s == "lldtailmerge") {
	tailMerge = 2;
	} else if (s == "nolldtailmerge") {
	tailMerge = 0;
	} else if (s.startswith("lldlto=")) {
	StringRef optLevel = s.substr(7);
	if (optLevel.getAsInteger(10, config->ltoo) \|\| config->ltoo > 3)
	error("/opt:lldlto: invalid optimization level: " + optLevel);
	} else if (s.startswith("lldltojobs=")) {
	StringRef jobs = s.substr(11);
	if (!get_threadpool_strategy(jobs))
	error("/opt:lldltojobs: invalid job count: " + jobs);
	config->thinLTOJobs = jobs.str();
	} else if (s.startswith("lldltopartitions=")) {
	StringRef n = s.substr(17);
	if (n.getAsInteger(10, config->ltoPartitions) \|\|
	config->ltoPartitions == 0)
	error("/opt:lldltopartitions: invalid partition count: " + n);
	} else if (s != "lbr" && s != "nolbr")
	error("/opt: unknown option: " + s);
	}
	}

	// Limited ICF is enabled if GC is enabled and ICF was never mentioned
	// explicitly.
	// FIXME: LLD only implements "limited" ICF, i.e. it only merges identical
	// code. If the user passes /OPT:ICF explicitly, LLD should merge identical
	// comdat readonly data.
	if (icfLevel == 1 && !doGC)
	icfLevel = 0;
	config->doGC = doGC;
	config->doICF = icfLevel > 0;
	config->tailMerge = (tailMerge == 1 && config->doICF) \|\| tailMerge == 2;

	// Handle /lldsavetemps
	if (args.hasArg(OPT_lldsavetemps))
	config->saveTemps = true;

	// Handle /kill-at
	if (args.hasArg(OPT_kill_at))
	config->killAt = true;

	// Handle /lldltocache
	if (auto *arg = args.getLastArg(OPT_lldltocache))
	config->ltoCache = arg->getValue();

	// Handle /lldsavecachepolicy
	if (auto *arg = args.getLastArg(OPT_lldltocachepolicy))
	config->ltoCachePolicy = CHECK(
	parseCachePruningPolicy(arg->getValue()),
	Twine("/lldltocachepolicy: invalid cache policy: ") + arg->getValue());

	// Handle /failifmismatch
	for (auto *arg : args.filtered(OPT_failifmismatch))
	checkFailIfMismatch(arg->getValue(), nullptr);

	// Handle /merge
	for (auto *arg : args.filtered(OPT_merge))
	parseMerge(arg->getValue());

	// Add default section merging rules after user rules. User rules take
	// precedence, but we will emit a warning if there is a conflict.
	parseMerge(".idata=.rdata");
	parseMerge(".didat=.rdata");
	parseMerge(".edata=.rdata");
	parseMerge(".xdata=.rdata");
	parseMerge(".bss=.data");

	if (config->mingw) {
	parseMerge(".ctors=.rdata");
	parseMerge(".dtors=.rdata");
	parseMerge(".CRT=.rdata");
	}

	// Handle /section
	for (auto *arg : args.filtered(OPT_section))
	parseSection(arg->getValue());

	// Handle /align
	if (auto *arg = args.getLastArg(OPT_align)) {
	parseNumbers(arg->getValue(), &config->align);
	if (!isPowerOf2_64(config->align))
	error("/align: not a power of two: " + StringRef(arg->getValue()));
	if (!args.hasArg(OPT_driver))
	warn("/align specified without /driver; image may not run");
	}

	// Handle /aligncomm
	for (auto *arg : args.filtered(OPT_aligncomm))
	parseAligncomm(arg->getValue());

	// Handle /manifestdependency. This enables /manifest unless /manifest:no is
	// also passed.
	if (auto *arg = args.getLastArg(OPT_manifestdependency)) {
	config->manifestDependency = arg->getValue();
	config->manifest = Configuration::SideBySide;
	}

	// Handle /manifest and /manifest:
	if (auto *arg = args.getLastArg(OPT_manifest, OPT_manifest_colon)) {
	if (arg->getOption().getID() == OPT_manifest)
	config->manifest = Configuration::SideBySide;
	else
	parseManifest(arg->getValue());
	}

	// Handle /manifestuac
	if (auto *arg = args.getLastArg(OPT_manifestuac))
	parseManifestUAC(arg->getValue());

	// Handle /manifestfile
	if (auto *arg = args.getLastArg(OPT_manifestfile))
	config->manifestFile = arg->getValue();

	// Handle /manifestinput
	for (auto *arg : args.filtered(OPT_manifestinput))
	config->manifestInput.push_back(arg->getValue());

	if (!config->manifestInput.empty() &&
	config->manifest != Configuration::Embed) {
	fatal("/manifestinput: requires /manifest:embed");
	}

	config->thinLTOEmitImportsFiles = args.hasArg(OPT_thinlto_emit_imports_files);
	config->thinLTOIndexOnly = args.hasArg(OPT_thinlto_index_only) \|\|
	args.hasArg(OPT_thinlto_index_only_arg);
	config->thinLTOIndexOnlyArg =
	args.getLastArgValue(OPT_thinlto_index_only_arg);
	config->thinLTOPrefixReplace =
	getOldNewOptions(args, OPT_thinlto_prefix_replace);
	config->thinLTOObjectSuffixReplace =
	getOldNewOptions(args, OPT_thinlto_object_suffix_replace);
	config->ltoObjPath = args.getLastArgValue(OPT_lto_obj_path);
	// Handle miscellaneous boolean flags.
	config->allowBind = args.hasFlag(OPT_allowbind, OPT_allowbind_no, true);
	config->allowIsolation =
	args.hasFlag(OPT_allowisolation, OPT_allowisolation_no, true);
	config->incremental =
	args.hasFlag(OPT_incremental, OPT_incremental_no,
	!config->doGC && !config->doICF && !args.hasArg(OPT_order) &&
	!args.hasArg(OPT_profile));
	config->integrityCheck =
	args.hasFlag(OPT_integritycheck, OPT_integritycheck_no, false);
	config->cetCompat = args.hasFlag(OPT_cetcompat, OPT_cetcompat_no, false);
	config->nxCompat = args.hasFlag(OPT_nxcompat, OPT_nxcompat_no, true);
	for (auto *arg : args.filtered(OPT_swaprun))
	parseSwaprun(arg->getValue());
	config->terminalServerAware =
	!config->dll && args.hasFlag(OPT_tsaware, OPT_tsaware_no, true);
	config->debugDwarf = debug == DebugKind::Dwarf;
	config->debugGHashes = debug == DebugKind::GHash;
	config->debugSymtab = debug == DebugKind::Symtab;
	config->autoImport =
	args.hasFlag(OPT_auto_import, OPT_auto_import_no, config->mingw);
	config->pseudoRelocs = args.hasFlag(
	OPT_runtime_pseudo_reloc, OPT_runtime_pseudo_reloc_no, config->mingw);

	// Don't warn about long section names, such as .debug_info, for mingw or when
	// -debug:dwarf is requested.
	if (config->mingw \|\| config->debugDwarf)
	config->warnLongSectionNames = false;

	config->lldmapFile = getMapFile(args, OPT_lldmap, OPT_lldmap_file);
	config->mapFile = getMapFile(args, OPT_map, OPT_map_file);

	if (config->lldmapFile != "" && config->lldmapFile == config->mapFile) {
	warn("/lldmap and /map have the same output file '" + config->mapFile +
	"'.\n>>> ignoring /lldmap");
	config->lldmapFile.clear();
	}

	if (config->incremental && args.hasArg(OPT_profile)) {
	warn("ignoring '/incremental' due to '/profile' specification");
	config->incremental = false;
	}

	if (config->incremental && args.hasArg(OPT_order)) {
	warn("ignoring '/incremental' due to '/order' specification");
	config->incremental = false;
	}

	if (config->incremental && config->doGC) {
	warn("ignoring '/incremental' because REF is enabled; use '/opt:noref' to "
	"disable");
	config->incremental = false;
	}

	if (config->incremental && config->doICF) {
	warn("ignoring '/incremental' because ICF is enabled; use '/opt:noicf' to "
	"disable");
	config->incremental = false;
	}

	if (errorCount())
	return;

	std::set<sys::fs::UniqueID> wholeArchives;
	for (auto *arg : args.filtered(OPT_wholearchive_file))
	if (Optional<StringRef> path = doFindFile(arg->getValue()))
	if (Optional<sys::fs::UniqueID> id = getUniqueID(*path))
	wholeArchives.insert(*id);

	// A predicate returning true if a given path is an argument for
	// /wholearchive:, or /wholearchive is enabled globally.
	// This function is a bit tricky because "foo.obj /wholearchive:././foo.obj"
	// needs to be handled as "/wholearchive:foo.obj foo.obj".
	auto isWholeArchive = [&](StringRef path) -> bool {
	if (args.hasArg(OPT_wholearchive_flag))
	return true;
	if (Optional<sys::fs::UniqueID> id = getUniqueID(path))
	return wholeArchives.count(*id);
	return false;
	};

	// Create a list of input files. These can be given as OPT_INPUT options
	// and OPT_wholearchive_file options, and we also need to track OPT_start_lib
	// and OPT_end_lib.
	bool inLib = false;
	for (auto *arg : args) {
	switch (arg->getOption().getID()) {
	case OPT_end_lib:
	if (!inLib)
	error("stray " + arg->getSpelling());
	inLib = false;
	break;
	case OPT_start_lib:
	if (inLib)
	error("nested " + arg->getSpelling());
	inLib = true;
	break;
	case OPT_wholearchive_file:
	if (Optional<StringRef> path = findFile(arg->getValue()))
	enqueuePath(*path, true, inLib);
	break;
	case OPT_INPUT:
	if (Optional<StringRef> path = findFile(arg->getValue()))
	enqueuePath(path, isWholeArchive(path), inLib);
	break;
	default:
	// Ignore other options.
	break;
	}
	}

	// Process files specified as /defaultlib. These should be enequeued after
	// other files, which is why they are in a separate loop.
	for (auto *arg : args.filtered(OPT_defaultlib))
	if (Optional<StringRef> path = findLib(arg->getValue()))
	enqueuePath(*path, false, false);

	// Windows specific -- Create a resource file containing a manifest file.
	if (config->manifest == Configuration::Embed)
	addBuffer(createManifestRes(), false, false);

	// Read all input files given via the command line.
	run();

	if (errorCount())
	return;

	// We should have inferred a machine type by now from the input files, but if
	// not we assume x64.
	if (config->machine == IMAGE_FILE_MACHINE_UNKNOWN) {
	warn("/machine is not specified. x64 is assumed");
	config->machine = AMD64;
	}
	config->wordsize = config->is64() ? 8 : 4;

	// Handle /safeseh, x86 only, on by default, except for mingw.
	- if (config->machine == I386 &&
	- args.hasFlag(OPT_safeseh, OPT_safeseh_no, !config->mingw))
	- config->safeSEH = true;
	+ if (config->machine == I386) {
	+ config->safeSEH = args.hasFlag(OPT_safeseh, OPT_safeseh_no, !config->mingw);
	+ config->noSEH = args.hasArg(OPT_noseh);
	+ }

	// Handle /functionpadmin
	for (auto *arg : args.filtered(OPT_functionpadmin, OPT_functionpadmin_opt))
	parseFunctionPadMin(arg, config->machine);

	if (tar)
	tar->append("response.txt",
	createResponseFile(args, filePaths,
	ArrayRef<StringRef>(searchPaths).slice(1)));

	// Handle /largeaddressaware
	config->largeAddressAware = args.hasFlag(
	OPT_largeaddressaware, OPT_largeaddressaware_no, config->is64());

	// Handle /highentropyva
	config->highEntropyVA =
	config->is64() &&
	args.hasFlag(OPT_highentropyva, OPT_highentropyva_no, true);

	if (!config->dynamicBase &&
	(config->machine == ARMNT \|\| config->machine == ARM64))
	error("/dynamicbase:no is not compatible with " +
	machineToStr(config->machine));

	// Handle /export
	for (auto *arg : args.filtered(OPT_export)) {
	Export e = parseExport(arg->getValue());
	if (config->machine == I386) {
	if (!isDecorated(e.name))
	e.name = saver.save("_" + e.name);
	if (!e.extName.empty() && !isDecorated(e.extName))
	e.extName = saver.save("_" + e.extName);
	}
	config->exports.push_back(e);
	}

	// Handle /def
	if (auto *arg = args.getLastArg(OPT_deffile)) {
	// parseModuleDefs mutates Config object.
	parseModuleDefs(arg->getValue());
	}

	// Handle generation of import library from a def file.
	if (!args.hasArg(OPT_INPUT, OPT_wholearchive_file)) {
	fixupExports();
	createImportLibrary(/asLib=/true);
	return;
	}

	// Windows specific -- if no /subsystem is given, we need to infer
	// that from entry point name. Must happen before /entry handling,
	// and after the early return when just writing an import library.
	if (config->subsystem == IMAGE_SUBSYSTEM_UNKNOWN) {
	config->subsystem = inferSubsystem();
	if (config->subsystem == IMAGE_SUBSYSTEM_UNKNOWN)
	fatal("subsystem must be defined");
	}

	// Handle /entry and /dll
	if (auto *arg = args.getLastArg(OPT_entry)) {
	config->entry = addUndefined(mangle(arg->getValue()));
	} else if (!config->entry && !config->noEntry) {
	if (args.hasArg(OPT_dll)) {
	StringRef s = (config->machine == I386) ? "__DllMainCRTStartup@12"
	: "_DllMainCRTStartup";
	config->entry = addUndefined(s);
	} else if (config->driverWdm) {
	// /driver:wdm implies /entry:_NtProcessStartup
	config->entry = addUndefined(mangle("_NtProcessStartup"));
	} else {
	// Windows specific -- If entry point name is not given, we need to
	// infer that from user-defined entry name.
	StringRef s = findDefaultEntry();
	if (s.empty())
	fatal("entry point must be defined");
	config->entry = addUndefined(s);
	log("Entry name inferred: " + s);
	}
	}

	// Handle /delayload
	for (auto *arg : args.filtered(OPT_delayload)) {
	config->delayLoads.insert(StringRef(arg->getValue()).lower());
	if (config->machine == I386) {
	config->delayLoadHelper = addUndefined("___delayLoadHelper2@8");
	} else {
	config->delayLoadHelper = addUndefined("__delayLoadHelper2");
	}
	}

	// Set default image name if neither /out or /def set it.
	if (config->outputFile.empty()) {
	config->outputFile = getOutputPath(
	(*args.filtered(OPT_INPUT, OPT_wholearchive_file).begin())->getValue());
	}

	// Fail early if an output file is not writable.
	if (auto e = tryCreateFile(config->outputFile)) {
	error("cannot open output file " + config->outputFile + ": " + e.message());
	return;
	}

	if (shouldCreatePDB) {
	// Put the PDB next to the image if no /pdb flag was passed.
	if (config->pdbPath.empty()) {
	config->pdbPath = config->outputFile;
	sys::path::replace_extension(config->pdbPath, ".pdb");
	}

	// The embedded PDB path should be the absolute path to the PDB if no
	// /pdbaltpath flag was passed.
	if (config->pdbAltPath.empty()) {
	config->pdbAltPath = config->pdbPath;

	// It's important to make the path absolute and remove dots. This path
	// will eventually be written into the PE header, and certain Microsoft
	// tools won't work correctly if these assumptions are not held.
	sys::fs::make_absolute(config->pdbAltPath);
	sys::path::remove_dots(config->pdbAltPath);
	} else {
	// Don't do this earlier, so that Config->OutputFile is ready.
	parsePDBAltPath(config->pdbAltPath);
	}
	}

	// Set default image base if /base is not given.
	if (config->imageBase == uint64_t(-1))
	config->imageBase = getDefaultImageBase();

	symtab->addSynthetic(mangle("__ImageBase"), nullptr);
	if (config->machine == I386) {
	symtab->addAbsolute("___safe_se_handler_table", 0);
	symtab->addAbsolute("___safe_se_handler_count", 0);
	}

	symtab->addAbsolute(mangle("__guard_fids_count"), 0);
	symtab->addAbsolute(mangle("__guard_fids_table"), 0);
	symtab->addAbsolute(mangle("__guard_flags"), 0);
	symtab->addAbsolute(mangle("__guard_iat_count"), 0);
	symtab->addAbsolute(mangle("__guard_iat_table"), 0);
	symtab->addAbsolute(mangle("__guard_longjmp_count"), 0);
	symtab->addAbsolute(mangle("__guard_longjmp_table"), 0);
	// Needed for MSVC 2017 15.5 CRT.
	symtab->addAbsolute(mangle("__enclave_config"), 0);

	if (config->pseudoRelocs) {
	symtab->addAbsolute(mangle("__RUNTIME_PSEUDO_RELOC_LIST__"), 0);
	symtab->addAbsolute(mangle("__RUNTIME_PSEUDO_RELOC_LIST_END__"), 0);
	}
	if (config->mingw) {
	symtab->addAbsolute(mangle("__CTOR_LIST__"), 0);
	symtab->addAbsolute(mangle("__DTOR_LIST__"), 0);
	}

	// This code may add new undefined symbols to the link, which may enqueue more
	// symbol resolution tasks, so we need to continue executing tasks until we
	// converge.
	do {
	// Windows specific -- if entry point is not found,
	// search for its mangled names.
	if (config->entry)
	mangleMaybe(config->entry);

	// Windows specific -- Make sure we resolve all dllexported symbols.
	for (Export &e : config->exports) {
	if (!e.forwardTo.empty())
	continue;
	e.sym = addUndefined(e.name);
	if (!e.directives)
	e.symbolName = mangleMaybe(e.sym);
	}

	// Add weak aliases. Weak aliases is a mechanism to give remaining
	// undefined symbols final chance to be resolved successfully.
	for (auto pair : config->alternateNames) {
	StringRef from = pair.first;
	StringRef to = pair.second;
	Symbol *sym = symtab->find(from);
	if (!sym)
	continue;
	if (auto *u = dyn_cast<Undefined>(sym))
	if (!u->weakAlias)
	u->weakAlias = symtab->addUndefined(to);
	}

	// If any inputs are bitcode files, the LTO code generator may create
	// references to library functions that are not explicit in the bitcode
	// file's symbol table. If any of those library functions are defined in a
	// bitcode file in an archive member, we need to arrange to use LTO to
	// compile those archive members by adding them to the link beforehand.
	if (!BitcodeFile::instances.empty())
	for (auto *s : lto::LTO::getRuntimeLibcallSymbols())
	symtab->addLibcall(s);

	// Windows specific -- if __load_config_used can be resolved, resolve it.
	if (symtab->findUnderscore("_load_config_used"))
	addUndefined(mangle("_load_config_used"));
	} while (run());

	if (args.hasArg(OPT_include_optional)) {
	// Handle /includeoptional
	for (auto *arg : args.filtered(OPT_include_optional))
	if (dyn_cast_or_null<LazyArchive>(symtab->find(arg->getValue())))
	addUndefined(arg->getValue());
	while (run());
	}

	if (config->autoImport) {
	// MinGW specific.
	// Load any further object files that might be needed for doing automatic
	// imports.
	//
	// For cases with no automatically imported symbols, this iterates once
	// over the symbol table and doesn't do anything.
	//
	// For the normal case with a few automatically imported symbols, this
	// should only need to be run once, since each new object file imported
	// is an import library and wouldn't add any new undefined references,
	// but there's nothing stopping the __imp_ symbols from coming from a
	// normal object file as well (although that won't be used for the
	// actual autoimport later on). If this pass adds new undefined references,
	// we won't iterate further to resolve them.
	symtab->loadMinGWAutomaticImports();
	run();
	}

	// At this point, we should not have any symbols that cannot be resolved.
	// If we are going to do codegen for link-time optimization, check for
	// unresolvable symbols first, so we don't spend time generating code that
	// will fail to link anyway.
	if (!BitcodeFile::instances.empty() && !config->forceUnresolved)
	symtab->reportUnresolvable();
	if (errorCount())
	return;

	// Do LTO by compiling bitcode input files to a set of native COFF files then
	// link those files (unless -thinlto-index-only was given, in which case we
	// resolve symbols and write indices, but don't generate native code or link).
	symtab->addCombinedLTOObjects();

	// If -thinlto-index-only is given, we should create only "index
	// files" and not object files. Index file creation is already done
	// in addCombinedLTOObject, so we are done if that's the case.
	if (config->thinLTOIndexOnly)
	return;

	// If we generated native object files from bitcode files, this resolves
	// references to the symbols we use from them.
	run();

	// Resolve remaining undefined symbols and warn about imported locals.
	symtab->resolveRemainingUndefines();
	if (errorCount())
	return;

	config->hadExplicitExports = !config->exports.empty();
	if (config->mingw) {
	// In MinGW, all symbols are automatically exported if no symbols
	// are chosen to be exported.
	maybeExportMinGWSymbols(args);

	// Make sure the crtend.o object is the last object file. This object
	// file can contain terminating section chunks that need to be placed
	// last. GNU ld processes files and static libraries explicitly in the
	// order provided on the command line, while lld will pull in needed
	// files from static libraries only after the last object file on the
	// command line.
	for (auto i = ObjFile::instances.begin(), e = ObjFile::instances.end();
	i != e; i++) {
	ObjFile file = i;
	if (isCrtend(file->getName())) {
	ObjFile::instances.erase(i);
	ObjFile::instances.push_back(file);
	break;
	}
	}
	}

	// Windows specific -- when we are creating a .dll file, we also
	// need to create a .lib file. In MinGW mode, we only do that when the
	// -implib option is given explicitly, for compatibility with GNU ld.
	if (!config->exports.empty() \|\| config->dll) {
	fixupExports();
	if (!config->mingw \|\| !config->implib.empty())
	createImportLibrary(/asLib=/false);
	assignExportOrdinals();
	}

	// Handle /output-def (MinGW specific).
	if (auto *arg = args.getLastArg(OPT_output_def))
	writeDefFile(arg->getValue());

	// Set extra alignment for .comm symbols
	for (auto pair : config->alignComm) {
	StringRef name = pair.first;
	uint32_t alignment = pair.second;

	Symbol *sym = symtab->find(name);
	if (!sym) {
	warn("/aligncomm symbol " + name + " not found");
	continue;
	}

	// If the symbol isn't common, it must have been replaced with a regular
	// symbol, which will carry its own alignment.
	auto *dc = dyn_cast<DefinedCommon>(sym);
	if (!dc)
	continue;

	CommonChunk *c = dc->getChunk();
	c->setAlignment(std::max(c->getAlignment(), alignment));
	}

	// Windows specific -- Create a side-by-side manifest file.
	if (config->manifest == Configuration::SideBySide)
	createSideBySideManifest();

	// Handle /order. We want to do this at this moment because we
	// need a complete list of comdat sections to warn on nonexistent
	// functions.
	if (auto *arg = args.getLastArg(OPT_order))
	parseOrderFile(arg->getValue());

	// Identify unreferenced COMDAT sections.
	if (config->doGC)
	markLive(symtab->getChunks());

	// Needs to happen after the last call to addFile().
	convertResources();

	// Identify identical COMDAT sections to merge them.
	if (config->doICF) {
	findKeepUniqueSections();
	doICF(symtab->getChunks());
	}

	// Write the result.
	writeResult();

	// Stop early so we can print the results.
	rootTimer.stop();
	if (config->showTiming)
	Timer::root().print();
	}

	} // namespace coff
	} // namespace lld
	diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp
	index 0adc2b91bd99..4346b3a2ffa7 100644
	--- a/lld/COFF/InputFiles.cpp
	+++ b/lld/COFF/InputFiles.cpp
	@@ -1,1064 +1,1064 @@
	//===- InputFiles.cpp -----------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "InputFiles.h"
	#include "Chunks.h"
	#include "Config.h"
	#include "DebugTypes.h"
	#include "Driver.h"
	#include "SymbolTable.h"
	#include "Symbols.h"
	#include "lld/Common/DWARF.h"
	#include "lld/Common/ErrorHandler.h"
	#include "lld/Common/Memory.h"
	#include "llvm-c/lto.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/BinaryFormat/COFF.h"
	#include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
	#include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
	#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
	#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
	#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
	#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
	#include "llvm/LTO/LTO.h"
	#include "llvm/Object/Binary.h"
	#include "llvm/Object/COFF.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/Endian.h"
	#include "llvm/Support/Error.h"
	#include "llvm/Support/ErrorOr.h"
	#include "llvm/Support/FileSystem.h"
	#include "llvm/Support/Path.h"
	#include "llvm/Target/TargetOptions.h"
	#include <cstring>
	#include <system_error>
	#include <utility>

	using namespace llvm;
	using namespace llvm::COFF;
	using namespace llvm::codeview;
	using namespace llvm::object;
	using namespace llvm::support::endian;
	using namespace lld;
	using namespace lld::coff;

	using llvm::Triple;
	using llvm::support::ulittle32_t;

	// Returns the last element of a path, which is supposed to be a filename.
	static StringRef getBasename(StringRef path) {
	return sys::path::filename(path, sys::path::Style::windows);
	}

	// Returns a string in the format of "foo.obj" or "foo.obj(bar.lib)".
	std::string lld::toString(const coff::InputFile *file) {
	if (!file)
	return "<internal>";
	if (file->parentName.empty() \|\| file->kind() == coff::InputFile::ImportKind)
	return std::string(file->getName());

	return (getBasename(file->parentName) + "(" + getBasename(file->getName()) +
	")")
	.str();
	}

	std::vector<ObjFile *> ObjFile::instances;
	std::map<std::string, PDBInputFile *> PDBInputFile::instances;
	std::vector<ImportFile *> ImportFile::instances;
	std::vector<BitcodeFile *> BitcodeFile::instances;

	/// Checks that Source is compatible with being a weak alias to Target.
	/// If Source is Undefined and has no weak alias set, makes it a weak
	/// alias to Target.
	static void checkAndSetWeakAlias(SymbolTable symtab, InputFile f,
	Symbol source, Symbol target) {
	if (auto *u = dyn_cast<Undefined>(source)) {
	if (u->weakAlias && u->weakAlias != target) {
	// Weak aliases as produced by GCC are named in the form
	// .weak.<weaksymbol>.<othersymbol>, where <othersymbol> is the name
	// of another symbol emitted near the weak symbol.
	// Just use the definition from the first object file that defined
	// this weak symbol.
	if (config->mingw)
	return;
	symtab->reportDuplicate(source, f);
	}
	u->weakAlias = target;
	}
	}

	static bool ignoredSymbolName(StringRef name) {
	return name == "@feat.00" \|\| name == "@comp.id";
	}

	ArchiveFile::ArchiveFile(MemoryBufferRef m) : InputFile(ArchiveKind, m) {}

	void ArchiveFile::parse() {
	// Parse a MemoryBufferRef as an archive file.
	file = CHECK(Archive::create(mb), this);

	// Read the symbol table to construct Lazy objects.
	for (const Archive::Symbol &sym : file->symbols())
	symtab->addLazyArchive(this, sym);
	}

	// Returns a buffer pointing to a member file containing a given symbol.
	void ArchiveFile::addMember(const Archive::Symbol &sym) {
	const Archive::Child &c =
	CHECK(sym.getMember(),
	"could not get the member for symbol " + toCOFFString(sym));

	// Return an empty buffer if we have already returned the same buffer.
	if (!seen.insert(c.getChildOffset()).second)
	return;

	driver->enqueueArchiveMember(c, sym, getName());
	}

	std::vector<MemoryBufferRef> lld::coff::getArchiveMembers(Archive *file) {
	std::vector<MemoryBufferRef> v;
	Error err = Error::success();
	for (const Archive::Child &c : file->children(err)) {
	MemoryBufferRef mbref =
	CHECK(c.getMemoryBufferRef(),
	file->getFileName() +
	": could not get the buffer for a child of the archive");
	v.push_back(mbref);
	}
	if (err)
	fatal(file->getFileName() +
	": Archive::children failed: " + toString(std::move(err)));
	return v;
	}

	void LazyObjFile::fetch() {
	if (mb.getBuffer().empty())
	return;

	InputFile *file;
	if (isBitcode(mb))
	file = make<BitcodeFile>(mb, "", 0, std::move(symbols));
	else
	file = make<ObjFile>(mb, std::move(symbols));
	mb = {};
	symtab->addFile(file);
	}

	void LazyObjFile::parse() {
	if (isBitcode(this->mb)) {
	// Bitcode file.
	std::unique_ptr<lto::InputFile> obj =
	CHECK(lto::InputFile::create(this->mb), this);
	for (const lto::InputFile::Symbol &sym : obj->symbols()) {
	if (!sym.isUndefined())
	symtab->addLazyObject(this, sym.getName());
	}
	return;
	}

	// Native object file.
	std::unique_ptr<Binary> coffObjPtr = CHECK(createBinary(mb), this);
	COFFObjectFile *coffObj = cast<COFFObjectFile>(coffObjPtr.get());
	uint32_t numSymbols = coffObj->getNumberOfSymbols();
	for (uint32_t i = 0; i < numSymbols; ++i) {
	COFFSymbolRef coffSym = check(coffObj->getSymbol(i));
	if (coffSym.isUndefined() \|\| !coffSym.isExternal() \|\|
	coffSym.isWeakExternal())
	continue;
	StringRef name = check(coffObj->getSymbolName(coffSym));
	if (coffSym.isAbsolute() && ignoredSymbolName(name))
	continue;
	symtab->addLazyObject(this, name);
	i += coffSym.getNumberOfAuxSymbols();
	}
	}

	void ObjFile::parse() {
	// Parse a memory buffer as a COFF file.
	std::unique_ptr<Binary> bin = CHECK(createBinary(mb), this);

	if (auto *obj = dyn_cast<COFFObjectFile>(bin.get())) {
	bin.release();
	coffObj.reset(obj);
	} else {
	fatal(toString(this) + " is not a COFF file");
	}

	// Read section and symbol tables.
	initializeChunks();
	initializeSymbols();
	initializeFlags();
	initializeDependencies();
	}

	const coff_section *ObjFile::getSection(uint32_t i) {
	auto sec = coffObj->getSection(i);
	if (!sec)
	fatal("getSection failed: #" + Twine(i) + ": " + toString(sec.takeError()));
	return *sec;
	}

	// We set SectionChunk pointers in the SparseChunks vector to this value
	// temporarily to mark comdat sections as having an unknown resolution. As we
	// walk the object file's symbol table, once we visit either a leader symbol or
	// an associative section definition together with the parent comdat's leader,
	// we set the pointer to either nullptr (to mark the section as discarded) or a
	// valid SectionChunk for that section.
	static SectionChunk const pendingComdat = reinterpret_cast<SectionChunk >(1);

	void ObjFile::initializeChunks() {
	uint32_t numSections = coffObj->getNumberOfSections();
	sparseChunks.resize(numSections + 1);
	for (uint32_t i = 1; i < numSections + 1; ++i) {
	const coff_section *sec = getSection(i);
	if (sec->Characteristics & IMAGE_SCN_LNK_COMDAT)
	sparseChunks[i] = pendingComdat;
	else
	sparseChunks[i] = readSection(i, nullptr, "");
	}
	}

	SectionChunk *ObjFile::readSection(uint32_t sectionNumber,
	const coff_aux_section_definition *def,
	StringRef leaderName) {
	const coff_section *sec = getSection(sectionNumber);

	StringRef name;
	if (Expected<StringRef> e = coffObj->getSectionName(sec))
	name = *e;
	else
	fatal("getSectionName failed: #" + Twine(sectionNumber) + ": " +
	toString(e.takeError()));

	if (name == ".drectve") {
	ArrayRef<uint8_t> data;
	cantFail(coffObj->getSectionContents(sec, data));
	directives = StringRef((const char *)data.data(), data.size());
	return nullptr;
	}

	if (name == ".llvm_addrsig") {
	addrsigSec = sec;
	return nullptr;
	}

	// Object files may have DWARF debug info or MS CodeView debug info
	// (or both).
	//
	// DWARF sections don't need any special handling from the perspective
	// of the linker; they are just a data section containing relocations.
	// We can just link them to complete debug info.
	//
	// CodeView needs linker support. We need to interpret debug info,
	// and then write it to a separate .pdb file.

	// Ignore DWARF debug info unless /debug is given.
	if (!config->debug && name.startswith(".debug_"))
	return nullptr;

	if (sec->Characteristics & llvm::COFF::IMAGE_SCN_LNK_REMOVE)
	return nullptr;
	auto *c = make<SectionChunk>(this, sec);
	if (def)
	c->checksum = def->CheckSum;

	// CodeView sections are stored to a different vector because they are not
	// linked in the regular manner.
	if (c->isCodeView())
	debugChunks.push_back(c);
	else if (name == ".gfids$y")
	guardFidChunks.push_back(c);
	else if (name == ".gljmp$y")
	guardLJmpChunks.push_back(c);
	else if (name == ".sxdata")
	sxDataChunks.push_back(c);
	else if (config->tailMerge && sec->NumberOfRelocations == 0 &&
	name == ".rdata" && leaderName.startswith("??_C@"))
	// COFF sections that look like string literal sections (i.e. no
	// relocations, in .rdata, leader symbol name matches the MSVC name mangling
	// for string literals) are subject to string tail merging.
	MergeChunk::addSection(c);
	else if (name == ".rsrc" \|\| name.startswith(".rsrc$"))
	resourceChunks.push_back(c);
	else
	chunks.push_back(c);

	return c;
	}

	void ObjFile::includeResourceChunks() {
	chunks.insert(chunks.end(), resourceChunks.begin(), resourceChunks.end());
	}

	void ObjFile::readAssociativeDefinition(
	COFFSymbolRef sym, const coff_aux_section_definition *def) {
	readAssociativeDefinition(sym, def, def->getNumber(sym.isBigObj()));
	}

	void ObjFile::readAssociativeDefinition(COFFSymbolRef sym,
	const coff_aux_section_definition *def,
	uint32_t parentIndex) {
	SectionChunk *parent = sparseChunks[parentIndex];
	int32_t sectionNumber = sym.getSectionNumber();

	auto diag = [&]() {
	StringRef name = check(coffObj->getSymbolName(sym));

	StringRef parentName;
	const coff_section *parentSec = getSection(parentIndex);
	if (Expected<StringRef> e = coffObj->getSectionName(parentSec))
	parentName = *e;
	error(toString(this) + ": associative comdat " + name + " (sec " +
	Twine(sectionNumber) + ") has invalid reference to section " +
	parentName + " (sec " + Twine(parentIndex) + ")");
	};

	if (parent == pendingComdat) {
	// This can happen if an associative comdat refers to another associative
	// comdat that appears after it (invalid per COFF spec) or to a section
	// without any symbols.
	diag();
	return;
	}

	// Check whether the parent is prevailing. If it is, so are we, and we read
	// the section; otherwise mark it as discarded.
	if (parent) {
	SectionChunk *c = readSection(sectionNumber, def, "");
	sparseChunks[sectionNumber] = c;
	if (c) {
	c->selection = IMAGE_COMDAT_SELECT_ASSOCIATIVE;
	parent->addAssociative(c);
	}
	} else {
	sparseChunks[sectionNumber] = nullptr;
	}
	}

	void ObjFile::recordPrevailingSymbolForMingw(
	COFFSymbolRef sym, DenseMap<StringRef, uint32_t> &prevailingSectionMap) {
	// For comdat symbols in executable sections, where this is the copy
	// of the section chunk we actually include instead of discarding it,
	// add the symbol to a map to allow using it for implicitly
	// associating .[px]data$<func> sections to it.
	+ // Use the suffix from the .text$<func> instead of the leader symbol
	+ // name, for cases where the names differ (i386 mangling/decorations,
	+ // cases where the leader is a weak symbol named .weak.func.default*).
	int32_t sectionNumber = sym.getSectionNumber();
	SectionChunk *sc = sparseChunks[sectionNumber];
	if (sc && sc->getOutputCharacteristics() & IMAGE_SCN_MEM_EXECUTE) {
	- StringRef name;
	- name = check(coffObj->getSymbolName(sym));
	- if (getMachineType() == I386)
	- name.consume_front("_");
	+ StringRef name = sc->getSectionName().split('$').second;
	prevailingSectionMap[name] = sectionNumber;
	}
	}

	void ObjFile::maybeAssociateSEHForMingw(
	COFFSymbolRef sym, const coff_aux_section_definition *def,
	const DenseMap<StringRef, uint32_t> &prevailingSectionMap) {
	StringRef name = check(coffObj->getSymbolName(sym));
	if (name.consume_front(".pdata$") \|\| name.consume_front(".xdata$") \|\|
	name.consume_front(".eh_frame$")) {
	// For MinGW, treat .[px]data$<func> and .eh_frame$<func> as implicitly
	// associative to the symbol <func>.
	auto parentSym = prevailingSectionMap.find(name);
	if (parentSym != prevailingSectionMap.end())
	readAssociativeDefinition(sym, def, parentSym->second);
	}
	}

	Symbol *ObjFile::createRegular(COFFSymbolRef sym) {
	SectionChunk *sc = sparseChunks[sym.getSectionNumber()];
	if (sym.isExternal()) {
	StringRef name = check(coffObj->getSymbolName(sym));
	if (sc)
	return symtab->addRegular(this, name, sym.getGeneric(), sc,
	sym.getValue());
	// For MinGW symbols named .weak.* that point to a discarded section,
	// don't create an Undefined symbol. If nothing ever refers to the symbol,
	// everything should be fine. If something actually refers to the symbol
	// (e.g. the undefined weak alias), linking will fail due to undefined
	// references at the end.
	if (config->mingw && name.startswith(".weak."))
	return nullptr;
	return symtab->addUndefined(name, this, false);
	}
	if (sc)
	return make<DefinedRegular>(this, /Name/ "", /IsCOMDAT/ false,
	/IsExternal/ false, sym.getGeneric(), sc);
	return nullptr;
	}

	void ObjFile::initializeSymbols() {
	uint32_t numSymbols = coffObj->getNumberOfSymbols();
	symbols.resize(numSymbols);

	SmallVector<std::pair<Symbol *, uint32_t>, 8> weakAliases;
	std::vector<uint32_t> pendingIndexes;
	pendingIndexes.reserve(numSymbols);

	DenseMap<StringRef, uint32_t> prevailingSectionMap;
	std::vector<const coff_aux_section_definition *> comdatDefs(
	coffObj->getNumberOfSections() + 1);

	for (uint32_t i = 0; i < numSymbols; ++i) {
	COFFSymbolRef coffSym = check(coffObj->getSymbol(i));
	bool prevailingComdat;
	if (coffSym.isUndefined()) {
	symbols[i] = createUndefined(coffSym);
	} else if (coffSym.isWeakExternal()) {
	symbols[i] = createUndefined(coffSym);
	uint32_t tagIndex = coffSym.getAux<coff_aux_weak_external>()->TagIndex;
	weakAliases.emplace_back(symbols[i], tagIndex);
	} else if (Optional<Symbol *> optSym =
	createDefined(coffSym, comdatDefs, prevailingComdat)) {
	symbols[i] = *optSym;
	if (config->mingw && prevailingComdat)
	recordPrevailingSymbolForMingw(coffSym, prevailingSectionMap);
	} else {
	// createDefined() returns None if a symbol belongs to a section that
	// was pending at the point when the symbol was read. This can happen in
	// two cases:
	// 1) section definition symbol for a comdat leader;
	// 2) symbol belongs to a comdat section associated with another section.
	// In both of these cases, we can expect the section to be resolved by
	// the time we finish visiting the remaining symbols in the symbol
	// table. So we postpone the handling of this symbol until that time.
	pendingIndexes.push_back(i);
	}
	i += coffSym.getNumberOfAuxSymbols();
	}

	for (uint32_t i : pendingIndexes) {
	COFFSymbolRef sym = check(coffObj->getSymbol(i));
	if (const coff_aux_section_definition *def = sym.getSectionDefinition()) {
	if (def->Selection == IMAGE_COMDAT_SELECT_ASSOCIATIVE)
	readAssociativeDefinition(sym, def);
	else if (config->mingw)
	maybeAssociateSEHForMingw(sym, def, prevailingSectionMap);
	}
	if (sparseChunks[sym.getSectionNumber()] == pendingComdat) {
	StringRef name = check(coffObj->getSymbolName(sym));
	log("comdat section " + name +
	" without leader and unassociated, discarding");
	continue;
	}
	symbols[i] = createRegular(sym);
	}

	for (auto &kv : weakAliases) {
	Symbol *sym = kv.first;
	uint32_t idx = kv.second;
	checkAndSetWeakAlias(symtab, this, sym, symbols[idx]);
	}

	// Free the memory used by sparseChunks now that symbol loading is finished.
	decltype(sparseChunks)().swap(sparseChunks);
	}

	Symbol *ObjFile::createUndefined(COFFSymbolRef sym) {
	StringRef name = check(coffObj->getSymbolName(sym));
	return symtab->addUndefined(name, this, sym.isWeakExternal());
	}

	void ObjFile::handleComdatSelection(COFFSymbolRef sym, COMDATType &selection,
	bool &prevailing, DefinedRegular *leader) {
	if (prevailing)
	return;
	// There's already an existing comdat for this symbol: `Leader`.
	// Use the comdats's selection field to determine if the new
	// symbol in `Sym` should be discarded, produce a duplicate symbol
	// error, etc.

	SectionChunk *leaderChunk = nullptr;
	COMDATType leaderSelection = IMAGE_COMDAT_SELECT_ANY;

	if (leader->data) {
	leaderChunk = leader->getChunk();
	leaderSelection = leaderChunk->selection;
	} else {
	// FIXME: comdats from LTO files don't know their selection; treat them
	// as "any".
	selection = leaderSelection;
	}

	if ((selection == IMAGE_COMDAT_SELECT_ANY &&
	leaderSelection == IMAGE_COMDAT_SELECT_LARGEST) \|\|
	(selection == IMAGE_COMDAT_SELECT_LARGEST &&
	leaderSelection == IMAGE_COMDAT_SELECT_ANY)) {
	// cl.exe picks "any" for vftables when building with /GR- and
	// "largest" when building with /GR. To be able to link object files
	// compiled with each flag, "any" and "largest" are merged as "largest".
	leaderSelection = selection = IMAGE_COMDAT_SELECT_LARGEST;
	}

	// GCCs __declspec(selectany) doesn't actually pick "any" but "same size as".
	// Clang on the other hand picks "any". To be able to link two object files
	// with a __declspec(selectany) declaration, one compiled with gcc and the
	// other with clang, we merge them as proper "same size as"
	if (config->mingw && ((selection == IMAGE_COMDAT_SELECT_ANY &&
	leaderSelection == IMAGE_COMDAT_SELECT_SAME_SIZE) \|\|
	(selection == IMAGE_COMDAT_SELECT_SAME_SIZE &&
	leaderSelection == IMAGE_COMDAT_SELECT_ANY))) {
	leaderSelection = selection = IMAGE_COMDAT_SELECT_SAME_SIZE;
	}

	// Other than that, comdat selections must match. This is a bit more
	// strict than link.exe which allows merging "any" and "largest" if "any"
	// is the first symbol the linker sees, and it allows merging "largest"
	// with everything (!) if "largest" is the first symbol the linker sees.
	// Making this symmetric independent of which selection is seen first
	// seems better though.
	// (This behavior matches ModuleLinker::getComdatResult().)
	if (selection != leaderSelection) {
	log(("conflicting comdat type for " + toString(*leader) + ": " +
	Twine((int)leaderSelection) + " in " + toString(leader->getFile()) +
	" and " + Twine((int)selection) + " in " + toString(this))
	.str());
	symtab->reportDuplicate(leader, this);
	return;
	}

	switch (selection) {
	case IMAGE_COMDAT_SELECT_NODUPLICATES:
	symtab->reportDuplicate(leader, this);
	break;

	case IMAGE_COMDAT_SELECT_ANY:
	// Nothing to do.
	break;

	case IMAGE_COMDAT_SELECT_SAME_SIZE:
	if (leaderChunk->getSize() != getSection(sym)->SizeOfRawData)
	symtab->reportDuplicate(leader, this);
	break;

	case IMAGE_COMDAT_SELECT_EXACT_MATCH: {
	SectionChunk newChunk(this, getSection(sym));
	// link.exe only compares section contents here and doesn't complain
	// if the two comdat sections have e.g. different alignment.
	// Match that.
	if (leaderChunk->getContents() != newChunk.getContents())
	symtab->reportDuplicate(leader, this, &newChunk, sym.getValue());
	break;
	}

	case IMAGE_COMDAT_SELECT_ASSOCIATIVE:
	// createDefined() is never called for IMAGE_COMDAT_SELECT_ASSOCIATIVE.
	// (This means lld-link doesn't produce duplicate symbol errors for
	// associative comdats while link.exe does, but associate comdats
	// are never extern in practice.)
	llvm_unreachable("createDefined not called for associative comdats");

	case IMAGE_COMDAT_SELECT_LARGEST:
	if (leaderChunk->getSize() < getSection(sym)->SizeOfRawData) {
	// Replace the existing comdat symbol with the new one.
	StringRef name = check(coffObj->getSymbolName(sym));
	// FIXME: This is incorrect: With /opt:noref, the previous sections
	// make it into the final executable as well. Correct handling would
	// be to undo reading of the whole old section that's being replaced,
	// or doing one pass that determines what the final largest comdat
	// is for all IMAGE_COMDAT_SELECT_LARGEST comdats and then reading
	// only the largest one.
	replaceSymbol<DefinedRegular>(leader, this, name, /IsCOMDAT/ true,
	/IsExternal/ true, sym.getGeneric(),
	nullptr);
	prevailing = true;
	}
	break;

	case IMAGE_COMDAT_SELECT_NEWEST:
	llvm_unreachable("should have been rejected earlier");
	}
	}

	Optional<Symbol *> ObjFile::createDefined(
	COFFSymbolRef sym,
	std::vector<const coff_aux_section_definition *> &comdatDefs,
	bool &prevailing) {
	prevailing = false;
	auto getName = [&]() { return check(coffObj->getSymbolName(sym)); };

	if (sym.isCommon()) {
	auto *c = make<CommonChunk>(sym);
	chunks.push_back(c);
	return symtab->addCommon(this, getName(), sym.getValue(), sym.getGeneric(),
	c);
	}

	if (sym.isAbsolute()) {
	StringRef name = getName();

	if (name == "@feat.00")
	feat00Flags = sym.getValue();
	// Skip special symbols.
	if (ignoredSymbolName(name))
	return nullptr;

	if (sym.isExternal())
	return symtab->addAbsolute(name, sym);
	return make<DefinedAbsolute>(name, sym);
	}

	int32_t sectionNumber = sym.getSectionNumber();
	if (sectionNumber == llvm::COFF::IMAGE_SYM_DEBUG)
	return nullptr;

	if (llvm::COFF::isReservedSectionNumber(sectionNumber))
	fatal(toString(this) + ": " + getName() +
	" should not refer to special section " + Twine(sectionNumber));

	if ((uint32_t)sectionNumber >= sparseChunks.size())
	fatal(toString(this) + ": " + getName() +
	" should not refer to non-existent section " + Twine(sectionNumber));

	// Comdat handling.
	// A comdat symbol consists of two symbol table entries.
	// The first symbol entry has the name of the section (e.g. .text), fixed
	// values for the other fields, and one auxiliary record.
	// The second symbol entry has the name of the comdat symbol, called the
	// "comdat leader".
	// When this function is called for the first symbol entry of a comdat,
	// it sets comdatDefs and returns None, and when it's called for the second
	// symbol entry it reads comdatDefs and then sets it back to nullptr.

	// Handle comdat leader.
	if (const coff_aux_section_definition *def = comdatDefs[sectionNumber]) {
	comdatDefs[sectionNumber] = nullptr;
	DefinedRegular *leader;

	if (sym.isExternal()) {
	std::tie(leader, prevailing) =
	symtab->addComdat(this, getName(), sym.getGeneric());
	} else {
	leader = make<DefinedRegular>(this, /Name/ "", /IsCOMDAT/ false,
	/IsExternal/ false, sym.getGeneric());
	prevailing = true;
	}

	if (def->Selection < (int)IMAGE_COMDAT_SELECT_NODUPLICATES \|\|
	// Intentionally ends at IMAGE_COMDAT_SELECT_LARGEST: link.exe
	// doesn't understand IMAGE_COMDAT_SELECT_NEWEST either.
	def->Selection > (int)IMAGE_COMDAT_SELECT_LARGEST) {
	fatal("unknown comdat type " + std::to_string((int)def->Selection) +
	" for " + getName() + " in " + toString(this));
	}
	COMDATType selection = (COMDATType)def->Selection;

	if (leader->isCOMDAT)
	handleComdatSelection(sym, selection, prevailing, leader);

	if (prevailing) {
	SectionChunk *c = readSection(sectionNumber, def, getName());
	sparseChunks[sectionNumber] = c;
	c->sym = cast<DefinedRegular>(leader);
	c->selection = selection;
	cast<DefinedRegular>(leader)->data = &c->repl;
	} else {
	sparseChunks[sectionNumber] = nullptr;
	}
	return leader;
	}

	// Prepare to handle the comdat leader symbol by setting the section's
	// ComdatDefs pointer if we encounter a non-associative comdat.
	if (sparseChunks[sectionNumber] == pendingComdat) {
	if (const coff_aux_section_definition *def = sym.getSectionDefinition()) {
	if (def->Selection != IMAGE_COMDAT_SELECT_ASSOCIATIVE)
	comdatDefs[sectionNumber] = def;
	}
	return None;
	}

	return createRegular(sym);
	}

	MachineTypes ObjFile::getMachineType() {
	if (coffObj)
	return static_cast<MachineTypes>(coffObj->getMachine());
	return IMAGE_FILE_MACHINE_UNKNOWN;
	}

	ArrayRef<uint8_t> ObjFile::getDebugSection(StringRef secName) {
	if (SectionChunk *sec = SectionChunk::findByName(debugChunks, secName))
	return sec->consumeDebugMagic();
	return {};
	}

	// OBJ files systematically store critical information in a .debug$S stream,
	// even if the TU was compiled with no debug info. At least two records are
	// always there. S_OBJNAME stores a 32-bit signature, which is loaded into the
	// PCHSignature member. S_COMPILE3 stores compile-time cmd-line flags. This is
	// currently used to initialize the hotPatchable member.
	void ObjFile::initializeFlags() {
	ArrayRef<uint8_t> data = getDebugSection(".debug$S");
	if (data.empty())
	return;

	DebugSubsectionArray subsections;

	BinaryStreamReader reader(data, support::little);
	ExitOnError exitOnErr;
	exitOnErr(reader.readArray(subsections, data.size()));

	for (const DebugSubsectionRecord &ss : subsections) {
	if (ss.kind() != DebugSubsectionKind::Symbols)
	continue;

	unsigned offset = 0;

	// Only parse the first two records. We are only looking for S_OBJNAME
	// and S_COMPILE3, and they usually appear at the beginning of the
	// stream.
	for (unsigned i = 0; i < 2; ++i) {
	Expected<CVSymbol> sym = readSymbolFromStream(ss.getRecordData(), offset);
	if (!sym) {
	consumeError(sym.takeError());
	return;
	}
	if (sym->kind() == SymbolKind::S_COMPILE3) {
	auto cs =
	cantFail(SymbolDeserializer::deserializeAs<Compile3Sym>(sym.get()));
	hotPatchable =
	(cs.Flags & CompileSym3Flags::HotPatch) != CompileSym3Flags::None;
	}
	if (sym->kind() == SymbolKind::S_OBJNAME) {
	auto objName = cantFail(SymbolDeserializer::deserializeAs<ObjNameSym>(
	sym.get()));
	pchSignature = objName.Signature;
	}
	offset += sym->length();
	}
	}
	}

	// Depending on the compilation flags, OBJs can refer to external files,
	// necessary to merge this OBJ into the final PDB. We currently support two
	// types of external files: Precomp/PCH OBJs, when compiling with /Yc and /Yu.
	// And PDB type servers, when compiling with /Zi. This function extracts these
	// dependencies and makes them available as a TpiSource interface (see
	// DebugTypes.h). Both cases only happen with cl.exe: clang-cl produces regular
	// output even with /Yc and /Yu and with /Zi.
	void ObjFile::initializeDependencies() {
	if (!config->debug)
	return;

	bool isPCH = false;

	ArrayRef<uint8_t> data = getDebugSection(".debug$P");
	if (!data.empty())
	isPCH = true;
	else
	data = getDebugSection(".debug$T");

	if (data.empty())
	return;

	// Get the first type record. It will indicate if this object uses a type
	// server (/Zi) or a PCH file (/Yu).
	CVTypeArray types;
	BinaryStreamReader reader(data, support::little);
	cantFail(reader.readArray(types, reader.getLength()));
	CVTypeArray::Iterator firstType = types.begin();
	if (firstType == types.end())
	return;

	// Remember the .debug$T or .debug$P section.
	debugTypes = data;

	// This object file is a PCH file that others will depend on.
	if (isPCH) {
	debugTypesObj = makePrecompSource(this);
	return;
	}

	// This object file was compiled with /Zi. Enqueue the PDB dependency.
	if (firstType->kind() == LF_TYPESERVER2) {
	TypeServer2Record ts = cantFail(
	TypeDeserializer::deserializeAs<TypeServer2Record>(firstType->data()));
	debugTypesObj = makeUseTypeServerSource(this, ts);
	PDBInputFile::enqueue(ts.getName(), this);
	return;
	}

	// This object was compiled with /Yu. It uses types from another object file
	// with a matching signature.
	if (firstType->kind() == LF_PRECOMP) {
	PrecompRecord precomp = cantFail(
	TypeDeserializer::deserializeAs<PrecompRecord>(firstType->data()));
	debugTypesObj = makeUsePrecompSource(this, precomp);
	return;
	}

	// This is a plain old object file.
	debugTypesObj = makeTpiSource(this);
	}

	// Make a PDB path assuming the PDB is in the same folder as the OBJ
	static std::string getPdbBaseName(ObjFile *file, StringRef tSPath) {
	StringRef localPath =
	!file->parentName.empty() ? file->parentName : file->getName();
	SmallString<128> path = sys::path::parent_path(localPath);

	// Currently, type server PDBs are only created by MSVC cl, which only runs
	// on Windows, so we can assume type server paths are Windows style.
	sys::path::append(path,
	sys::path::filename(tSPath, sys::path::Style::windows));
	return std::string(path.str());
	}

	// The casing of the PDB path stamped in the OBJ can differ from the actual path
	// on disk. With this, we ensure to always use lowercase as a key for the
	// PDBInputFile::instances map, at least on Windows.
	static std::string normalizePdbPath(StringRef path) {
	#if defined(_WIN32)
	return path.lower();
	#else // LINUX
	return std::string(path);
	#endif
	}

	// If existing, return the actual PDB path on disk.
	static Optional<std::string> findPdbPath(StringRef pdbPath,
	ObjFile *dependentFile) {
	// Ensure the file exists before anything else. In some cases, if the path
	// points to a removable device, Driver::enqueuePath() would fail with an
	// error (EAGAIN, "resource unavailable try again") which we want to skip
	// silently.
	if (llvm::sys::fs::exists(pdbPath))
	return normalizePdbPath(pdbPath);
	std::string ret = getPdbBaseName(dependentFile, pdbPath);
	if (llvm::sys::fs::exists(ret))
	return normalizePdbPath(ret);
	return None;
	}

	PDBInputFile::PDBInputFile(MemoryBufferRef m) : InputFile(PDBKind, m) {}

	PDBInputFile::~PDBInputFile() = default;

	PDBInputFile *PDBInputFile::findFromRecordPath(StringRef path,
	ObjFile *fromFile) {
	auto p = findPdbPath(path.str(), fromFile);
	if (!p)
	return nullptr;
	auto it = PDBInputFile::instances.find(*p);
	if (it != PDBInputFile::instances.end())
	return it->second;
	return nullptr;
	}

	void PDBInputFile::enqueue(StringRef path, ObjFile *fromFile) {
	auto p = findPdbPath(path.str(), fromFile);
	if (!p)
	return;
	auto it = PDBInputFile::instances.emplace(*p, nullptr);
	if (!it.second)
	return; // already scheduled for load
	driver->enqueuePDB(*p);
	}

	void PDBInputFile::parse() {
	PDBInputFile::instances[mb.getBufferIdentifier().str()] = this;

	std::unique_ptr<pdb::IPDBSession> thisSession;
	loadErr.emplace(pdb::NativeSession::createFromPdb(
	MemoryBuffer::getMemBuffer(mb, false), thisSession));
	if (*loadErr)
	return; // fail silently at this point - the error will be handled later,
	// when merging the debug type stream

	session.reset(static_cast<pdb::NativeSession *>(thisSession.release()));

	pdb::PDBFile &pdbFile = session->getPDBFile();
	auto expectedInfo = pdbFile.getPDBInfoStream();
	// All PDB Files should have an Info stream.
	if (!expectedInfo) {
	loadErr.emplace(expectedInfo.takeError());
	return;
	}
	debugTypesObj = makeTypeServerSource(this);
	}

	// Used only for DWARF debug info, which is not common (except in MinGW
	// environments). This returns an optional pair of file name and line
	// number for where the variable was defined.
	Optional<std::pair<StringRef, uint32_t>>
	ObjFile::getVariableLocation(StringRef var) {
	if (!dwarf) {
	dwarf = make<DWARFCache>(DWARFContext::create(*getCOFFObj()));
	if (!dwarf)
	return None;
	}
	if (config->machine == I386)
	var.consume_front("_");
	Optional<std::pair<std::string, unsigned>> ret = dwarf->getVariableLoc(var);
	if (!ret)
	return None;
	return std::make_pair(saver.save(ret->first), ret->second);
	}

	// Used only for DWARF debug info, which is not common (except in MinGW
	// environments).
	Optional<DILineInfo> ObjFile::getDILineInfo(uint32_t offset,
	uint32_t sectionIndex) {
	if (!dwarf) {
	dwarf = make<DWARFCache>(DWARFContext::create(*getCOFFObj()));
	if (!dwarf)
	return None;
	}

	return dwarf->getDILineInfo(offset, sectionIndex);
	}

	static StringRef ltrim1(StringRef s, const char *chars) {
	if (!s.empty() && strchr(chars, s[0]))
	return s.substr(1);
	return s;
	}

	void ImportFile::parse() {
	const char *buf = mb.getBufferStart();
	const auto hdr = reinterpret_cast<const coff_import_header >(buf);

	// Check if the total size is valid.
	if (mb.getBufferSize() != sizeof(*hdr) + hdr->SizeOfData)
	fatal("broken import library");

	// Read names and create an __imp_ symbol.
	StringRef name = saver.save(StringRef(buf + sizeof(*hdr)));
	StringRef impName = saver.save("__imp_" + name);
	const char *nameStart = buf + sizeof(coff_import_header) + name.size() + 1;
	dllName = std::string(StringRef(nameStart));
	StringRef extName;
	switch (hdr->getNameType()) {
	case IMPORT_ORDINAL:
	extName = "";
	break;
	case IMPORT_NAME:
	extName = name;
	break;
	case IMPORT_NAME_NOPREFIX:
	extName = ltrim1(name, "?@_");
	break;
	case IMPORT_NAME_UNDECORATE:
	extName = ltrim1(name, "?@_");
	extName = extName.substr(0, extName.find('@'));
	break;
	}

	this->hdr = hdr;
	externalName = extName;

	impSym = symtab->addImportData(impName, this);
	// If this was a duplicate, we logged an error but may continue;
	// in this case, impSym is nullptr.
	if (!impSym)
	return;

	if (hdr->getType() == llvm::COFF::IMPORT_CONST)
	static_cast<void>(symtab->addImportData(name, this));

	// If type is function, we need to create a thunk which jump to an
	// address pointed by the __imp_ symbol. (This allows you to call
	// DLL functions just like regular non-DLL functions.)
	if (hdr->getType() == llvm::COFF::IMPORT_CODE)
	thunkSym = symtab->addImportThunk(
	name, cast_or_null<DefinedImportData>(impSym), hdr->Machine);
	}

	BitcodeFile::BitcodeFile(MemoryBufferRef mb, StringRef archiveName,
	uint64_t offsetInArchive)
	: BitcodeFile(mb, archiveName, offsetInArchive, {}) {}

	BitcodeFile::BitcodeFile(MemoryBufferRef mb, StringRef archiveName,
	uint64_t offsetInArchive,
	std::vector<Symbol *> &&symbols)
	: InputFile(BitcodeKind, mb), symbols(std::move(symbols)) {
	std::string path = mb.getBufferIdentifier().str();
	if (config->thinLTOIndexOnly)
	path = replaceThinLTOSuffix(mb.getBufferIdentifier());

	// ThinLTO assumes that all MemoryBufferRefs given to it have a unique
	// name. If two archives define two members with the same name, this
	// causes a collision which result in only one of the objects being taken
	// into consideration at LTO time (which very likely causes undefined
	// symbols later in the link stage). So we append file offset to make
	// filename unique.
	MemoryBufferRef mbref(
	mb.getBuffer(),
	saver.save(archiveName.empty() ? path
	: archiveName + sys::path::filename(path) +
	utostr(offsetInArchive)));

	obj = check(lto::InputFile::create(mbref));
	}

	BitcodeFile::~BitcodeFile() = default;

	void BitcodeFile::parse() {
	std::vector<std::pair<Symbol *, bool>> comdat(obj->getComdatTable().size());
	for (size_t i = 0; i != obj->getComdatTable().size(); ++i)
	// FIXME: lto::InputFile doesn't keep enough data to do correct comdat
	// selection handling.
	comdat[i] = symtab->addComdat(this, saver.save(obj->getComdatTable()[i]));
	for (const lto::InputFile::Symbol &objSym : obj->symbols()) {
	StringRef symName = saver.save(objSym.getName());
	int comdatIndex = objSym.getComdatIndex();
	Symbol *sym;
	if (objSym.isUndefined()) {
	sym = symtab->addUndefined(symName, this, false);
	} else if (objSym.isCommon()) {
	sym = symtab->addCommon(this, symName, objSym.getCommonSize());
	} else if (objSym.isWeak() && objSym.isIndirect()) {
	// Weak external.
	sym = symtab->addUndefined(symName, this, true);
	std::string fallback = std::string(objSym.getCOFFWeakExternalFallback());
	Symbol *alias = symtab->addUndefined(saver.save(fallback));
	checkAndSetWeakAlias(symtab, this, sym, alias);
	} else if (comdatIndex != -1) {
	if (symName == obj->getComdatTable()[comdatIndex])
	sym = comdat[comdatIndex].first;
	else if (comdat[comdatIndex].second)
	sym = symtab->addRegular(this, symName);
	else
	sym = symtab->addUndefined(symName, this, false);
	} else {
	sym = symtab->addRegular(this, symName);
	}
	symbols.push_back(sym);
	if (objSym.isUsed())
	config->gcroot.push_back(sym);
	}
	directives = obj->getCOFFLinkerOpts();
	}

	MachineTypes BitcodeFile::getMachineType() {
	switch (Triple(obj->getTargetTriple()).getArch()) {
	case Triple::x86_64:
	return AMD64;
	case Triple::x86:
	return I386;
	case Triple::arm:
	return ARMNT;
	case Triple::aarch64:
	return ARM64;
	default:
	return IMAGE_FILE_MACHINE_UNKNOWN;
	}
	}

	std::string lld::coff::replaceThinLTOSuffix(StringRef path) {
	StringRef suffix = config->thinLTOObjectSuffixReplace.first;
	StringRef repl = config->thinLTOObjectSuffixReplace.second;

	if (path.consume_back(suffix))
	return (path + repl).str();
	return std::string(path);
	}
	diff --git a/lld/COFF/MinGW.cpp b/lld/COFF/MinGW.cpp
	index bded985f04d0..e24cdca6ee34 100644
	--- a/lld/COFF/MinGW.cpp
	+++ b/lld/COFF/MinGW.cpp
	@@ -1,166 +1,175 @@
	//===- MinGW.cpp ----------------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "MinGW.h"
	#include "SymbolTable.h"
	#include "lld/Common/ErrorHandler.h"
	#include "llvm/Object/COFF.h"
	#include "llvm/Support/Path.h"
	#include "llvm/Support/raw_ostream.h"

	using namespace llvm;
	using namespace llvm::COFF;
	using namespace lld;
	using namespace lld::coff;

	AutoExporter::AutoExporter() {
	excludeLibs = {
	"libgcc",
	"libgcc_s",
	"libstdc++",
	"libmingw32",
	"libmingwex",
	"libg2c",
	"libsupc++",
	"libobjc",
	"libgcj",
	"libclang_rt.builtins",
	"libclang_rt.builtins-aarch64",
	"libclang_rt.builtins-arm",
	"libclang_rt.builtins-i386",
	"libclang_rt.builtins-x86_64",
	+ "libclang_rt.profile",
	+ "libclang_rt.profile-aarch64",
	+ "libclang_rt.profile-arm",
	+ "libclang_rt.profile-i386",
	+ "libclang_rt.profile-x86_64",
	"libc++",
	"libc++abi",
	"libunwind",
	"libmsvcrt",
	"libucrtbase",
	};

	excludeObjects = {
	"crt0.o", "crt1.o", "crt1u.o", "crt2.o", "crt2u.o", "dllcrt1.o",
	"dllcrt2.o", "gcrt0.o", "gcrt1.o", "gcrt2.o", "crtbegin.o", "crtend.o",
	};

	excludeSymbolPrefixes = {
	// Import symbols
	"__imp_",
	"__IMPORT_DESCRIPTOR_",
	// Extra import symbols from GNU import libraries
	"__nm_",
	// C++ symbols
	"__rtti_",
	"__builtin_",
	// Artificial symbols such as .refptr
	".",
	+ // profile generate symbols
	+ "__profc_",
	+ "__profd_",
	+ "__profvp_",
	};

	excludeSymbolSuffixes = {
	"_iname",
	"_NULL_THUNK_DATA",
	};

	if (config->machine == I386) {
	excludeSymbols = {
	"__NULL_IMPORT_DESCRIPTOR",
	"__pei386_runtime_relocator",
	"_do_pseudo_reloc",
	"_impure_ptr",
	"__impure_ptr",
	"__fmode",
	"_environ",
	"___dso_handle",
	// These are the MinGW names that differ from the standard
	// ones (lacking an extra underscore).
	"_DllMain@12",
	"_DllEntryPoint@12",
	"_DllMainCRTStartup@12",
	};
	excludeSymbolPrefixes.insert("__head_");
	} else {
	excludeSymbols = {
	"__NULL_IMPORT_DESCRIPTOR",
	"_pei386_runtime_relocator",
	"do_pseudo_reloc",
	"impure_ptr",
	"_impure_ptr",
	"_fmode",
	"environ",
	"__dso_handle",
	// These are the MinGW names that differ from the standard
	// ones (lacking an extra underscore).
	"DllMain",
	"DllEntryPoint",
	"DllMainCRTStartup",
	};
	excludeSymbolPrefixes.insert("_head_");
	}
	}

	void AutoExporter::addWholeArchive(StringRef path) {
	StringRef libName = sys::path::filename(path);
	// Drop the file extension, to match the processing below.
	libName = libName.substr(0, libName.rfind('.'));
	excludeLibs.erase(libName);
	}

	bool AutoExporter::shouldExport(Defined *sym) const {
	if (!sym \|\| !sym->isLive() \|\| !sym->getChunk())
	return false;

	// Only allow the symbol kinds that make sense to export; in particular,
	// disallow import symbols.
	if (!isa<DefinedRegular>(sym) && !isa<DefinedCommon>(sym))
	return false;
	if (excludeSymbols.count(sym->getName()))
	return false;

	for (StringRef prefix : excludeSymbolPrefixes.keys())
	if (sym->getName().startswith(prefix))
	return false;
	for (StringRef suffix : excludeSymbolSuffixes.keys())
	if (sym->getName().endswith(suffix))
	return false;

	// If a corresponding __imp_ symbol exists and is defined, don't export it.
	if (symtab->find(("__imp_" + sym->getName()).str()))
	return false;

	// Check that file is non-null before dereferencing it, symbols not
	// originating in regular object files probably shouldn't be exported.
	if (!sym->getFile())
	return false;

	StringRef libName = sys::path::filename(sym->getFile()->parentName);

	// Drop the file extension.
	libName = libName.substr(0, libName.rfind('.'));
	if (!libName.empty())
	return !excludeLibs.count(libName);

	StringRef fileName = sys::path::filename(sym->getFile()->getName());
	return !excludeObjects.count(fileName);
	}

	void lld::coff::writeDefFile(StringRef name) {
	std::error_code ec;
	raw_fd_ostream os(name, ec, sys::fs::OF_None);
	if (ec)
	fatal("cannot open " + name + ": " + ec.message());

	os << "EXPORTS\n";
	for (Export &e : config->exports) {
	os << " " << e.exportName << " "
	<< "@" << e.ordinal;
	if (auto *def = dyn_cast_or_null<Defined>(e.sym)) {
	if (def && def->getChunk() &&
	!(def->getChunk()->getOutputCharacteristics() & IMAGE_SCN_MEM_EXECUTE))
	os << " DATA";
	}
	os << "\n";
	}
	}
	diff --git a/lld/COFF/Options.td b/lld/COFF/Options.td
	index 212879e1d60b..087d53b5d2dd 100644
	--- a/lld/COFF/Options.td
	+++ b/lld/COFF/Options.td
	@@ -1,265 +1,266 @@
	include "llvm/Option/OptParser.td"

	// link.exe accepts options starting with either a dash or a slash.

	// Flag that takes no arguments.
	class F<string name> : Flag<["/", "-", "/?", "-?"], name>;

	// Flag that takes one argument after ":".
	class P<string name, string help> :
	Joined<["/", "-", "/?", "-?"], name#":">, HelpText<help>;

	// Boolean flag which can be suffixed by ":no". Using it unsuffixed turns the
	// flag on and using it suffixed by ":no" turns it off.
	multiclass B<string name, string help_on, string help_off> {
	def "" : F<name>, HelpText<help_on>;
	def _no : F<name#":no">, HelpText<help_off>;
	}

	// Same as B<> above, but without help texts, for private undocumented
	// options.
	multiclass B_priv<string name> {
	def "" : F<name>;
	def _no : F<name#":no">;
	}

	def align : P<"align", "Section alignment">;
	def aligncomm : P<"aligncomm", "Set common symbol alignment">;
	def alternatename : P<"alternatename", "Define weak alias">;
	def base : P<"base", "Base address of the program">;
	def color_diagnostics: Flag<["--"], "color-diagnostics">,
	HelpText<"Use colors in diagnostics">;
	def color_diagnostics_eq: Joined<["--"], "color-diagnostics=">,
	HelpText<"Use colors in diagnostics; one of 'always', 'never', 'auto'">;
	def defaultlib : P<"defaultlib", "Add the library to the list of input files">;
	def delayload : P<"delayload", "Delay loaded DLL name">;
	def entry : P<"entry", "Name of entry point symbol">;
	def errorlimit : P<"errorlimit",
	"Maximum number of errors to emit before stopping (0 = no limit)">;
	def export : P<"export", "Export a function">;
	// No help text because /failifmismatch is not intended to be used by the user.
	def failifmismatch : P<"failifmismatch", "">;
	def filealign : P<"filealign", "Section alignment in the output file">;
	def functionpadmin : F<"functionpadmin">;
	def functionpadmin_opt : P<"functionpadmin",
	"Prepares an image for hotpatching">;
	def guard : P<"guard", "Control flow guard">;
	def heap : P<"heap", "Size of the heap">;
	def ignore : P<"ignore", "Specify warning codes to ignore">;
	def implib : P<"implib", "Import library name">;
	def lib : F<"lib">,
	HelpText<"Act like lib.exe; must be first argument if present">;
	def libpath : P<"libpath", "Additional library search path">;
	def linkrepro : P<"linkrepro",
	"Dump linker invocation and input files for debugging">;
	def lldignoreenv : F<"lldignoreenv">,
	HelpText<"Ignore environment variables like %LIB%">;
	def lldltocache : P<"lldltocache",
	"Path to ThinLTO cached object file directory">;
	def lldltocachepolicy : P<"lldltocachepolicy",
	"Pruning policy for the ThinLTO cache">;
	def lldsavetemps : F<"lldsavetemps">,
	HelpText<"Save temporary files instead of deleting them">;
	def machine : P<"machine", "Specify target platform">;
	def merge : P<"merge", "Combine sections">;
	def mllvm : P<"mllvm", "Options to pass to LLVM">;
	def nodefaultlib : P<"nodefaultlib", "Remove a default library">;
	def opt : P<"opt", "Control optimizations">;
	def order : P<"order", "Put functions in order">;
	def out : P<"out", "Path to file to write output">;
	def natvis : P<"natvis", "Path to natvis file to embed in the PDB">;
	def no_color_diagnostics: F<"no-color-diagnostics">,
	HelpText<"Do not use colors in diagnostics">;
	def pdb : P<"pdb", "PDB file path">;
	def pdbstripped : P<"pdbstripped", "Stripped PDB file path">;
	def pdbaltpath : P<"pdbaltpath", "PDB file path to embed in the image">;
	def pdbstream : Joined<["/", "-", "/?", "-?"], "pdbstream:">,
	MetaVarName<"<name>=<file>">,
	HelpText<"Embed the contents of <file> in the PDB as named stream <name>">;
	def section : P<"section", "Specify section attributes">;
	def stack : P<"stack", "Size of the stack">;
	def stub : P<"stub", "Specify DOS stub file">;
	def subsystem : P<"subsystem", "Specify subsystem">;
	def timestamp : P<"timestamp", "Specify the PE header timestamp">;
	def version : P<"version", "Specify a version number in the PE header">;
	def wholearchive_file : P<"wholearchive",
	"Include all object files from this library">;

	def disallowlib : Joined<["/", "-", "/?", "-?"], "disallowlib:">,
	Alias<nodefaultlib>;

	def manifest : F<"manifest">, HelpText<"Create .manifest file">;
	def manifest_colon : P<
	"manifest",
	"NO disables manifest output; EMBED[,ID=#] embeds manifest as resource in the image">;
	def manifestuac : P<"manifestuac", "User access control">;
	def manifestfile : P<"manifestfile", "Manifest output path, with /manifest">;
	def manifestdependency : P<
	"manifestdependency",
	"Attributes for <dependency> element in manifest file; implies /manifest">;
	def manifestinput : P<
	"manifestinput",
	"Additional manifest inputs; only valid with /manifest:embed">;

	// We cannot use multiclass P because class name "incl" is different
	// from its command line option name. We do this because "include" is
	// a reserved keyword in tablegen.
	def incl : Joined<["/", "-", "/?", "-?"], "include:">,
	HelpText<"Force symbol to be added to symbol table as undefined one">;

	// "def" is also a keyword.
	def deffile : Joined<["/", "-", "/?", "-?"], "def:">,
	HelpText<"Use module-definition file">;

	def debug : F<"debug">, HelpText<"Embed a symbol table in the image">;
	def debug_opt : P<"debug", "Embed a symbol table in the image with option">;
	def debugtype : P<"debugtype", "Debug Info Options">;
	def dll : F<"dll">, HelpText<"Create a DLL">;
	def driver : F<"driver">, HelpText<"Generate a Windows NT Kernel Mode Driver">;
	def driver_wdm : F<"driver:wdm">,
	HelpText<"Set IMAGE_FILE_UP_SYSTEM_ONLY bit in PE header">;
	def driver_uponly : F<"driver:uponly">,
	HelpText<"Set IMAGE_DLL_CHARACTERISTICS_WDM_DRIVER bit in PE header">;
	def driver_wdm_uponly : F<"driver:wdm,uponly">;
	def driver_uponly_wdm : F<"driver:uponly,wdm">;
	def nodefaultlib_all : F<"nodefaultlib">,
	HelpText<"Remove all default libraries">;
	def noentry : F<"noentry">,
	HelpText<"Don't add reference to DllMainCRTStartup; only valid with /dll">;
	def profile : F<"profile">;
	def repro : F<"Brepro">,
	HelpText<"Use a hash of the executable as the PE header timestamp">;
	def reproduce : P<"reproduce",
	"Dump linker invocation and input files for debugging">;
	def swaprun : P<"swaprun",
	"Comma-separated list of 'cd' or 'net'">;
	def swaprun_cd : F<"swaprun:cd">, Alias<swaprun>, AliasArgs<["cd"]>,
	HelpText<"Make loader run output binary from swap instead of from CD">;
	def swaprun_net : F<"swaprun:net">, Alias<swaprun>, AliasArgs<["net"]>,
	HelpText<"Make loader run output binary from swap instead of from network">;
	def verbose : F<"verbose">;
	def wholearchive_flag : F<"wholearchive">,
	HelpText<"Include all object files from all libraries">;

	def force : F<"force">,
	HelpText<"Allow undefined and multiply defined symbols">;
	def force_unresolved : F<"force:unresolved">,
	HelpText<"Allow undefined symbols when creating executables">;
	def force_multiple : F<"force:multiple">,
	HelpText<"Allow multiply defined symbols when creating executables">;
	def force_multipleres : F<"force:multipleres">,
	HelpText<"Allow multiply defined resources when creating executables">;
	defm WX : B<"WX", "Treat warnings as errors", "Don't treat warnings as errors">;

	defm allowbind : B<"allowbind", "Enable DLL binding (default)",
	"Disable DLL binding">;
	defm allowisolation : B<"allowisolation", "Enable DLL isolation (default)",
	"Disable DLL isolation">;
	defm appcontainer : B<"appcontainer",
	"Image can only be run in an app container",
	"Image can run outside an app container (default)">;
	defm cetcompat : B<"cetcompat", "Mark executable image as compatible with Control-flow Enforcement Technology (CET) Shadow Stack",
	"Don't mark executable image as compatible with Control-flow Enforcement Technology (CET) Shadow Stack (default)">;
	defm dynamicbase : B<"dynamicbase", "Enable ASLR (default unless /fixed)",
	"Disable ASLR (default when /fixed)">;
	defm fixed : B<"fixed", "Disable base relocations",
	"Enable base relocations (default)">;
	defm highentropyva : B<"highentropyva",
	"Enable 64-bit ASLR (default on 64-bit)",
	"Disable 64-bit ASLR">;
	defm incremental : B<"incremental",
	"Keep original import library if contents are unchanged",
	"Overwrite import library even if contents are unchanged">;
	defm integritycheck : B<"integritycheck",
	"Set FORCE_INTEGRITY bit in PE header",
	"No effect (default)">;
	defm largeaddressaware : B<"largeaddressaware",
	"Enable large addresses (default on 64-bit)",
	"Disable large addresses (default on 32-bit)">;
	defm nxcompat : B<"nxcompat", "Enable data execution prevention (default)",
	"Disable data execution provention">;
	defm safeseh : B<"safeseh",
	"Produce an image with Safe Exception Handler (only for x86)",
	"Don't produce an image with Safe Exception Handler">;
	defm tsaware : B<"tsaware",
	"Create Terminal Server aware executable (default)",
	"Create non-Terminal Server aware executable">;

	def help : F<"help">;

	// /?? and -?? must be before /? and -? to not confuse lib/Options.
	def help_q : Flag<["/??", "-??", "/?", "-?"], "">, Alias<help>;

	// LLD extensions
	defm auto_import : B_priv<"auto-import">;
	defm runtime_pseudo_reloc : B_priv<"runtime-pseudo-reloc">;
	def end_lib : F<"end-lib">,
	HelpText<"Ends group of objects treated as if they were in a library">;
	def exclude_all_symbols : F<"exclude-all-symbols">;
	def export_all_symbols : F<"export-all-symbols">;
	defm demangle : B<"demangle",
	"Demangle symbols in output (default)",
	"Do not demangle symbols in output">;
	def include_optional : Joined<["/", "-", "/?", "-?"], "includeoptional:">,
	HelpText<"Add symbol as undefined, but allow it to remain undefined">;
	def kill_at : F<"kill-at">;
	def lldmingw : F<"lldmingw">;
	+def noseh : F<"noseh">;
	def output_def : Joined<["/", "-", "/?", "-?"], "output-def:">;
	def pdb_source_path : P<"pdbsourcepath",
	"Base path used to make relative source file path absolute in PDB">;
	def rsp_quoting : Joined<["--"], "rsp-quoting=">,
	HelpText<"Quoting style for response files, 'windows' (default) or 'posix'">;
	def start_lib : F<"start-lib">,
	HelpText<"Starts group of objects treated as if they were in a library">;
	def thinlto_emit_imports_files :
	F<"thinlto-emit-imports-files">,
	HelpText<"Emit .imports files with -thinlto-index-only">;
	def thinlto_index_only :
	F<"thinlto-index-only">,
	HelpText<"Instead of linking, emit ThinLTO index files">;
	def thinlto_index_only_arg : P<
	"thinlto-index-only",
	"-thinlto-index-only and also write native module names to file">;
	def thinlto_object_suffix_replace : P<
	"thinlto-object-suffix-replace",
	"'old;new' replace old suffix with new suffix in ThinLTO index">;
	def thinlto_prefix_replace: P<
	"thinlto-prefix-replace",
	"'old;new' replace old prefix with new prefix in ThinLTO outputs">;
	def lto_obj_path : P<
	"lto-obj-path",
	"output native object for merged LTO unit to this path">;
	def dash_dash_version : Flag<["--"], "version">,
	HelpText<"Print version information">;
	def threads
	: P<"threads", "Number of threads. '1' disables multi-threading. By "
	"default all available hardware threads are used">;

	// Flags for debugging
	def lldmap : F<"lldmap">;
	def lldmap_file : Joined<["/", "-", "/?", "-?"], "lldmap:">;
	def map : F<"map">;
	def map_file : Joined<["/", "-", "/?", "-?"], "map:">;
	def show_timing : F<"time">;
	def summary : F<"summary">;

	//==============================================================================
	// The flags below do nothing. They are defined only for link.exe compatibility.
	//==============================================================================

	class QF<string name> : Joined<["/", "-", "/?", "-?"], name#":">;

	def ignoreidl : F<"ignoreidl">;
	def nologo : F<"nologo">;
	def throwingnew : F<"throwingnew">;
	def editandcontinue : F<"editandcontinue">;
	def fastfail : F<"fastfail">;

	def delay : QF<"delay">;
	def errorreport : QF<"errorreport">;
	def idlout : QF<"idlout">;
	def maxilksize : QF<"maxilksize">;
	def tlbid : QF<"tlbid">;
	def tlbout : QF<"tlbout">;
	def verbose_all : QF<"verbose">;
	def guardsym : QF<"guardsym">;
	diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
	index 3bcc1777f7ac..082de5b8c1d6 100644
	--- a/lld/COFF/Writer.cpp
	+++ b/lld/COFF/Writer.cpp
	@@ -1,1992 +1,1992 @@
	//===- Writer.cpp ---------------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "Writer.h"
	#include "Config.h"
	#include "DLL.h"
	#include "InputFiles.h"
	#include "LLDMapFile.h"
	#include "MapFile.h"
	#include "PDB.h"
	#include "SymbolTable.h"
	#include "Symbols.h"
	#include "lld/Common/ErrorHandler.h"
	#include "lld/Common/Memory.h"
	#include "lld/Common/Timer.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/StringSet.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/Support/BinaryStreamReader.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/Endian.h"
	#include "llvm/Support/FileOutputBuffer.h"
	#include "llvm/Support/Parallel.h"
	#include "llvm/Support/Path.h"
	#include "llvm/Support/RandomNumberGenerator.h"
	#include "llvm/Support/xxhash.h"
	#include <algorithm>
	#include <cstdio>
	#include <map>
	#include <memory>
	#include <utility>

	using namespace llvm;
	using namespace llvm::COFF;
	using namespace llvm::object;
	using namespace llvm::support;
	using namespace llvm::support::endian;
	using namespace lld;
	using namespace lld::coff;

	/* To re-generate DOSProgram:
	$ cat > /tmp/DOSProgram.asm
	org 0
	; Copy cs to ds.
	push cs
	pop ds
	; Point ds:dx at the $-terminated string.
	mov dx, str
	; Int 21/AH=09h: Write string to standard output.
	mov ah, 0x9
	int 0x21
	; Int 21/AH=4Ch: Exit with return code (in AL).
	mov ax, 0x4C01
	int 0x21
	str:
	db 'This program cannot be run in DOS mode.$'
	align 8, db 0
	$ nasm -fbin /tmp/DOSProgram.asm -o /tmp/DOSProgram.bin
	$ xxd -i /tmp/DOSProgram.bin
	*/
	static unsigned char dosProgram[] = {
	0x0e, 0x1f, 0xba, 0x0e, 0x00, 0xb4, 0x09, 0xcd, 0x21, 0xb8, 0x01, 0x4c,
	0xcd, 0x21, 0x54, 0x68, 0x69, 0x73, 0x20, 0x70, 0x72, 0x6f, 0x67, 0x72,
	0x61, 0x6d, 0x20, 0x63, 0x61, 0x6e, 0x6e, 0x6f, 0x74, 0x20, 0x62, 0x65,
	0x20, 0x72, 0x75, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x44, 0x4f, 0x53, 0x20,
	0x6d, 0x6f, 0x64, 0x65, 0x2e, 0x24, 0x00, 0x00
	};
	static_assert(sizeof(dosProgram) % 8 == 0,
	"DOSProgram size must be multiple of 8");

	static const int dosStubSize = sizeof(dos_header) + sizeof(dosProgram);
	static_assert(dosStubSize % 8 == 0, "DOSStub size must be multiple of 8");

	static const int numberOfDataDirectory = 16;

	// Global vector of all output sections. After output sections are finalized,
	// this can be indexed by Chunk::getOutputSection.
	static std::vector<OutputSection *> outputSections;

	OutputSection *Chunk::getOutputSection() const {
	return osidx == 0 ? nullptr : outputSections[osidx - 1];
	}

	namespace {

	class DebugDirectoryChunk : public NonSectionChunk {
	public:
	DebugDirectoryChunk(const std::vector<std::pair<COFF::DebugType, Chunk *>> &r,
	bool writeRepro)
	: records(r), writeRepro(writeRepro) {}

	size_t getSize() const override {
	return (records.size() + int(writeRepro)) * sizeof(debug_directory);
	}

	void writeTo(uint8_t *b) const override {
	auto d = reinterpret_cast<debug_directory >(b);

	for (const std::pair<COFF::DebugType, Chunk *>& record : records) {
	Chunk *c = record.second;
	OutputSection *os = c->getOutputSection();
	uint64_t offs = os->getFileOff() + (c->getRVA() - os->getRVA());
	fillEntry(d, record.first, c->getSize(), c->getRVA(), offs);
	++d;
	}

	if (writeRepro) {
	// FIXME: The COFF spec allows either a 0-sized entry to just say
	// "the timestamp field is really a hash", or a 4-byte size field
	// followed by that many bytes containing a longer hash (with the
	// lowest 4 bytes usually being the timestamp in little-endian order).
	// Consider storing the full 8 bytes computed by xxHash64 here.
	fillEntry(d, COFF::IMAGE_DEBUG_TYPE_REPRO, 0, 0, 0);
	}
	}

	void setTimeDateStamp(uint32_t timeDateStamp) {
	for (support::ulittle32_t *tds : timeDateStamps)
	*tds = timeDateStamp;
	}

	private:
	void fillEntry(debug_directory *d, COFF::DebugType debugType, size_t size,
	uint64_t rva, uint64_t offs) const {
	d->Characteristics = 0;
	d->TimeDateStamp = 0;
	d->MajorVersion = 0;
	d->MinorVersion = 0;
	d->Type = debugType;
	d->SizeOfData = size;
	d->AddressOfRawData = rva;
	d->PointerToRawData = offs;

	timeDateStamps.push_back(&d->TimeDateStamp);
	}

	mutable std::vector<support::ulittle32_t *> timeDateStamps;
	const std::vector<std::pair<COFF::DebugType, Chunk *>> &records;
	bool writeRepro;
	};

	class CVDebugRecordChunk : public NonSectionChunk {
	public:
	size_t getSize() const override {
	return sizeof(codeview::DebugInfo) + config->pdbAltPath.size() + 1;
	}

	void writeTo(uint8_t *b) const override {
	// Save off the DebugInfo entry to backfill the file signature (build id)
	// in Writer::writeBuildId
	buildId = reinterpret_cast<codeview::DebugInfo *>(b);

	// variable sized field (PDB Path)
	char p = reinterpret_cast<char >(b + sizeof(*buildId));
	if (!config->pdbAltPath.empty())
	memcpy(p, config->pdbAltPath.data(), config->pdbAltPath.size());
	p[config->pdbAltPath.size()] = '\0';
	}

	mutable codeview::DebugInfo *buildId = nullptr;
	};

	class ExtendedDllCharacteristicsChunk : public NonSectionChunk {
	public:
	ExtendedDllCharacteristicsChunk(uint32_t c) : characteristics(c) {}

	size_t getSize() const override { return 4; }

	void writeTo(uint8_t *buf) const override { write32le(buf, characteristics); }

	uint32_t characteristics = 0;
	};

	// PartialSection represents a group of chunks that contribute to an
	// OutputSection. Collating a collection of PartialSections of same name and
	// characteristics constitutes the OutputSection.
	class PartialSectionKey {
	public:
	StringRef name;
	unsigned characteristics;

	bool operator<(const PartialSectionKey &other) const {
	int c = name.compare(other.name);
	if (c == 1)
	return false;
	if (c == 0)
	return characteristics < other.characteristics;
	return true;
	}
	};

	// The writer writes a SymbolTable result to a file.
	class Writer {
	public:
	Writer() : buffer(errorHandler().outputBuffer) {}
	void run();

	private:
	void createSections();
	void createMiscChunks();
	void createImportTables();
	void appendImportThunks();
	void locateImportTables();
	void createExportTable();
	void mergeSections();
	void removeUnusedSections();
	void assignAddresses();
	void finalizeAddresses();
	void removeEmptySections();
	void assignOutputSectionIndices();
	void createSymbolAndStringTable();
	void openFile(StringRef outputPath);
	template <typename PEHeaderTy> void writeHeader();
	void createSEHTable();
	void createRuntimePseudoRelocs();
	void insertCtorDtorSymbols();
	void createGuardCFTables();
	void markSymbolsForRVATable(ObjFile *file,
	ArrayRef<SectionChunk *> symIdxChunks,
	SymbolRVASet &tableSymbols);
	void maybeAddRVATable(SymbolRVASet tableSymbols, StringRef tableSym,
	StringRef countSym);
	void setSectionPermissions();
	void writeSections();
	void writeBuildId();
	void sortExceptionTable();
	void sortCRTSectionChunks(std::vector<Chunk *> &chunks);
	void addSyntheticIdata();
	void fixPartialSectionChars(StringRef name, uint32_t chars);
	bool fixGnuImportChunks();
	PartialSection *createPartialSection(StringRef name, uint32_t outChars);
	PartialSection *findPartialSection(StringRef name, uint32_t outChars);

	llvm::Optional<coff_symbol16> createSymbol(Defined *d);
	size_t addEntryToStringTable(StringRef str);

	OutputSection *findSection(StringRef name);
	void addBaserels();
	void addBaserelBlocks(std::vector<Baserel> &v);

	uint32_t getSizeOfInitializedData();

	std::unique_ptr<FileOutputBuffer> &buffer;
	std::map<PartialSectionKey, PartialSection *> partialSections;
	std::vector<char> strtab;
	std::vector<llvm::object::coff_symbol16> outputSymtab;
	IdataContents idata;
	Chunk *importTableStart = nullptr;
	uint64_t importTableSize = 0;
	Chunk *edataStart = nullptr;
	Chunk *edataEnd = nullptr;
	Chunk *iatStart = nullptr;
	uint64_t iatSize = 0;
	DelayLoadContents delayIdata;
	EdataContents edata;
	bool setNoSEHCharacteristic = false;

	DebugDirectoryChunk *debugDirectory = nullptr;
	std::vector<std::pair<COFF::DebugType, Chunk *>> debugRecords;
	CVDebugRecordChunk *buildId = nullptr;
	ArrayRef<uint8_t> sectionTable;

	uint64_t fileSize;
	uint32_t pointerToSymbolTable = 0;
	uint64_t sizeOfImage;
	uint64_t sizeOfHeaders;

	OutputSection *textSec;
	OutputSection *rdataSec;
	OutputSection *buildidSec;
	OutputSection *dataSec;
	OutputSection *pdataSec;
	OutputSection *idataSec;
	OutputSection *edataSec;
	OutputSection *didatSec;
	OutputSection *rsrcSec;
	OutputSection *relocSec;
	OutputSection *ctorsSec;
	OutputSection *dtorsSec;

	// The first and last .pdata sections in the output file.
	//
	// We need to keep track of the location of .pdata in whichever section it
	// gets merged into so that we can sort its contents and emit a correct data
	// directory entry for the exception table. This is also the case for some
	// other sections (such as .edata) but because the contents of those sections
	// are entirely linker-generated we can keep track of their locations using
	// the chunks that the linker creates. All .pdata chunks come from input
	// files, so we need to keep track of them separately.
	Chunk *firstPdata = nullptr;
	Chunk *lastPdata;
	};
	} // anonymous namespace

	static Timer codeLayoutTimer("Code Layout", Timer::root());
	static Timer diskCommitTimer("Commit Output File", Timer::root());

	void lld::coff::writeResult() { Writer().run(); }

	void OutputSection::addChunk(Chunk *c) {
	chunks.push_back(c);
	}

	void OutputSection::insertChunkAtStart(Chunk *c) {
	chunks.insert(chunks.begin(), c);
	}

	void OutputSection::setPermissions(uint32_t c) {
	header.Characteristics &= ~permMask;
	header.Characteristics \|= c;
	}

	void OutputSection::merge(OutputSection *other) {
	chunks.insert(chunks.end(), other->chunks.begin(), other->chunks.end());
	other->chunks.clear();
	contribSections.insert(contribSections.end(), other->contribSections.begin(),
	other->contribSections.end());
	other->contribSections.clear();
	}

	// Write the section header to a given buffer.
	void OutputSection::writeHeaderTo(uint8_t *buf) {
	auto hdr = reinterpret_cast<coff_section >(buf);
	*hdr = header;
	if (stringTableOff) {
	// If name is too long, write offset into the string table as a name.
	sprintf(hdr->Name, "/%d", stringTableOff);
	} else {
	assert(!config->debug \|\| name.size() <= COFF::NameSize \|\|
	(hdr->Characteristics & IMAGE_SCN_MEM_DISCARDABLE) == 0);
	strncpy(hdr->Name, name.data(),
	std::min(name.size(), (size_t)COFF::NameSize));
	}
	}

	void OutputSection::addContributingPartialSection(PartialSection *sec) {
	contribSections.push_back(sec);
	}

	// Check whether the target address S is in range from a relocation
	// of type relType at address P.
	static bool isInRange(uint16_t relType, uint64_t s, uint64_t p, int margin) {
	if (config->machine == ARMNT) {
	int64_t diff = AbsoluteDifference(s, p + 4) + margin;
	switch (relType) {
	case IMAGE_REL_ARM_BRANCH20T:
	return isInt<21>(diff);
	case IMAGE_REL_ARM_BRANCH24T:
	case IMAGE_REL_ARM_BLX23T:
	return isInt<25>(diff);
	default:
	return true;
	}
	} else if (config->machine == ARM64) {
	int64_t diff = AbsoluteDifference(s, p) + margin;
	switch (relType) {
	case IMAGE_REL_ARM64_BRANCH26:
	return isInt<28>(diff);
	case IMAGE_REL_ARM64_BRANCH19:
	return isInt<21>(diff);
	case IMAGE_REL_ARM64_BRANCH14:
	return isInt<16>(diff);
	default:
	return true;
	}
	} else {
	llvm_unreachable("Unexpected architecture");
	}
	}

	// Return the last thunk for the given target if it is in range,
	// or create a new one.
	static std::pair<Defined *, bool>
	getThunk(DenseMap<uint64_t, Defined > &lastThunks, Defined target, uint64_t p,
	uint16_t type, int margin) {
	Defined *&lastThunk = lastThunks[target->getRVA()];
	if (lastThunk && isInRange(type, lastThunk->getRVA(), p, margin))
	return {lastThunk, false};
	Chunk *c;
	switch (config->machine) {
	case ARMNT:
	c = make<RangeExtensionThunkARM>(target);
	break;
	case ARM64:
	c = make<RangeExtensionThunkARM64>(target);
	break;
	default:
	llvm_unreachable("Unexpected architecture");
	}
	Defined *d = make<DefinedSynthetic>("", c);
	lastThunk = d;
	return {d, true};
	}

	// This checks all relocations, and for any relocation which isn't in range
	// it adds a thunk after the section chunk that contains the relocation.
	// If the latest thunk for the specific target is in range, that is used
	// instead of creating a new thunk. All range checks are done with the
	// specified margin, to make sure that relocations that originally are in
	// range, but only barely, also get thunks - in case other added thunks makes
	// the target go out of range.
	//
	// After adding thunks, we verify that all relocations are in range (with
	// no extra margin requirements). If this failed, we restart (throwing away
	// the previously created thunks) and retry with a wider margin.
	static bool createThunks(OutputSection *os, int margin) {
	bool addressesChanged = false;
	DenseMap<uint64_t, Defined *> lastThunks;
	DenseMap<std::pair<ObjFile , Defined >, uint32_t> thunkSymtabIndices;
	size_t thunksSize = 0;
	// Recheck Chunks.size() each iteration, since we can insert more
	// elements into it.
	for (size_t i = 0; i != os->chunks.size(); ++i) {
	SectionChunk *sc = dyn_cast_or_null<SectionChunk>(os->chunks[i]);
	if (!sc)
	continue;
	size_t thunkInsertionSpot = i + 1;

	// Try to get a good enough estimate of where new thunks will be placed.
	// Offset this by the size of the new thunks added so far, to make the
	// estimate slightly better.
	size_t thunkInsertionRVA = sc->getRVA() + sc->getSize() + thunksSize;
	ObjFile *file = sc->file;
	std::vector<std::pair<uint32_t, uint32_t>> relocReplacements;
	ArrayRef<coff_relocation> originalRelocs =
	file->getCOFFObj()->getRelocations(sc->header);
	for (size_t j = 0, e = originalRelocs.size(); j < e; ++j) {
	const coff_relocation &rel = originalRelocs[j];
	Symbol *relocTarget = file->getSymbol(rel.SymbolTableIndex);

	// The estimate of the source address P should be pretty accurate,
	// but we don't know whether the target Symbol address should be
	// offset by thunksSize or not (or by some of thunksSize but not all of
	// it), giving us some uncertainty once we have added one thunk.
	uint64_t p = sc->getRVA() + rel.VirtualAddress + thunksSize;

	Defined *sym = dyn_cast_or_null<Defined>(relocTarget);
	if (!sym)
	continue;

	uint64_t s = sym->getRVA();

	if (isInRange(rel.Type, s, p, margin))
	continue;

	// If the target isn't in range, hook it up to an existing or new
	// thunk.
	Defined *thunk;
	bool wasNew;
	std::tie(thunk, wasNew) = getThunk(lastThunks, sym, p, rel.Type, margin);
	if (wasNew) {
	Chunk *thunkChunk = thunk->getChunk();
	thunkChunk->setRVA(
	thunkInsertionRVA); // Estimate of where it will be located.
	os->chunks.insert(os->chunks.begin() + thunkInsertionSpot, thunkChunk);
	thunkInsertionSpot++;
	thunksSize += thunkChunk->getSize();
	thunkInsertionRVA += thunkChunk->getSize();
	addressesChanged = true;
	}

	// To redirect the relocation, add a symbol to the parent object file's
	// symbol table, and replace the relocation symbol table index with the
	// new index.
	auto insertion = thunkSymtabIndices.insert({{file, thunk}, ~0U});
	uint32_t &thunkSymbolIndex = insertion.first->second;
	if (insertion.second)
	thunkSymbolIndex = file->addRangeThunkSymbol(thunk);
	relocReplacements.push_back({j, thunkSymbolIndex});
	}

	// Get a writable copy of this section's relocations so they can be
	// modified. If the relocations point into the object file, allocate new
	// memory. Otherwise, this must be previously allocated memory that can be
	// modified in place.
	ArrayRef<coff_relocation> curRelocs = sc->getRelocs();
	MutableArrayRef<coff_relocation> newRelocs;
	if (originalRelocs.data() == curRelocs.data()) {
	newRelocs = makeMutableArrayRef(
	bAlloc.Allocate<coff_relocation>(originalRelocs.size()),
	originalRelocs.size());
	} else {
	newRelocs = makeMutableArrayRef(
	const_cast<coff_relocation *>(curRelocs.data()), curRelocs.size());
	}

	// Copy each relocation, but replace the symbol table indices which need
	// thunks.
	auto nextReplacement = relocReplacements.begin();
	auto endReplacement = relocReplacements.end();
	for (size_t i = 0, e = originalRelocs.size(); i != e; ++i) {
	newRelocs[i] = originalRelocs[i];
	if (nextReplacement != endReplacement && nextReplacement->first == i) {
	newRelocs[i].SymbolTableIndex = nextReplacement->second;
	++nextReplacement;
	}
	}

	sc->setRelocs(newRelocs);
	}
	return addressesChanged;
	}

	// Verify that all relocations are in range, with no extra margin requirements.
	static bool verifyRanges(const std::vector<Chunk *> chunks) {
	for (Chunk *c : chunks) {
	SectionChunk *sc = dyn_cast_or_null<SectionChunk>(c);
	if (!sc)
	continue;

	ArrayRef<coff_relocation> relocs = sc->getRelocs();
	for (size_t j = 0, e = relocs.size(); j < e; ++j) {
	const coff_relocation &rel = relocs[j];
	Symbol *relocTarget = sc->file->getSymbol(rel.SymbolTableIndex);

	Defined *sym = dyn_cast_or_null<Defined>(relocTarget);
	if (!sym)
	continue;

	uint64_t p = sc->getRVA() + rel.VirtualAddress;
	uint64_t s = sym->getRVA();

	if (!isInRange(rel.Type, s, p, 0))
	return false;
	}
	}
	return true;
	}

	// Assign addresses and add thunks if necessary.
	void Writer::finalizeAddresses() {
	assignAddresses();
	if (config->machine != ARMNT && config->machine != ARM64)
	return;

	size_t origNumChunks = 0;
	for (OutputSection *sec : outputSections) {
	sec->origChunks = sec->chunks;
	origNumChunks += sec->chunks.size();
	}

	int pass = 0;
	int margin = 1024 * 100;
	while (true) {
	// First check whether we need thunks at all, or if the previous pass of
	// adding them turned out ok.
	bool rangesOk = true;
	size_t numChunks = 0;
	for (OutputSection *sec : outputSections) {
	if (!verifyRanges(sec->chunks)) {
	rangesOk = false;
	break;
	}
	numChunks += sec->chunks.size();
	}
	if (rangesOk) {
	if (pass > 0)
	log("Added " + Twine(numChunks - origNumChunks) + " thunks with " +
	"margin " + Twine(margin) + " in " + Twine(pass) + " passes");
	return;
	}

	if (pass >= 10)
	fatal("adding thunks hasn't converged after " + Twine(pass) + " passes");

	if (pass > 0) {
	// If the previous pass didn't work out, reset everything back to the
	// original conditions before retrying with a wider margin. This should
	// ideally never happen under real circumstances.
	for (OutputSection *sec : outputSections)
	sec->chunks = sec->origChunks;
	margin *= 2;
	}

	// Try adding thunks everywhere where it is needed, with a margin
	// to avoid things going out of range due to the added thunks.
	bool addressesChanged = false;
	for (OutputSection *sec : outputSections)
	addressesChanged \|= createThunks(sec, margin);
	// If the verification above thought we needed thunks, we should have
	// added some.
	assert(addressesChanged);

	// Recalculate the layout for the whole image (and verify the ranges at
	// the start of the next round).
	assignAddresses();

	pass++;
	}
	}

	// The main function of the writer.
	void Writer::run() {
	ScopedTimer t1(codeLayoutTimer);

	createImportTables();
	createSections();
	createMiscChunks();
	appendImportThunks();
	createExportTable();
	mergeSections();
	removeUnusedSections();
	finalizeAddresses();
	removeEmptySections();
	assignOutputSectionIndices();
	setSectionPermissions();
	createSymbolAndStringTable();

	if (fileSize > UINT32_MAX)
	fatal("image size (" + Twine(fileSize) + ") " +
	"exceeds maximum allowable size (" + Twine(UINT32_MAX) + ")");

	openFile(config->outputFile);
	if (config->is64()) {
	writeHeader<pe32plus_header>();
	} else {
	writeHeader<pe32_header>();
	}
	writeSections();
	sortExceptionTable();

	t1.stop();

	if (!config->pdbPath.empty() && config->debug) {
	assert(buildId);
	createPDB(symtab, outputSections, sectionTable, buildId->buildId);
	}
	writeBuildId();

	writeLLDMapFile(outputSections);
	writeMapFile(outputSections);

	if (errorCount())
	return;

	ScopedTimer t2(diskCommitTimer);
	if (auto e = buffer->commit())
	fatal("failed to write the output file: " + toString(std::move(e)));
	}

	static StringRef getOutputSectionName(StringRef name) {
	StringRef s = name.split('$').first;

	// Treat a later period as a separator for MinGW, for sections like
	// ".ctors.01234".
	return s.substr(0, s.find('.', 1));
	}

	// For /order.
	static void sortBySectionOrder(std::vector<Chunk *> &chunks) {
	auto getPriority = [](const Chunk *c) {
	if (auto *sec = dyn_cast<SectionChunk>(c))
	if (sec->sym)
	return config->order.lookup(sec->sym->getName());
	return 0;
	};

	llvm::stable_sort(chunks, [=](const Chunk a, const Chunk b) {
	return getPriority(a) < getPriority(b);
	});
	}

	// Change the characteristics of existing PartialSections that belong to the
	// section Name to Chars.
	void Writer::fixPartialSectionChars(StringRef name, uint32_t chars) {
	for (auto it : partialSections) {
	PartialSection *pSec = it.second;
	StringRef curName = pSec->name;
	if (!curName.consume_front(name) \|\|
	(!curName.empty() && !curName.startswith("$")))
	continue;
	if (pSec->characteristics == chars)
	continue;
	PartialSection *destSec = createPartialSection(pSec->name, chars);
	destSec->chunks.insert(destSec->chunks.end(), pSec->chunks.begin(),
	pSec->chunks.end());
	pSec->chunks.clear();
	}
	}

	// Sort concrete section chunks from GNU import libraries.
	//
	// GNU binutils doesn't use short import files, but instead produces import
	// libraries that consist of object files, with section chunks for the .idata$*
	// sections. These are linked just as regular static libraries. Each import
	// library consists of one header object, one object file for every imported
	// symbol, and one trailer object. In order for the .idata tables/lists to
	// be formed correctly, the section chunks within each .idata$* section need
	// to be grouped by library, and sorted alphabetically within each library
	// (which makes sure the header comes first and the trailer last).
	bool Writer::fixGnuImportChunks() {
	uint32_t rdata = IMAGE_SCN_CNT_INITIALIZED_DATA \| IMAGE_SCN_MEM_READ;

	// Make sure all .idata$* section chunks are mapped as RDATA in order to
	// be sorted into the same sections as our own synthesized .idata chunks.
	fixPartialSectionChars(".idata", rdata);

	bool hasIdata = false;
	// Sort all .idata$* chunks, grouping chunks from the same library,
	// with alphabetical ordering of the object fils within a library.
	for (auto it : partialSections) {
	PartialSection *pSec = it.second;
	if (!pSec->name.startswith(".idata"))
	continue;

	if (!pSec->chunks.empty())
	hasIdata = true;
	llvm::stable_sort(pSec->chunks, [&](Chunk s, Chunk t) {
	SectionChunk *sc1 = dyn_cast_or_null<SectionChunk>(s);
	SectionChunk *sc2 = dyn_cast_or_null<SectionChunk>(t);
	if (!sc1 \|\| !sc2) {
	// if SC1, order them ascending. If SC2 or both null,
	// S is not less than T.
	return sc1 != nullptr;
	}
	// Make a string with "libraryname/objectfile" for sorting, achieving
	// both grouping by library and sorting of objects within a library,
	// at once.
	std::string key1 =
	(sc1->file->parentName + "/" + sc1->file->getName()).str();
	std::string key2 =
	(sc2->file->parentName + "/" + sc2->file->getName()).str();
	return key1 < key2;
	});
	}
	return hasIdata;
	}

	// Add generated idata chunks, for imported symbols and DLLs, and a
	// terminator in .idata$2.
	void Writer::addSyntheticIdata() {
	uint32_t rdata = IMAGE_SCN_CNT_INITIALIZED_DATA \| IMAGE_SCN_MEM_READ;
	idata.create();

	// Add the .idata content in the right section groups, to allow
	// chunks from other linked in object files to be grouped together.
	// See Microsoft PE/COFF spec 5.4 for details.
	auto add = [&](StringRef n, std::vector<Chunk *> &v) {
	PartialSection *pSec = createPartialSection(n, rdata);
	pSec->chunks.insert(pSec->chunks.end(), v.begin(), v.end());
	};

	// The loader assumes a specific order of data.
	// Add each type in the correct order.
	add(".idata$2", idata.dirs);
	add(".idata$4", idata.lookups);
	add(".idata$5", idata.addresses);
	if (!idata.hints.empty())
	add(".idata$6", idata.hints);
	add(".idata$7", idata.dllNames);
	}

	// Locate the first Chunk and size of the import directory list and the
	// IAT.
	void Writer::locateImportTables() {
	uint32_t rdata = IMAGE_SCN_CNT_INITIALIZED_DATA \| IMAGE_SCN_MEM_READ;

	if (PartialSection *importDirs = findPartialSection(".idata$2", rdata)) {
	if (!importDirs->chunks.empty())
	importTableStart = importDirs->chunks.front();
	for (Chunk *c : importDirs->chunks)
	importTableSize += c->getSize();
	}

	if (PartialSection *importAddresses = findPartialSection(".idata$5", rdata)) {
	if (!importAddresses->chunks.empty())
	iatStart = importAddresses->chunks.front();
	for (Chunk *c : importAddresses->chunks)
	iatSize += c->getSize();
	}
	}

	// Return whether a SectionChunk's suffix (the dollar and any trailing
	// suffix) should be removed and sorted into the main suffixless
	// PartialSection.
	static bool shouldStripSectionSuffix(SectionChunk *sc, StringRef name) {
	// On MinGW, comdat groups are formed by putting the comdat group name
	// after the '$' in the section name. For .eh_frame$<symbol>, that must
	// still be sorted before the .eh_frame trailer from crtend.o, thus just
	// strip the section name trailer. For other sections, such as
	// .tls$$<symbol> (where non-comdat .tls symbols are otherwise stored in
	// ".tls$"), they must be strictly sorted after .tls. And for the
	// hypothetical case of comdat .CRT$XCU, we definitely need to keep the
	// suffix for sorting. Thus, to play it safe, only strip the suffix for
	// the standard sections.
	if (!config->mingw)
	return false;
	if (!sc \|\| !sc->isCOMDAT())
	return false;
	return name.startswith(".text$") \|\| name.startswith(".data$") \|\|
	name.startswith(".rdata$") \|\| name.startswith(".pdata$") \|\|
	name.startswith(".xdata$") \|\| name.startswith(".eh_frame$");
	}

	// Create output section objects and add them to OutputSections.
	void Writer::createSections() {
	// First, create the builtin sections.
	const uint32_t data = IMAGE_SCN_CNT_INITIALIZED_DATA;
	const uint32_t bss = IMAGE_SCN_CNT_UNINITIALIZED_DATA;
	const uint32_t code = IMAGE_SCN_CNT_CODE;
	const uint32_t discardable = IMAGE_SCN_MEM_DISCARDABLE;
	const uint32_t r = IMAGE_SCN_MEM_READ;
	const uint32_t w = IMAGE_SCN_MEM_WRITE;
	const uint32_t x = IMAGE_SCN_MEM_EXECUTE;

	SmallDenseMap<std::pair<StringRef, uint32_t>, OutputSection *> sections;
	auto createSection = [&](StringRef name, uint32_t outChars) {
	OutputSection *&sec = sections[{name, outChars}];
	if (!sec) {
	sec = make<OutputSection>(name, outChars);
	outputSections.push_back(sec);
	}
	return sec;
	};

	// Try to match the section order used by link.exe.
	textSec = createSection(".text", code \| r \| x);
	createSection(".bss", bss \| r \| w);
	rdataSec = createSection(".rdata", data \| r);
	buildidSec = createSection(".buildid", data \| r);
	dataSec = createSection(".data", data \| r \| w);
	pdataSec = createSection(".pdata", data \| r);
	idataSec = createSection(".idata", data \| r);
	edataSec = createSection(".edata", data \| r);
	didatSec = createSection(".didat", data \| r);
	rsrcSec = createSection(".rsrc", data \| r);
	relocSec = createSection(".reloc", data \| discardable \| r);
	ctorsSec = createSection(".ctors", data \| r \| w);
	dtorsSec = createSection(".dtors", data \| r \| w);

	// Then bin chunks by name and output characteristics.
	for (Chunk *c : symtab->getChunks()) {
	auto *sc = dyn_cast<SectionChunk>(c);
	if (sc && !sc->live) {
	if (config->verbose)
	sc->printDiscardedMessage();
	continue;
	}
	StringRef name = c->getSectionName();
	if (shouldStripSectionSuffix(sc, name))
	name = name.split('$').first;
	PartialSection *pSec = createPartialSection(name,
	c->getOutputCharacteristics());
	pSec->chunks.push_back(c);
	}

	fixPartialSectionChars(".rsrc", data \| r);
	fixPartialSectionChars(".edata", data \| r);
	// Even in non MinGW cases, we might need to link against GNU import
	// libraries.
	bool hasIdata = fixGnuImportChunks();
	if (!idata.empty())
	hasIdata = true;

	if (hasIdata)
	addSyntheticIdata();

	// Process an /order option.
	if (!config->order.empty())
	for (auto it : partialSections)
	sortBySectionOrder(it.second->chunks);

	if (hasIdata)
	locateImportTables();

	// Then create an OutputSection for each section.
	// '$' and all following characters in input section names are
	// discarded when determining output section. So, .text$foo
	// contributes to .text, for example. See PE/COFF spec 3.2.
	for (auto it : partialSections) {
	PartialSection *pSec = it.second;
	StringRef name = getOutputSectionName(pSec->name);
	uint32_t outChars = pSec->characteristics;

	if (name == ".CRT") {
	// In link.exe, there is a special case for the I386 target where .CRT
	// sections are treated as if they have output characteristics DATA \| R if
	// their characteristics are DATA \| R \| W. This implements the same
	// special case for all architectures.
	outChars = data \| r;

	log("Processing section " + pSec->name + " -> " + name);

	sortCRTSectionChunks(pSec->chunks);
	}

	OutputSection *sec = createSection(name, outChars);
	for (Chunk *c : pSec->chunks)
	sec->addChunk(c);

	sec->addContributingPartialSection(pSec);
	}

	// Finally, move some output sections to the end.
	auto sectionOrder = [&](const OutputSection *s) {
	// Move DISCARDABLE (or non-memory-mapped) sections to the end of file
	// because the loader cannot handle holes. Stripping can remove other
	// discardable ones than .reloc, which is first of them (created early).
	if (s->header.Characteristics & IMAGE_SCN_MEM_DISCARDABLE)
	return 2;
	// .rsrc should come at the end of the non-discardable sections because its
	// size may change by the Win32 UpdateResources() function, causing
	// subsequent sections to move (see https://crbug.com/827082).
	if (s == rsrcSec)
	return 1;
	return 0;
	};
	llvm::stable_sort(outputSections,
	[&](const OutputSection s, const OutputSection t) {
	return sectionOrder(s) < sectionOrder(t);
	});
	}

	void Writer::createMiscChunks() {
	for (MergeChunk *p : MergeChunk::instances) {
	if (p) {
	p->finalizeContents();
	rdataSec->addChunk(p);
	}
	}

	// Create thunks for locally-dllimported symbols.
	if (!symtab->localImportChunks.empty()) {
	for (Chunk *c : symtab->localImportChunks)
	rdataSec->addChunk(c);
	}

	// Create Debug Information Chunks
	OutputSection *debugInfoSec = config->mingw ? buildidSec : rdataSec;
	if (config->debug \|\| config->repro \|\| config->cetCompat) {
	debugDirectory = make<DebugDirectoryChunk>(debugRecords, config->repro);
	debugDirectory->setAlignment(4);
	debugInfoSec->addChunk(debugDirectory);
	}

	if (config->debug) {
	// Make a CVDebugRecordChunk even when /DEBUG:CV is not specified. We
	// output a PDB no matter what, and this chunk provides the only means of
	// allowing a debugger to match a PDB and an executable. So we need it even
	// if we're ultimately not going to write CodeView data to the PDB.
	buildId = make<CVDebugRecordChunk>();
	debugRecords.push_back({COFF::IMAGE_DEBUG_TYPE_CODEVIEW, buildId});
	}

	if (config->cetCompat) {
	ExtendedDllCharacteristicsChunk *extendedDllChars =
	make<ExtendedDllCharacteristicsChunk>(
	IMAGE_DLL_CHARACTERISTICS_EX_CET_COMPAT);
	debugRecords.push_back(
	{COFF::IMAGE_DEBUG_TYPE_EX_DLLCHARACTERISTICS, extendedDllChars});
	}

	if (debugRecords.size() > 0) {
	for (std::pair<COFF::DebugType, Chunk *> r : debugRecords)
	debugInfoSec->addChunk(r.second);
	}

	// Create SEH table. x86-only.
	if (config->safeSEH)
	createSEHTable();

	// Create /guard:cf tables if requested.
	if (config->guardCF != GuardCFLevel::Off)
	createGuardCFTables();

	if (config->autoImport)
	createRuntimePseudoRelocs();

	if (config->mingw)
	insertCtorDtorSymbols();
	}

	// Create .idata section for the DLL-imported symbol table.
	// The format of this section is inherently Windows-specific.
	// IdataContents class abstracted away the details for us,
	// so we just let it create chunks and add them to the section.
	void Writer::createImportTables() {
	// Initialize DLLOrder so that import entries are ordered in
	// the same order as in the command line. (That affects DLL
	// initialization order, and this ordering is MSVC-compatible.)
	for (ImportFile *file : ImportFile::instances) {
	if (!file->live)
	continue;

	std::string dll = StringRef(file->dllName).lower();
	if (config->dllOrder.count(dll) == 0)
	config->dllOrder[dll] = config->dllOrder.size();

	if (file->impSym && !isa<DefinedImportData>(file->impSym))
	fatal(toString(*file->impSym) + " was replaced");
	DefinedImportData *impSym = cast_or_null<DefinedImportData>(file->impSym);
	if (config->delayLoads.count(StringRef(file->dllName).lower())) {
	if (!file->thunkSym)
	fatal("cannot delay-load " + toString(file) +
	" due to import of data: " + toString(*impSym));
	delayIdata.add(impSym);
	} else {
	idata.add(impSym);
	}
	}
	}

	void Writer::appendImportThunks() {
	if (ImportFile::instances.empty())
	return;

	for (ImportFile *file : ImportFile::instances) {
	if (!file->live)
	continue;

	if (!file->thunkSym)
	continue;

	if (!isa<DefinedImportThunk>(file->thunkSym))
	fatal(toString(*file->thunkSym) + " was replaced");
	DefinedImportThunk *thunk = cast<DefinedImportThunk>(file->thunkSym);
	if (file->thunkLive)
	textSec->addChunk(thunk->getChunk());
	}

	if (!delayIdata.empty()) {
	Defined *helper = cast<Defined>(config->delayLoadHelper);
	delayIdata.create(helper);
	for (Chunk *c : delayIdata.getChunks())
	didatSec->addChunk(c);
	for (Chunk *c : delayIdata.getDataChunks())
	dataSec->addChunk(c);
	for (Chunk *c : delayIdata.getCodeChunks())
	textSec->addChunk(c);
	}
	}

	void Writer::createExportTable() {
	if (!edataSec->chunks.empty()) {
	// Allow using a custom built export table from input object files, instead
	// of having the linker synthesize the tables.
	if (config->hadExplicitExports)
	warn("literal .edata sections override exports");
	} else if (!config->exports.empty()) {
	for (Chunk *c : edata.chunks)
	edataSec->addChunk(c);
	}
	if (!edataSec->chunks.empty()) {
	edataStart = edataSec->chunks.front();
	edataEnd = edataSec->chunks.back();
	}
	}

	void Writer::removeUnusedSections() {
	// Remove sections that we can be sure won't get content, to avoid
	// allocating space for their section headers.
	auto isUnused = [this](OutputSection *s) {
	if (s == relocSec)
	return false; // This section is populated later.
	// MergeChunks have zero size at this point, as their size is finalized
	// later. Only remove sections that have no Chunks at all.
	return s->chunks.empty();
	};
	outputSections.erase(
	std::remove_if(outputSections.begin(), outputSections.end(), isUnused),
	outputSections.end());
	}

	// The Windows loader doesn't seem to like empty sections,
	// so we remove them if any.
	void Writer::removeEmptySections() {
	auto isEmpty = [](OutputSection *s) { return s->getVirtualSize() == 0; };
	outputSections.erase(
	std::remove_if(outputSections.begin(), outputSections.end(), isEmpty),
	outputSections.end());
	}

	void Writer::assignOutputSectionIndices() {
	// Assign final output section indices, and assign each chunk to its output
	// section.
	uint32_t idx = 1;
	for (OutputSection *os : outputSections) {
	os->sectionIndex = idx;
	for (Chunk *c : os->chunks)
	c->setOutputSectionIdx(idx);
	++idx;
	}

	// Merge chunks are containers of chunks, so assign those an output section
	// too.
	for (MergeChunk *mc : MergeChunk::instances)
	if (mc)
	for (SectionChunk *sc : mc->sections)
	if (sc && sc->live)
	sc->setOutputSectionIdx(mc->getOutputSectionIdx());
	}

	size_t Writer::addEntryToStringTable(StringRef str) {
	assert(str.size() > COFF::NameSize);
	size_t offsetOfEntry = strtab.size() + 4; // +4 for the size field
	strtab.insert(strtab.end(), str.begin(), str.end());
	strtab.push_back('\0');
	return offsetOfEntry;
	}

	Optional<coff_symbol16> Writer::createSymbol(Defined *def) {
	coff_symbol16 sym;
	switch (def->kind()) {
	case Symbol::DefinedAbsoluteKind:
	sym.Value = def->getRVA();
	sym.SectionNumber = IMAGE_SYM_ABSOLUTE;
	break;
	case Symbol::DefinedSyntheticKind:
	// Relative symbols are unrepresentable in a COFF symbol table.
	return None;
	default: {
	// Don't write symbols that won't be written to the output to the symbol
	// table.
	Chunk *c = def->getChunk();
	if (!c)
	return None;
	OutputSection *os = c->getOutputSection();
	if (!os)
	return None;

	sym.Value = def->getRVA() - os->getRVA();
	sym.SectionNumber = os->sectionIndex;
	break;
	}
	}

	// Symbols that are runtime pseudo relocations don't point to the actual
	// symbol data itself (as they are imported), but points to the IAT entry
	// instead. Avoid emitting them to the symbol table, as they can confuse
	// debuggers.
	if (def->isRuntimePseudoReloc)
	return None;

	StringRef name = def->getName();
	if (name.size() > COFF::NameSize) {
	sym.Name.Offset.Zeroes = 0;
	sym.Name.Offset.Offset = addEntryToStringTable(name);
	} else {
	memset(sym.Name.ShortName, 0, COFF::NameSize);
	memcpy(sym.Name.ShortName, name.data(), name.size());
	}

	if (auto *d = dyn_cast<DefinedCOFF>(def)) {
	COFFSymbolRef ref = d->getCOFFSymbol();
	sym.Type = ref.getType();
	sym.StorageClass = ref.getStorageClass();
	} else {
	sym.Type = IMAGE_SYM_TYPE_NULL;
	sym.StorageClass = IMAGE_SYM_CLASS_EXTERNAL;
	}
	sym.NumberOfAuxSymbols = 0;
	return sym;
	}

	void Writer::createSymbolAndStringTable() {
	// PE/COFF images are limited to 8 byte section names. Longer names can be
	// supported by writing a non-standard string table, but this string table is
	// not mapped at runtime and the long names will therefore be inaccessible.
	// link.exe always truncates section names to 8 bytes, whereas binutils always
	// preserves long section names via the string table. LLD adopts a hybrid
	// solution where discardable sections have long names preserved and
	// non-discardable sections have their names truncated, to ensure that any
	// section which is mapped at runtime also has its name mapped at runtime.
	for (OutputSection *sec : outputSections) {
	if (sec->name.size() <= COFF::NameSize)
	continue;
	if ((sec->header.Characteristics & IMAGE_SCN_MEM_DISCARDABLE) == 0)
	continue;
	if (config->warnLongSectionNames) {
	warn("section name " + sec->name +
	" is longer than 8 characters and will use a non-standard string "
	"table");
	}
	sec->setStringTableOff(addEntryToStringTable(sec->name));
	}

	if (config->debugDwarf \|\| config->debugSymtab) {
	for (ObjFile *file : ObjFile::instances) {
	for (Symbol *b : file->getSymbols()) {
	auto *d = dyn_cast_or_null<Defined>(b);
	if (!d \|\| d->writtenToSymtab)
	continue;
	d->writtenToSymtab = true;

	if (Optional<coff_symbol16> sym = createSymbol(d))
	outputSymtab.push_back(*sym);
	}
	}
	}

	if (outputSymtab.empty() && strtab.empty())
	return;

	// We position the symbol table to be adjacent to the end of the last section.
	uint64_t fileOff = fileSize;
	pointerToSymbolTable = fileOff;
	fileOff += outputSymtab.size() * sizeof(coff_symbol16);
	fileOff += 4 + strtab.size();
	fileSize = alignTo(fileOff, config->fileAlign);
	}

	void Writer::mergeSections() {
	if (!pdataSec->chunks.empty()) {
	firstPdata = pdataSec->chunks.front();
	lastPdata = pdataSec->chunks.back();
	}

	for (auto &p : config->merge) {
	StringRef toName = p.second;
	if (p.first == toName)
	continue;
	StringSet<> names;
	while (1) {
	if (!names.insert(toName).second)
	fatal("/merge: cycle found for section '" + p.first + "'");
	auto i = config->merge.find(toName);
	if (i == config->merge.end())
	break;
	toName = i->second;
	}
	OutputSection *from = findSection(p.first);
	OutputSection *to = findSection(toName);
	if (!from)
	continue;
	if (!to) {
	from->name = toName;
	continue;
	}
	to->merge(from);
	}
	}

	// Visits all sections to assign incremental, non-overlapping RVAs and
	// file offsets.
	void Writer::assignAddresses() {
	sizeOfHeaders = dosStubSize + sizeof(PEMagic) + sizeof(coff_file_header) +
	sizeof(data_directory) * numberOfDataDirectory +
	sizeof(coff_section) * outputSections.size();
	sizeOfHeaders +=
	config->is64() ? sizeof(pe32plus_header) : sizeof(pe32_header);
	sizeOfHeaders = alignTo(sizeOfHeaders, config->fileAlign);
	fileSize = sizeOfHeaders;

	// The first page is kept unmapped.
	uint64_t rva = alignTo(sizeOfHeaders, config->align);

	for (OutputSection *sec : outputSections) {
	if (sec == relocSec)
	addBaserels();
	uint64_t rawSize = 0, virtualSize = 0;
	sec->header.VirtualAddress = rva;

	// If /FUNCTIONPADMIN is used, functions are padded in order to create a
	// hotpatchable image.
	const bool isCodeSection =
	(sec->header.Characteristics & IMAGE_SCN_CNT_CODE) &&
	(sec->header.Characteristics & IMAGE_SCN_MEM_READ) &&
	(sec->header.Characteristics & IMAGE_SCN_MEM_EXECUTE);
	uint32_t padding = isCodeSection ? config->functionPadMin : 0;

	for (Chunk *c : sec->chunks) {
	if (padding && c->isHotPatchable())
	virtualSize += padding;
	virtualSize = alignTo(virtualSize, c->getAlignment());
	c->setRVA(rva + virtualSize);
	virtualSize += c->getSize();
	if (c->hasData)
	rawSize = alignTo(virtualSize, config->fileAlign);
	}
	if (virtualSize > UINT32_MAX)
	error("section larger than 4 GiB: " + sec->name);
	sec->header.VirtualSize = virtualSize;
	sec->header.SizeOfRawData = rawSize;
	if (rawSize != 0)
	sec->header.PointerToRawData = fileSize;
	rva += alignTo(virtualSize, config->align);
	fileSize += alignTo(rawSize, config->fileAlign);
	}
	sizeOfImage = alignTo(rva, config->align);

	// Assign addresses to sections in MergeChunks.
	for (MergeChunk *mc : MergeChunk::instances)
	if (mc)
	mc->assignSubsectionRVAs();
	}

	template <typename PEHeaderTy> void Writer::writeHeader() {
	// Write DOS header. For backwards compatibility, the first part of a PE/COFF
	// executable consists of an MS-DOS MZ executable. If the executable is run
	// under DOS, that program gets run (usually to just print an error message).
	// When run under Windows, the loader looks at AddressOfNewExeHeader and uses
	// the PE header instead.
	uint8_t *buf = buffer->getBufferStart();
	auto dos = reinterpret_cast<dos_header >(buf);
	buf += sizeof(dos_header);
	dos->Magic[0] = 'M';
	dos->Magic[1] = 'Z';
	dos->UsedBytesInTheLastPage = dosStubSize % 512;
	dos->FileSizeInPages = divideCeil(dosStubSize, 512);
	dos->HeaderSizeInParagraphs = sizeof(dos_header) / 16;

	dos->AddressOfRelocationTable = sizeof(dos_header);
	dos->AddressOfNewExeHeader = dosStubSize;

	// Write DOS program.
	memcpy(buf, dosProgram, sizeof(dosProgram));
	buf += sizeof(dosProgram);

	// Write PE magic
	memcpy(buf, PEMagic, sizeof(PEMagic));
	buf += sizeof(PEMagic);

	// Write COFF header
	auto coff = reinterpret_cast<coff_file_header >(buf);
	buf += sizeof(*coff);
	coff->Machine = config->machine;
	coff->NumberOfSections = outputSections.size();
	coff->Characteristics = IMAGE_FILE_EXECUTABLE_IMAGE;
	if (config->largeAddressAware)
	coff->Characteristics \|= IMAGE_FILE_LARGE_ADDRESS_AWARE;
	if (!config->is64())
	coff->Characteristics \|= IMAGE_FILE_32BIT_MACHINE;
	if (config->dll)
	coff->Characteristics \|= IMAGE_FILE_DLL;
	if (config->driverUponly)
	coff->Characteristics \|= IMAGE_FILE_UP_SYSTEM_ONLY;
	if (!config->relocatable)
	coff->Characteristics \|= IMAGE_FILE_RELOCS_STRIPPED;
	if (config->swaprunCD)
	coff->Characteristics \|= IMAGE_FILE_REMOVABLE_RUN_FROM_SWAP;
	if (config->swaprunNet)
	coff->Characteristics \|= IMAGE_FILE_NET_RUN_FROM_SWAP;
	coff->SizeOfOptionalHeader =
	sizeof(PEHeaderTy) + sizeof(data_directory) * numberOfDataDirectory;

	// Write PE header
	auto pe = reinterpret_cast<PEHeaderTy >(buf);
	buf += sizeof(*pe);
	pe->Magic = config->is64() ? PE32Header::PE32_PLUS : PE32Header::PE32;

	// If {Major,Minor}LinkerVersion is left at 0.0, then for some
	// reason signing the resulting PE file with Authenticode produces a
	// signature that fails to validate on Windows 7 (but is OK on 10).
	// Set it to 14.0, which is what VS2015 outputs, and which avoids
	// that problem.
	pe->MajorLinkerVersion = 14;
	pe->MinorLinkerVersion = 0;

	pe->ImageBase = config->imageBase;
	pe->SectionAlignment = config->align;
	pe->FileAlignment = config->fileAlign;
	pe->MajorImageVersion = config->majorImageVersion;
	pe->MinorImageVersion = config->minorImageVersion;
	pe->MajorOperatingSystemVersion = config->majorOSVersion;
	pe->MinorOperatingSystemVersion = config->minorOSVersion;
	pe->MajorSubsystemVersion = config->majorOSVersion;
	pe->MinorSubsystemVersion = config->minorOSVersion;
	pe->Subsystem = config->subsystem;
	pe->SizeOfImage = sizeOfImage;
	pe->SizeOfHeaders = sizeOfHeaders;
	if (!config->noEntry) {
	Defined *entry = cast<Defined>(config->entry);
	pe->AddressOfEntryPoint = entry->getRVA();
	// Pointer to thumb code must have the LSB set, so adjust it.
	if (config->machine == ARMNT)
	pe->AddressOfEntryPoint \|= 1;
	}
	pe->SizeOfStackReserve = config->stackReserve;
	pe->SizeOfStackCommit = config->stackCommit;
	pe->SizeOfHeapReserve = config->heapReserve;
	pe->SizeOfHeapCommit = config->heapCommit;
	if (config->appContainer)
	pe->DLLCharacteristics \|= IMAGE_DLL_CHARACTERISTICS_APPCONTAINER;
	if (config->driverWdm)
	pe->DLLCharacteristics \|= IMAGE_DLL_CHARACTERISTICS_WDM_DRIVER;
	if (config->dynamicBase)
	pe->DLLCharacteristics \|= IMAGE_DLL_CHARACTERISTICS_DYNAMIC_BASE;
	if (config->highEntropyVA)
	pe->DLLCharacteristics \|= IMAGE_DLL_CHARACTERISTICS_HIGH_ENTROPY_VA;
	if (!config->allowBind)
	pe->DLLCharacteristics \|= IMAGE_DLL_CHARACTERISTICS_NO_BIND;
	if (config->nxCompat)
	pe->DLLCharacteristics \|= IMAGE_DLL_CHARACTERISTICS_NX_COMPAT;
	if (!config->allowIsolation)
	pe->DLLCharacteristics \|= IMAGE_DLL_CHARACTERISTICS_NO_ISOLATION;
	if (config->guardCF != GuardCFLevel::Off)
	pe->DLLCharacteristics \|= IMAGE_DLL_CHARACTERISTICS_GUARD_CF;
	if (config->integrityCheck)
	pe->DLLCharacteristics \|= IMAGE_DLL_CHARACTERISTICS_FORCE_INTEGRITY;
	- if (setNoSEHCharacteristic)
	+ if (setNoSEHCharacteristic \|\| config->noSEH)
	pe->DLLCharacteristics \|= IMAGE_DLL_CHARACTERISTICS_NO_SEH;
	if (config->terminalServerAware)
	pe->DLLCharacteristics \|= IMAGE_DLL_CHARACTERISTICS_TERMINAL_SERVER_AWARE;
	pe->NumberOfRvaAndSize = numberOfDataDirectory;
	if (textSec->getVirtualSize()) {
	pe->BaseOfCode = textSec->getRVA();
	pe->SizeOfCode = textSec->getRawSize();
	}
	pe->SizeOfInitializedData = getSizeOfInitializedData();

	// Write data directory
	auto dir = reinterpret_cast<data_directory >(buf);
	buf += sizeof(dir) numberOfDataDirectory;
	if (edataStart) {
	dir[EXPORT_TABLE].RelativeVirtualAddress = edataStart->getRVA();
	dir[EXPORT_TABLE].Size =
	edataEnd->getRVA() + edataEnd->getSize() - edataStart->getRVA();
	}
	if (importTableStart) {
	dir[IMPORT_TABLE].RelativeVirtualAddress = importTableStart->getRVA();
	dir[IMPORT_TABLE].Size = importTableSize;
	}
	if (iatStart) {
	dir[IAT].RelativeVirtualAddress = iatStart->getRVA();
	dir[IAT].Size = iatSize;
	}
	if (rsrcSec->getVirtualSize()) {
	dir[RESOURCE_TABLE].RelativeVirtualAddress = rsrcSec->getRVA();
	dir[RESOURCE_TABLE].Size = rsrcSec->getVirtualSize();
	}
	if (firstPdata) {
	dir[EXCEPTION_TABLE].RelativeVirtualAddress = firstPdata->getRVA();
	dir[EXCEPTION_TABLE].Size =
	lastPdata->getRVA() + lastPdata->getSize() - firstPdata->getRVA();
	}
	if (relocSec->getVirtualSize()) {
	dir[BASE_RELOCATION_TABLE].RelativeVirtualAddress = relocSec->getRVA();
	dir[BASE_RELOCATION_TABLE].Size = relocSec->getVirtualSize();
	}
	if (Symbol *sym = symtab->findUnderscore("_tls_used")) {
	if (Defined *b = dyn_cast<Defined>(sym)) {
	dir[TLS_TABLE].RelativeVirtualAddress = b->getRVA();
	dir[TLS_TABLE].Size = config->is64()
	? sizeof(object::coff_tls_directory64)
	: sizeof(object::coff_tls_directory32);
	}
	}
	if (debugDirectory) {
	dir[DEBUG_DIRECTORY].RelativeVirtualAddress = debugDirectory->getRVA();
	dir[DEBUG_DIRECTORY].Size = debugDirectory->getSize();
	}
	if (Symbol *sym = symtab->findUnderscore("_load_config_used")) {
	if (auto *b = dyn_cast<DefinedRegular>(sym)) {
	SectionChunk *sc = b->getChunk();
	assert(b->getRVA() >= sc->getRVA());
	uint64_t offsetInChunk = b->getRVA() - sc->getRVA();
	if (!sc->hasData \|\| offsetInChunk + 4 > sc->getSize())
	fatal("_load_config_used is malformed");

	ArrayRef<uint8_t> secContents = sc->getContents();
	uint32_t loadConfigSize =
	reinterpret_cast<const ulittle32_t >(&secContents[offsetInChunk]);
	if (offsetInChunk + loadConfigSize > sc->getSize())
	fatal("_load_config_used is too large");
	dir[LOAD_CONFIG_TABLE].RelativeVirtualAddress = b->getRVA();
	dir[LOAD_CONFIG_TABLE].Size = loadConfigSize;
	}
	}
	if (!delayIdata.empty()) {
	dir[DELAY_IMPORT_DESCRIPTOR].RelativeVirtualAddress =
	delayIdata.getDirRVA();
	dir[DELAY_IMPORT_DESCRIPTOR].Size = delayIdata.getDirSize();
	}

	// Write section table
	for (OutputSection *sec : outputSections) {
	sec->writeHeaderTo(buf);
	buf += sizeof(coff_section);
	}
	sectionTable = ArrayRef<uint8_t>(
	buf - outputSections.size() * sizeof(coff_section), buf);

	if (outputSymtab.empty() && strtab.empty())
	return;

	coff->PointerToSymbolTable = pointerToSymbolTable;
	uint32_t numberOfSymbols = outputSymtab.size();
	coff->NumberOfSymbols = numberOfSymbols;
	auto symbolTable = reinterpret_cast<coff_symbol16 >(
	buffer->getBufferStart() + coff->PointerToSymbolTable);
	for (size_t i = 0; i != numberOfSymbols; ++i)
	symbolTable[i] = outputSymtab[i];
	// Create the string table, it follows immediately after the symbol table.
	// The first 4 bytes is length including itself.
	buf = reinterpret_cast<uint8_t *>(&symbolTable[numberOfSymbols]);
	write32le(buf, strtab.size() + 4);
	if (!strtab.empty())
	memcpy(buf + 4, strtab.data(), strtab.size());
	}

	void Writer::openFile(StringRef path) {
	buffer = CHECK(
	FileOutputBuffer::create(path, fileSize, FileOutputBuffer::F_executable),
	"failed to open " + path);
	}

	void Writer::createSEHTable() {
	SymbolRVASet handlers;
	for (ObjFile *file : ObjFile::instances) {
	if (!file->hasSafeSEH())
	error("/safeseh: " + file->getName() + " is not compatible with SEH");
	markSymbolsForRVATable(file, file->getSXDataChunks(), handlers);
	}

	// Set the "no SEH" characteristic if there really were no handlers, or if
	// there is no load config object to point to the table of handlers.
	setNoSEHCharacteristic =
	handlers.empty() \|\| !symtab->findUnderscore("_load_config_used");

	maybeAddRVATable(std::move(handlers), "__safe_se_handler_table",
	"__safe_se_handler_count");
	}

	// Add a symbol to an RVA set. Two symbols may have the same RVA, but an RVA set
	// cannot contain duplicates. Therefore, the set is uniqued by Chunk and the
	// symbol's offset into that Chunk.
	static void addSymbolToRVASet(SymbolRVASet &rvaSet, Defined *s) {
	Chunk *c = s->getChunk();
	if (auto *sc = dyn_cast<SectionChunk>(c))
	c = sc->repl; // Look through ICF replacement.
	uint32_t off = s->getRVA() - (c ? c->getRVA() : 0);
	rvaSet.insert({c, off});
	}

	// Given a symbol, add it to the GFIDs table if it is a live, defined, function
	// symbol in an executable section.
	static void maybeAddAddressTakenFunction(SymbolRVASet &addressTakenSyms,
	Symbol *s) {
	if (!s)
	return;

	switch (s->kind()) {
	case Symbol::DefinedLocalImportKind:
	case Symbol::DefinedImportDataKind:
	// Defines an __imp_ pointer, so it is data, so it is ignored.
	break;
	case Symbol::DefinedCommonKind:
	// Common is always data, so it is ignored.
	break;
	case Symbol::DefinedAbsoluteKind:
	case Symbol::DefinedSyntheticKind:
	// Absolute is never code, synthetic generally isn't and usually isn't
	// determinable.
	break;
	case Symbol::LazyArchiveKind:
	case Symbol::LazyObjectKind:
	case Symbol::UndefinedKind:
	// Undefined symbols resolve to zero, so they don't have an RVA. Lazy
	// symbols shouldn't have relocations.
	break;

	case Symbol::DefinedImportThunkKind:
	// Thunks are always code, include them.
	addSymbolToRVASet(addressTakenSyms, cast<Defined>(s));
	break;

	case Symbol::DefinedRegularKind: {
	// This is a regular, defined, symbol from a COFF file. Mark the symbol as
	// address taken if the symbol type is function and it's in an executable
	// section.
	auto *d = cast<DefinedRegular>(s);
	if (d->getCOFFSymbol().getComplexType() == COFF::IMAGE_SYM_DTYPE_FUNCTION) {
	SectionChunk *sc = dyn_cast<SectionChunk>(d->getChunk());
	if (sc && sc->live &&
	sc->getOutputCharacteristics() & IMAGE_SCN_MEM_EXECUTE)
	addSymbolToRVASet(addressTakenSyms, d);
	}
	break;
	}
	}
	}

	// Visit all relocations from all section contributions of this object file and
	// mark the relocation target as address-taken.
	static void markSymbolsWithRelocations(ObjFile *file,
	SymbolRVASet &usedSymbols) {
	for (Chunk *c : file->getChunks()) {
	// We only care about live section chunks. Common chunks and other chunks
	// don't generally contain relocations.
	SectionChunk *sc = dyn_cast<SectionChunk>(c);
	if (!sc \|\| !sc->live)
	continue;

	for (const coff_relocation &reloc : sc->getRelocs()) {
	if (config->machine == I386 && reloc.Type == COFF::IMAGE_REL_I386_REL32)
	// Ignore relative relocations on x86. On x86_64 they can't be ignored
	// since they're also used to compute absolute addresses.
	continue;

	Symbol *ref = sc->file->getSymbol(reloc.SymbolTableIndex);
	maybeAddAddressTakenFunction(usedSymbols, ref);
	}
	}
	}

	// Create the guard function id table. This is a table of RVAs of all
	// address-taken functions. It is sorted and uniqued, just like the safe SEH
	// table.
	void Writer::createGuardCFTables() {
	SymbolRVASet addressTakenSyms;
	SymbolRVASet longJmpTargets;
	for (ObjFile *file : ObjFile::instances) {
	// If the object was compiled with /guard:cf, the address taken symbols
	// are in .gfids$y sections, and the longjmp targets are in .gljmp$y
	// sections. If the object was not compiled with /guard:cf, we assume there
	// were no setjmp targets, and that all code symbols with relocations are
	// possibly address-taken.
	if (file->hasGuardCF()) {
	markSymbolsForRVATable(file, file->getGuardFidChunks(), addressTakenSyms);
	markSymbolsForRVATable(file, file->getGuardLJmpChunks(), longJmpTargets);
	} else {
	markSymbolsWithRelocations(file, addressTakenSyms);
	}
	}

	// Mark the image entry as address-taken.
	if (config->entry)
	maybeAddAddressTakenFunction(addressTakenSyms, config->entry);

	// Mark exported symbols in executable sections as address-taken.
	for (Export &e : config->exports)
	maybeAddAddressTakenFunction(addressTakenSyms, e.sym);

	// Ensure sections referenced in the gfid table are 16-byte aligned.
	for (const ChunkAndOffset &c : addressTakenSyms)
	if (c.inputChunk->getAlignment() < 16)
	c.inputChunk->setAlignment(16);

	maybeAddRVATable(std::move(addressTakenSyms), "__guard_fids_table",
	"__guard_fids_count");

	// Add the longjmp target table unless the user told us not to.
	if (config->guardCF == GuardCFLevel::Full)
	maybeAddRVATable(std::move(longJmpTargets), "__guard_longjmp_table",
	"__guard_longjmp_count");

	// Set __guard_flags, which will be used in the load config to indicate that
	// /guard:cf was enabled.
	uint32_t guardFlags = uint32_t(coff_guard_flags::CFInstrumented) \|
	uint32_t(coff_guard_flags::HasFidTable);
	if (config->guardCF == GuardCFLevel::Full)
	guardFlags \|= uint32_t(coff_guard_flags::HasLongJmpTable);
	Symbol *flagSym = symtab->findUnderscore("__guard_flags");
	cast<DefinedAbsolute>(flagSym)->setVA(guardFlags);
	}

	// Take a list of input sections containing symbol table indices and add those
	// symbols to an RVA table. The challenge is that symbol RVAs are not known and
	// depend on the table size, so we can't directly build a set of integers.
	void Writer::markSymbolsForRVATable(ObjFile *file,
	ArrayRef<SectionChunk *> symIdxChunks,
	SymbolRVASet &tableSymbols) {
	for (SectionChunk *c : symIdxChunks) {
	// Skip sections discarded by linker GC. This comes up when a .gfids section
	// is associated with something like a vtable and the vtable is discarded.
	// In this case, the associated gfids section is discarded, and we don't
	// mark the virtual member functions as address-taken by the vtable.
	if (!c->live)
	continue;

	// Validate that the contents look like symbol table indices.
	ArrayRef<uint8_t> data = c->getContents();
	if (data.size() % 4 != 0) {
	warn("ignoring " + c->getSectionName() +
	" symbol table index section in object " + toString(file));
	continue;
	}

	// Read each symbol table index and check if that symbol was included in the
	// final link. If so, add it to the table symbol set.
	ArrayRef<ulittle32_t> symIndices(
	reinterpret_cast<const ulittle32_t *>(data.data()), data.size() / 4);
	ArrayRef<Symbol *> objSymbols = file->getSymbols();
	for (uint32_t symIndex : symIndices) {
	if (symIndex >= objSymbols.size()) {
	warn("ignoring invalid symbol table index in section " +
	c->getSectionName() + " in object " + toString(file));
	continue;
	}
	if (Symbol *s = objSymbols[symIndex]) {
	if (s->isLive())
	addSymbolToRVASet(tableSymbols, cast<Defined>(s));
	}
	}
	}
	}

	// Replace the absolute table symbol with a synthetic symbol pointing to
	// tableChunk so that we can emit base relocations for it and resolve section
	// relative relocations.
	void Writer::maybeAddRVATable(SymbolRVASet tableSymbols, StringRef tableSym,
	StringRef countSym) {
	if (tableSymbols.empty())
	return;

	RVATableChunk *tableChunk = make<RVATableChunk>(std::move(tableSymbols));
	rdataSec->addChunk(tableChunk);

	Symbol *t = symtab->findUnderscore(tableSym);
	Symbol *c = symtab->findUnderscore(countSym);
	replaceSymbol<DefinedSynthetic>(t, t->getName(), tableChunk);
	cast<DefinedAbsolute>(c)->setVA(tableChunk->getSize() / 4);
	}

	// MinGW specific. Gather all relocations that are imported from a DLL even
	// though the code didn't expect it to, produce the table that the runtime
	// uses for fixing them up, and provide the synthetic symbols that the
	// runtime uses for finding the table.
	void Writer::createRuntimePseudoRelocs() {
	std::vector<RuntimePseudoReloc> rels;

	for (Chunk *c : symtab->getChunks()) {
	auto *sc = dyn_cast<SectionChunk>(c);
	if (!sc \|\| !sc->live)
	continue;
	sc->getRuntimePseudoRelocs(rels);
	}

	if (!config->pseudoRelocs) {
	// Not writing any pseudo relocs; if some were needed, error out and
	// indicate what required them.
	for (const RuntimePseudoReloc &rpr : rels)
	error("automatic dllimport of " + rpr.sym->getName() + " in " +
	toString(rpr.target->file) + " requires pseudo relocations");
	return;
	}

	if (!rels.empty())
	log("Writing " + Twine(rels.size()) + " runtime pseudo relocations");
	PseudoRelocTableChunk *table = make<PseudoRelocTableChunk>(rels);
	rdataSec->addChunk(table);
	EmptyChunk *endOfList = make<EmptyChunk>();
	rdataSec->addChunk(endOfList);

	Symbol *headSym = symtab->findUnderscore("__RUNTIME_PSEUDO_RELOC_LIST__");
	Symbol *endSym = symtab->findUnderscore("__RUNTIME_PSEUDO_RELOC_LIST_END__");
	replaceSymbol<DefinedSynthetic>(headSym, headSym->getName(), table);
	replaceSymbol<DefinedSynthetic>(endSym, endSym->getName(), endOfList);
	}

	// MinGW specific.
	// The MinGW .ctors and .dtors lists have sentinels at each end;
	// a (uintptr_t)-1 at the start and a (uintptr_t)0 at the end.
	// There's a symbol pointing to the start sentinel pointer, __CTOR_LIST__
	// and __DTOR_LIST__ respectively.
	void Writer::insertCtorDtorSymbols() {
	AbsolutePointerChunk *ctorListHead = make<AbsolutePointerChunk>(-1);
	AbsolutePointerChunk *ctorListEnd = make<AbsolutePointerChunk>(0);
	AbsolutePointerChunk *dtorListHead = make<AbsolutePointerChunk>(-1);
	AbsolutePointerChunk *dtorListEnd = make<AbsolutePointerChunk>(0);
	ctorsSec->insertChunkAtStart(ctorListHead);
	ctorsSec->addChunk(ctorListEnd);
	dtorsSec->insertChunkAtStart(dtorListHead);
	dtorsSec->addChunk(dtorListEnd);

	Symbol *ctorListSym = symtab->findUnderscore("__CTOR_LIST__");
	Symbol *dtorListSym = symtab->findUnderscore("__DTOR_LIST__");
	replaceSymbol<DefinedSynthetic>(ctorListSym, ctorListSym->getName(),
	ctorListHead);
	replaceSymbol<DefinedSynthetic>(dtorListSym, dtorListSym->getName(),
	dtorListHead);
	}

	// Handles /section options to allow users to overwrite
	// section attributes.
	void Writer::setSectionPermissions() {
	for (auto &p : config->section) {
	StringRef name = p.first;
	uint32_t perm = p.second;
	for (OutputSection *sec : outputSections)
	if (sec->name == name)
	sec->setPermissions(perm);
	}
	}

	// Write section contents to a mmap'ed file.
	void Writer::writeSections() {
	// Record the number of sections to apply section index relocations
	// against absolute symbols. See applySecIdx in Chunks.cpp..
	DefinedAbsolute::numOutputSections = outputSections.size();

	uint8_t *buf = buffer->getBufferStart();
	for (OutputSection *sec : outputSections) {
	uint8_t *secBuf = buf + sec->getFileOff();
	// Fill gaps between functions in .text with INT3 instructions
	// instead of leaving as NUL bytes (which can be interpreted as
	// ADD instructions).
	if (sec->header.Characteristics & IMAGE_SCN_CNT_CODE)
	memset(secBuf, 0xCC, sec->getRawSize());
	parallelForEach(sec->chunks, [&](Chunk *c) {
	c->writeTo(secBuf + c->getRVA() - sec->getRVA());
	});
	}
	}

	void Writer::writeBuildId() {
	// There are two important parts to the build ID.
	// 1) If building with debug info, the COFF debug directory contains a
	// timestamp as well as a Guid and Age of the PDB.
	// 2) In all cases, the PE COFF file header also contains a timestamp.
	// For reproducibility, instead of a timestamp we want to use a hash of the
	// PE contents.
	if (config->debug) {
	assert(buildId && "BuildId is not set!");
	// BuildId->BuildId was filled in when the PDB was written.
	}

	// At this point the only fields in the COFF file which remain unset are the
	// "timestamp" in the COFF file header, and the ones in the coff debug
	// directory. Now we can hash the file and write that hash to the various
	// timestamp fields in the file.
	StringRef outputFileData(
	reinterpret_cast<const char *>(buffer->getBufferStart()),
	buffer->getBufferSize());

	uint32_t timestamp = config->timestamp;
	uint64_t hash = 0;
	bool generateSyntheticBuildId =
	config->mingw && config->debug && config->pdbPath.empty();

	if (config->repro \|\| generateSyntheticBuildId)
	hash = xxHash64(outputFileData);

	if (config->repro)
	timestamp = static_cast<uint32_t>(hash);

	if (generateSyntheticBuildId) {
	// For MinGW builds without a PDB file, we still generate a build id
	// to allow associating a crash dump to the executable.
	buildId->buildId->PDB70.CVSignature = OMF::Signature::PDB70;
	buildId->buildId->PDB70.Age = 1;
	memcpy(buildId->buildId->PDB70.Signature, &hash, 8);
	// xxhash only gives us 8 bytes, so put some fixed data in the other half.
	memcpy(&buildId->buildId->PDB70.Signature[8], "LLD PDB.", 8);
	}

	if (debugDirectory)
	debugDirectory->setTimeDateStamp(timestamp);

	uint8_t *buf = buffer->getBufferStart();
	buf += dosStubSize + sizeof(PEMagic);
	object::coff_file_header *coffHeader =
	reinterpret_cast<coff_file_header *>(buf);
	coffHeader->TimeDateStamp = timestamp;
	}

	// Sort .pdata section contents according to PE/COFF spec 5.5.
	void Writer::sortExceptionTable() {
	if (!firstPdata)
	return;
	// We assume .pdata contains function table entries only.
	auto bufAddr = [&](Chunk *c) {
	OutputSection *os = c->getOutputSection();
	return buffer->getBufferStart() + os->getFileOff() + c->getRVA() -
	os->getRVA();
	};
	uint8_t *begin = bufAddr(firstPdata);
	uint8_t *end = bufAddr(lastPdata) + lastPdata->getSize();
	if (config->machine == AMD64) {
	struct Entry { ulittle32_t begin, end, unwind; };
	if ((end - begin) % sizeof(Entry) != 0) {
	fatal("unexpected .pdata size: " + Twine(end - begin) +
	" is not a multiple of " + Twine(sizeof(Entry)));
	}
	parallelSort(
	MutableArrayRef<Entry>((Entry )begin, (Entry )end),
	[](const Entry &a, const Entry &b) { return a.begin < b.begin; });
	return;
	}
	if (config->machine == ARMNT \|\| config->machine == ARM64) {
	struct Entry { ulittle32_t begin, unwind; };
	if ((end - begin) % sizeof(Entry) != 0) {
	fatal("unexpected .pdata size: " + Twine(end - begin) +
	" is not a multiple of " + Twine(sizeof(Entry)));
	}
	parallelSort(
	MutableArrayRef<Entry>((Entry )begin, (Entry )end),
	[](const Entry &a, const Entry &b) { return a.begin < b.begin; });
	return;
	}
	lld::errs() << "warning: don't know how to handle .pdata.\n";
	}

	// The CRT section contains, among other things, the array of function
	// pointers that initialize every global variable that is not trivially
	// constructed. The CRT calls them one after the other prior to invoking
	// main().
	//
	// As per C++ spec, 3.6.2/2.3,
	// "Variables with ordered initialization defined within a single
	// translation unit shall be initialized in the order of their definitions
	// in the translation unit"
	//
	// It is therefore critical to sort the chunks containing the function
	// pointers in the order that they are listed in the object file (top to
	// bottom), otherwise global objects might not be initialized in the
	// correct order.
	void Writer::sortCRTSectionChunks(std::vector<Chunk *> &chunks) {
	auto sectionChunkOrder = [](const Chunk a, const Chunk b) {
	auto sa = dyn_cast<SectionChunk>(a);
	auto sb = dyn_cast<SectionChunk>(b);
	assert(sa && sb && "Non-section chunks in CRT section!");

	StringRef sAObj = sa->file->mb.getBufferIdentifier();
	StringRef sBObj = sb->file->mb.getBufferIdentifier();

	return sAObj == sBObj && sa->getSectionNumber() < sb->getSectionNumber();
	};
	llvm::stable_sort(chunks, sectionChunkOrder);

	if (config->verbose) {
	for (auto &c : chunks) {
	auto sc = dyn_cast<SectionChunk>(c);
	log(" " + sc->file->mb.getBufferIdentifier().str() +
	", SectionID: " + Twine(sc->getSectionNumber()));
	}
	}
	}

	OutputSection *Writer::findSection(StringRef name) {
	for (OutputSection *sec : outputSections)
	if (sec->name == name)
	return sec;
	return nullptr;
	}

	uint32_t Writer::getSizeOfInitializedData() {
	uint32_t res = 0;
	for (OutputSection *s : outputSections)
	if (s->header.Characteristics & IMAGE_SCN_CNT_INITIALIZED_DATA)
	res += s->getRawSize();
	return res;
	}

	// Add base relocations to .reloc section.
	void Writer::addBaserels() {
	if (!config->relocatable)
	return;
	relocSec->chunks.clear();
	std::vector<Baserel> v;
	for (OutputSection *sec : outputSections) {
	if (sec->header.Characteristics & IMAGE_SCN_MEM_DISCARDABLE)
	continue;
	// Collect all locations for base relocations.
	for (Chunk *c : sec->chunks)
	c->getBaserels(&v);
	// Add the addresses to .reloc section.
	if (!v.empty())
	addBaserelBlocks(v);
	v.clear();
	}
	}

	// Add addresses to .reloc section. Note that addresses are grouped by page.
	void Writer::addBaserelBlocks(std::vector<Baserel> &v) {
	const uint32_t mask = ~uint32_t(pageSize - 1);
	uint32_t page = v[0].rva & mask;
	size_t i = 0, j = 1;
	for (size_t e = v.size(); j < e; ++j) {
	uint32_t p = v[j].rva & mask;
	if (p == page)
	continue;
	relocSec->addChunk(make<BaserelChunk>(page, &v[i], &v[0] + j));
	i = j;
	page = p;
	}
	if (i == j)
	return;
	relocSec->addChunk(make<BaserelChunk>(page, &v[i], &v[0] + j));
	}

	PartialSection *Writer::createPartialSection(StringRef name,
	uint32_t outChars) {
	PartialSection *&pSec = partialSections[{name, outChars}];
	if (pSec)
	return pSec;
	pSec = make<PartialSection>(name, outChars);
	return pSec;
	}

	PartialSection *Writer::findPartialSection(StringRef name, uint32_t outChars) {
	auto it = partialSections.find({name, outChars});
	if (it != partialSections.end())
	return it->second;
	return nullptr;
	}
	diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
	index c3a11b199675..d6580430daf7 100644
	--- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h
	+++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
	@@ -1,448 +1,454 @@
	//===-- llvm/CodeGen/TargetFrameLowering.h ----------------------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// Interface to describe the layout of a stack frame on the target machine.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_CODEGEN_TARGETFRAMELOWERING_H
	#define LLVM_CODEGEN_TARGETFRAMELOWERING_H

	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include <vector>

	namespace llvm {
	class BitVector;
	class CalleeSavedInfo;
	class MachineFunction;
	class RegScavenger;

	namespace TargetStackID {
	enum Value {
	Default = 0,
	SGPRSpill = 1,
	SVEVector = 2,
	NoAlloc = 255
	};
	}

	/// Information about stack frame layout on the target. It holds the direction
	/// of stack growth, the known stack alignment on entry to each function, and
	/// the offset to the locals area.
	///
	/// The offset to the local area is the offset from the stack pointer on
	/// function entry to the first location where function data (local variables,
	/// spill locations) can be stored.
	class TargetFrameLowering {
	public:
	enum StackDirection {
	StackGrowsUp, // Adding to the stack increases the stack address
	StackGrowsDown // Adding to the stack decreases the stack address
	};

	// Maps a callee saved register to a stack slot with a fixed offset.
	struct SpillSlot {
	unsigned Reg;
	int Offset; // Offset relative to stack pointer on function entry.
	};

	struct DwarfFrameBase {
	// The frame base may be either a register (the default), the CFA,
	// or a WebAssembly-specific location description.
	enum FrameBaseKind { Register, CFA, WasmFrameBase } Kind;
	struct WasmFrameBase {
	unsigned Kind; // Wasm local, global, or value stack
	unsigned Index;
	};
	union {
	unsigned Reg;
	struct WasmFrameBase WasmLoc;
	} Location;
	};

	private:
	StackDirection StackDir;
	Align StackAlignment;
	Align TransientStackAlignment;
	int LocalAreaOffset;
	bool StackRealignable;
	public:
	TargetFrameLowering(StackDirection D, Align StackAl, int LAO,
	Align TransAl = Align(1), bool StackReal = true)
	: StackDir(D), StackAlignment(StackAl), TransientStackAlignment(TransAl),
	LocalAreaOffset(LAO), StackRealignable(StackReal) {}

	virtual ~TargetFrameLowering();

	// These methods return information that describes the abstract stack layout
	// of the target machine.

	/// getStackGrowthDirection - Return the direction the stack grows
	///
	StackDirection getStackGrowthDirection() const { return StackDir; }

	/// getStackAlignment - This method returns the number of bytes to which the
	/// stack pointer must be aligned on entry to a function. Typically, this
	/// is the largest alignment for any data object in the target.
	///
	unsigned getStackAlignment() const { return StackAlignment.value(); }
	/// getStackAlignment - This method returns the number of bytes to which the
	/// stack pointer must be aligned on entry to a function. Typically, this
	/// is the largest alignment for any data object in the target.
	///
	Align getStackAlign() const { return StackAlignment; }

	/// alignSPAdjust - This method aligns the stack adjustment to the correct
	/// alignment.
	///
	int alignSPAdjust(int SPAdj) const {
	if (SPAdj < 0) {
	SPAdj = -alignTo(-SPAdj, StackAlignment);
	} else {
	SPAdj = alignTo(SPAdj, StackAlignment);
	}
	return SPAdj;
	}

	/// getTransientStackAlignment - This method returns the number of bytes to
	/// which the stack pointer must be aligned at all times, even between
	/// calls.
	///
	LLVM_ATTRIBUTE_DEPRECATED(unsigned getTransientStackAlignment() const,
	"Use getTransientStackAlign instead") {
	return TransientStackAlignment.value();
	}
	/// getTransientStackAlignment - This method returns the number of bytes to
	/// which the stack pointer must be aligned at all times, even between
	/// calls.
	///
	Align getTransientStackAlign() const { return TransientStackAlignment; }

	/// isStackRealignable - This method returns whether the stack can be
	/// realigned.
	bool isStackRealignable() const {
	return StackRealignable;
	}

	/// Return the skew that has to be applied to stack alignment under
	/// certain conditions (e.g. stack was adjusted before function \p MF
	/// was called).
	virtual unsigned getStackAlignmentSkew(const MachineFunction &MF) const;

	+ /// This method returns whether or not it is safe for an object with the
	+ /// given stack id to be bundled into the local area.
	+ virtual bool isStackIdSafeForLocalArea(unsigned StackId) const {
	+ return true;
	+ }
	+
	/// getOffsetOfLocalArea - This method returns the offset of the local area
	/// from the stack pointer on entrance to a function.
	///
	int getOffsetOfLocalArea() const { return LocalAreaOffset; }

	/// isFPCloseToIncomingSP - Return true if the frame pointer is close to
	/// the incoming stack pointer, false if it is close to the post-prologue
	/// stack pointer.
	virtual bool isFPCloseToIncomingSP() const { return true; }

	/// assignCalleeSavedSpillSlots - Allows target to override spill slot
	/// assignment logic. If implemented, assignCalleeSavedSpillSlots() should
	/// assign frame slots to all CSI entries and return true. If this method
	/// returns false, spill slots will be assigned using generic implementation.
	/// assignCalleeSavedSpillSlots() may add, delete or rearrange elements of
	/// CSI.
	virtual bool
	assignCalleeSavedSpillSlots(MachineFunction &MF,
	const TargetRegisterInfo *TRI,
	std::vector<CalleeSavedInfo> &CSI) const {
	return false;
	}

	/// getCalleeSavedSpillSlots - This method returns a pointer to an array of
	/// pairs, that contains an entry for each callee saved register that must be
	/// spilled to a particular stack location if it is spilled.
	///
	/// Each entry in this array contains a <register,offset> pair, indicating the
	/// fixed offset from the incoming stack pointer that each register should be
	/// spilled at. If a register is not listed here, the code generator is
	/// allowed to spill it anywhere it chooses.
	///
	virtual const SpillSlot *
	getCalleeSavedSpillSlots(unsigned &NumEntries) const {
	NumEntries = 0;
	return nullptr;
	}

	/// targetHandlesStackFrameRounding - Returns true if the target is
	/// responsible for rounding up the stack frame (probably at emitPrologue
	/// time).
	virtual bool targetHandlesStackFrameRounding() const {
	return false;
	}

	/// Returns true if the target will correctly handle shrink wrapping.
	virtual bool enableShrinkWrapping(const MachineFunction &MF) const {
	return false;
	}

	/// Returns true if the stack slot holes in the fixed and callee-save stack
	/// area should be used when allocating other stack locations to reduce stack
	/// size.
	virtual bool enableStackSlotScavenging(const MachineFunction &MF) const {
	return false;
	}

	/// Returns true if the target can safely skip saving callee-saved registers
	/// for noreturn nounwind functions.
	virtual bool enableCalleeSaveSkip(const MachineFunction &MF) const;

	/// emitProlog/emitEpilog - These methods insert prolog and epilog code into
	/// the function.
	virtual void emitPrologue(MachineFunction &MF,
	MachineBasicBlock &MBB) const = 0;
	virtual void emitEpilogue(MachineFunction &MF,
	MachineBasicBlock &MBB) const = 0;

	/// With basic block sections, emit callee saved frame moves for basic blocks
	/// that are in a different section.
	virtual void
	emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI) const {}

	virtual void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	const DebugLoc &DL,
	bool IsPrologue) const {}

	/// Replace a StackProbe stub (if any) with the actual probe code inline
	virtual void inlineStackProbe(MachineFunction &MF,
	MachineBasicBlock &PrologueMBB) const {}

	/// Adjust the prologue to have the function use segmented stacks. This works
	/// by adding a check even before the "normal" function prologue.
	virtual void adjustForSegmentedStacks(MachineFunction &MF,
	MachineBasicBlock &PrologueMBB) const {}

	/// Adjust the prologue to add Erlang Run-Time System (ERTS) specific code in
	/// the assembly prologue to explicitly handle the stack.
	virtual void adjustForHiPEPrologue(MachineFunction &MF,
	MachineBasicBlock &PrologueMBB) const {}

	/// spillCalleeSavedRegisters - Issues instruction(s) to spill all callee
	/// saved registers and returns true if it isn't possible / profitable to do
	/// so by issuing a series of store instructions via
	/// storeRegToStackSlot(). Returns false otherwise.
	virtual bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI,
	ArrayRef<CalleeSavedInfo> CSI,
	const TargetRegisterInfo *TRI) const {
	return false;
	}

	/// restoreCalleeSavedRegisters - Issues instruction(s) to restore all callee
	/// saved registers and returns true if it isn't possible / profitable to do
	/// so by issuing a series of load instructions via loadRegToStackSlot().
	/// If it returns true, and any of the registers in CSI is not restored,
	/// it sets the corresponding Restored flag in CSI to false.
	/// Returns false otherwise.
	virtual bool
	restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI,
	MutableArrayRef<CalleeSavedInfo> CSI,
	const TargetRegisterInfo *TRI) const {
	return false;
	}

	/// Return true if the target wants to keep the frame pointer regardless of
	/// the function attribute "frame-pointer".
	virtual bool keepFramePointer(const MachineFunction &MF) const {
	return false;
	}

	/// hasFP - Return true if the specified function should have a dedicated
	/// frame pointer register. For most targets this is true only if the function
	/// has variable sized allocas or if frame pointer elimination is disabled.
	virtual bool hasFP(const MachineFunction &MF) const = 0;

	/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
	/// not required, we reserve argument space for call sites in the function
	/// immediately on entry to the current function. This eliminates the need for
	/// add/sub sp brackets around call sites. Returns true if the call frame is
	/// included as part of the stack frame.
	virtual bool hasReservedCallFrame(const MachineFunction &MF) const {
	return !hasFP(MF);
	}

	/// canSimplifyCallFramePseudos - When possible, it's best to simplify the
	/// call frame pseudo ops before doing frame index elimination. This is
	/// possible only when frame index references between the pseudos won't
	/// need adjusting for the call frame adjustments. Normally, that's true
	/// if the function has a reserved call frame or a frame pointer. Some
	/// targets (Thumb2, for example) may have more complicated criteria,
	/// however, and can override this behavior.
	virtual bool canSimplifyCallFramePseudos(const MachineFunction &MF) const {
	return hasReservedCallFrame(MF) \|\| hasFP(MF);
	}

	// needsFrameIndexResolution - Do we need to perform FI resolution for
	// this function. Normally, this is required only when the function
	// has any stack objects. However, targets may want to override this.
	virtual bool needsFrameIndexResolution(const MachineFunction &MF) const;

	/// getFrameIndexReference - This method should return the base register
	/// and offset used to reference a frame index location. The offset is
	/// returned directly, and the base register is returned via FrameReg.
	virtual int getFrameIndexReference(const MachineFunction &MF, int FI,
	Register &FrameReg) const;

	/// Same as \c getFrameIndexReference, except that the stack pointer (as
	/// opposed to the frame pointer) will be the preferred value for \p
	/// FrameReg. This is generally used for emitting statepoint or EH tables that
	/// use offsets from RSP. If \p IgnoreSPUpdates is true, the returned
	/// offset is only guaranteed to be valid with respect to the value of SP at
	/// the end of the prologue.
	virtual int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
	Register &FrameReg,
	bool IgnoreSPUpdates) const {
	// Always safe to dispatch to getFrameIndexReference.
	return getFrameIndexReference(MF, FI, FrameReg);
	}

	/// getNonLocalFrameIndexReference - This method returns the offset used to
	/// reference a frame index location. The offset can be from either FP/BP/SP
	/// based on which base register is returned by llvm.localaddress.
	virtual int getNonLocalFrameIndexReference(const MachineFunction &MF,
	int FI) const {
	// By default, dispatch to getFrameIndexReference. Interested targets can
	// override this.
	Register FrameReg;
	return getFrameIndexReference(MF, FI, FrameReg);
	}

	/// Returns the callee-saved registers as computed by determineCalleeSaves
	/// in the BitVector \p SavedRegs.
	virtual void getCalleeSaves(const MachineFunction &MF,
	BitVector &SavedRegs) const;

	/// This method determines which of the registers reported by
	/// TargetRegisterInfo::getCalleeSavedRegs() should actually get saved.
	/// The default implementation checks populates the \p SavedRegs bitset with
	/// all registers which are modified in the function, targets may override
	/// this function to save additional registers.
	/// This method also sets up the register scavenger ensuring there is a free
	/// register or a frameindex available.
	/// This method should not be called by any passes outside of PEI, because
	/// it may change state passed in by \p MF and \p RS. The preferred
	/// interface outside PEI is getCalleeSaves.
	virtual void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
	RegScavenger *RS = nullptr) const;

	/// processFunctionBeforeFrameFinalized - This method is called immediately
	/// before the specified function's frame layout (MF.getFrameInfo()) is
	/// finalized. Once the frame is finalized, MO_FrameIndex operands are
	/// replaced with direct constants. This method is optional.
	///
	virtual void processFunctionBeforeFrameFinalized(MachineFunction &MF,
	RegScavenger *RS = nullptr) const {
	}

	/// processFunctionBeforeFrameIndicesReplaced - This method is called
	/// immediately before MO_FrameIndex operands are eliminated, but after the
	/// frame is finalized. This method is optional.
	virtual void
	processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF,
	RegScavenger *RS = nullptr) const {}

	virtual unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const {
	report_fatal_error("WinEH not implemented for this target");
	}

	/// This method is called during prolog/epilog code insertion to eliminate
	/// call frame setup and destroy pseudo instructions (but only if the Target
	/// is using them). It is responsible for eliminating these instructions,
	/// replacing them with concrete instructions. This method need only be
	/// implemented if using call frame setup/destroy pseudo instructions.
	/// Returns an iterator pointing to the instruction after the replaced one.
	virtual MachineBasicBlock::iterator
	eliminateCallFramePseudoInstr(MachineFunction &MF,
	MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI) const {
	llvm_unreachable("Call Frame Pseudo Instructions do not exist on this "
	"target!");
	}


	/// Order the symbols in the local stack frame.
	/// The list of objects that we want to order is in \p objectsToAllocate as
	/// indices into the MachineFrameInfo. The array can be reordered in any way
	/// upon return. The contents of the array, however, may not be modified (i.e.
	/// only their order may be changed).
	/// By default, just maintain the original order.
	virtual void
	orderFrameObjects(const MachineFunction &MF,
	SmallVectorImpl<int> &objectsToAllocate) const {
	}

	/// Check whether or not the given \p MBB can be used as a prologue
	/// for the target.
	/// The prologue will be inserted first in this basic block.
	/// This method is used by the shrink-wrapping pass to decide if
	/// \p MBB will be correctly handled by the target.
	/// As soon as the target enable shrink-wrapping without overriding
	/// this method, we assume that each basic block is a valid
	/// prologue.
	virtual bool canUseAsPrologue(const MachineBasicBlock &MBB) const {
	return true;
	}

	/// Check whether or not the given \p MBB can be used as a epilogue
	/// for the target.
	/// The epilogue will be inserted before the first terminator of that block.
	/// This method is used by the shrink-wrapping pass to decide if
	/// \p MBB will be correctly handled by the target.
	/// As soon as the target enable shrink-wrapping without overriding
	/// this method, we assume that each basic block is a valid
	/// epilogue.
	virtual bool canUseAsEpilogue(const MachineBasicBlock &MBB) const {
	return true;
	}

	/// Returns the StackID that scalable vectors should be associated with.
	virtual TargetStackID::Value getStackIDForScalableVectors() const {
	return TargetStackID::Default;
	}

	virtual bool isSupportedStackID(TargetStackID::Value ID) const {
	switch (ID) {
	default:
	return false;
	case TargetStackID::Default:
	case TargetStackID::NoAlloc:
	return true;
	}
	}

	/// Check if given function is safe for not having callee saved registers.
	/// This is used when interprocedural register allocation is enabled.
	static bool isSafeForNoCSROpt(const Function &F);

	/// Check if the no-CSR optimisation is profitable for the given function.
	virtual bool isProfitableForNoCSROpt(const Function &F) const {
	return true;
	}

	/// Return initial CFA offset value i.e. the one valid at the beginning of the
	/// function (before any stack operations).
	virtual int getInitialCFAOffset(const MachineFunction &MF) const;

	/// Return initial CFA register value i.e. the one valid at the beginning of
	/// the function (before any stack operations).
	virtual Register getInitialCFARegister(const MachineFunction &MF) const;

	/// Return the frame base information to be encoded in the DWARF subprogram
	/// debug info.
	virtual DwarfFrameBase getDwarfFrameBase(const MachineFunction &MF) const;
	};

	} // End llvm namespace

	#endif
	diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
	index 74664098ce1d..33f122728d2a 100644
	--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
	+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
	@@ -1,2118 +1,2121 @@
	//===- BasicAliasAnalysis.cpp - Stateless Alias Analysis Impl -------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the primary stateless implementation of the
	// Alias Analysis interface that implements identities (two different
	// globals cannot alias, etc), but does no stateful analysis.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/Analysis/BasicAliasAnalysis.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/Analysis/AliasAnalysis.h"
	#include "llvm/Analysis/AssumptionCache.h"
	#include "llvm/Analysis/CFG.h"
	#include "llvm/Analysis/CaptureTracking.h"
	#include "llvm/Analysis/InstructionSimplify.h"
	#include "llvm/Analysis/LoopInfo.h"
	#include "llvm/Analysis/MemoryBuiltins.h"
	#include "llvm/Analysis/MemoryLocation.h"
	#include "llvm/Analysis/PhiValues.h"
	#include "llvm/Analysis/TargetLibraryInfo.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/IR/Argument.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Dominators.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GetElementPtrTypeIterator.h"
	#include "llvm/IR/GlobalAlias.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/InstrTypes.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/Metadata.h"
	#include "llvm/IR/Operator.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/User.h"
	#include "llvm/IR/Value.h"
	#include "llvm/InitializePasses.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/KnownBits.h"
	#include <cassert>
	#include <cstdint>
	#include <cstdlib>
	#include <utility>

	#define DEBUG_TYPE "basicaa"

	using namespace llvm;

	/// Enable analysis of recursive PHI nodes.
	static cl::opt<bool> EnableRecPhiAnalysis("basic-aa-recphi", cl::Hidden,
	cl::init(false));

	/// By default, even on 32-bit architectures we use 64-bit integers for
	/// calculations. This will allow us to more-aggressively decompose indexing
	/// expressions calculated using i64 values (e.g., long long in C) which is
	/// common enough to worry about.
	static cl::opt<bool> ForceAtLeast64Bits("basic-aa-force-at-least-64b",
	cl::Hidden, cl::init(true));
	static cl::opt<bool> DoubleCalcBits("basic-aa-double-calc-bits",
	cl::Hidden, cl::init(false));

	/// SearchLimitReached / SearchTimes shows how often the limit of
	/// to decompose GEPs is reached. It will affect the precision
	/// of basic alias analysis.
	STATISTIC(SearchLimitReached, "Number of times the limit to "
	"decompose GEPs is reached");
	STATISTIC(SearchTimes, "Number of times a GEP is decomposed");

	/// Cutoff after which to stop analysing a set of phi nodes potentially involved
	/// in a cycle. Because we are analysing 'through' phi nodes, we need to be
	/// careful with value equivalence. We use reachability to make sure a value
	/// cannot be involved in a cycle.
	const unsigned MaxNumPhiBBsValueReachabilityCheck = 20;

	// The max limit of the search depth in DecomposeGEPExpression() and
	// GetUnderlyingObject(), both functions need to use the same search
	// depth otherwise the algorithm in aliasGEP will assert.
	static const unsigned MaxLookupSearchDepth = 6;

	bool BasicAAResult::invalidate(Function &Fn, const PreservedAnalyses &PA,
	FunctionAnalysisManager::Invalidator &Inv) {
	// We don't care if this analysis itself is preserved, it has no state. But
	// we need to check that the analyses it depends on have been. Note that we
	// may be created without handles to some analyses and in that case don't
	// depend on them.
	if (Inv.invalidate<AssumptionAnalysis>(Fn, PA) \|\|
	(DT && Inv.invalidate<DominatorTreeAnalysis>(Fn, PA)) \|\|
	(LI && Inv.invalidate<LoopAnalysis>(Fn, PA)) \|\|
	(PV && Inv.invalidate<PhiValuesAnalysis>(Fn, PA)))
	return true;

	// Otherwise this analysis result remains valid.
	return false;
	}

	//===----------------------------------------------------------------------===//
	// Useful predicates
	//===----------------------------------------------------------------------===//

	/// Returns true if the pointer is to a function-local object that never
	/// escapes from the function.
	static bool isNonEscapingLocalObject(
	const Value *V,
	SmallDenseMap<const Value , bool, 8> IsCapturedCache = nullptr) {
	SmallDenseMap<const Value *, bool, 8>::iterator CacheIt;
	if (IsCapturedCache) {
	bool Inserted;
	std::tie(CacheIt, Inserted) = IsCapturedCache->insert({V, false});
	if (!Inserted)
	// Found cached result, return it!
	return CacheIt->second;
	}

	// If this is a local allocation, check to see if it escapes.
	if (isa<AllocaInst>(V) \|\| isNoAliasCall(V)) {
	// Set StoreCaptures to True so that we can assume in our callers that the
	// pointer is not the result of a load instruction. Currently
	// PointerMayBeCaptured doesn't have any special analysis for the
	// StoreCaptures=false case; if it did, our callers could be refined to be
	// more precise.
	auto Ret = !PointerMayBeCaptured(V, false, /StoreCaptures=/true);
	if (IsCapturedCache)
	CacheIt->second = Ret;
	return Ret;
	}

	// If this is an argument that corresponds to a byval or noalias argument,
	// then it has not escaped before entering the function. Check if it escapes
	// inside the function.
	if (const Argument *A = dyn_cast<Argument>(V))
	if (A->hasByValAttr() \|\| A->hasNoAliasAttr()) {
	// Note even if the argument is marked nocapture, we still need to check
	// for copies made inside the function. The nocapture attribute only
	// specifies that there are no copies made that outlive the function.
	auto Ret = !PointerMayBeCaptured(V, false, /StoreCaptures=/true);
	if (IsCapturedCache)
	CacheIt->second = Ret;
	return Ret;
	}

	return false;
	}

	/// Returns true if the pointer is one which would have been considered an
	/// escape by isNonEscapingLocalObject.
	static bool isEscapeSource(const Value *V) {
	if (isa<CallBase>(V))
	return true;

	if (isa<Argument>(V))
	return true;

	// The load case works because isNonEscapingLocalObject considers all
	// stores to be escapes (it passes true for the StoreCaptures argument
	// to PointerMayBeCaptured).
	if (isa<LoadInst>(V))
	return true;

	return false;
	}

	/// Returns the size of the object specified by V or UnknownSize if unknown.
	static uint64_t getObjectSize(const Value *V, const DataLayout &DL,
	const TargetLibraryInfo &TLI,
	bool NullIsValidLoc,
	bool RoundToAlign = false) {
	uint64_t Size;
	ObjectSizeOpts Opts;
	Opts.RoundToAlign = RoundToAlign;
	Opts.NullIsUnknownSize = NullIsValidLoc;
	if (getObjectSize(V, Size, DL, &TLI, Opts))
	return Size;
	return MemoryLocation::UnknownSize;
	}

	/// Returns true if we can prove that the object specified by V is smaller than
	/// Size.
	static bool isObjectSmallerThan(const Value *V, uint64_t Size,
	const DataLayout &DL,
	const TargetLibraryInfo &TLI,
	bool NullIsValidLoc) {
	// Note that the meanings of the "object" are slightly different in the
	// following contexts:
	// c1: llvm::getObjectSize()
	// c2: llvm.objectsize() intrinsic
	// c3: isObjectSmallerThan()
	// c1 and c2 share the same meaning; however, the meaning of "object" in c3
	// refers to the "entire object".
	//
	// Consider this example:
	// char p = (char)malloc(100)
	// char *q = p+80;
	//
	// In the context of c1 and c2, the "object" pointed by q refers to the
	// stretch of memory of q[0:19]. So, getObjectSize(q) should return 20.
	//
	// However, in the context of c3, the "object" refers to the chunk of memory
	// being allocated. So, the "object" has 100 bytes, and q points to the middle
	// the "object". In case q is passed to isObjectSmallerThan() as the 1st
	// parameter, before the llvm::getObjectSize() is called to get the size of
	// entire object, we should:
	// - either rewind the pointer q to the base-address of the object in
	// question (in this case rewind to p), or
	// - just give up. It is up to caller to make sure the pointer is pointing
	// to the base address the object.
	//
	// We go for 2nd option for simplicity.
	if (!isIdentifiedObject(V))
	return false;

	// This function needs to use the aligned object size because we allow
	// reads a bit past the end given sufficient alignment.
	uint64_t ObjectSize = getObjectSize(V, DL, TLI, NullIsValidLoc,
	/RoundToAlign/ true);

	return ObjectSize != MemoryLocation::UnknownSize && ObjectSize < Size;
	}

	/// Return the minimal extent from \p V to the end of the underlying object,
	/// assuming the result is used in an aliasing query. E.g., we do use the query
	/// location size and the fact that null pointers cannot alias here.
	static uint64_t getMinimalExtentFrom(const Value &V,
	const LocationSize &LocSize,
	const DataLayout &DL,
	bool NullIsValidLoc) {
	// If we have dereferenceability information we know a lower bound for the
	// extent as accesses for a lower offset would be valid. We need to exclude
	// the "or null" part if null is a valid pointer.
	bool CanBeNull;
	uint64_t DerefBytes = V.getPointerDereferenceableBytes(DL, CanBeNull);
	DerefBytes = (CanBeNull && NullIsValidLoc) ? 0 : DerefBytes;
	// If queried with a precise location size, we assume that location size to be
	// accessed, thus valid.
	if (LocSize.isPrecise())
	DerefBytes = std::max(DerefBytes, LocSize.getValue());
	return DerefBytes;
	}

	/// Returns true if we can prove that the object specified by V has size Size.
	static bool isObjectSize(const Value *V, uint64_t Size, const DataLayout &DL,
	const TargetLibraryInfo &TLI, bool NullIsValidLoc) {
	uint64_t ObjectSize = getObjectSize(V, DL, TLI, NullIsValidLoc);
	return ObjectSize != MemoryLocation::UnknownSize && ObjectSize == Size;
	}

	//===----------------------------------------------------------------------===//
	// GetElementPtr Instruction Decomposition and Analysis
	//===----------------------------------------------------------------------===//

	/// Analyzes the specified value as a linear expression: "A*V + B", where A and
	/// B are constant integers.
	///
	/// Returns the scale and offset values as APInts and return V as a Value*, and
	/// return whether we looked through any sign or zero extends. The incoming
	/// Value is known to have IntegerType, and it may already be sign or zero
	/// extended.
	///
	/// Note that this looks through extends, so the high bits may not be
	/// represented in the result.
	/static/ const Value *BasicAAResult::GetLinearExpression(
	const Value *V, APInt &Scale, APInt &Offset, unsigned &ZExtBits,
	unsigned &SExtBits, const DataLayout &DL, unsigned Depth,
	AssumptionCache AC, DominatorTree DT, bool &NSW, bool &NUW) {
	assert(V->getType()->isIntegerTy() && "Not an integer value");

	// Limit our recursion depth.
	if (Depth == 6) {
	Scale = 1;
	Offset = 0;
	return V;
	}

	if (const ConstantInt *Const = dyn_cast<ConstantInt>(V)) {
	// If it's a constant, just convert it to an offset and remove the variable.
	// If we've been called recursively, the Offset bit width will be greater
	// than the constant's (the Offset's always as wide as the outermost call),
	// so we'll zext here and process any extension in the isa<SExtInst> &
	// isa<ZExtInst> cases below.
	Offset += Const->getValue().zextOrSelf(Offset.getBitWidth());
	assert(Scale == 0 && "Constant values don't have a scale");
	return V;
	}

	if (const BinaryOperator *BOp = dyn_cast<BinaryOperator>(V)) {
	if (ConstantInt *RHSC = dyn_cast<ConstantInt>(BOp->getOperand(1))) {
	// If we've been called recursively, then Offset and Scale will be wider
	// than the BOp operands. We'll always zext it here as we'll process sign
	// extensions below (see the isa<SExtInst> / isa<ZExtInst> cases).
	APInt RHS = RHSC->getValue().zextOrSelf(Offset.getBitWidth());

	switch (BOp->getOpcode()) {
	default:
	// We don't understand this instruction, so we can't decompose it any
	// further.
	Scale = 1;
	Offset = 0;
	return V;
	case Instruction::Or:
	// X\|C == X+C if all the bits in C are unset in X. Otherwise we can't
	// analyze it.
	if (!MaskedValueIsZero(BOp->getOperand(0), RHSC->getValue(), DL, 0, AC,
	BOp, DT)) {
	Scale = 1;
	Offset = 0;
	return V;
	}
	LLVM_FALLTHROUGH;
	case Instruction::Add:
	V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, ZExtBits,
	SExtBits, DL, Depth + 1, AC, DT, NSW, NUW);
	Offset += RHS;
	break;
	case Instruction::Sub:
	V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, ZExtBits,
	SExtBits, DL, Depth + 1, AC, DT, NSW, NUW);
	Offset -= RHS;
	break;
	case Instruction::Mul:
	V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, ZExtBits,
	SExtBits, DL, Depth + 1, AC, DT, NSW, NUW);
	Offset *= RHS;
	Scale *= RHS;
	break;
	case Instruction::Shl:
	V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, ZExtBits,
	SExtBits, DL, Depth + 1, AC, DT, NSW, NUW);

	// We're trying to linearize an expression of the kind:
	// shl i8 -128, 36
	// where the shift count exceeds the bitwidth of the type.
	// We can't decompose this further (the expression would return
	// a poison value).
	if (Offset.getBitWidth() < RHS.getLimitedValue() \|\|
	Scale.getBitWidth() < RHS.getLimitedValue()) {
	Scale = 1;
	Offset = 0;
	return V;
	}

	Offset <<= RHS.getLimitedValue();
	Scale <<= RHS.getLimitedValue();
	// the semantics of nsw and nuw for left shifts don't match those of
	// multiplications, so we won't propagate them.
	NSW = NUW = false;
	return V;
	}

	if (isa<OverflowingBinaryOperator>(BOp)) {
	NUW &= BOp->hasNoUnsignedWrap();
	NSW &= BOp->hasNoSignedWrap();
	}
	return V;
	}
	}

	// Since GEP indices are sign extended anyway, we don't care about the high
	// bits of a sign or zero extended value - just scales and offsets. The
	// extensions have to be consistent though.
	if (isa<SExtInst>(V) \|\| isa<ZExtInst>(V)) {
	Value *CastOp = cast<CastInst>(V)->getOperand(0);
	unsigned NewWidth = V->getType()->getPrimitiveSizeInBits();
	unsigned SmallWidth = CastOp->getType()->getPrimitiveSizeInBits();
	unsigned OldZExtBits = ZExtBits, OldSExtBits = SExtBits;
	const Value *Result =
	GetLinearExpression(CastOp, Scale, Offset, ZExtBits, SExtBits, DL,
	Depth + 1, AC, DT, NSW, NUW);

	// zext(zext(%x)) == zext(%x), and similarly for sext; we'll handle this
	// by just incrementing the number of bits we've extended by.
	unsigned ExtendedBy = NewWidth - SmallWidth;

	if (isa<SExtInst>(V) && ZExtBits == 0) {
	// sext(sext(%x, a), b) == sext(%x, a + b)

	if (NSW) {
	// We haven't sign-wrapped, so it's valid to decompose sext(%x + c)
	// into sext(%x) + sext(c). We'll sext the Offset ourselves:
	unsigned OldWidth = Offset.getBitWidth();
	Offset = Offset.trunc(SmallWidth).sext(NewWidth).zextOrSelf(OldWidth);
	} else {
	// We may have signed-wrapped, so don't decompose sext(%x + c) into
	// sext(%x) + sext(c)
	Scale = 1;
	Offset = 0;
	Result = CastOp;
	ZExtBits = OldZExtBits;
	SExtBits = OldSExtBits;
	}
	SExtBits += ExtendedBy;
	} else {
	// sext(zext(%x, a), b) = zext(zext(%x, a), b) = zext(%x, a + b)

	if (!NUW) {
	// We may have unsigned-wrapped, so don't decompose zext(%x + c) into
	// zext(%x) + zext(c)
	Scale = 1;
	Offset = 0;
	Result = CastOp;
	ZExtBits = OldZExtBits;
	SExtBits = OldSExtBits;
	}
	ZExtBits += ExtendedBy;
	}

	return Result;
	}

	Scale = 1;
	Offset = 0;
	return V;
	}

	/// To ensure a pointer offset fits in an integer of size PointerSize
	/// (in bits) when that size is smaller than the maximum pointer size. This is
	/// an issue, for example, in particular for 32b pointers with negative indices
	/// that rely on two's complement wrap-arounds for precise alias information
	/// where the maximum pointer size is 64b.
	static APInt adjustToPointerSize(const APInt &Offset, unsigned PointerSize) {
	assert(PointerSize <= Offset.getBitWidth() && "Invalid PointerSize!");
	unsigned ShiftBits = Offset.getBitWidth() - PointerSize;
	return (Offset << ShiftBits).ashr(ShiftBits);
	}

	static unsigned getMaxPointerSize(const DataLayout &DL) {
	unsigned MaxPointerSize = DL.getMaxPointerSizeInBits();
	if (MaxPointerSize < 64 && ForceAtLeast64Bits) MaxPointerSize = 64;
	if (DoubleCalcBits) MaxPointerSize *= 2;

	return MaxPointerSize;
	}

	/// If V is a symbolic pointer expression, decompose it into a base pointer
	/// with a constant offset and a number of scaled symbolic offsets.
	///
	/// The scaled symbolic offsets (represented by pairs of a Value* and a scale
	/// in the VarIndices vector) are Value*'s that are known to be scaled by the
	/// specified amount, but which may have other unrepresented high bits. As
	/// such, the gep cannot necessarily be reconstructed from its decomposed form.
	///
	/// When DataLayout is around, this function is capable of analyzing everything
	/// that GetUnderlyingObject can look through. To be able to do that
	/// GetUnderlyingObject and DecomposeGEPExpression must use the same search
	/// depth (MaxLookupSearchDepth). When DataLayout not is around, it just looks
	/// through pointer casts.
	bool BasicAAResult::DecomposeGEPExpression(const Value *V,
	DecomposedGEP &Decomposed, const DataLayout &DL, AssumptionCache *AC,
	DominatorTree *DT) {
	// Limit recursion depth to limit compile time in crazy cases.
	unsigned MaxLookup = MaxLookupSearchDepth;
	SearchTimes++;

	unsigned MaxPointerSize = getMaxPointerSize(DL);
	Decomposed.VarIndices.clear();
	do {
	// See if this is a bitcast or GEP.
	const Operator *Op = dyn_cast<Operator>(V);
	if (!Op) {
	// The only non-operator case we can handle are GlobalAliases.
	if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
	if (!GA->isInterposable()) {
	V = GA->getAliasee();
	continue;
	}
	}
	Decomposed.Base = V;
	return false;
	}

	if (Op->getOpcode() == Instruction::BitCast \|\|
	Op->getOpcode() == Instruction::AddrSpaceCast) {
	V = Op->getOperand(0);
	continue;
	}

	const GEPOperator *GEPOp = dyn_cast<GEPOperator>(Op);
	if (!GEPOp) {
	if (const auto *PHI = dyn_cast<PHINode>(V)) {
	// Look through single-arg phi nodes created by LCSSA.
	if (PHI->getNumIncomingValues() == 1) {
	V = PHI->getIncomingValue(0);
	continue;
	}
	} else if (const auto *Call = dyn_cast<CallBase>(V)) {
	// CaptureTracking can know about special capturing properties of some
	// intrinsics like launder.invariant.group, that can't be expressed with
	// the attributes, but have properties like returning aliasing pointer.
	// Because some analysis may assume that nocaptured pointer is not
	// returned from some special intrinsic (because function would have to
	// be marked with returns attribute), it is crucial to use this function
	// because it should be in sync with CaptureTracking. Not using it may
	// cause weird miscompilations where 2 aliasing pointers are assumed to
	// noalias.
	if (auto *RP = getArgumentAliasingToReturnedPointer(Call, false)) {
	V = RP;
	continue;
	}
	}

	Decomposed.Base = V;
	return false;
	}

	// Don't attempt to analyze GEPs over unsized objects.
	if (!GEPOp->getSourceElementType()->isSized()) {
	Decomposed.Base = V;
	return false;
	}

	// Don't attempt to analyze GEPs if index scale is not a compile-time
	// constant.
	if (isa<ScalableVectorType>(GEPOp->getSourceElementType())) {
	Decomposed.Base = V;
	Decomposed.HasCompileTimeConstantScale = false;
	return false;
	}

	unsigned AS = GEPOp->getPointerAddressSpace();
	// Walk the indices of the GEP, accumulating them into BaseOff/VarIndices.
	gep_type_iterator GTI = gep_type_begin(GEPOp);
	unsigned PointerSize = DL.getPointerSizeInBits(AS);
	// Assume all GEP operands are constants until proven otherwise.
	bool GepHasConstantOffset = true;
	for (User::const_op_iterator I = GEPOp->op_begin() + 1, E = GEPOp->op_end();
	I != E; ++I, ++GTI) {
	const Value Index = I;
	// Compute the (potentially symbolic) offset in bytes for this index.
	if (StructType *STy = GTI.getStructTypeOrNull()) {
	// For a struct, add the member offset.
	unsigned FieldNo = cast<ConstantInt>(Index)->getZExtValue();
	if (FieldNo == 0)
	continue;

	Decomposed.StructOffset +=
	DL.getStructLayout(STy)->getElementOffset(FieldNo);
	continue;
	}

	// For an array/pointer, add the element offset, explicitly scaled.
	if (const ConstantInt *CIdx = dyn_cast<ConstantInt>(Index)) {
	if (CIdx->isZero())
	continue;
	Decomposed.OtherOffset +=
	(DL.getTypeAllocSize(GTI.getIndexedType()).getFixedSize() *
	CIdx->getValue().sextOrSelf(MaxPointerSize))
	.sextOrTrunc(MaxPointerSize);
	continue;
	}

	GepHasConstantOffset = false;

	APInt Scale(MaxPointerSize,
	DL.getTypeAllocSize(GTI.getIndexedType()).getFixedSize());
	unsigned ZExtBits = 0, SExtBits = 0;

	// If the integer type is smaller than the pointer size, it is implicitly
	// sign extended to pointer size.
	unsigned Width = Index->getType()->getIntegerBitWidth();
	if (PointerSize > Width)
	SExtBits += PointerSize - Width;

	// Use GetLinearExpression to decompose the index into a C1*V+C2 form.
	APInt IndexScale(Width, 0), IndexOffset(Width, 0);
	bool NSW = true, NUW = true;
	const Value *OrigIndex = Index;
	Index = GetLinearExpression(Index, IndexScale, IndexOffset, ZExtBits,
	SExtBits, DL, 0, AC, DT, NSW, NUW);

	// The GEP index scale ("Scale") scales C1V+C2, yielding (C1V+C2)*Scale.
	// This gives us an aggregate computation of (C1Scale)V + C2*Scale.

	// It can be the case that, even through C1*V+C2 does not overflow for
	// relevant values of V, (C2*Scale) can overflow. In that case, we cannot
	// decompose the expression in this way.
	//
	// FIXME: C1*Scale and the other operations in the decomposed
	// (C1Scale)V+C2*Scale can also overflow. We should check for this
	// possibility.
	APInt WideScaledOffset = IndexOffset.sextOrTrunc(MaxPointerSize2)
	Scale.sext(MaxPointerSize*2);
	if (WideScaledOffset.getMinSignedBits() > MaxPointerSize) {
	Index = OrigIndex;
	IndexScale = 1;
	IndexOffset = 0;

	ZExtBits = SExtBits = 0;
	if (PointerSize > Width)
	SExtBits += PointerSize - Width;
	} else {
	Decomposed.OtherOffset += IndexOffset.sextOrTrunc(MaxPointerSize) * Scale;
	Scale *= IndexScale.sextOrTrunc(MaxPointerSize);
	}

	// If we already had an occurrence of this index variable, merge this
	// scale into it. For example, we want to handle:
	// A[x][x] -> x16 + x4 -> x*20
	// This also ensures that 'x' only appears in the index list once.
	for (unsigned i = 0, e = Decomposed.VarIndices.size(); i != e; ++i) {
	if (Decomposed.VarIndices[i].V == Index &&
	Decomposed.VarIndices[i].ZExtBits == ZExtBits &&
	Decomposed.VarIndices[i].SExtBits == SExtBits) {
	Scale += Decomposed.VarIndices[i].Scale;
	Decomposed.VarIndices.erase(Decomposed.VarIndices.begin() + i);
	break;
	}
	}

	// Make sure that we have a scale that makes sense for this target's
	// pointer size.
	Scale = adjustToPointerSize(Scale, PointerSize);

	if (!!Scale) {
	VariableGEPIndex Entry = {Index, ZExtBits, SExtBits, Scale};
	Decomposed.VarIndices.push_back(Entry);
	}
	}

	// Take care of wrap-arounds
	if (GepHasConstantOffset) {
	Decomposed.StructOffset =
	adjustToPointerSize(Decomposed.StructOffset, PointerSize);
	Decomposed.OtherOffset =
	adjustToPointerSize(Decomposed.OtherOffset, PointerSize);
	}

	// Analyze the base pointer next.
	V = GEPOp->getOperand(0);
	} while (--MaxLookup);

	// If the chain of expressions is too deep, just return early.
	Decomposed.Base = V;
	SearchLimitReached++;
	return true;
	}

	/// Returns whether the given pointer value points to memory that is local to
	/// the function, with global constants being considered local to all
	/// functions.
	bool BasicAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
	AAQueryInfo &AAQI, bool OrLocal) {
	assert(Visited.empty() && "Visited must be cleared after use!");

	unsigned MaxLookup = 8;
	SmallVector<const Value *, 16> Worklist;
	Worklist.push_back(Loc.Ptr);
	do {
	const Value *V = GetUnderlyingObject(Worklist.pop_back_val(), DL);
	if (!Visited.insert(V).second) {
	Visited.clear();
	return AAResultBase::pointsToConstantMemory(Loc, AAQI, OrLocal);
	}

	// An alloca instruction defines local memory.
	if (OrLocal && isa<AllocaInst>(V))
	continue;

	// A global constant counts as local memory for our purposes.
	if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) {
	// Note: this doesn't require GV to be "ODR" because it isn't legal for a
	// global to be marked constant in some modules and non-constant in
	// others. GV may even be a declaration, not a definition.
	if (!GV->isConstant()) {
	Visited.clear();
	return AAResultBase::pointsToConstantMemory(Loc, AAQI, OrLocal);
	}
	continue;
	}

	// If both select values point to local memory, then so does the select.
	if (const SelectInst *SI = dyn_cast<SelectInst>(V)) {
	Worklist.push_back(SI->getTrueValue());
	Worklist.push_back(SI->getFalseValue());
	continue;
	}

	// If all values incoming to a phi node point to local memory, then so does
	// the phi.
	if (const PHINode *PN = dyn_cast<PHINode>(V)) {
	// Don't bother inspecting phi nodes with many operands.
	if (PN->getNumIncomingValues() > MaxLookup) {
	Visited.clear();
	return AAResultBase::pointsToConstantMemory(Loc, AAQI, OrLocal);
	}
	for (Value *IncValue : PN->incoming_values())
	Worklist.push_back(IncValue);
	continue;
	}

	// Otherwise be conservative.
	Visited.clear();
	return AAResultBase::pointsToConstantMemory(Loc, AAQI, OrLocal);
	} while (!Worklist.empty() && --MaxLookup);

	Visited.clear();
	return Worklist.empty();
	}

	/// Returns the behavior when calling the given call site.
	FunctionModRefBehavior BasicAAResult::getModRefBehavior(const CallBase *Call) {
	if (Call->doesNotAccessMemory())
	// Can't do better than this.
	return FMRB_DoesNotAccessMemory;

	FunctionModRefBehavior Min = FMRB_UnknownModRefBehavior;

	// If the callsite knows it only reads memory, don't return worse
	// than that.
	if (Call->onlyReadsMemory())
	Min = FMRB_OnlyReadsMemory;
	else if (Call->doesNotReadMemory())
	Min = FMRB_OnlyWritesMemory;

	if (Call->onlyAccessesArgMemory())
	Min = FunctionModRefBehavior(Min & FMRB_OnlyAccessesArgumentPointees);
	else if (Call->onlyAccessesInaccessibleMemory())
	Min = FunctionModRefBehavior(Min & FMRB_OnlyAccessesInaccessibleMem);
	else if (Call->onlyAccessesInaccessibleMemOrArgMem())
	Min = FunctionModRefBehavior(Min & FMRB_OnlyAccessesInaccessibleOrArgMem);

	// If the call has operand bundles then aliasing attributes from the function
	// it calls do not directly apply to the call. This can be made more precise
	// in the future.
	if (!Call->hasOperandBundles())
	if (const Function *F = Call->getCalledFunction())
	Min =
	FunctionModRefBehavior(Min & getBestAAResults().getModRefBehavior(F));

	return Min;
	}

	/// Returns the behavior when calling the given function. For use when the call
	/// site is not known.
	FunctionModRefBehavior BasicAAResult::getModRefBehavior(const Function *F) {
	// If the function declares it doesn't access memory, we can't do better.
	if (F->doesNotAccessMemory())
	return FMRB_DoesNotAccessMemory;

	FunctionModRefBehavior Min = FMRB_UnknownModRefBehavior;

	// If the function declares it only reads memory, go with that.
	if (F->onlyReadsMemory())
	Min = FMRB_OnlyReadsMemory;
	else if (F->doesNotReadMemory())
	Min = FMRB_OnlyWritesMemory;

	if (F->onlyAccessesArgMemory())
	Min = FunctionModRefBehavior(Min & FMRB_OnlyAccessesArgumentPointees);
	else if (F->onlyAccessesInaccessibleMemory())
	Min = FunctionModRefBehavior(Min & FMRB_OnlyAccessesInaccessibleMem);
	else if (F->onlyAccessesInaccessibleMemOrArgMem())
	Min = FunctionModRefBehavior(Min & FMRB_OnlyAccessesInaccessibleOrArgMem);

	return Min;
	}

	/// Returns true if this is a writeonly (i.e Mod only) parameter.
	static bool isWriteOnlyParam(const CallBase *Call, unsigned ArgIdx,
	const TargetLibraryInfo &TLI) {
	if (Call->paramHasAttr(ArgIdx, Attribute::WriteOnly))
	return true;

	// We can bound the aliasing properties of memset_pattern16 just as we can
	// for memcpy/memset. This is particularly important because the
	// LoopIdiomRecognizer likes to turn loops into calls to memset_pattern16
	// whenever possible.
	// FIXME Consider handling this in InferFunctionAttr.cpp together with other
	// attributes.
	LibFunc F;
	if (Call->getCalledFunction() &&
	TLI.getLibFunc(*Call->getCalledFunction(), F) &&
	F == LibFunc_memset_pattern16 && TLI.has(F))
	if (ArgIdx == 0)
	return true;

	// TODO: memset_pattern4, memset_pattern8
	// TODO: _chk variants
	// TODO: strcmp, strcpy

	return false;
	}

	ModRefInfo BasicAAResult::getArgModRefInfo(const CallBase *Call,
	unsigned ArgIdx) {
	// Checking for known builtin intrinsics and target library functions.
	if (isWriteOnlyParam(Call, ArgIdx, TLI))
	return ModRefInfo::Mod;

	if (Call->paramHasAttr(ArgIdx, Attribute::ReadOnly))
	return ModRefInfo::Ref;

	if (Call->paramHasAttr(ArgIdx, Attribute::ReadNone))
	return ModRefInfo::NoModRef;

	return AAResultBase::getArgModRefInfo(Call, ArgIdx);
	}

	static bool isIntrinsicCall(const CallBase *Call, Intrinsic::ID IID) {
	const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Call);
	return II && II->getIntrinsicID() == IID;
	}

	#ifndef NDEBUG
	static const Function getParent(const Value V) {
	if (const Instruction *inst = dyn_cast<Instruction>(V)) {
	if (!inst->getParent())
	return nullptr;
	return inst->getParent()->getParent();
	}

	if (const Argument *arg = dyn_cast<Argument>(V))
	return arg->getParent();

	return nullptr;
	}

	static bool notDifferentParent(const Value O1, const Value O2) {

	const Function *F1 = getParent(O1);
	const Function *F2 = getParent(O2);

	return !F1 \|\| !F2 \|\| F1 == F2;
	}
	#endif

	AliasResult BasicAAResult::alias(const MemoryLocation &LocA,
	const MemoryLocation &LocB,
	AAQueryInfo &AAQI) {
	assert(notDifferentParent(LocA.Ptr, LocB.Ptr) &&
	"BasicAliasAnalysis doesn't support interprocedural queries.");

	// If we have a directly cached entry for these locations, we have recursed
	// through this once, so just return the cached results. Notably, when this
	// happens, we don't clear the cache.
	auto CacheIt = AAQI.AliasCache.find(AAQueryInfo::LocPair(LocA, LocB));
	if (CacheIt != AAQI.AliasCache.end())
	return CacheIt->second;

	CacheIt = AAQI.AliasCache.find(AAQueryInfo::LocPair(LocB, LocA));
	if (CacheIt != AAQI.AliasCache.end())
	return CacheIt->second;

	AliasResult Alias = aliasCheck(LocA.Ptr, LocA.Size, LocA.AATags, LocB.Ptr,
	LocB.Size, LocB.AATags, AAQI);

	VisitedPhiBBs.clear();
	return Alias;
	}

	/// Checks to see if the specified callsite can clobber the specified memory
	/// object.
	///
	/// Since we only look at local properties of this function, we really can't
	/// say much about this query. We do, however, use simple "address taken"
	/// analysis on local objects.
	ModRefInfo BasicAAResult::getModRefInfo(const CallBase *Call,
	const MemoryLocation &Loc,
	AAQueryInfo &AAQI) {
	assert(notDifferentParent(Call, Loc.Ptr) &&
	"AliasAnalysis query involving multiple functions!");

	const Value *Object = GetUnderlyingObject(Loc.Ptr, DL);

	// Calls marked 'tail' cannot read or write allocas from the current frame
	// because the current frame might be destroyed by the time they run. However,
	// a tail call may use an alloca with byval. Calling with byval copies the
	// contents of the alloca into argument registers or stack slots, so there is
	// no lifetime issue.
	if (isa<AllocaInst>(Object))
	if (const CallInst *CI = dyn_cast<CallInst>(Call))
	if (CI->isTailCall() &&
	!CI->getAttributes().hasAttrSomewhere(Attribute::ByVal))
	return ModRefInfo::NoModRef;

	// Stack restore is able to modify unescaped dynamic allocas. Assume it may
	// modify them even though the alloca is not escaped.
	if (auto *AI = dyn_cast<AllocaInst>(Object))
	if (!AI->isStaticAlloca() && isIntrinsicCall(Call, Intrinsic::stackrestore))
	return ModRefInfo::Mod;

	// If the pointer is to a locally allocated object that does not escape,
	// then the call can not mod/ref the pointer unless the call takes the pointer
	// as an argument, and itself doesn't capture it.
	if (!isa<Constant>(Object) && Call != Object &&
	isNonEscapingLocalObject(Object, &AAQI.IsCapturedCache)) {

	// Optimistically assume that call doesn't touch Object and check this
	// assumption in the following loop.
	ModRefInfo Result = ModRefInfo::NoModRef;
	bool IsMustAlias = true;

	unsigned OperandNo = 0;
	for (auto CI = Call->data_operands_begin(), CE = Call->data_operands_end();
	CI != CE; ++CI, ++OperandNo) {
	// Only look at the no-capture or byval pointer arguments. If this
	// pointer were passed to arguments that were neither of these, then it
	// couldn't be no-capture.
	if (!(*CI)->getType()->isPointerTy() \|\|
	(!Call->doesNotCapture(OperandNo) &&
	OperandNo < Call->getNumArgOperands() &&
	!Call->isByValArgument(OperandNo)))
	continue;

	// Call doesn't access memory through this operand, so we don't care
	// if it aliases with Object.
	if (Call->doesNotAccessMemory(OperandNo))
	continue;

	// If this is a no-capture pointer argument, see if we can tell that it
	// is impossible to alias the pointer we're checking.
	AliasResult AR = getBestAAResults().alias(MemoryLocation(*CI),
	MemoryLocation(Object), AAQI);
	if (AR != MustAlias)
	IsMustAlias = false;
	// Operand doesn't alias 'Object', continue looking for other aliases
	if (AR == NoAlias)
	continue;
	// Operand aliases 'Object', but call doesn't modify it. Strengthen
	// initial assumption and keep looking in case if there are more aliases.
	if (Call->onlyReadsMemory(OperandNo)) {
	Result = setRef(Result);
	continue;
	}
	// Operand aliases 'Object' but call only writes into it.
	if (Call->doesNotReadMemory(OperandNo)) {
	Result = setMod(Result);
	continue;
	}
	// This operand aliases 'Object' and call reads and writes into it.
	// Setting ModRef will not yield an early return below, MustAlias is not
	// used further.
	Result = ModRefInfo::ModRef;
	break;
	}

	// No operand aliases, reset Must bit. Add below if at least one aliases
	// and all aliases found are MustAlias.
	if (isNoModRef(Result))
	IsMustAlias = false;

	// Early return if we improved mod ref information
	if (!isModAndRefSet(Result)) {
	if (isNoModRef(Result))
	return ModRefInfo::NoModRef;
	return IsMustAlias ? setMust(Result) : clearMust(Result);
	}
	}

	// If the call is malloc/calloc like, we can assume that it doesn't
	// modify any IR visible value. This is only valid because we assume these
	// routines do not read values visible in the IR. TODO: Consider special
	// casing realloc and strdup routines which access only their arguments as
	// well. Or alternatively, replace all of this with inaccessiblememonly once
	// that's implemented fully.
	if (isMallocOrCallocLikeFn(Call, &TLI)) {
	// Be conservative if the accessed pointer may alias the allocation -
	// fallback to the generic handling below.
	if (getBestAAResults().alias(MemoryLocation(Call), Loc, AAQI) == NoAlias)
	return ModRefInfo::NoModRef;
	}

	// The semantics of memcpy intrinsics forbid overlap between their respective
	// operands, i.e., source and destination of any given memcpy must no-alias.
	// If Loc must-aliases either one of these two locations, then it necessarily
	// no-aliases the other.
	if (auto *Inst = dyn_cast<AnyMemCpyInst>(Call)) {
	AliasResult SrcAA, DestAA;

	if ((SrcAA = getBestAAResults().alias(MemoryLocation::getForSource(Inst),
	Loc, AAQI)) == MustAlias)
	// Loc is exactly the memcpy source thus disjoint from memcpy dest.
	return ModRefInfo::Ref;
	if ((DestAA = getBestAAResults().alias(MemoryLocation::getForDest(Inst),
	Loc, AAQI)) == MustAlias)
	// The converse case.
	return ModRefInfo::Mod;

	// It's also possible for Loc to alias both src and dest, or neither.
	ModRefInfo rv = ModRefInfo::NoModRef;
	if (SrcAA != NoAlias)
	rv = setRef(rv);
	if (DestAA != NoAlias)
	rv = setMod(rv);
	return rv;
	}

	// While the assume intrinsic is marked as arbitrarily writing so that
	// proper control dependencies will be maintained, it never aliases any
	// particular memory location.
	if (isIntrinsicCall(Call, Intrinsic::assume))
	return ModRefInfo::NoModRef;

	// Like assumes, guard intrinsics are also marked as arbitrarily writing so
	// that proper control dependencies are maintained but they never mods any
	// particular memory location.
	//
	// Unlike assumes, guard intrinsics are modeled as reading memory since the
	// heap state at the point the guard is issued needs to be consistent in case
	// the guard invokes the "deopt" continuation.
	if (isIntrinsicCall(Call, Intrinsic::experimental_guard))
	return ModRefInfo::Ref;

	// Like assumes, invariant.start intrinsics were also marked as arbitrarily
	// writing so that proper control dependencies are maintained but they never
	// mod any particular memory location visible to the IR.
	// Unlike assumes (which are now modeled as NoModRef), invariant.start
	// intrinsic is now modeled as reading memory. This prevents hoisting the
	// invariant.start intrinsic over stores. Consider:
	// *ptr = 40;
	// *ptr = 50;
	// invariant_start(ptr)
	// int val = *ptr;
	// print(val);
	//
	// This cannot be transformed to:
	//
	// *ptr = 40;
	// invariant_start(ptr)
	// *ptr = 50;
	// int val = *ptr;
	// print(val);
	//
	// The transformation will cause the second store to be ignored (based on
	// rules of invariant.start) and print 40, while the first program always
	// prints 50.
	if (isIntrinsicCall(Call, Intrinsic::invariant_start))
	return ModRefInfo::Ref;

	// The AAResultBase base class has some smarts, lets use them.
	return AAResultBase::getModRefInfo(Call, Loc, AAQI);
	}

	ModRefInfo BasicAAResult::getModRefInfo(const CallBase *Call1,
	const CallBase *Call2,
	AAQueryInfo &AAQI) {
	// While the assume intrinsic is marked as arbitrarily writing so that
	// proper control dependencies will be maintained, it never aliases any
	// particular memory location.
	if (isIntrinsicCall(Call1, Intrinsic::assume) \|\|
	isIntrinsicCall(Call2, Intrinsic::assume))
	return ModRefInfo::NoModRef;

	// Like assumes, guard intrinsics are also marked as arbitrarily writing so
	// that proper control dependencies are maintained but they never mod any
	// particular memory location.
	//
	// Unlike assumes, guard intrinsics are modeled as reading memory since the
	// heap state at the point the guard is issued needs to be consistent in case
	// the guard invokes the "deopt" continuation.

	// NB! This function is not commutative, so we special case two
	// possibilities for guard intrinsics.

	if (isIntrinsicCall(Call1, Intrinsic::experimental_guard))
	return isModSet(createModRefInfo(getModRefBehavior(Call2)))
	? ModRefInfo::Ref
	: ModRefInfo::NoModRef;

	if (isIntrinsicCall(Call2, Intrinsic::experimental_guard))
	return isModSet(createModRefInfo(getModRefBehavior(Call1)))
	? ModRefInfo::Mod
	: ModRefInfo::NoModRef;

	// The AAResultBase base class has some smarts, lets use them.
	return AAResultBase::getModRefInfo(Call1, Call2, AAQI);
	}

	/// Provide ad-hoc rules to disambiguate accesses through two GEP operators,
	/// both having the exact same pointer operand.
	static AliasResult aliasSameBasePointerGEPs(const GEPOperator *GEP1,
	LocationSize MaybeV1Size,
	const GEPOperator *GEP2,
	LocationSize MaybeV2Size,
	const DataLayout &DL) {
	assert(GEP1->getPointerOperand()->stripPointerCastsAndInvariantGroups() ==
	GEP2->getPointerOperand()->stripPointerCastsAndInvariantGroups() &&
	GEP1->getPointerOperandType() == GEP2->getPointerOperandType() &&
	"Expected GEPs with the same pointer operand");

	// Try to determine whether GEP1 and GEP2 index through arrays, into structs,
	// such that the struct field accesses provably cannot alias.
	// We also need at least two indices (the pointer, and the struct field).
	if (GEP1->getNumIndices() != GEP2->getNumIndices() \|\|
	GEP1->getNumIndices() < 2)
	return MayAlias;

	// If we don't know the size of the accesses through both GEPs, we can't
	// determine whether the struct fields accessed can't alias.
	if (MaybeV1Size == LocationSize::unknown() \|\|
	MaybeV2Size == LocationSize::unknown())
	return MayAlias;

	const uint64_t V1Size = MaybeV1Size.getValue();
	const uint64_t V2Size = MaybeV2Size.getValue();

	ConstantInt *C1 =
	dyn_cast<ConstantInt>(GEP1->getOperand(GEP1->getNumOperands() - 1));
	ConstantInt *C2 =
	dyn_cast<ConstantInt>(GEP2->getOperand(GEP2->getNumOperands() - 1));

	// If the last (struct) indices are constants and are equal, the other indices
	// might be also be dynamically equal, so the GEPs can alias.
	if (C1 && C2) {
	unsigned BitWidth = std::max(C1->getBitWidth(), C2->getBitWidth());
	if (C1->getValue().sextOrSelf(BitWidth) ==
	C2->getValue().sextOrSelf(BitWidth))
	return MayAlias;
	}

	// Find the last-indexed type of the GEP, i.e., the type you'd get if
	// you stripped the last index.
	// On the way, look at each indexed type. If there's something other
	// than an array, different indices can lead to different final types.
	SmallVector<Value *, 8> IntermediateIndices;

	// Insert the first index; we don't need to check the type indexed
	// through it as it only drops the pointer indirection.
	assert(GEP1->getNumIndices() > 1 && "Not enough GEP indices to examine");
	IntermediateIndices.push_back(GEP1->getOperand(1));

	// Insert all the remaining indices but the last one.
	// Also, check that they all index through arrays.
	for (unsigned i = 1, e = GEP1->getNumIndices() - 1; i != e; ++i) {
	if (!isa<ArrayType>(GetElementPtrInst::getIndexedType(
	GEP1->getSourceElementType(), IntermediateIndices)))
	return MayAlias;
	IntermediateIndices.push_back(GEP1->getOperand(i + 1));
	}

	auto *Ty = GetElementPtrInst::getIndexedType(
	GEP1->getSourceElementType(), IntermediateIndices);
	StructType *LastIndexedStruct = dyn_cast<StructType>(Ty);

	if (isa<ArrayType>(Ty) \|\| isa<VectorType>(Ty)) {
	// We know that:
	// - both GEPs begin indexing from the exact same pointer;
	// - the last indices in both GEPs are constants, indexing into a sequential
	// type (array or vector);
	// - both GEPs only index through arrays prior to that.
	//
	// Because array indices greater than the number of elements are valid in
	// GEPs, unless we know the intermediate indices are identical between
	// GEP1 and GEP2 we cannot guarantee that the last indexed arrays don't
	// partially overlap. We also need to check that the loaded size matches
	// the element size, otherwise we could still have overlap.
	Type *LastElementTy = GetElementPtrInst::getTypeAtIndex(Ty, (uint64_t)0);
	const uint64_t ElementSize =
	DL.getTypeStoreSize(LastElementTy).getFixedSize();
	if (V1Size != ElementSize \|\| V2Size != ElementSize)
	return MayAlias;

	for (unsigned i = 0, e = GEP1->getNumIndices() - 1; i != e; ++i)
	if (GEP1->getOperand(i + 1) != GEP2->getOperand(i + 1))
	return MayAlias;

	// Now we know that the array/pointer that GEP1 indexes into and that
	// that GEP2 indexes into must either precisely overlap or be disjoint.
	// Because they cannot partially overlap and because fields in an array
	// cannot overlap, if we can prove the final indices are different between
	// GEP1 and GEP2, we can conclude GEP1 and GEP2 don't alias.

	// If the last indices are constants, we've already checked they don't
	// equal each other so we can exit early.
	if (C1 && C2)
	return NoAlias;
	{
	Value *GEP1LastIdx = GEP1->getOperand(GEP1->getNumOperands() - 1);
	Value *GEP2LastIdx = GEP2->getOperand(GEP2->getNumOperands() - 1);
	if (isa<PHINode>(GEP1LastIdx) \|\| isa<PHINode>(GEP2LastIdx)) {
	// If one of the indices is a PHI node, be safe and only use
	// computeKnownBits so we don't make any assumptions about the
	// relationships between the two indices. This is important if we're
	// asking about values from different loop iterations. See PR32314.
	// TODO: We may be able to change the check so we only do this when
	// we definitely looked through a PHINode.
	if (GEP1LastIdx != GEP2LastIdx &&
	GEP1LastIdx->getType() == GEP2LastIdx->getType()) {
	KnownBits Known1 = computeKnownBits(GEP1LastIdx, DL);
	KnownBits Known2 = computeKnownBits(GEP2LastIdx, DL);
	if (Known1.Zero.intersects(Known2.One) \|\|
	Known1.One.intersects(Known2.Zero))
	return NoAlias;
	}
	} else if (isKnownNonEqual(GEP1LastIdx, GEP2LastIdx, DL))
	return NoAlias;
	}
	return MayAlias;
	} else if (!LastIndexedStruct \|\| !C1 \|\| !C2) {
	return MayAlias;
	}

	if (C1->getValue().getActiveBits() > 64 \|\|
	C2->getValue().getActiveBits() > 64)
	return MayAlias;

	// We know that:
	// - both GEPs begin indexing from the exact same pointer;
	// - the last indices in both GEPs are constants, indexing into a struct;
	// - said indices are different, hence, the pointed-to fields are different;
	// - both GEPs only index through arrays prior to that.
	//
	// This lets us determine that the struct that GEP1 indexes into and the
	// struct that GEP2 indexes into must either precisely overlap or be
	// completely disjoint. Because they cannot partially overlap, indexing into
	// different non-overlapping fields of the struct will never alias.

	// Therefore, the only remaining thing needed to show that both GEPs can't
	// alias is that the fields are not overlapping.
	const StructLayout *SL = DL.getStructLayout(LastIndexedStruct);
	const uint64_t StructSize = SL->getSizeInBytes();
	const uint64_t V1Off = SL->getElementOffset(C1->getZExtValue());
	const uint64_t V2Off = SL->getElementOffset(C2->getZExtValue());

	auto EltsDontOverlap = [StructSize](uint64_t V1Off, uint64_t V1Size,
	uint64_t V2Off, uint64_t V2Size) {
	return V1Off < V2Off && V1Off + V1Size <= V2Off &&
	((V2Off + V2Size <= StructSize) \|\|
	(V2Off + V2Size - StructSize <= V1Off));
	};

	if (EltsDontOverlap(V1Off, V1Size, V2Off, V2Size) \|\|
	EltsDontOverlap(V2Off, V2Size, V1Off, V1Size))
	return NoAlias;

	return MayAlias;
	}

	// If a we have (a) a GEP and (b) a pointer based on an alloca, and the
	// beginning of the object the GEP points would have a negative offset with
	// repsect to the alloca, that means the GEP can not alias pointer (b).
	// Note that the pointer based on the alloca may not be a GEP. For
	// example, it may be the alloca itself.
	// The same applies if (b) is based on a GlobalVariable. Note that just being
	// based on isIdentifiedObject() is not enough - we need an identified object
	// that does not permit access to negative offsets. For example, a negative
	// offset from a noalias argument or call can be inbounds w.r.t the actual
	// underlying object.
	//
	// For example, consider:
	//
	// struct { int f0, int f1, ...} foo;
	// foo alloca;
	// foo* random = bar(alloca);
	// int *f0 = &alloca.f0
	// int *f1 = &random->f1;
	//
	// Which is lowered, approximately, to:
	//
	// %alloca = alloca %struct.foo
	// %random = call %struct.foo* @random(%struct.foo* %alloca)
	// %f0 = getelementptr inbounds %struct, %struct.foo* %alloca, i32 0, i32 0
	// %f1 = getelementptr inbounds %struct, %struct.foo* %random, i32 0, i32 1
	//
	// Assume %f1 and %f0 alias. Then %f1 would point into the object allocated
	// by %alloca. Since the %f1 GEP is inbounds, that means %random must also
	// point into the same object. But since %f0 points to the beginning of %alloca,
	// the highest %f1 can be is (%alloca + 3). This means %random can not be higher
	// than (%alloca - 1), and so is not inbounds, a contradiction.
	bool BasicAAResult::isGEPBaseAtNegativeOffset(const GEPOperator *GEPOp,
	const DecomposedGEP &DecompGEP, const DecomposedGEP &DecompObject,
	LocationSize MaybeObjectAccessSize) {
	// If the object access size is unknown, or the GEP isn't inbounds, bail.
	if (MaybeObjectAccessSize == LocationSize::unknown() \|\| !GEPOp->isInBounds())
	return false;

	const uint64_t ObjectAccessSize = MaybeObjectAccessSize.getValue();

	// We need the object to be an alloca or a globalvariable, and want to know
	// the offset of the pointer from the object precisely, so no variable
	// indices are allowed.
	if (!(isa<AllocaInst>(DecompObject.Base) \|\|
	isa<GlobalVariable>(DecompObject.Base)) \|\|
	!DecompObject.VarIndices.empty())
	return false;

	APInt ObjectBaseOffset = DecompObject.StructOffset +
	DecompObject.OtherOffset;

	// If the GEP has no variable indices, we know the precise offset
	// from the base, then use it. If the GEP has variable indices,
	// we can't get exact GEP offset to identify pointer alias. So return
	// false in that case.
	if (!DecompGEP.VarIndices.empty())
	return false;

	APInt GEPBaseOffset = DecompGEP.StructOffset;
	GEPBaseOffset += DecompGEP.OtherOffset;

	return GEPBaseOffset.sge(ObjectBaseOffset + (int64_t)ObjectAccessSize);
	}

	/// Provides a bunch of ad-hoc rules to disambiguate a GEP instruction against
	/// another pointer.
	///
	/// We know that V1 is a GEP, but we don't know anything about V2.
	/// UnderlyingV1 is GetUnderlyingObject(GEP1, DL), UnderlyingV2 is the same for
	/// V2.
	AliasResult BasicAAResult::aliasGEP(
	const GEPOperator *GEP1, LocationSize V1Size, const AAMDNodes &V1AAInfo,
	const Value *V2, LocationSize V2Size, const AAMDNodes &V2AAInfo,
	const Value UnderlyingV1, const Value UnderlyingV2, AAQueryInfo &AAQI) {
	DecomposedGEP DecompGEP1, DecompGEP2;
	unsigned MaxPointerSize = getMaxPointerSize(DL);
	DecompGEP1.StructOffset = DecompGEP1.OtherOffset = APInt(MaxPointerSize, 0);
	DecompGEP2.StructOffset = DecompGEP2.OtherOffset = APInt(MaxPointerSize, 0);
	DecompGEP1.HasCompileTimeConstantScale =
	DecompGEP2.HasCompileTimeConstantScale = true;

	bool GEP1MaxLookupReached =
	DecomposeGEPExpression(GEP1, DecompGEP1, DL, &AC, DT);
	bool GEP2MaxLookupReached =
	DecomposeGEPExpression(V2, DecompGEP2, DL, &AC, DT);

	// Don't attempt to analyze the decomposed GEP if index scale is not a
	// compile-time constant.
	if (!DecompGEP1.HasCompileTimeConstantScale \|\|
	!DecompGEP2.HasCompileTimeConstantScale)
	return MayAlias;

	APInt GEP1BaseOffset = DecompGEP1.StructOffset + DecompGEP1.OtherOffset;
	APInt GEP2BaseOffset = DecompGEP2.StructOffset + DecompGEP2.OtherOffset;

	assert(DecompGEP1.Base == UnderlyingV1 && DecompGEP2.Base == UnderlyingV2 &&
	"DecomposeGEPExpression returned a result different from "
	"GetUnderlyingObject");

	// If the GEP's offset relative to its base is such that the base would
	// fall below the start of the object underlying V2, then the GEP and V2
	// cannot alias.
	if (!GEP1MaxLookupReached && !GEP2MaxLookupReached &&
	isGEPBaseAtNegativeOffset(GEP1, DecompGEP1, DecompGEP2, V2Size))
	return NoAlias;
	// If we have two gep instructions with must-alias or not-alias'ing base
	// pointers, figure out if the indexes to the GEP tell us anything about the
	// derived pointer.
	if (const GEPOperator *GEP2 = dyn_cast<GEPOperator>(V2)) {
	// Check for the GEP base being at a negative offset, this time in the other
	// direction.
	if (!GEP1MaxLookupReached && !GEP2MaxLookupReached &&
	isGEPBaseAtNegativeOffset(GEP2, DecompGEP2, DecompGEP1, V1Size))
	return NoAlias;
	// Do the base pointers alias?
	AliasResult BaseAlias =
	aliasCheck(UnderlyingV1, LocationSize::unknown(), AAMDNodes(),
	UnderlyingV2, LocationSize::unknown(), AAMDNodes(), AAQI);

	// Check for geps of non-aliasing underlying pointers where the offsets are
	// identical.
	if ((BaseAlias == MayAlias) && V1Size == V2Size) {
	// Do the base pointers alias assuming type and size.
	AliasResult PreciseBaseAlias = aliasCheck(
	UnderlyingV1, V1Size, V1AAInfo, UnderlyingV2, V2Size, V2AAInfo, AAQI);
	if (PreciseBaseAlias == NoAlias) {
	// See if the computed offset from the common pointer tells us about the
	// relation of the resulting pointer.
	// If the max search depth is reached the result is undefined
	if (GEP2MaxLookupReached \|\| GEP1MaxLookupReached)
	return MayAlias;

	// Same offsets.
	if (GEP1BaseOffset == GEP2BaseOffset &&
	DecompGEP1.VarIndices == DecompGEP2.VarIndices)
	return NoAlias;
	}
	}

	// If we get a No or May, then return it immediately, no amount of analysis
	// will improve this situation.
	if (BaseAlias != MustAlias) {
	assert(BaseAlias == NoAlias \|\| BaseAlias == MayAlias);
	return BaseAlias;
	}

	// Otherwise, we have a MustAlias. Since the base pointers alias each other
	// exactly, see if the computed offset from the common pointer tells us
	// about the relation of the resulting pointer.
	// If we know the two GEPs are based off of the exact same pointer (and not
	// just the same underlying object), see if that tells us anything about
	// the resulting pointers.
	if (GEP1->getPointerOperand()->stripPointerCastsAndInvariantGroups() ==
	GEP2->getPointerOperand()->stripPointerCastsAndInvariantGroups() &&
	GEP1->getPointerOperandType() == GEP2->getPointerOperandType()) {
	AliasResult R = aliasSameBasePointerGEPs(GEP1, V1Size, GEP2, V2Size, DL);
	// If we couldn't find anything interesting, don't abandon just yet.
	if (R != MayAlias)
	return R;
	}

	// If the max search depth is reached, the result is undefined
	if (GEP2MaxLookupReached \|\| GEP1MaxLookupReached)
	return MayAlias;

	// Subtract the GEP2 pointer from the GEP1 pointer to find out their
	// symbolic difference.
	GEP1BaseOffset -= GEP2BaseOffset;
	GetIndexDifference(DecompGEP1.VarIndices, DecompGEP2.VarIndices);

	} else {
	// Check to see if these two pointers are related by the getelementptr
	// instruction. If one pointer is a GEP with a non-zero index of the other
	// pointer, we know they cannot alias.

	// If both accesses are unknown size, we can't do anything useful here.
	if (V1Size == LocationSize::unknown() && V2Size == LocationSize::unknown())
	return MayAlias;

	AliasResult R = aliasCheck(UnderlyingV1, LocationSize::unknown(),
	AAMDNodes(), V2, LocationSize::unknown(),
	V2AAInfo, AAQI, nullptr, UnderlyingV2);
	if (R != MustAlias) {
	// If V2 may alias GEP base pointer, conservatively returns MayAlias.
	// If V2 is known not to alias GEP base pointer, then the two values
	// cannot alias per GEP semantics: "Any memory access must be done through
	// a pointer value associated with an address range of the memory access,
	// otherwise the behavior is undefined.".
	assert(R == NoAlias \|\| R == MayAlias);
	return R;
	}

	// If the max search depth is reached the result is undefined
	if (GEP1MaxLookupReached)
	return MayAlias;
	}

	// In the two GEP Case, if there is no difference in the offsets of the
	// computed pointers, the resultant pointers are a must alias. This
	// happens when we have two lexically identical GEP's (for example).
	//
	// In the other case, if we have getelementptr <ptr>, 0, 0, 0, 0, ... and V2
	// must aliases the GEP, the end result is a must alias also.
	if (GEP1BaseOffset == 0 && DecompGEP1.VarIndices.empty())
	return MustAlias;

	// If there is a constant difference between the pointers, but the difference
	// is less than the size of the associated memory object, then we know
	// that the objects are partially overlapping. If the difference is
	// greater, we know they do not overlap.
	if (GEP1BaseOffset != 0 && DecompGEP1.VarIndices.empty()) {
	if (GEP1BaseOffset.sge(0)) {
	if (V2Size != LocationSize::unknown()) {
	if (GEP1BaseOffset.ult(V2Size.getValue()))
	return PartialAlias;
	return NoAlias;
	}
	} else {
	// We have the situation where:
	// + +
	// \| BaseOffset \|
	// ---------------->\|
	// \|-->V1Size \|-------> V2Size
	// GEP1 V2
	// We need to know that V2Size is not unknown, otherwise we might have
	// stripped a gep with negative index ('gep <ptr>, -1, ...).
	if (V1Size != LocationSize::unknown() &&
	V2Size != LocationSize::unknown()) {
	if ((-GEP1BaseOffset).ult(V1Size.getValue()))
	return PartialAlias;
	return NoAlias;
	}
	}
	}

	if (!DecompGEP1.VarIndices.empty()) {
	APInt Modulo(MaxPointerSize, 0);
	bool AllPositive = true;
	for (unsigned i = 0, e = DecompGEP1.VarIndices.size(); i != e; ++i) {

	// Try to distinguish something like &A[i][1] against &A[42][0].
	// Grab the least significant bit set in any of the scales. We
	// don't need std::abs here (even if the scale's negative) as we'll
	// be ^'ing Modulo with itself later.
	Modulo \|= DecompGEP1.VarIndices[i].Scale;

	if (AllPositive) {
	// If the Value could change between cycles, then any reasoning about
	// the Value this cycle may not hold in the next cycle. We'll just
	// give up if we can't determine conditions that hold for every cycle:
	const Value *V = DecompGEP1.VarIndices[i].V;

	KnownBits Known =
	computeKnownBits(V, DL, 0, &AC, dyn_cast<Instruction>(GEP1), DT);
	bool SignKnownZero = Known.isNonNegative();
	bool SignKnownOne = Known.isNegative();

	// Zero-extension widens the variable, and so forces the sign
	// bit to zero.
	bool IsZExt = DecompGEP1.VarIndices[i].ZExtBits > 0 \|\| isa<ZExtInst>(V);
	SignKnownZero \|= IsZExt;
	SignKnownOne &= !IsZExt;

	// If the variable begins with a zero then we know it's
	// positive, regardless of whether the value is signed or
	// unsigned.
	APInt Scale = DecompGEP1.VarIndices[i].Scale;
	AllPositive =
	(SignKnownZero && Scale.sge(0)) \|\| (SignKnownOne && Scale.slt(0));
	}
	}

	Modulo = Modulo ^ (Modulo & (Modulo - 1));

	// We can compute the difference between the two addresses
	// mod Modulo. Check whether that difference guarantees that the
	// two locations do not alias.
	APInt ModOffset = GEP1BaseOffset & (Modulo - 1);
	if (V1Size != LocationSize::unknown() &&
	V2Size != LocationSize::unknown() && ModOffset.uge(V2Size.getValue()) &&
	(Modulo - ModOffset).uge(V1Size.getValue()))
	return NoAlias;

	// If we know all the variables are positive, then GEP1 >= GEP1BasePtr.
	// If GEP1BasePtr > V2 (GEP1BaseOffset > 0) then we know the pointers
	// don't alias if V2Size can fit in the gap between V2 and GEP1BasePtr.
	if (AllPositive && GEP1BaseOffset.sgt(0) &&
	V2Size != LocationSize::unknown() &&
	GEP1BaseOffset.uge(V2Size.getValue()))
	return NoAlias;

	if (constantOffsetHeuristic(DecompGEP1.VarIndices, V1Size, V2Size,
	GEP1BaseOffset, &AC, DT))
	return NoAlias;
	}

	// Statically, we can see that the base objects are the same, but the
	// pointers have dynamic offsets which we can't resolve. And none of our
	// little tricks above worked.
	return MayAlias;
	}

	static AliasResult MergeAliasResults(AliasResult A, AliasResult B) {
	// If the results agree, take it.
	if (A == B)
	return A;
	// A mix of PartialAlias and MustAlias is PartialAlias.
	if ((A == PartialAlias && B == MustAlias) \|\|
	(B == PartialAlias && A == MustAlias))
	return PartialAlias;
	// Otherwise, we don't know anything.
	return MayAlias;
	}

	/// Provides a bunch of ad-hoc rules to disambiguate a Select instruction
	/// against another.
	AliasResult
	BasicAAResult::aliasSelect(const SelectInst *SI, LocationSize SISize,
	const AAMDNodes &SIAAInfo, const Value *V2,
	LocationSize V2Size, const AAMDNodes &V2AAInfo,
	const Value *UnderV2, AAQueryInfo &AAQI) {
	// If the values are Selects with the same condition, we can do a more precise
	// check: just check for aliases between the values on corresponding arms.
	if (const SelectInst *SI2 = dyn_cast<SelectInst>(V2))
	if (SI->getCondition() == SI2->getCondition()) {
	AliasResult Alias =
	aliasCheck(SI->getTrueValue(), SISize, SIAAInfo, SI2->getTrueValue(),
	V2Size, V2AAInfo, AAQI);
	if (Alias == MayAlias)
	return MayAlias;
	AliasResult ThisAlias =
	aliasCheck(SI->getFalseValue(), SISize, SIAAInfo,
	SI2->getFalseValue(), V2Size, V2AAInfo, AAQI);
	return MergeAliasResults(ThisAlias, Alias);
	}

	// If both arms of the Select node NoAlias or MustAlias V2, then returns
	// NoAlias / MustAlias. Otherwise, returns MayAlias.
	AliasResult Alias = aliasCheck(V2, V2Size, V2AAInfo, SI->getTrueValue(),
	SISize, SIAAInfo, AAQI, UnderV2);
	if (Alias == MayAlias)
	return MayAlias;

	AliasResult ThisAlias = aliasCheck(V2, V2Size, V2AAInfo, SI->getFalseValue(),
	SISize, SIAAInfo, AAQI, UnderV2);
	return MergeAliasResults(ThisAlias, Alias);
	}

	/// Provide a bunch of ad-hoc rules to disambiguate a PHI instruction against
	/// another.
	AliasResult BasicAAResult::aliasPHI(const PHINode *PN, LocationSize PNSize,
	const AAMDNodes &PNAAInfo, const Value *V2,
	LocationSize V2Size,
	const AAMDNodes &V2AAInfo,
	const Value *UnderV2, AAQueryInfo &AAQI) {
	// Track phi nodes we have visited. We use this information when we determine
	// value equivalence.
	VisitedPhiBBs.insert(PN->getParent());

	// If the values are PHIs in the same block, we can do a more precise
	// as well as efficient check: just check for aliases between the values
	// on corresponding edges.
	if (const PHINode *PN2 = dyn_cast<PHINode>(V2))
	if (PN2->getParent() == PN->getParent()) {
	AAQueryInfo::LocPair Locs(MemoryLocation(PN, PNSize, PNAAInfo),
	MemoryLocation(V2, V2Size, V2AAInfo));
	if (PN > V2)
	std::swap(Locs.first, Locs.second);
	// Analyse the PHIs' inputs under the assumption that the PHIs are
	// NoAlias.
	// If the PHIs are May/MustAlias there must be (recursively) an input
	// operand from outside the PHIs' cycle that is MayAlias/MustAlias or
	// there must be an operation on the PHIs within the PHIs' value cycle
	// that causes a MayAlias.
	// Pretend the phis do not alias.
	AliasResult Alias = NoAlias;
	AliasResult OrigAliasResult;
	{
	// Limited lifetime iterator invalidated by the aliasCheck call below.
	auto CacheIt = AAQI.AliasCache.find(Locs);
	assert((CacheIt != AAQI.AliasCache.end()) &&
	"There must exist an entry for the phi node");
	OrigAliasResult = CacheIt->second;
	CacheIt->second = NoAlias;
	}

	for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
	AliasResult ThisAlias =
	aliasCheck(PN->getIncomingValue(i), PNSize, PNAAInfo,
	PN2->getIncomingValueForBlock(PN->getIncomingBlock(i)),
	V2Size, V2AAInfo, AAQI);
	Alias = MergeAliasResults(ThisAlias, Alias);
	if (Alias == MayAlias)
	break;
	}

	// Reset if speculation failed.
	if (Alias != NoAlias) {
	auto Pair =
	AAQI.AliasCache.insert(std::make_pair(Locs, OrigAliasResult));
	assert(!Pair.second && "Entry must have existed");
	Pair.first->second = OrigAliasResult;
	}
	return Alias;
	}

	SmallVector<Value *, 4> V1Srcs;
	+ // For a recursive phi, that recurses through a contant gep, we can perform
	+ // aliasing calculations using the other phi operands with an unknown size to
	+ // specify that an unknown number of elements after the initial value are
	+ // potentially accessed.
	bool isRecursive = false;
	- if (PV) {
	+ auto CheckForRecPhi = [&](Value *PV) {
	+ if (!EnableRecPhiAnalysis)
	+ return false;
	+ if (GEPOperator *PVGEP = dyn_cast<GEPOperator>(PV)) {
	+ // Check whether the incoming value is a GEP that advances the pointer
	+ // result of this PHI node (e.g. in a loop). If this is the case, we
	+ // would recurse and always get a MayAlias. Handle this case specially
	+ // below. We need to ensure that the phi is inbounds and has a constant
	+ // positive operand so that we can check for alias with the initial value
	+ // and an unknown but positive size.
	+ if (PVGEP->getPointerOperand() == PN && PVGEP->isInBounds() &&
	+ PVGEP->getNumIndices() == 1 && isa<ConstantInt>(PVGEP->idx_begin()) &&
	+ !cast<ConstantInt>(PVGEP->idx_begin())->isNegative()) {
	+ isRecursive = true;
	+ return true;
	+ }
	+ }
	+ return false;
	+ };
	+
	+ if (PV) {
	// If we have PhiValues then use it to get the underlying phi values.
	const PhiValues::ValueSet &PhiValueSet = PV->getValuesForPhi(PN);
	// If we have more phi values than the search depth then return MayAlias
	// conservatively to avoid compile time explosion. The worst possible case
	// is if both sides are PHI nodes. In which case, this is O(m x n) time
	// where 'm' and 'n' are the number of PHI sources.
	if (PhiValueSet.size() > MaxLookupSearchDepth)
	return MayAlias;
	// Add the values to V1Srcs
	for (Value *PV1 : PhiValueSet) {
	- if (EnableRecPhiAnalysis) {
	- if (GEPOperator *PV1GEP = dyn_cast<GEPOperator>(PV1)) {
	- // Check whether the incoming value is a GEP that advances the pointer
	- // result of this PHI node (e.g. in a loop). If this is the case, we
	- // would recurse and always get a MayAlias. Handle this case specially
	- // below.
	- if (PV1GEP->getPointerOperand() == PN && PV1GEP->getNumIndices() == 1 &&
	- isa<ConstantInt>(PV1GEP->idx_begin())) {
	- isRecursive = true;
	- continue;
	- }
	- }
	- }
	+ if (CheckForRecPhi(PV1))
	+ continue;
	V1Srcs.push_back(PV1);
	}
	} else {
	// If we don't have PhiInfo then just look at the operands of the phi itself
	// FIXME: Remove this once we can guarantee that we have PhiInfo always
	SmallPtrSet<Value *, 4> UniqueSrc;
	for (Value *PV1 : PN->incoming_values()) {
	if (isa<PHINode>(PV1))
	// If any of the source itself is a PHI, return MayAlias conservatively
	// to avoid compile time explosion. The worst possible case is if both
	// sides are PHI nodes. In which case, this is O(m x n) time where 'm'
	// and 'n' are the number of PHI sources.
	return MayAlias;

	- if (EnableRecPhiAnalysis)
	- if (GEPOperator *PV1GEP = dyn_cast<GEPOperator>(PV1)) {
	- // Check whether the incoming value is a GEP that advances the pointer
	- // result of this PHI node (e.g. in a loop). If this is the case, we
	- // would recurse and always get a MayAlias. Handle this case specially
	- // below.
	- if (PV1GEP->getPointerOperand() == PN && PV1GEP->getNumIndices() == 1 &&
	- isa<ConstantInt>(PV1GEP->idx_begin())) {
	- isRecursive = true;
	- continue;
	- }
	- }
	+ if (CheckForRecPhi(PV1))
	+ continue;

	if (UniqueSrc.insert(PV1).second)
	V1Srcs.push_back(PV1);
	}
	}

	// If V1Srcs is empty then that means that the phi has no underlying non-phi
	// value. This should only be possible in blocks unreachable from the entry
	// block, but return MayAlias just in case.
	if (V1Srcs.empty())
	return MayAlias;

	// If this PHI node is recursive, set the size of the accessed memory to
	// unknown to represent all the possible values the GEP could advance the
	// pointer to.
	if (isRecursive)
	PNSize = LocationSize::unknown();

	AliasResult Alias = aliasCheck(V2, V2Size, V2AAInfo, V1Srcs[0], PNSize,
	PNAAInfo, AAQI, UnderV2);

	// Early exit if the check of the first PHI source against V2 is MayAlias.
	// Other results are not possible.
	if (Alias == MayAlias)
	return MayAlias;
	// With recursive phis we cannot guarantee that MustAlias/PartialAlias will
	// remain valid to all elements and needs to conservatively return MayAlias.
	if (isRecursive && Alias != NoAlias)
	return MayAlias;

	// If all sources of the PHI node NoAlias or MustAlias V2, then returns
	// NoAlias / MustAlias. Otherwise, returns MayAlias.
	for (unsigned i = 1, e = V1Srcs.size(); i != e; ++i) {
	Value *V = V1Srcs[i];

	AliasResult ThisAlias =
	aliasCheck(V2, V2Size, V2AAInfo, V, PNSize, PNAAInfo, AAQI, UnderV2);
	Alias = MergeAliasResults(ThisAlias, Alias);
	if (Alias == MayAlias)
	break;
	}

	return Alias;
	}

	/// Provides a bunch of ad-hoc rules to disambiguate in common cases, such as
	/// array references.
	AliasResult BasicAAResult::aliasCheck(const Value *V1, LocationSize V1Size,
	AAMDNodes V1AAInfo, const Value *V2,
	LocationSize V2Size, AAMDNodes V2AAInfo,
	AAQueryInfo &AAQI, const Value *O1,
	const Value *O2) {
	// If either of the memory references is empty, it doesn't matter what the
	// pointer values are.
	if (V1Size.isZero() \|\| V2Size.isZero())
	return NoAlias;

	// Strip off any casts if they exist.
	V1 = V1->stripPointerCastsAndInvariantGroups();
	V2 = V2->stripPointerCastsAndInvariantGroups();

	// If V1 or V2 is undef, the result is NoAlias because we can always pick a
	// value for undef that aliases nothing in the program.
	if (isa<UndefValue>(V1) \|\| isa<UndefValue>(V2))
	return NoAlias;

	// Are we checking for alias of the same value?
	// Because we look 'through' phi nodes, we could look at "Value" pointers from
	// different iterations. We must therefore make sure that this is not the
	// case. The function isValueEqualInPotentialCycles ensures that this cannot
	// happen by looking at the visited phi nodes and making sure they cannot
	// reach the value.
	if (isValueEqualInPotentialCycles(V1, V2))
	return MustAlias;

	if (!V1->getType()->isPointerTy() \|\| !V2->getType()->isPointerTy())
	return NoAlias; // Scalars cannot alias each other

	// Figure out what objects these things are pointing to if we can.
	if (O1 == nullptr)
	O1 = GetUnderlyingObject(V1, DL, MaxLookupSearchDepth);

	if (O2 == nullptr)
	O2 = GetUnderlyingObject(V2, DL, MaxLookupSearchDepth);

	// Null values in the default address space don't point to any object, so they
	// don't alias any other pointer.
	if (const ConstantPointerNull *CPN = dyn_cast<ConstantPointerNull>(O1))
	if (!NullPointerIsDefined(&F, CPN->getType()->getAddressSpace()))
	return NoAlias;
	if (const ConstantPointerNull *CPN = dyn_cast<ConstantPointerNull>(O2))
	if (!NullPointerIsDefined(&F, CPN->getType()->getAddressSpace()))
	return NoAlias;

	if (O1 != O2) {
	// If V1/V2 point to two different objects, we know that we have no alias.
	if (isIdentifiedObject(O1) && isIdentifiedObject(O2))
	return NoAlias;

	// Constant pointers can't alias with non-const isIdentifiedObject objects.
	if ((isa<Constant>(O1) && isIdentifiedObject(O2) && !isa<Constant>(O2)) \|\|
	(isa<Constant>(O2) && isIdentifiedObject(O1) && !isa<Constant>(O1)))
	return NoAlias;

	// Function arguments can't alias with things that are known to be
	// unambigously identified at the function level.
	if ((isa<Argument>(O1) && isIdentifiedFunctionLocal(O2)) \|\|
	(isa<Argument>(O2) && isIdentifiedFunctionLocal(O1)))
	return NoAlias;

	// If one pointer is the result of a call/invoke or load and the other is a
	// non-escaping local object within the same function, then we know the
	// object couldn't escape to a point where the call could return it.
	//
	// Note that if the pointers are in different functions, there are a
	// variety of complications. A call with a nocapture argument may still
	// temporary store the nocapture argument's value in a temporary memory
	// location if that memory location doesn't escape. Or it may pass a
	// nocapture value to other functions as long as they don't capture it.
	if (isEscapeSource(O1) &&
	isNonEscapingLocalObject(O2, &AAQI.IsCapturedCache))
	return NoAlias;
	if (isEscapeSource(O2) &&
	isNonEscapingLocalObject(O1, &AAQI.IsCapturedCache))
	return NoAlias;
	}

	// If the size of one access is larger than the entire object on the other
	// side, then we know such behavior is undefined and can assume no alias.
	bool NullIsValidLocation = NullPointerIsDefined(&F);
	if ((isObjectSmallerThan(
	O2, getMinimalExtentFrom(*V1, V1Size, DL, NullIsValidLocation), DL,
	TLI, NullIsValidLocation)) \|\|
	(isObjectSmallerThan(
	O1, getMinimalExtentFrom(*V2, V2Size, DL, NullIsValidLocation), DL,
	TLI, NullIsValidLocation)))
	return NoAlias;

	// Check the cache before climbing up use-def chains. This also terminates
	// otherwise infinitely recursive queries.
	AAQueryInfo::LocPair Locs(MemoryLocation(V1, V1Size, V1AAInfo),
	MemoryLocation(V2, V2Size, V2AAInfo));
	if (V1 > V2)
	std::swap(Locs.first, Locs.second);
	std::pair<AAQueryInfo::AliasCacheT::iterator, bool> Pair =
	AAQI.AliasCache.try_emplace(Locs, MayAlias);
	if (!Pair.second)
	return Pair.first->second;

	// FIXME: This isn't aggressively handling alias(GEP, PHI) for example: if the
	// GEP can't simplify, we don't even look at the PHI cases.
	if (!isa<GEPOperator>(V1) && isa<GEPOperator>(V2)) {
	std::swap(V1, V2);
	std::swap(V1Size, V2Size);
	std::swap(O1, O2);
	std::swap(V1AAInfo, V2AAInfo);
	}
	if (const GEPOperator *GV1 = dyn_cast<GEPOperator>(V1)) {
	AliasResult Result =
	aliasGEP(GV1, V1Size, V1AAInfo, V2, V2Size, V2AAInfo, O1, O2, AAQI);
	if (Result != MayAlias) {
	auto ItInsPair = AAQI.AliasCache.insert(std::make_pair(Locs, Result));
	assert(!ItInsPair.second && "Entry must have existed");
	ItInsPair.first->second = Result;
	return Result;
	}
	}

	if (isa<PHINode>(V2) && !isa<PHINode>(V1)) {
	std::swap(V1, V2);
	std::swap(O1, O2);
	std::swap(V1Size, V2Size);
	std::swap(V1AAInfo, V2AAInfo);
	}
	if (const PHINode *PN = dyn_cast<PHINode>(V1)) {
	AliasResult Result =
	aliasPHI(PN, V1Size, V1AAInfo, V2, V2Size, V2AAInfo, O2, AAQI);
	if (Result != MayAlias) {
	Pair = AAQI.AliasCache.try_emplace(Locs, Result);
	assert(!Pair.second && "Entry must have existed");
	return Pair.first->second = Result;
	}
	}

	if (isa<SelectInst>(V2) && !isa<SelectInst>(V1)) {
	std::swap(V1, V2);
	std::swap(O1, O2);
	std::swap(V1Size, V2Size);
	std::swap(V1AAInfo, V2AAInfo);
	}
	if (const SelectInst *S1 = dyn_cast<SelectInst>(V1)) {
	AliasResult Result =
	aliasSelect(S1, V1Size, V1AAInfo, V2, V2Size, V2AAInfo, O2, AAQI);
	if (Result != MayAlias) {
	Pair = AAQI.AliasCache.try_emplace(Locs, Result);
	assert(!Pair.second && "Entry must have existed");
	return Pair.first->second = Result;
	}
	}

	// If both pointers are pointing into the same object and one of them
	// accesses the entire object, then the accesses must overlap in some way.
	if (O1 == O2)
	if (V1Size.isPrecise() && V2Size.isPrecise() &&
	(isObjectSize(O1, V1Size.getValue(), DL, TLI, NullIsValidLocation) \|\|
	isObjectSize(O2, V2Size.getValue(), DL, TLI, NullIsValidLocation))) {
	Pair = AAQI.AliasCache.try_emplace(Locs, PartialAlias);
	assert(!Pair.second && "Entry must have existed");
	return Pair.first->second = PartialAlias;
	}

	// Recurse back into the best AA results we have, potentially with refined
	// memory locations. We have already ensured that BasicAA has a MayAlias
	// cache result for these, so any recursion back into BasicAA won't loop.
	AliasResult Result = getBestAAResults().alias(Locs.first, Locs.second, AAQI);
	Pair = AAQI.AliasCache.try_emplace(Locs, Result);
	assert(!Pair.second && "Entry must have existed");
	return Pair.first->second = Result;
	}

	/// Check whether two Values can be considered equivalent.
	///
	/// In addition to pointer equivalence of \p V1 and \p V2 this checks whether
	/// they can not be part of a cycle in the value graph by looking at all
	/// visited phi nodes an making sure that the phis cannot reach the value. We
	/// have to do this because we are looking through phi nodes (That is we say
	/// noalias(V, phi(VA, VB)) if noalias(V, VA) and noalias(V, VB).
	bool BasicAAResult::isValueEqualInPotentialCycles(const Value *V,
	const Value *V2) {
	if (V != V2)
	return false;

	const Instruction *Inst = dyn_cast<Instruction>(V);
	if (!Inst)
	return true;

	if (VisitedPhiBBs.empty())
	return true;

	if (VisitedPhiBBs.size() > MaxNumPhiBBsValueReachabilityCheck)
	return false;

	// Make sure that the visited phis cannot reach the Value. This ensures that
	// the Values cannot come from different iterations of a potential cycle the
	// phi nodes could be involved in.
	for (auto *P : VisitedPhiBBs)
	if (isPotentiallyReachable(&P->front(), Inst, nullptr, DT, LI))
	return false;

	return true;
	}

	/// Computes the symbolic difference between two de-composed GEPs.
	///
	/// Dest and Src are the variable indices from two decomposed GetElementPtr
	/// instructions GEP1 and GEP2 which have common base pointers.
	void BasicAAResult::GetIndexDifference(
	SmallVectorImpl<VariableGEPIndex> &Dest,
	const SmallVectorImpl<VariableGEPIndex> &Src) {
	if (Src.empty())
	return;

	for (unsigned i = 0, e = Src.size(); i != e; ++i) {
	const Value *V = Src[i].V;
	unsigned ZExtBits = Src[i].ZExtBits, SExtBits = Src[i].SExtBits;
	APInt Scale = Src[i].Scale;

	// Find V in Dest. This is N^2, but pointer indices almost never have more
	// than a few variable indexes.
	for (unsigned j = 0, e = Dest.size(); j != e; ++j) {
	if (!isValueEqualInPotentialCycles(Dest[j].V, V) \|\|
	Dest[j].ZExtBits != ZExtBits \|\| Dest[j].SExtBits != SExtBits)
	continue;

	// If we found it, subtract off Scale V's from the entry in Dest. If it
	// goes to zero, remove the entry.
	if (Dest[j].Scale != Scale)
	Dest[j].Scale -= Scale;
	else
	Dest.erase(Dest.begin() + j);
	Scale = 0;
	break;
	}

	// If we didn't consume this entry, add it to the end of the Dest list.
	if (!!Scale) {
	VariableGEPIndex Entry = {V, ZExtBits, SExtBits, -Scale};
	Dest.push_back(Entry);
	}
	}
	}

	bool BasicAAResult::constantOffsetHeuristic(
	const SmallVectorImpl<VariableGEPIndex> &VarIndices,
	LocationSize MaybeV1Size, LocationSize MaybeV2Size, const APInt &BaseOffset,
	AssumptionCache AC, DominatorTree DT) {
	if (VarIndices.size() != 2 \|\| MaybeV1Size == LocationSize::unknown() \|\|
	MaybeV2Size == LocationSize::unknown())
	return false;

	const uint64_t V1Size = MaybeV1Size.getValue();
	const uint64_t V2Size = MaybeV2Size.getValue();

	const VariableGEPIndex &Var0 = VarIndices[0], &Var1 = VarIndices[1];

	if (Var0.ZExtBits != Var1.ZExtBits \|\| Var0.SExtBits != Var1.SExtBits \|\|
	Var0.Scale != -Var1.Scale)
	return false;

	unsigned Width = Var1.V->getType()->getIntegerBitWidth();

	// We'll strip off the Extensions of Var0 and Var1 and do another round
	// of GetLinearExpression decomposition. In the example above, if Var0
	// is zext(%x + 1) we should get V1 == %x and V1Offset == 1.

	APInt V0Scale(Width, 0), V0Offset(Width, 0), V1Scale(Width, 0),
	V1Offset(Width, 0);
	bool NSW = true, NUW = true;
	unsigned V0ZExtBits = 0, V0SExtBits = 0, V1ZExtBits = 0, V1SExtBits = 0;
	const Value *V0 = GetLinearExpression(Var0.V, V0Scale, V0Offset, V0ZExtBits,
	V0SExtBits, DL, 0, AC, DT, NSW, NUW);
	NSW = true;
	NUW = true;
	const Value *V1 = GetLinearExpression(Var1.V, V1Scale, V1Offset, V1ZExtBits,
	V1SExtBits, DL, 0, AC, DT, NSW, NUW);

	if (V0Scale != V1Scale \|\| V0ZExtBits != V1ZExtBits \|\|
	V0SExtBits != V1SExtBits \|\| !isValueEqualInPotentialCycles(V0, V1))
	return false;

	// We have a hit - Var0 and Var1 only differ by a constant offset!

	// If we've been sext'ed then zext'd the maximum difference between Var0 and
	// Var1 is possible to calculate, but we're just interested in the absolute
	// minimum difference between the two. The minimum distance may occur due to
	// wrapping; consider "add i3 %i, 5": if %i == 7 then 7 + 5 mod 8 == 4, and so
	// the minimum distance between %i and %i + 5 is 3.
	APInt MinDiff = V0Offset - V1Offset, Wrapped = -MinDiff;
	MinDiff = APIntOps::umin(MinDiff, Wrapped);
	APInt MinDiffBytes =
	MinDiff.zextOrTrunc(Var0.Scale.getBitWidth()) * Var0.Scale.abs();

	// We can't definitely say whether GEP1 is before or after V2 due to wrapping
	// arithmetic (i.e. for some values of GEP1 and V2 GEP1 < V2, and for other
	// values GEP1 > V2). We'll therefore only declare NoAlias if both V1Size and
	// V2Size can fit in the MinDiffBytes gap.
	return MinDiffBytes.uge(V1Size + BaseOffset.abs()) &&
	MinDiffBytes.uge(V2Size + BaseOffset.abs());
	}

	//===----------------------------------------------------------------------===//
	// BasicAliasAnalysis Pass
	//===----------------------------------------------------------------------===//

	AnalysisKey BasicAA::Key;

	BasicAAResult BasicAA::run(Function &F, FunctionAnalysisManager &AM) {
	return BasicAAResult(F.getParent()->getDataLayout(),
	F,
	AM.getResult<TargetLibraryAnalysis>(F),
	AM.getResult<AssumptionAnalysis>(F),
	&AM.getResult<DominatorTreeAnalysis>(F),
	AM.getCachedResult<LoopAnalysis>(F),
	AM.getCachedResult<PhiValuesAnalysis>(F));
	}

	BasicAAWrapperPass::BasicAAWrapperPass() : FunctionPass(ID) {
	initializeBasicAAWrapperPassPass(*PassRegistry::getPassRegistry());
	}

	char BasicAAWrapperPass::ID = 0;

	void BasicAAWrapperPass::anchor() {}

	INITIALIZE_PASS_BEGIN(BasicAAWrapperPass, "basic-aa",
	"Basic Alias Analysis (stateless AA impl)", true, true)
	INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(PhiValuesWrapperPass)
	INITIALIZE_PASS_END(BasicAAWrapperPass, "basic-aa",
	"Basic Alias Analysis (stateless AA impl)", true, true)

	FunctionPass *llvm::createBasicAAWrapperPass() {
	return new BasicAAWrapperPass();
	}

	bool BasicAAWrapperPass::runOnFunction(Function &F) {
	auto &ACT = getAnalysis<AssumptionCacheTracker>();
	auto &TLIWP = getAnalysis<TargetLibraryInfoWrapperPass>();
	auto &DTWP = getAnalysis<DominatorTreeWrapperPass>();
	auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
	auto *PVWP = getAnalysisIfAvailable<PhiValuesWrapperPass>();

	Result.reset(new BasicAAResult(F.getParent()->getDataLayout(), F,
	TLIWP.getTLI(F), ACT.getAssumptionCache(F),
	&DTWP.getDomTree(),
	LIWP ? &LIWP->getLoopInfo() : nullptr,
	PVWP ? &PVWP->getResult() : nullptr));

	return false;
	}

	void BasicAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
	AU.setPreservesAll();
	AU.addRequired<AssumptionCacheTracker>();
	AU.addRequired<DominatorTreeWrapperPass>();
	AU.addRequired<TargetLibraryInfoWrapperPass>();
	AU.addUsedIfAvailable<PhiValuesWrapperPass>();
	}

	BasicAAResult llvm::createLegacyPMBasicAAResult(Pass &P, Function &F) {
	return BasicAAResult(
	F.getParent()->getDataLayout(), F,
	P.getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F),
	P.getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F));
	}
	diff --git a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
	index 6c5ef0255a08..204fb556d810 100644
	--- a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
	+++ b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
	@@ -1,444 +1,448 @@
	//===- LocalStackSlotAllocation.cpp - Pre-allocate locals to stack slots --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This pass assigns local frame indices to stack slots relative to one another
	// and allocates additional base registers to access them when the target
	// estimates they are likely to be out of range of stack pointer and frame
	// pointer relative addressing.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/ADT/SetVector.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineFunctionPass.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/TargetFrameLowering.h"
	#include "llvm/CodeGen/TargetOpcodes.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/InitializePasses.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/raw_ostream.h"
	#include <algorithm>
	#include <cassert>
	#include <cstdint>
	#include <tuple>

	using namespace llvm;

	#define DEBUG_TYPE "localstackalloc"

	STATISTIC(NumAllocations, "Number of frame indices allocated into local block");
	STATISTIC(NumBaseRegisters, "Number of virtual frame base registers allocated");
	STATISTIC(NumReplacements, "Number of frame indices references replaced");

	namespace {

	class FrameRef {
	MachineBasicBlock::iterator MI; // Instr referencing the frame
	int64_t LocalOffset; // Local offset of the frame idx referenced
	int FrameIdx; // The frame index

	// Order reference instruction appears in program. Used to ensure
	// deterministic order when multiple instructions may reference the same
	// location.
	unsigned Order;

	public:
	FrameRef(MachineInstr *I, int64_t Offset, int Idx, unsigned Ord) :
	MI(I), LocalOffset(Offset), FrameIdx(Idx), Order(Ord) {}

	bool operator<(const FrameRef &RHS) const {
	return std::tie(LocalOffset, FrameIdx, Order) <
	std::tie(RHS.LocalOffset, RHS.FrameIdx, RHS.Order);
	}

	MachineBasicBlock::iterator getMachineInstr() const { return MI; }
	int64_t getLocalOffset() const { return LocalOffset; }
	int getFrameIndex() const { return FrameIdx; }
	};

	class LocalStackSlotPass: public MachineFunctionPass {
	SmallVector<int64_t, 16> LocalOffsets;

	/// StackObjSet - A set of stack object indexes
	using StackObjSet = SmallSetVector<int, 8>;

	void AdjustStackOffset(MachineFrameInfo &MFI, int FrameIdx, int64_t &Offset,
	bool StackGrowsDown, Align &MaxAlign);
	void AssignProtectedObjSet(const StackObjSet &UnassignedObjs,
	SmallSet<int, 16> &ProtectedObjs,
	MachineFrameInfo &MFI, bool StackGrowsDown,
	int64_t &Offset, Align &MaxAlign);
	void calculateFrameObjectOffsets(MachineFunction &Fn);
	bool insertFrameReferenceRegisters(MachineFunction &Fn);

	public:
	static char ID; // Pass identification, replacement for typeid

	explicit LocalStackSlotPass() : MachineFunctionPass(ID) {
	initializeLocalStackSlotPassPass(*PassRegistry::getPassRegistry());
	}

	bool runOnMachineFunction(MachineFunction &MF) override;

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.setPreservesCFG();
	MachineFunctionPass::getAnalysisUsage(AU);
	}
	};

	} // end anonymous namespace

	char LocalStackSlotPass::ID = 0;

	char &llvm::LocalStackSlotAllocationID = LocalStackSlotPass::ID;
	INITIALIZE_PASS(LocalStackSlotPass, DEBUG_TYPE,
	"Local Stack Slot Allocation", false, false)

	bool LocalStackSlotPass::runOnMachineFunction(MachineFunction &MF) {
	MachineFrameInfo &MFI = MF.getFrameInfo();
	const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
	unsigned LocalObjectCount = MFI.getObjectIndexEnd();

	// If the target doesn't want/need this pass, or if there are no locals
	// to consider, early exit.
	if (!TRI->requiresVirtualBaseRegisters(MF) \|\| LocalObjectCount == 0)
	return true;

	// Make sure we have enough space to store the local offsets.
	LocalOffsets.resize(MFI.getObjectIndexEnd());

	// Lay out the local blob.
	calculateFrameObjectOffsets(MF);

	// Insert virtual base registers to resolve frame index references.
	bool UsedBaseRegs = insertFrameReferenceRegisters(MF);

	// Tell MFI whether any base registers were allocated. PEI will only
	// want to use the local block allocations from this pass if there were any.
	// Otherwise, PEI can do a bit better job of getting the alignment right
	// without a hole at the start since it knows the alignment of the stack
	// at the start of local allocation, and this pass doesn't.
	MFI.setUseLocalStackAllocationBlock(UsedBaseRegs);

	return true;
	}

	/// AdjustStackOffset - Helper function used to adjust the stack frame offset.
	void LocalStackSlotPass::AdjustStackOffset(MachineFrameInfo &MFI, int FrameIdx,
	int64_t &Offset, bool StackGrowsDown,
	Align &MaxAlign) {
	// If the stack grows down, add the object size to find the lowest address.
	if (StackGrowsDown)
	Offset += MFI.getObjectSize(FrameIdx);

	Align Alignment = MFI.getObjectAlign(FrameIdx);

	// If the alignment of this object is greater than that of the stack, then
	// increase the stack alignment to match.
	MaxAlign = std::max(MaxAlign, Alignment);

	// Adjust to alignment boundary.
	Offset = alignTo(Offset, Alignment);

	int64_t LocalOffset = StackGrowsDown ? -Offset : Offset;
	LLVM_DEBUG(dbgs() << "Allocate FI(" << FrameIdx << ") to local offset "
	<< LocalOffset << "\n");
	// Keep the offset available for base register allocation
	LocalOffsets[FrameIdx] = LocalOffset;
	// And tell MFI about it for PEI to use later
	MFI.mapLocalFrameObject(FrameIdx, LocalOffset);

	if (!StackGrowsDown)
	Offset += MFI.getObjectSize(FrameIdx);

	++NumAllocations;
	}

	/// AssignProtectedObjSet - Helper function to assign large stack objects (i.e.,
	/// those required to be close to the Stack Protector) to stack offsets.
	void LocalStackSlotPass::AssignProtectedObjSet(
	const StackObjSet &UnassignedObjs, SmallSet<int, 16> &ProtectedObjs,
	MachineFrameInfo &MFI, bool StackGrowsDown, int64_t &Offset,
	Align &MaxAlign) {
	for (StackObjSet::const_iterator I = UnassignedObjs.begin(),
	E = UnassignedObjs.end(); I != E; ++I) {
	int i = *I;
	AdjustStackOffset(MFI, i, Offset, StackGrowsDown, MaxAlign);
	ProtectedObjs.insert(i);
	}
	}

	/// calculateFrameObjectOffsets - Calculate actual frame offsets for all of the
	/// abstract stack objects.
	void LocalStackSlotPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
	// Loop over all of the stack objects, assigning sequential addresses...
	MachineFrameInfo &MFI = Fn.getFrameInfo();
	const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
	bool StackGrowsDown =
	TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
	int64_t Offset = 0;
	Align MaxAlign;

	// Make sure that the stack protector comes before the local variables on the
	// stack.
	SmallSet<int, 16> ProtectedObjs;
	if (MFI.hasStackProtectorIndex()) {
	int StackProtectorFI = MFI.getStackProtectorIndex();

	// We need to make sure we didn't pre-allocate the stack protector when
	// doing this.
	// If we already have a stack protector, this will re-assign it to a slot
	// that is not covering the protected objects.
	assert(!MFI.isObjectPreAllocated(StackProtectorFI) &&
	"Stack protector pre-allocated in LocalStackSlotAllocation");

	StackObjSet LargeArrayObjs;
	StackObjSet SmallArrayObjs;
	StackObjSet AddrOfObjs;

	AdjustStackOffset(MFI, StackProtectorFI, Offset, StackGrowsDown, MaxAlign);

	// Assign large stack objects first.
	for (unsigned i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
	if (MFI.isDeadObjectIndex(i))
	continue;
	if (StackProtectorFI == (int)i)
	continue;
	+ if (!TFI.isStackIdSafeForLocalArea(MFI.getStackID(i)))
	+ continue;

	switch (MFI.getObjectSSPLayout(i)) {
	case MachineFrameInfo::SSPLK_None:
	continue;
	case MachineFrameInfo::SSPLK_SmallArray:
	SmallArrayObjs.insert(i);
	continue;
	case MachineFrameInfo::SSPLK_AddrOf:
	AddrOfObjs.insert(i);
	continue;
	case MachineFrameInfo::SSPLK_LargeArray:
	LargeArrayObjs.insert(i);
	continue;
	}
	llvm_unreachable("Unexpected SSPLayoutKind.");
	}

	AssignProtectedObjSet(LargeArrayObjs, ProtectedObjs, MFI, StackGrowsDown,
	Offset, MaxAlign);
	AssignProtectedObjSet(SmallArrayObjs, ProtectedObjs, MFI, StackGrowsDown,
	Offset, MaxAlign);
	AssignProtectedObjSet(AddrOfObjs, ProtectedObjs, MFI, StackGrowsDown,
	Offset, MaxAlign);
	}

	// Then assign frame offsets to stack objects that are not used to spill
	// callee saved registers.
	for (unsigned i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
	if (MFI.isDeadObjectIndex(i))
	continue;
	if (MFI.getStackProtectorIndex() == (int)i)
	continue;
	if (ProtectedObjs.count(i))
	continue;
	+ if (!TFI.isStackIdSafeForLocalArea(MFI.getStackID(i)))
	+ continue;

	AdjustStackOffset(MFI, i, Offset, StackGrowsDown, MaxAlign);
	}

	// Remember how big this blob of stack space is
	MFI.setLocalFrameSize(Offset);
	MFI.setLocalFrameMaxAlign(MaxAlign);
	}

	static inline bool
	lookupCandidateBaseReg(unsigned BaseReg,
	int64_t BaseOffset,
	int64_t FrameSizeAdjust,
	int64_t LocalFrameOffset,
	const MachineInstr &MI,
	const TargetRegisterInfo *TRI) {
	// Check if the relative offset from the where the base register references
	// to the target address is in range for the instruction.
	int64_t Offset = FrameSizeAdjust + LocalFrameOffset - BaseOffset;
	return TRI->isFrameOffsetLegal(&MI, BaseReg, Offset);
	}

	bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
	// Scan the function's instructions looking for frame index references.
	// For each, ask the target if it wants a virtual base register for it
	// based on what we can tell it about where the local will end up in the
	// stack frame. If it wants one, re-use a suitable one we've previously
	// allocated, or if there isn't one that fits the bill, allocate a new one
	// and ask the target to create a defining instruction for it.
	bool UsedBaseReg = false;

	MachineFrameInfo &MFI = Fn.getFrameInfo();
	const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
	const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
	bool StackGrowsDown =
	TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;

	// Collect all of the instructions in the block that reference
	// a frame index. Also store the frame index referenced to ease later
	// lookup. (For any insn that has more than one FI reference, we arbitrarily
	// choose the first one).
	SmallVector<FrameRef, 64> FrameReferenceInsns;

	unsigned Order = 0;

	for (MachineBasicBlock &BB : Fn) {
	for (MachineInstr &MI : BB) {
	// Debug value, stackmap and patchpoint instructions can't be out of
	// range, so they don't need any updates.
	if (MI.isDebugInstr() \|\| MI.getOpcode() == TargetOpcode::STATEPOINT \|\|
	MI.getOpcode() == TargetOpcode::STACKMAP \|\|
	MI.getOpcode() == TargetOpcode::PATCHPOINT)
	continue;

	// For now, allocate the base register(s) within the basic block
	// where they're used, and don't try to keep them around outside
	// of that. It may be beneficial to try sharing them more broadly
	// than that, but the increased register pressure makes that a
	// tricky thing to balance. Investigate if re-materializing these
	// becomes an issue.
	for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
	// Consider replacing all frame index operands that reference
	// an object allocated in the local block.
	if (MI.getOperand(i).isFI()) {
	// Don't try this with values not in the local block.
	if (!MFI.isObjectPreAllocated(MI.getOperand(i).getIndex()))
	break;
	int Idx = MI.getOperand(i).getIndex();
	int64_t LocalOffset = LocalOffsets[Idx];
	if (!TRI->needsFrameBaseReg(&MI, LocalOffset))
	break;
	FrameReferenceInsns.push_back(FrameRef(&MI, LocalOffset, Idx, Order++));
	break;
	}
	}
	}
	}

	// Sort the frame references by local offset.
	// Use frame index as a tie-breaker in case MI's have the same offset.
	llvm::sort(FrameReferenceInsns);

	MachineBasicBlock *Entry = &Fn.front();

	unsigned BaseReg = 0;
	int64_t BaseOffset = 0;

	// Loop through the frame references and allocate for them as necessary.
	for (int ref = 0, e = FrameReferenceInsns.size(); ref < e ; ++ref) {
	FrameRef &FR = FrameReferenceInsns[ref];
	MachineInstr &MI = *FR.getMachineInstr();
	int64_t LocalOffset = FR.getLocalOffset();
	int FrameIdx = FR.getFrameIndex();
	assert(MFI.isObjectPreAllocated(FrameIdx) &&
	"Only pre-allocated locals expected!");

	// We need to keep the references to the stack protector slot through frame
	// index operands so that it gets resolved by PEI rather than this pass.
	// This avoids accesses to the stack protector though virtual base
	// registers, and forces PEI to address it using fp/sp/bp.
	if (MFI.hasStackProtectorIndex() &&
	FrameIdx == MFI.getStackProtectorIndex())
	continue;

	LLVM_DEBUG(dbgs() << "Considering: " << MI);

	unsigned idx = 0;
	for (unsigned f = MI.getNumOperands(); idx != f; ++idx) {
	if (!MI.getOperand(idx).isFI())
	continue;

	if (FrameIdx == MI.getOperand(idx).getIndex())
	break;
	}

	assert(idx < MI.getNumOperands() && "Cannot find FI operand");

	int64_t Offset = 0;
	int64_t FrameSizeAdjust = StackGrowsDown ? MFI.getLocalFrameSize() : 0;

	LLVM_DEBUG(dbgs() << " Replacing FI in: " << MI);

	// If we have a suitable base register available, use it; otherwise
	// create a new one. Note that any offset encoded in the
	// instruction itself will be taken into account by the target,
	// so we don't have to adjust for it here when reusing a base
	// register.
	if (UsedBaseReg &&
	lookupCandidateBaseReg(BaseReg, BaseOffset, FrameSizeAdjust,
	LocalOffset, MI, TRI)) {
	LLVM_DEBUG(dbgs() << " Reusing base register " << BaseReg << "\n");
	// We found a register to reuse.
	Offset = FrameSizeAdjust + LocalOffset - BaseOffset;
	} else {
	// No previously defined register was in range, so create a new one.
	int64_t InstrOffset = TRI->getFrameIndexInstrOffset(&MI, idx);

	int64_t PrevBaseOffset = BaseOffset;
	BaseOffset = FrameSizeAdjust + LocalOffset + InstrOffset;

	// We'd like to avoid creating single-use virtual base registers.
	// Because the FrameRefs are in sorted order, and we've already
	// processed all FrameRefs before this one, just check whether or not
	// the next FrameRef will be able to reuse this new register. If not,
	// then don't bother creating it.
	if (ref + 1 >= e \|\|
	!lookupCandidateBaseReg(
	BaseReg, BaseOffset, FrameSizeAdjust,
	FrameReferenceInsns[ref + 1].getLocalOffset(),
	*FrameReferenceInsns[ref + 1].getMachineInstr(), TRI)) {
	BaseOffset = PrevBaseOffset;
	continue;
	}

	const MachineFunction *MF = MI.getMF();
	const TargetRegisterClass RC = TRI->getPointerRegClass(MF);
	BaseReg = Fn.getRegInfo().createVirtualRegister(RC);

	LLVM_DEBUG(dbgs() << " Materializing base register " << BaseReg
	<< " at frame local offset "
	<< LocalOffset + InstrOffset << "\n");

	// Tell the target to insert the instruction to initialize
	// the base register.
	// MachineBasicBlock::iterator InsertionPt = Entry->begin();
	TRI->materializeFrameBaseRegister(Entry, BaseReg, FrameIdx,
	InstrOffset);

	// The base register already includes any offset specified
	// by the instruction, so account for that so it doesn't get
	// applied twice.
	Offset = -InstrOffset;

	++NumBaseRegisters;
	UsedBaseReg = true;
	}
	assert(BaseReg != 0 && "Unable to allocate virtual base register!");

	// Modify the instruction to use the new base register rather
	// than the frame index operand.
	TRI->resolveFrameIndex(MI, BaseReg, Offset);
	LLVM_DEBUG(dbgs() << "Resolved: " << MI);

	++NumReplacements;
	}

	return UsedBaseReg;
	}
	diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
	index f14b3dba4f31..ec384d2a7c56 100644
	--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
	+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
	@@ -1,22146 +1,22158 @@
	//===- DAGCombiner.cpp - Implement a DAG node combiner --------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This pass combines dag nodes to form fewer, simpler DAG nodes. It can be run
	// both before and after the DAG is legalized.
	//
	// This pass is not a substitute for the LLVM IR instcombine pass. This pass is
	// primarily intended to handle simplification opportunities that are implicit
	// in the LLVM IR and exposed by the various codegen lowering phases.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/IntervalMap.h"
	#include "llvm/ADT/None.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SetVector.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/Analysis/AliasAnalysis.h"
	#include "llvm/Analysis/MemoryLocation.h"
	#include "llvm/Analysis/VectorUtils.h"
	#include "llvm/CodeGen/DAGCombine.h"
	#include "llvm/CodeGen/ISDOpcodes.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/RuntimeLibcalls.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/Metadata.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MachineValueType.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include <algorithm>
	#include <cassert>
	#include <cstdint>
	#include <functional>
	#include <iterator>
	#include <string>
	#include <tuple>
	#include <utility>

	using namespace llvm;

	#define DEBUG_TYPE "dagcombine"

	STATISTIC(NodesCombined , "Number of dag nodes combined");
	STATISTIC(PreIndexedNodes , "Number of pre-indexed nodes created");
	STATISTIC(PostIndexedNodes, "Number of post-indexed nodes created");
	STATISTIC(OpsNarrowed , "Number of load/op/store narrowed");
	STATISTIC(LdStFP2Int , "Number of fp load/store pairs transformed to int");
	STATISTIC(SlicedLoads, "Number of load sliced");
	STATISTIC(NumFPLogicOpsConv, "Number of logic ops converted to fp ops");

	static cl::opt<bool>
	CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden,
	cl::desc("Enable DAG combiner's use of IR alias analysis"));

	static cl::opt<bool>
	UseTBAA("combiner-use-tbaa", cl::Hidden, cl::init(true),
	cl::desc("Enable DAG combiner's use of TBAA"));

	#ifndef NDEBUG
	static cl::opt<std::string>
	CombinerAAOnlyFunc("combiner-aa-only-func", cl::Hidden,
	cl::desc("Only use DAG-combiner alias analysis in this"
	" function"));
	#endif

	/// Hidden option to stress test load slicing, i.e., when this option
	/// is enabled, load slicing bypasses most of its profitability guards.
	static cl::opt<bool>
	StressLoadSlicing("combiner-stress-load-slicing", cl::Hidden,
	cl::desc("Bypass the profitability model of load slicing"),
	cl::init(false));

	static cl::opt<bool>
	MaySplitLoadIndex("combiner-split-load-index", cl::Hidden, cl::init(true),
	cl::desc("DAG combiner may split indexing from loads"));

	static cl::opt<bool>
	EnableStoreMerging("combiner-store-merging", cl::Hidden, cl::init(true),
	cl::desc("DAG combiner enable merging multiple stores "
	"into a wider store"));

	static cl::opt<unsigned> TokenFactorInlineLimit(
	"combiner-tokenfactor-inline-limit", cl::Hidden, cl::init(2048),
	cl::desc("Limit the number of operands to inline for Token Factors"));

	static cl::opt<unsigned> StoreMergeDependenceLimit(
	"combiner-store-merge-dependence-limit", cl::Hidden, cl::init(10),
	cl::desc("Limit the number of times for the same StoreNode and RootNode "
	"to bail out in store merging dependence check"));

	static cl::opt<bool> EnableReduceLoadOpStoreWidth(
	"combiner-reduce-load-op-store-width", cl::Hidden, cl::init(true),
	cl::desc("DAG cominber enable reducing the width of load/op/store "
	"sequence"));

	static cl::opt<bool> EnableShrinkLoadReplaceStoreWithStore(
	"combiner-shrink-load-replace-store-with-store", cl::Hidden, cl::init(true),
	cl::desc("DAG cominber enable load/<replace bytes>/store with "
	"a narrower store"));

	namespace {

	class DAGCombiner {
	SelectionDAG &DAG;
	const TargetLowering &TLI;
	const SelectionDAGTargetInfo *STI;
	CombineLevel Level;
	CodeGenOpt::Level OptLevel;
	bool LegalDAG = false;
	bool LegalOperations = false;
	bool LegalTypes = false;
	bool ForCodeSize;
	bool DisableGenericCombines;

	/// Worklist of all of the nodes that need to be simplified.
	///
	/// This must behave as a stack -- new nodes to process are pushed onto the
	/// back and when processing we pop off of the back.
	///
	/// The worklist will not contain duplicates but may contain null entries
	/// due to nodes being deleted from the underlying DAG.
	SmallVector<SDNode *, 64> Worklist;

	/// Mapping from an SDNode to its position on the worklist.
	///
	/// This is used to find and remove nodes from the worklist (by nulling
	/// them) when they are deleted from the underlying DAG. It relies on
	/// stable indices of nodes within the worklist.
	DenseMap<SDNode *, unsigned> WorklistMap;
	/// This records all nodes attempted to add to the worklist since we
	/// considered a new worklist entry. As we keep do not add duplicate nodes
	/// in the worklist, this is different from the tail of the worklist.
	SmallSetVector<SDNode *, 32> PruningList;

	/// Set of nodes which have been combined (at least once).
	///
	/// This is used to allow us to reliably add any operands of a DAG node
	/// which have not yet been combined to the worklist.
	SmallPtrSet<SDNode *, 32> CombinedNodes;

	/// Map from candidate StoreNode to the pair of RootNode and count.
	/// The count is used to track how many times we have seen the StoreNode
	/// with the same RootNode bail out in dependence check. If we have seen
	/// the bail out for the same pair many times over a limit, we won't
	/// consider the StoreNode with the same RootNode as store merging
	/// candidate again.
	DenseMap<SDNode , std::pair<SDNode , unsigned>> StoreRootCountMap;

	// AA - Used for DAG load/store alias analysis.
	AliasAnalysis *AA;

	/// When an instruction is simplified, add all users of the instruction to
	/// the work lists because they might get more simplified now.
	void AddUsersToWorklist(SDNode *N) {
	for (SDNode *Node : N->uses())
	AddToWorklist(Node);
	}

	/// Convenient shorthand to add a node and all of its user to the worklist.
	void AddToWorklistWithUsers(SDNode *N) {
	AddUsersToWorklist(N);
	AddToWorklist(N);
	}

	// Prune potentially dangling nodes. This is called after
	// any visit to a node, but should also be called during a visit after any
	// failed combine which may have created a DAG node.
	void clearAddedDanglingWorklistEntries() {
	// Check any nodes added to the worklist to see if they are prunable.
	while (!PruningList.empty()) {
	auto *N = PruningList.pop_back_val();
	if (N->use_empty())
	recursivelyDeleteUnusedNodes(N);
	}
	}

	SDNode *getNextWorklistEntry() {
	// Before we do any work, remove nodes that are not in use.
	clearAddedDanglingWorklistEntries();
	SDNode *N = nullptr;
	// The Worklist holds the SDNodes in order, but it may contain null
	// entries.
	while (!N && !Worklist.empty()) {
	N = Worklist.pop_back_val();
	}

	if (N) {
	bool GoodWorklistEntry = WorklistMap.erase(N);
	(void)GoodWorklistEntry;
	assert(GoodWorklistEntry &&
	"Found a worklist entry without a corresponding map entry!");
	}
	return N;
	}

	/// Call the node-specific routine that folds each particular type of node.
	SDValue visit(SDNode *N);

	public:
	DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL)
	: DAG(D), TLI(D.getTargetLoweringInfo()),
	STI(D.getSubtarget().getSelectionDAGInfo()),
	Level(BeforeLegalizeTypes), OptLevel(OL), AA(AA) {
	ForCodeSize = DAG.shouldOptForSize();
	DisableGenericCombines = STI && STI->disableGenericCombines(OptLevel);

	MaximumLegalStoreInBits = 0;
	// We use the minimum store size here, since that's all we can guarantee
	// for the scalable vector types.
	for (MVT VT : MVT::all_valuetypes())
	if (EVT(VT).isSimple() && VT != MVT::Other &&
	TLI.isTypeLegal(EVT(VT)) &&
	VT.getSizeInBits().getKnownMinSize() >= MaximumLegalStoreInBits)
	MaximumLegalStoreInBits = VT.getSizeInBits().getKnownMinSize();
	}

	void ConsiderForPruning(SDNode *N) {
	// Mark this for potential pruning.
	PruningList.insert(N);
	}

	/// Add to the worklist making sure its instance is at the back (next to be
	/// processed.)
	void AddToWorklist(SDNode *N) {
	assert(N->getOpcode() != ISD::DELETED_NODE &&
	"Deleted Node added to Worklist");

	// Skip handle nodes as they can't usefully be combined and confuse the
	// zero-use deletion strategy.
	if (N->getOpcode() == ISD::HANDLENODE)
	return;

	ConsiderForPruning(N);

	if (WorklistMap.insert(std::make_pair(N, Worklist.size())).second)
	Worklist.push_back(N);
	}

	/// Remove all instances of N from the worklist.
	void removeFromWorklist(SDNode *N) {
	CombinedNodes.erase(N);
	PruningList.remove(N);
	StoreRootCountMap.erase(N);

	auto It = WorklistMap.find(N);
	if (It == WorklistMap.end())
	return; // Not in the worklist.

	// Null out the entry rather than erasing it to avoid a linear operation.
	Worklist[It->second] = nullptr;
	WorklistMap.erase(It);
	}

	void deleteAndRecombine(SDNode *N);
	bool recursivelyDeleteUnusedNodes(SDNode *N);

	/// Replaces all uses of the results of one DAG node with new values.
	SDValue CombineTo(SDNode N, const SDValue To, unsigned NumTo,
	bool AddTo = true);

	/// Replaces all uses of the results of one DAG node with new values.
	SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true) {
	return CombineTo(N, &Res, 1, AddTo);
	}

	/// Replaces all uses of the results of one DAG node with new values.
	SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1,
	bool AddTo = true) {
	SDValue To[] = { Res0, Res1 };
	return CombineTo(N, To, 2, AddTo);
	}

	void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO);

	private:
	unsigned MaximumLegalStoreInBits;

	/// Check the specified integer node value to see if it can be simplified or
	/// if things it uses can be simplified by bit propagation.
	/// If so, return true.
	bool SimplifyDemandedBits(SDValue Op) {
	unsigned BitWidth = Op.getScalarValueSizeInBits();
	APInt DemandedBits = APInt::getAllOnesValue(BitWidth);
	return SimplifyDemandedBits(Op, DemandedBits);
	}

	bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits) {
	TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
	KnownBits Known;
	if (!TLI.SimplifyDemandedBits(Op, DemandedBits, Known, TLO, 0, false))
	return false;

	// Revisit the node.
	AddToWorklist(Op.getNode());

	CommitTargetLoweringOpt(TLO);
	return true;
	}

	/// Check the specified vector node value to see if it can be simplified or
	/// if things it uses can be simplified as it only uses some of the
	/// elements. If so, return true.
	bool SimplifyDemandedVectorElts(SDValue Op) {
	// TODO: For now just pretend it cannot be simplified.
	if (Op.getValueType().isScalableVector())
	return false;

	unsigned NumElts = Op.getValueType().getVectorNumElements();
	APInt DemandedElts = APInt::getAllOnesValue(NumElts);
	return SimplifyDemandedVectorElts(Op, DemandedElts);
	}

	bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
	const APInt &DemandedElts,
	bool AssumeSingleUse = false);
	bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedElts,
	bool AssumeSingleUse = false);

	bool CombineToPreIndexedLoadStore(SDNode *N);
	bool CombineToPostIndexedLoadStore(SDNode *N);
	SDValue SplitIndexingFromLoad(LoadSDNode *LD);
	bool SliceUpLoad(SDNode *N);

	// Scalars have size 0 to distinguish from singleton vectors.
	SDValue ForwardStoreValueToDirectLoad(LoadSDNode *LD);
	bool getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val);
	bool extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val);

	/// Replace an ISD::EXTRACT_VECTOR_ELT of a load with a narrowed
	/// load.
	///
	/// \param EVE ISD::EXTRACT_VECTOR_ELT to be replaced.
	/// \param InVecVT type of the input vector to EVE with bitcasts resolved.
	/// \param EltNo index of the vector element to load.
	/// \param OriginalLoad load that EVE came from to be replaced.
	/// \returns EVE on success SDValue() on failure.
	SDValue scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
	SDValue EltNo,
	LoadSDNode *OriginalLoad);
	void ReplaceLoadWithPromotedLoad(SDNode Load, SDNode ExtLoad);
	SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace);
	SDValue SExtPromoteOperand(SDValue Op, EVT PVT);
	SDValue ZExtPromoteOperand(SDValue Op, EVT PVT);
	SDValue PromoteIntBinOp(SDValue Op);
	SDValue PromoteIntShiftOp(SDValue Op);
	SDValue PromoteExtend(SDValue Op);
	bool PromoteLoad(SDValue Op);

	/// Call the node-specific routine that knows how to fold each
	/// particular type of node. If that doesn't do anything, try the
	/// target-specific DAG combines.
	SDValue combine(SDNode *N);

	// Visitation implementation - Implement dag node combining for different
	// node types. The semantics are as follows:
	// Return Value:
	// SDValue.getNode() == 0 - No change was made
	// SDValue.getNode() == N - N was replaced, is dead and has been handled.
	// otherwise - N should be replaced by the returned Operand.
	//
	SDValue visitTokenFactor(SDNode *N);
	SDValue visitMERGE_VALUES(SDNode *N);
	SDValue visitADD(SDNode *N);
	SDValue visitADDLike(SDNode *N);
	SDValue visitADDLikeCommutative(SDValue N0, SDValue N1, SDNode *LocReference);
	SDValue visitSUB(SDNode *N);
	SDValue visitADDSAT(SDNode *N);
	SDValue visitSUBSAT(SDNode *N);
	SDValue visitADDC(SDNode *N);
	SDValue visitADDO(SDNode *N);
	SDValue visitUADDOLike(SDValue N0, SDValue N1, SDNode *N);
	SDValue visitSUBC(SDNode *N);
	SDValue visitSUBO(SDNode *N);
	SDValue visitADDE(SDNode *N);
	SDValue visitADDCARRY(SDNode *N);
	SDValue visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, SDNode *N);
	SDValue visitSUBE(SDNode *N);
	SDValue visitSUBCARRY(SDNode *N);
	SDValue visitMUL(SDNode *N);
	SDValue visitMULFIX(SDNode *N);
	SDValue useDivRem(SDNode *N);
	SDValue visitSDIV(SDNode *N);
	SDValue visitSDIVLike(SDValue N0, SDValue N1, SDNode *N);
	SDValue visitUDIV(SDNode *N);
	SDValue visitUDIVLike(SDValue N0, SDValue N1, SDNode *N);
	SDValue visitREM(SDNode *N);
	SDValue visitMULHU(SDNode *N);
	SDValue visitMULHS(SDNode *N);
	SDValue visitSMUL_LOHI(SDNode *N);
	SDValue visitUMUL_LOHI(SDNode *N);
	SDValue visitMULO(SDNode *N);
	SDValue visitIMINMAX(SDNode *N);
	SDValue visitAND(SDNode *N);
	SDValue visitANDLike(SDValue N0, SDValue N1, SDNode *N);
	SDValue visitOR(SDNode *N);
	SDValue visitORLike(SDValue N0, SDValue N1, SDNode *N);
	SDValue visitXOR(SDNode *N);
	SDValue SimplifyVBinOp(SDNode *N);
	SDValue visitSHL(SDNode *N);
	SDValue visitSRA(SDNode *N);
	SDValue visitSRL(SDNode *N);
	SDValue visitFunnelShift(SDNode *N);
	SDValue visitRotate(SDNode *N);
	SDValue visitABS(SDNode *N);
	SDValue visitBSWAP(SDNode *N);
	SDValue visitBITREVERSE(SDNode *N);
	SDValue visitCTLZ(SDNode *N);
	SDValue visitCTLZ_ZERO_UNDEF(SDNode *N);
	SDValue visitCTTZ(SDNode *N);
	SDValue visitCTTZ_ZERO_UNDEF(SDNode *N);
	SDValue visitCTPOP(SDNode *N);
	SDValue visitSELECT(SDNode *N);
	SDValue visitVSELECT(SDNode *N);
	SDValue visitSELECT_CC(SDNode *N);
	SDValue visitSETCC(SDNode *N);
	SDValue visitSETCCCARRY(SDNode *N);
	SDValue visitSIGN_EXTEND(SDNode *N);
	SDValue visitZERO_EXTEND(SDNode *N);
	SDValue visitANY_EXTEND(SDNode *N);
	SDValue visitAssertExt(SDNode *N);
	SDValue visitAssertAlign(SDNode *N);
	SDValue visitSIGN_EXTEND_INREG(SDNode *N);
	SDValue visitSIGN_EXTEND_VECTOR_INREG(SDNode *N);
	SDValue visitZERO_EXTEND_VECTOR_INREG(SDNode *N);
	SDValue visitTRUNCATE(SDNode *N);
	SDValue visitBITCAST(SDNode *N);
	SDValue visitFREEZE(SDNode *N);
	SDValue visitBUILD_PAIR(SDNode *N);
	SDValue visitFADD(SDNode *N);
	SDValue visitFSUB(SDNode *N);
	SDValue visitFMUL(SDNode *N);
	SDValue visitFMA(SDNode *N);
	SDValue visitFDIV(SDNode *N);
	SDValue visitFREM(SDNode *N);
	SDValue visitFSQRT(SDNode *N);
	SDValue visitFCOPYSIGN(SDNode *N);
	SDValue visitFPOW(SDNode *N);
	SDValue visitSINT_TO_FP(SDNode *N);
	SDValue visitUINT_TO_FP(SDNode *N);
	SDValue visitFP_TO_SINT(SDNode *N);
	SDValue visitFP_TO_UINT(SDNode *N);
	SDValue visitFP_ROUND(SDNode *N);
	SDValue visitFP_EXTEND(SDNode *N);
	SDValue visitFNEG(SDNode *N);
	SDValue visitFABS(SDNode *N);
	SDValue visitFCEIL(SDNode *N);
	SDValue visitFTRUNC(SDNode *N);
	SDValue visitFFLOOR(SDNode *N);
	SDValue visitFMINNUM(SDNode *N);
	SDValue visitFMAXNUM(SDNode *N);
	SDValue visitFMINIMUM(SDNode *N);
	SDValue visitFMAXIMUM(SDNode *N);
	SDValue visitBRCOND(SDNode *N);
	SDValue visitBR_CC(SDNode *N);
	SDValue visitLOAD(SDNode *N);

	SDValue replaceStoreChain(StoreSDNode *ST, SDValue BetterChain);
	SDValue replaceStoreOfFPConstant(StoreSDNode *ST);

	SDValue visitSTORE(SDNode *N);
	SDValue visitLIFETIME_END(SDNode *N);
	SDValue visitINSERT_VECTOR_ELT(SDNode *N);
	SDValue visitEXTRACT_VECTOR_ELT(SDNode *N);
	SDValue visitBUILD_VECTOR(SDNode *N);
	SDValue visitCONCAT_VECTORS(SDNode *N);
	SDValue visitEXTRACT_SUBVECTOR(SDNode *N);
	SDValue visitVECTOR_SHUFFLE(SDNode *N);
	SDValue visitSCALAR_TO_VECTOR(SDNode *N);
	SDValue visitINSERT_SUBVECTOR(SDNode *N);
	SDValue visitMLOAD(SDNode *N);
	SDValue visitMSTORE(SDNode *N);
	SDValue visitMGATHER(SDNode *N);
	SDValue visitMSCATTER(SDNode *N);
	SDValue visitFP_TO_FP16(SDNode *N);
	SDValue visitFP16_TO_FP(SDNode *N);
	SDValue visitVECREDUCE(SDNode *N);

	SDValue visitFADDForFMACombine(SDNode *N);
	SDValue visitFSUBForFMACombine(SDNode *N);
	SDValue visitFMULForFMADistributiveCombine(SDNode *N);

	SDValue XformToShuffleWithZero(SDNode *N);
	bool reassociationCanBreakAddressingModePattern(unsigned Opc,
	const SDLoc &DL, SDValue N0,
	SDValue N1);
	SDValue reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, SDValue N0,
	SDValue N1);
	SDValue reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
	SDValue N1, SDNodeFlags Flags);

	SDValue visitShiftByConstant(SDNode *N);

	SDValue foldSelectOfConstants(SDNode *N);
	SDValue foldVSelectOfConstants(SDNode *N);
	SDValue foldBinOpIntoSelect(SDNode *BO);
	bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS);
	SDValue hoistLogicOpWithSameOpcodeHands(SDNode *N);
	SDValue SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2);
	SDValue SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
	SDValue N2, SDValue N3, ISD::CondCode CC,
	bool NotExtCompare = false);
	SDValue convertSelectOfFPConstantsToLoadOffset(
	const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3,
	ISD::CondCode CC);
	SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1,
	SDValue N2, SDValue N3, ISD::CondCode CC);
	SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
	const SDLoc &DL);
	SDValue unfoldMaskedMerge(SDNode *N);
	SDValue unfoldExtremeBitClearingToShifts(SDNode *N);
	SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
	const SDLoc &DL, bool foldBooleans);
	SDValue rebuildSetCC(SDValue N);

	bool isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
	SDValue &CC, bool MatchStrict = false) const;
	bool isOneUseSetCC(SDValue N) const;

	SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
	unsigned HiOp);
	SDValue CombineConsecutiveLoads(SDNode *N, EVT VT);
	SDValue CombineExtLoad(SDNode *N);
	SDValue CombineZExtLogicopShiftLoad(SDNode *N);
	SDValue combineRepeatedFPDivisors(SDNode *N);
	SDValue combineInsertEltToShuffle(SDNode *N, unsigned InsIndex);
	SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT);
	SDValue BuildSDIV(SDNode *N);
	SDValue BuildSDIVPow2(SDNode *N);
	SDValue BuildUDIV(SDNode *N);
	SDValue BuildLogBase2(SDValue V, const SDLoc &DL);
	SDValue BuildDivEstimate(SDValue N, SDValue Op, SDNodeFlags Flags);
	SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags);
	SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags Flags);
	SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, bool Recip);
	SDValue buildSqrtNROneConst(SDValue Arg, SDValue Est, unsigned Iterations,
	SDNodeFlags Flags, bool Reciprocal);
	SDValue buildSqrtNRTwoConst(SDValue Arg, SDValue Est, unsigned Iterations,
	SDNodeFlags Flags, bool Reciprocal);
	SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
	bool DemandHighBits = true);
	SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
	SDValue MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg,
	SDValue InnerPos, SDValue InnerNeg,
	unsigned PosOpcode, unsigned NegOpcode,
	const SDLoc &DL);
	SDValue MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos, SDValue Neg,
	SDValue InnerPos, SDValue InnerNeg,
	unsigned PosOpcode, unsigned NegOpcode,
	const SDLoc &DL);
	SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
	SDValue MatchLoadCombine(SDNode *N);
	SDValue MatchStoreCombine(StoreSDNode *N);
	SDValue ReduceLoadWidth(SDNode *N);
	SDValue ReduceLoadOpStoreWidth(SDNode *N);
	SDValue splitMergedValStore(StoreSDNode *ST);
	SDValue TransformFPLoadStorePair(SDNode *N);
	SDValue convertBuildVecZextToZext(SDNode *N);
	SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);
	SDValue reduceBuildVecTruncToBitCast(SDNode *N);
	SDValue reduceBuildVecToShuffle(SDNode *N);
	SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N,
	ArrayRef<int> VectorMask, SDValue VecIn1,
	SDValue VecIn2, unsigned LeftIdx,
	bool DidSplitVec);
	SDValue matchVSelectOpSizesWithSetCC(SDNode *Cast);

	/// Walk up chain skipping non-aliasing memory nodes,
	/// looking for aliasing nodes and adding them to the Aliases vector.
	void GatherAllAliases(SDNode *N, SDValue OriginalChain,
	SmallVectorImpl<SDValue> &Aliases);

	/// Return true if there is any possibility that the two addresses overlap.
	bool isAlias(SDNode Op0, SDNode Op1) const;

	/// Walk up chain skipping non-aliasing memory nodes, looking for a better
	/// chain (aliasing node.)
	SDValue FindBetterChain(SDNode *N, SDValue Chain);

	/// Try to replace a store and any possibly adjacent stores on
	/// consecutive chains with better chains. Return true only if St is
	/// replaced.
	///
	/// Notice that other chains may still be replaced even if the function
	/// returns false.
	bool findBetterNeighborChains(StoreSDNode *St);

	// Helper for findBetterNeighborChains. Walk up store chain add additional
	// chained stores that do not overlap and can be parallelized.
	bool parallelizeChainedStores(StoreSDNode *St);

	/// Holds a pointer to an LSBaseSDNode as well as information on where it
	/// is located in a sequence of memory operations connected by a chain.
	struct MemOpLink {
	// Ptr to the mem node.
	LSBaseSDNode *MemNode;

	// Offset from the base ptr.
	int64_t OffsetFromBase;

	MemOpLink(LSBaseSDNode *N, int64_t Offset)
	: MemNode(N), OffsetFromBase(Offset) {}
	};

	// Classify the origin of a stored value.
	enum class StoreSource { Unknown, Constant, Extract, Load };
	StoreSource getStoreSource(SDValue StoreVal) {
	if (isa<ConstantSDNode>(StoreVal) \|\| isa<ConstantFPSDNode>(StoreVal))
	return StoreSource::Constant;
	if (StoreVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT \|\|
	StoreVal.getOpcode() == ISD::EXTRACT_SUBVECTOR)
	return StoreSource::Extract;
	if (isa<LoadSDNode>(StoreVal))
	return StoreSource::Load;
	return StoreSource::Unknown;
	}

	/// This is a helper function for visitMUL to check the profitability
	/// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).
	/// MulNode is the original multiply, AddNode is (add x, c1),
	/// and ConstNode is c2.
	bool isMulAddWithConstProfitable(SDNode *MulNode,
	SDValue &AddNode,
	SDValue &ConstNode);

	/// This is a helper function for visitAND and visitZERO_EXTEND. Returns
	/// true if the (and (load x) c) pattern matches an extload. ExtVT returns
	/// the type of the loaded value to be extended.
	bool isAndLoadExtLoad(ConstantSDNode AndC, LoadSDNode LoadN,
	EVT LoadResultTy, EVT &ExtVT);

	/// Helper function to calculate whether the given Load/Store can have its
	/// width reduced to ExtVT.
	bool isLegalNarrowLdSt(LSBaseSDNode *LDSTN, ISD::LoadExtType ExtType,
	EVT &MemVT, unsigned ShAmt = 0);

	/// Used by BackwardsPropagateMask to find suitable loads.
	bool SearchForAndLoads(SDNode N, SmallVectorImpl<LoadSDNode> &Loads,
	SmallPtrSetImpl<SDNode*> &NodesWithConsts,
	ConstantSDNode Mask, SDNode &NodeToMask);
	/// Attempt to propagate a given AND node back to load leaves so that they
	/// can be combined into narrow loads.
	bool BackwardsPropagateMask(SDNode *N);

	/// Helper function for mergeConsecutiveStores which merges the component
	/// store chains.
	SDValue getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
	unsigned NumStores);

	/// This is a helper function for mergeConsecutiveStores. When the source
	/// elements of the consecutive stores are all constants or all extracted
	/// vector elements, try to merge them into one larger store introducing
	/// bitcasts if necessary. \return True if a merged store was created.
	bool mergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes,
	EVT MemVT, unsigned NumStores,
	bool IsConstantSrc, bool UseVector,
	bool UseTrunc);

	/// This is a helper function for mergeConsecutiveStores. Stores that
	/// potentially may be merged with St are placed in StoreNodes. RootNode is
	/// a chain predecessor to all store candidates.
	void getStoreMergeCandidates(StoreSDNode *St,
	SmallVectorImpl<MemOpLink> &StoreNodes,
	SDNode *&Root);

	/// Helper function for mergeConsecutiveStores. Checks if candidate stores
	/// have indirect dependency through their operands. RootNode is the
	/// predecessor to all stores calculated by getStoreMergeCandidates and is
	/// used to prune the dependency check. \return True if safe to merge.
	bool checkMergeStoreCandidatesForDependencies(
	SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores,
	SDNode *RootNode);

	/// This is a helper function for mergeConsecutiveStores. Given a list of
	/// store candidates, find the first N that are consecutive in memory.
	/// Returns 0 if there are not at least 2 consecutive stores to try merging.
	unsigned getConsecutiveStores(SmallVectorImpl<MemOpLink> &StoreNodes,
	int64_t ElementSizeBytes) const;

	/// This is a helper function for mergeConsecutiveStores. It is used for
	/// store chains that are composed entirely of constant values.
	bool tryStoreMergeOfConstants(SmallVectorImpl<MemOpLink> &StoreNodes,
	unsigned NumConsecutiveStores,
	EVT MemVT, SDNode *Root, bool AllowVectors);

	/// This is a helper function for mergeConsecutiveStores. It is used for
	/// store chains that are composed entirely of extracted vector elements.
	/// When extracting multiple vector elements, try to store them in one
	/// vector store rather than a sequence of scalar stores.
	bool tryStoreMergeOfExtracts(SmallVectorImpl<MemOpLink> &StoreNodes,
	unsigned NumConsecutiveStores, EVT MemVT,
	SDNode *Root);

	/// This is a helper function for mergeConsecutiveStores. It is used for
	/// store chains that are composed entirely of loaded values.
	bool tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes,
	unsigned NumConsecutiveStores, EVT MemVT,
	SDNode *Root, bool AllowVectors,
	bool IsNonTemporalStore, bool IsNonTemporalLoad);

	/// Merge consecutive store operations into a wide store.
	/// This optimization uses wide integers or vectors when possible.
	/// \return true if stores were merged.
	bool mergeConsecutiveStores(StoreSDNode *St);

	/// Try to transform a truncation where C is a constant:
	/// (trunc (and X, C)) -> (and (trunc X), (trunc C))
	///
	/// \p N needs to be a truncation and its first operand an AND. Other
	/// requirements are checked by the function (e.g. that trunc is
	/// single-use) and if missed an empty SDValue is returned.
	SDValue distributeTruncateThroughAnd(SDNode *N);

	/// Helper function to determine whether the target supports operation
	/// given by \p Opcode for type \p VT, that is, whether the operation
	/// is legal or custom before legalizing operations, and whether is
	/// legal (but not custom) after legalization.
	bool hasOperation(unsigned Opcode, EVT VT) {
	if (LegalOperations)
	return TLI.isOperationLegal(Opcode, VT);
	return TLI.isOperationLegalOrCustom(Opcode, VT);
	}

	public:
	/// Runs the dag combiner on all nodes in the work list
	void Run(CombineLevel AtLevel);

	SelectionDAG &getDAG() const { return DAG; }

	/// Returns a type large enough to hold any valid shift amount - before type
	/// legalization these can be huge.
	EVT getShiftAmountTy(EVT LHSTy) {
	assert(LHSTy.isInteger() && "Shift amount is not an integer type!");
	return TLI.getShiftAmountTy(LHSTy, DAG.getDataLayout(), LegalTypes);
	}

	/// This method returns true if we are running before type legalization or
	/// if the specified VT is legal.
	bool isTypeLegal(const EVT &VT) {
	if (!LegalTypes) return true;
	return TLI.isTypeLegal(VT);
	}

	/// Convenience wrapper around TargetLowering::getSetCCResultType
	EVT getSetCCResultType(EVT VT) const {
	return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
	}

	void ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs,
	SDValue OrigLoad, SDValue ExtLoad,
	ISD::NodeType ExtType);
	};

	/// This class is a DAGUpdateListener that removes any deleted
	/// nodes from the worklist.
	class WorklistRemover : public SelectionDAG::DAGUpdateListener {
	DAGCombiner &DC;

	public:
	explicit WorklistRemover(DAGCombiner &dc)
	: SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {}

	void NodeDeleted(SDNode N, SDNode E) override {
	DC.removeFromWorklist(N);
	}
	};

	class WorklistInserter : public SelectionDAG::DAGUpdateListener {
	DAGCombiner &DC;

	public:
	explicit WorklistInserter(DAGCombiner &dc)
	: SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {}

	// FIXME: Ideally we could add N to the worklist, but this causes exponential
	// compile time costs in large DAGs, e.g. Halide.
	void NodeInserted(SDNode *N) override { DC.ConsiderForPruning(N); }
	};

	} // end anonymous namespace

	//===----------------------------------------------------------------------===//
	// TargetLowering::DAGCombinerInfo implementation
	//===----------------------------------------------------------------------===//

	void TargetLowering::DAGCombinerInfo::AddToWorklist(SDNode *N) {
	((DAGCombiner*)DC)->AddToWorklist(N);
	}

	SDValue TargetLowering::DAGCombinerInfo::
	CombineTo(SDNode *N, ArrayRef<SDValue> To, bool AddTo) {
	return ((DAGCombiner*)DC)->CombineTo(N, &To[0], To.size(), AddTo);
	}

	SDValue TargetLowering::DAGCombinerInfo::
	CombineTo(SDNode *N, SDValue Res, bool AddTo) {
	return ((DAGCombiner*)DC)->CombineTo(N, Res, AddTo);
	}

	SDValue TargetLowering::DAGCombinerInfo::
	CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo) {
	return ((DAGCombiner*)DC)->CombineTo(N, Res0, Res1, AddTo);
	}

	bool TargetLowering::DAGCombinerInfo::
	recursivelyDeleteUnusedNodes(SDNode *N) {
	return ((DAGCombiner*)DC)->recursivelyDeleteUnusedNodes(N);
	}

	void TargetLowering::DAGCombinerInfo::
	CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
	return ((DAGCombiner*)DC)->CommitTargetLoweringOpt(TLO);
	}

	//===----------------------------------------------------------------------===//
	// Helper Functions
	//===----------------------------------------------------------------------===//

	void DAGCombiner::deleteAndRecombine(SDNode *N) {
	removeFromWorklist(N);

	// If the operands of this node are only used by the node, they will now be
	// dead. Make sure to re-visit them and recursively delete dead nodes.
	for (const SDValue &Op : N->ops())
	// For an operand generating multiple values, one of the values may
	// become dead allowing further simplification (e.g. split index
	// arithmetic from an indexed load).
	if (Op->hasOneUse() \|\| Op->getNumValues() > 1)
	AddToWorklist(Op.getNode());

	DAG.DeleteNode(N);
	}

	// APInts must be the same size for most operations, this helper
	// function zero extends the shorter of the pair so that they match.
	// We provide an Offset so that we can create bitwidths that won't overflow.
	static void zeroExtendToMatch(APInt &LHS, APInt &RHS, unsigned Offset = 0) {
	unsigned Bits = Offset + std::max(LHS.getBitWidth(), RHS.getBitWidth());
	LHS = LHS.zextOrSelf(Bits);
	RHS = RHS.zextOrSelf(Bits);
	}

	// Return true if this node is a setcc, or is a select_cc
	// that selects between the target values used for true and false, making it
	// equivalent to a setcc. Also, set the incoming LHS, RHS, and CC references to
	// the appropriate nodes based on the type of node we are checking. This
	// simplifies life a bit for the callers.
	bool DAGCombiner::isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
	SDValue &CC, bool MatchStrict) const {
	if (N.getOpcode() == ISD::SETCC) {
	LHS = N.getOperand(0);
	RHS = N.getOperand(1);
	CC = N.getOperand(2);
	return true;
	}

	if (MatchStrict &&
	(N.getOpcode() == ISD::STRICT_FSETCC \|\|
	N.getOpcode() == ISD::STRICT_FSETCCS)) {
	LHS = N.getOperand(1);
	RHS = N.getOperand(2);
	CC = N.getOperand(3);
	return true;
	}

	if (N.getOpcode() != ISD::SELECT_CC \|\|
	!TLI.isConstTrueVal(N.getOperand(2).getNode()) \|\|
	!TLI.isConstFalseVal(N.getOperand(3).getNode()))
	return false;

	if (TLI.getBooleanContents(N.getValueType()) ==
	TargetLowering::UndefinedBooleanContent)
	return false;

	LHS = N.getOperand(0);
	RHS = N.getOperand(1);
	CC = N.getOperand(4);
	return true;
	}

	/// Return true if this is a SetCC-equivalent operation with only one use.
	/// If this is true, it allows the users to invert the operation for free when
	/// it is profitable to do so.
	bool DAGCombiner::isOneUseSetCC(SDValue N) const {
	SDValue N0, N1, N2;
	if (isSetCCEquivalent(N, N0, N1, N2) && N.getNode()->hasOneUse())
	return true;
	return false;
	}

	// Returns the SDNode if it is a constant float BuildVector
	// or constant float.
	static SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N) {
	if (isa<ConstantFPSDNode>(N))
	return N.getNode();
	if (ISD::isBuildVectorOfConstantFPSDNodes(N.getNode()))
	return N.getNode();
	return nullptr;
	}

	// Determines if it is a constant integer or a build vector of constant
	// integers (and undefs).
	// Do not permit build vector implicit truncation.
	static bool isConstantOrConstantVector(SDValue N, bool NoOpaques = false) {
	if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N))
	return !(Const->isOpaque() && NoOpaques);
	if (N.getOpcode() != ISD::BUILD_VECTOR)
	return false;
	unsigned BitWidth = N.getScalarValueSizeInBits();
	for (const SDValue &Op : N->op_values()) {
	if (Op.isUndef())
	continue;
	ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Op);
	if (!Const \|\| Const->getAPIntValue().getBitWidth() != BitWidth \|\|
	(Const->isOpaque() && NoOpaques))
	return false;
	}
	return true;
	}

	// Determines if a BUILD_VECTOR is composed of all-constants possibly mixed with
	// undef's.
	static bool isAnyConstantBuildVector(SDValue V, bool NoOpaques = false) {
	if (V.getOpcode() != ISD::BUILD_VECTOR)
	return false;
	return isConstantOrConstantVector(V, NoOpaques) \|\|
	ISD::isBuildVectorOfConstantFPSDNodes(V.getNode());
	}

	// Determine if this an indexed load with an opaque target constant index.
	static bool canSplitIdx(LoadSDNode *LD) {
	return MaySplitLoadIndex &&
	(LD->getOperand(2).getOpcode() != ISD::TargetConstant \|\|
	!cast<ConstantSDNode>(LD->getOperand(2))->isOpaque());
	}

	bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc,
	const SDLoc &DL,
	SDValue N0,
	SDValue N1) {
	// Currently this only tries to ensure we don't undo the GEP splits done by
	// CodeGenPrepare when shouldConsiderGEPOffsetSplit is true. To ensure this,
	// we check if the following transformation would be problematic:
	// (load/store (add, (add, x, offset1), offset2)) ->
	// (load/store (add, x, offset1+offset2)).

	if (Opc != ISD::ADD \|\| N0.getOpcode() != ISD::ADD)
	return false;

	if (N0.hasOneUse())
	return false;

	auto *C1 = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	auto *C2 = dyn_cast<ConstantSDNode>(N1);
	if (!C1 \|\| !C2)
	return false;

	const APInt &C1APIntVal = C1->getAPIntValue();
	const APInt &C2APIntVal = C2->getAPIntValue();
	if (C1APIntVal.getBitWidth() > 64 \|\| C2APIntVal.getBitWidth() > 64)
	return false;

	const APInt CombinedValueIntVal = C1APIntVal + C2APIntVal;
	if (CombinedValueIntVal.getBitWidth() > 64)
	return false;
	const int64_t CombinedValue = CombinedValueIntVal.getSExtValue();

	for (SDNode *Node : N0->uses()) {
	auto LoadStore = dyn_cast<MemSDNode>(Node);
	if (LoadStore) {
	// Is x[offset2] already not a legal addressing mode? If so then
	// reassociating the constants breaks nothing (we test offset2 because
	// that's the one we hope to fold into the load or store).
	TargetLoweringBase::AddrMode AM;
	AM.HasBaseReg = true;
	AM.BaseOffs = C2APIntVal.getSExtValue();
	EVT VT = LoadStore->getMemoryVT();
	unsigned AS = LoadStore->getAddressSpace();
	Type AccessTy = VT.getTypeForEVT(DAG.getContext());
	if (!TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, AccessTy, AS))
	continue;

	// Would x[offset1+offset2] still be a legal addressing mode?
	AM.BaseOffs = CombinedValue;
	if (!TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, AccessTy, AS))
	return true;
	}
	}

	return false;
	}

	// Helper for DAGCombiner::reassociateOps. Try to reassociate an expression
	// such as (Opc N0, N1), if \p N0 is the same kind of operation as \p Opc.
	SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
	SDValue N0, SDValue N1) {
	EVT VT = N0.getValueType();

	if (N0.getOpcode() != Opc)
	return SDValue();

	if (DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) {
	if (DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
	// Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2))
	if (SDValue OpNode =
	DAG.FoldConstantArithmetic(Opc, DL, VT, {N0.getOperand(1), N1}))
	return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode);
	return SDValue();
	}
	if (N0.hasOneUse()) {
	// Reassociate: (op (op x, c1), y) -> (op (op x, y), c1)
	// iff (op x, c1) has one use
	SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0.getOperand(0), N1);
	if (!OpNode.getNode())
	return SDValue();
	return DAG.getNode(Opc, DL, VT, OpNode, N0.getOperand(1));
	}
	}
	return SDValue();
	}

	// Try to reassociate commutative binops.
	SDValue DAGCombiner::reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
	SDValue N1, SDNodeFlags Flags) {
	assert(TLI.isCommutativeBinOp(Opc) && "Operation not commutative.");

	// Floating-point reassociation is not allowed without loose FP math.
	if (N0.getValueType().isFloatingPoint() \|\|
	N1.getValueType().isFloatingPoint())
	if (!Flags.hasAllowReassociation() \|\| !Flags.hasNoSignedZeros())
	return SDValue();

	if (SDValue Combined = reassociateOpsCommutative(Opc, DL, N0, N1))
	return Combined;
	if (SDValue Combined = reassociateOpsCommutative(Opc, DL, N1, N0))
	return Combined;
	return SDValue();
	}

	SDValue DAGCombiner::CombineTo(SDNode N, const SDValue To, unsigned NumTo,
	bool AddTo) {
	assert(N->getNumValues() == NumTo && "Broken CombineTo call!");
	++NodesCombined;
	LLVM_DEBUG(dbgs() << "\nReplacing.1 "; N->dump(&DAG); dbgs() << "\nWith: ";
	To[0].getNode()->dump(&DAG);
	dbgs() << " and " << NumTo - 1 << " other values\n");
	for (unsigned i = 0, e = NumTo; i != e; ++i)
	assert((!To[i].getNode() \|\|
	N->getValueType(i) == To[i].getValueType()) &&
	"Cannot combine value to value of different type!");

	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesWith(N, To);
	if (AddTo) {
	// Push the new nodes and any users onto the worklist
	for (unsigned i = 0, e = NumTo; i != e; ++i) {
	if (To[i].getNode()) {
	AddToWorklist(To[i].getNode());
	AddUsersToWorklist(To[i].getNode());
	}
	}
	}

	// Finally, if the node is now dead, remove it from the graph. The node
	// may not be dead if the replacement process recursively simplified to
	// something else needing this node.
	if (N->use_empty())
	deleteAndRecombine(N);
	return SDValue(N, 0);
	}

	void DAGCombiner::
	CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
	// Replace the old value with the new one.
	++NodesCombined;
	LLVM_DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.getNode()->dump(&DAG);
	dbgs() << "\nWith: "; TLO.New.getNode()->dump(&DAG);
	dbgs() << '\n');

	// Replace all uses. If any nodes become isomorphic to other nodes and
	// are deleted, make sure to remove them from our worklist.
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(TLO.Old, TLO.New);

	// Push the new node and any (possibly new) users onto the worklist.
	AddToWorklistWithUsers(TLO.New.getNode());

	// Finally, if the node is now dead, remove it from the graph. The node
	// may not be dead if the replacement process recursively simplified to
	// something else needing this node.
	if (TLO.Old.getNode()->use_empty())
	deleteAndRecombine(TLO.Old.getNode());
	}

	/// Check the specified integer node value to see if it can be simplified or if
	/// things it uses can be simplified by bit propagation. If so, return true.
	bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
	const APInt &DemandedElts,
	bool AssumeSingleUse) {
	TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
	KnownBits Known;
	if (!TLI.SimplifyDemandedBits(Op, DemandedBits, DemandedElts, Known, TLO, 0,
	AssumeSingleUse))
	return false;

	// Revisit the node.
	AddToWorklist(Op.getNode());

	CommitTargetLoweringOpt(TLO);
	return true;
	}

	/// Check the specified vector node value to see if it can be simplified or
	/// if things it uses can be simplified as it only uses some of the elements.
	/// If so, return true.
	bool DAGCombiner::SimplifyDemandedVectorElts(SDValue Op,
	const APInt &DemandedElts,
	bool AssumeSingleUse) {
	TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
	APInt KnownUndef, KnownZero;
	if (!TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero,
	TLO, 0, AssumeSingleUse))
	return false;

	// Revisit the node.
	AddToWorklist(Op.getNode());

	CommitTargetLoweringOpt(TLO);
	return true;
	}

	void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode Load, SDNode ExtLoad) {
	SDLoc DL(Load);
	EVT VT = Load->getValueType(0);
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, SDValue(ExtLoad, 0));

	LLVM_DEBUG(dbgs() << "\nReplacing.9 "; Load->dump(&DAG); dbgs() << "\nWith: ";
	Trunc.getNode()->dump(&DAG); dbgs() << '\n');
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), Trunc);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), SDValue(ExtLoad, 1));
	deleteAndRecombine(Load);
	AddToWorklist(Trunc.getNode());
	}

	SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) {
	Replace = false;
	SDLoc DL(Op);
	if (ISD::isUNINDEXEDLoad(Op.getNode())) {
	LoadSDNode *LD = cast<LoadSDNode>(Op);
	EVT MemVT = LD->getMemoryVT();
	ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD) ? ISD::EXTLOAD
	: LD->getExtensionType();
	Replace = true;
	return DAG.getExtLoad(ExtType, DL, PVT,
	LD->getChain(), LD->getBasePtr(),
	MemVT, LD->getMemOperand());
	}

	unsigned Opc = Op.getOpcode();
	switch (Opc) {
	default: break;
	case ISD::AssertSext:
	if (SDValue Op0 = SExtPromoteOperand(Op.getOperand(0), PVT))
	return DAG.getNode(ISD::AssertSext, DL, PVT, Op0, Op.getOperand(1));
	break;
	case ISD::AssertZext:
	if (SDValue Op0 = ZExtPromoteOperand(Op.getOperand(0), PVT))
	return DAG.getNode(ISD::AssertZext, DL, PVT, Op0, Op.getOperand(1));
	break;
	case ISD::Constant: {
	unsigned ExtOpc =
	Op.getValueType().isByteSized() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	return DAG.getNode(ExtOpc, DL, PVT, Op);
	}
	}

	if (!TLI.isOperationLegal(ISD::ANY_EXTEND, PVT))
	return SDValue();
	return DAG.getNode(ISD::ANY_EXTEND, DL, PVT, Op);
	}

	SDValue DAGCombiner::SExtPromoteOperand(SDValue Op, EVT PVT) {
	if (!TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, PVT))
	return SDValue();
	EVT OldVT = Op.getValueType();
	SDLoc DL(Op);
	bool Replace = false;
	SDValue NewOp = PromoteOperand(Op, PVT, Replace);
	if (!NewOp.getNode())
	return SDValue();
	AddToWorklist(NewOp.getNode());

	if (Replace)
	ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode());
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, NewOp.getValueType(), NewOp,
	DAG.getValueType(OldVT));
	}

	SDValue DAGCombiner::ZExtPromoteOperand(SDValue Op, EVT PVT) {
	EVT OldVT = Op.getValueType();
	SDLoc DL(Op);
	bool Replace = false;
	SDValue NewOp = PromoteOperand(Op, PVT, Replace);
	if (!NewOp.getNode())
	return SDValue();
	AddToWorklist(NewOp.getNode());

	if (Replace)
	ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode());
	return DAG.getZeroExtendInReg(NewOp, DL, OldVT);
	}

	/// Promote the specified integer binary operation if the target indicates it is
	/// beneficial. e.g. On x86, it's usually better to promote i16 operations to
	/// i32 since i16 instructions are longer.
	SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {
	if (!LegalOperations)
	return SDValue();

	EVT VT = Op.getValueType();
	if (VT.isVector() \|\| !VT.isInteger())
	return SDValue();

	// If operation type is 'undesirable', e.g. i16 on x86, consider
	// promoting it.
	unsigned Opc = Op.getOpcode();
	if (TLI.isTypeDesirableForOp(Opc, VT))
	return SDValue();

	EVT PVT = VT;
	// Consult target whether it is a good idea to promote this operation and
	// what's the right type to promote it to.
	if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
	assert(PVT != VT && "Don't know what type to promote to!");

	LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));

	bool Replace0 = false;
	SDValue N0 = Op.getOperand(0);
	SDValue NN0 = PromoteOperand(N0, PVT, Replace0);

	bool Replace1 = false;
	SDValue N1 = Op.getOperand(1);
	SDValue NN1 = PromoteOperand(N1, PVT, Replace1);
	SDLoc DL(Op);

	SDValue RV =
	DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, NN0, NN1));

	// We are always replacing N0/N1's use in N and only need additional
	// replacements if there are additional uses.
	// Note: We are checking uses of the nodes (SDNode) rather than values
	// (SDValue) here because the node may reference multiple values
	// (for example, the chain value of a load node).
	Replace0 &= !N0->hasOneUse();
	Replace1 &= (N0 != N1) && !N1->hasOneUse();

	// Combine Op here so it is preserved past replacements.
	CombineTo(Op.getNode(), RV);

	// If operands have a use ordering, make sure we deal with
	// predecessor first.
	if (Replace0 && Replace1 && N0.getNode()->isPredecessorOf(N1.getNode())) {
	std::swap(N0, N1);
	std::swap(NN0, NN1);
	}

	if (Replace0) {
	AddToWorklist(NN0.getNode());
	ReplaceLoadWithPromotedLoad(N0.getNode(), NN0.getNode());
	}
	if (Replace1) {
	AddToWorklist(NN1.getNode());
	ReplaceLoadWithPromotedLoad(N1.getNode(), NN1.getNode());
	}
	return Op;
	}
	return SDValue();
	}

	/// Promote the specified integer shift operation if the target indicates it is
	/// beneficial. e.g. On x86, it's usually better to promote i16 operations to
	/// i32 since i16 instructions are longer.
	SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) {
	if (!LegalOperations)
	return SDValue();

	EVT VT = Op.getValueType();
	if (VT.isVector() \|\| !VT.isInteger())
	return SDValue();

	// If operation type is 'undesirable', e.g. i16 on x86, consider
	// promoting it.
	unsigned Opc = Op.getOpcode();
	if (TLI.isTypeDesirableForOp(Opc, VT))
	return SDValue();

	EVT PVT = VT;
	// Consult target whether it is a good idea to promote this operation and
	// what's the right type to promote it to.
	if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
	assert(PVT != VT && "Don't know what type to promote to!");

	LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));

	bool Replace = false;
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);
	if (Opc == ISD::SRA)
	N0 = SExtPromoteOperand(N0, PVT);
	else if (Opc == ISD::SRL)
	N0 = ZExtPromoteOperand(N0, PVT);
	else
	N0 = PromoteOperand(N0, PVT, Replace);

	if (!N0.getNode())
	return SDValue();

	SDLoc DL(Op);
	SDValue RV =
	DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, N0, N1));

	if (Replace)
	ReplaceLoadWithPromotedLoad(Op.getOperand(0).getNode(), N0.getNode());

	// Deal with Op being deleted.
	if (Op && Op.getOpcode() != ISD::DELETED_NODE)
	return RV;
	}
	return SDValue();
	}

	SDValue DAGCombiner::PromoteExtend(SDValue Op) {
	if (!LegalOperations)
	return SDValue();

	EVT VT = Op.getValueType();
	if (VT.isVector() \|\| !VT.isInteger())
	return SDValue();

	// If operation type is 'undesirable', e.g. i16 on x86, consider
	// promoting it.
	unsigned Opc = Op.getOpcode();
	if (TLI.isTypeDesirableForOp(Opc, VT))
	return SDValue();

	EVT PVT = VT;
	// Consult target whether it is a good idea to promote this operation and
	// what's the right type to promote it to.
	if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
	assert(PVT != VT && "Don't know what type to promote to!");
	// fold (aext (aext x)) -> (aext x)
	// fold (aext (zext x)) -> (zext x)
	// fold (aext (sext x)) -> (sext x)
	LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));
	return DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, Op.getOperand(0));
	}
	return SDValue();
	}

	bool DAGCombiner::PromoteLoad(SDValue Op) {
	if (!LegalOperations)
	return false;

	if (!ISD::isUNINDEXEDLoad(Op.getNode()))
	return false;

	EVT VT = Op.getValueType();
	if (VT.isVector() \|\| !VT.isInteger())
	return false;

	// If operation type is 'undesirable', e.g. i16 on x86, consider
	// promoting it.
	unsigned Opc = Op.getOpcode();
	if (TLI.isTypeDesirableForOp(Opc, VT))
	return false;

	EVT PVT = VT;
	// Consult target whether it is a good idea to promote this operation and
	// what's the right type to promote it to.
	if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
	assert(PVT != VT && "Don't know what type to promote to!");

	SDLoc DL(Op);
	SDNode *N = Op.getNode();
	LoadSDNode *LD = cast<LoadSDNode>(N);
	EVT MemVT = LD->getMemoryVT();
	ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD) ? ISD::EXTLOAD
	: LD->getExtensionType();
	SDValue NewLD = DAG.getExtLoad(ExtType, DL, PVT,
	LD->getChain(), LD->getBasePtr(),
	MemVT, LD->getMemOperand());
	SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD);

	LLVM_DEBUG(dbgs() << "\nPromoting "; N->dump(&DAG); dbgs() << "\nTo: ";
	Result.getNode()->dump(&DAG); dbgs() << '\n');
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1));
	deleteAndRecombine(N);
	AddToWorklist(Result.getNode());
	return true;
	}
	return false;
	}

	/// Recursively delete a node which has no uses and any operands for
	/// which it is the only use.
	///
	/// Note that this both deletes the nodes and removes them from the worklist.
	/// It also adds any nodes who have had a user deleted to the worklist as they
	/// may now have only one use and subject to other combines.
	bool DAGCombiner::recursivelyDeleteUnusedNodes(SDNode *N) {
	if (!N->use_empty())
	return false;

	SmallSetVector<SDNode *, 16> Nodes;
	Nodes.insert(N);
	do {
	N = Nodes.pop_back_val();
	if (!N)
	continue;

	if (N->use_empty()) {
	for (const SDValue &ChildN : N->op_values())
	Nodes.insert(ChildN.getNode());

	removeFromWorklist(N);
	DAG.DeleteNode(N);
	} else {
	AddToWorklist(N);
	}
	} while (!Nodes.empty());
	return true;
	}

	//===----------------------------------------------------------------------===//
	// Main DAG Combiner implementation
	//===----------------------------------------------------------------------===//

	void DAGCombiner::Run(CombineLevel AtLevel) {
	// set the instance variables, so that the various visit routines may use it.
	Level = AtLevel;
	LegalDAG = Level >= AfterLegalizeDAG;
	LegalOperations = Level >= AfterLegalizeVectorOps;
	LegalTypes = Level >= AfterLegalizeTypes;

	WorklistInserter AddNodes(*this);

	// Add all the dag nodes to the worklist.
	for (SDNode &Node : DAG.allnodes())
	AddToWorklist(&Node);

	// Create a dummy node (which is not added to allnodes), that adds a reference
	// to the root node, preventing it from being deleted, and tracking any
	// changes of the root.
	HandleSDNode Dummy(DAG.getRoot());

	// While we have a valid worklist entry node, try to combine it.
	while (SDNode *N = getNextWorklistEntry()) {
	// If N has no uses, it is dead. Make sure to revisit all N's operands once
	// N is deleted from the DAG, since they too may now be dead or may have a
	// reduced number of uses, allowing other xforms.
	if (recursivelyDeleteUnusedNodes(N))
	continue;

	WorklistRemover DeadNodes(*this);

	// If this combine is running after legalizing the DAG, re-legalize any
	// nodes pulled off the worklist.
	if (LegalDAG) {
	SmallSetVector<SDNode *, 16> UpdatedNodes;
	bool NIsValid = DAG.LegalizeOp(N, UpdatedNodes);

	for (SDNode *LN : UpdatedNodes)
	AddToWorklistWithUsers(LN);

	if (!NIsValid)
	continue;
	}

	LLVM_DEBUG(dbgs() << "\nCombining: "; N->dump(&DAG));

	// Add any operands of the new node which have not yet been combined to the
	// worklist as well. Because the worklist uniques things already, this
	// won't repeatedly process the same operand.
	CombinedNodes.insert(N);
	for (const SDValue &ChildN : N->op_values())
	if (!CombinedNodes.count(ChildN.getNode()))
	AddToWorklist(ChildN.getNode());

	SDValue RV = combine(N);

	if (!RV.getNode())
	continue;

	++NodesCombined;

	// If we get back the same node we passed in, rather than a new node or
	// zero, we know that the node must have defined multiple values and
	// CombineTo was used. Since CombineTo takes care of the worklist
	// mechanics for us, we have no work to do in this case.
	if (RV.getNode() == N)
	continue;

	assert(N->getOpcode() != ISD::DELETED_NODE &&
	RV.getOpcode() != ISD::DELETED_NODE &&
	"Node was deleted but visit returned new node!");

	LLVM_DEBUG(dbgs() << " ... into: "; RV.getNode()->dump(&DAG));

	if (N->getNumValues() == RV.getNode()->getNumValues())
	DAG.ReplaceAllUsesWith(N, RV.getNode());
	else {
	assert(N->getValueType(0) == RV.getValueType() &&
	N->getNumValues() == 1 && "Type mismatch");
	DAG.ReplaceAllUsesWith(N, &RV);
	}

	// Push the new node and any users onto the worklist
	AddToWorklist(RV.getNode());
	AddUsersToWorklist(RV.getNode());

	// Finally, if the node is now dead, remove it from the graph. The node
	// may not be dead if the replacement process recursively simplified to
	// something else needing this node. This will also take care of adding any
	// operands which have lost a user to the worklist.
	recursivelyDeleteUnusedNodes(N);
	}

	// If the root changed (e.g. it was a dead load, update the root).
	DAG.setRoot(Dummy.getValue());
	DAG.RemoveDeadNodes();
	}

	SDValue DAGCombiner::visit(SDNode *N) {
	switch (N->getOpcode()) {
	default: break;
	case ISD::TokenFactor: return visitTokenFactor(N);
	case ISD::MERGE_VALUES: return visitMERGE_VALUES(N);
	case ISD::ADD: return visitADD(N);
	case ISD::SUB: return visitSUB(N);
	case ISD::SADDSAT:
	case ISD::UADDSAT: return visitADDSAT(N);
	case ISD::SSUBSAT:
	case ISD::USUBSAT: return visitSUBSAT(N);
	case ISD::ADDC: return visitADDC(N);
	case ISD::SADDO:
	case ISD::UADDO: return visitADDO(N);
	case ISD::SUBC: return visitSUBC(N);
	case ISD::SSUBO:
	case ISD::USUBO: return visitSUBO(N);
	case ISD::ADDE: return visitADDE(N);
	case ISD::ADDCARRY: return visitADDCARRY(N);
	case ISD::SUBE: return visitSUBE(N);
	case ISD::SUBCARRY: return visitSUBCARRY(N);
	case ISD::SMULFIX:
	case ISD::SMULFIXSAT:
	case ISD::UMULFIX:
	case ISD::UMULFIXSAT: return visitMULFIX(N);
	case ISD::MUL: return visitMUL(N);
	case ISD::SDIV: return visitSDIV(N);
	case ISD::UDIV: return visitUDIV(N);
	case ISD::SREM:
	case ISD::UREM: return visitREM(N);
	case ISD::MULHU: return visitMULHU(N);
	case ISD::MULHS: return visitMULHS(N);
	case ISD::SMUL_LOHI: return visitSMUL_LOHI(N);
	case ISD::UMUL_LOHI: return visitUMUL_LOHI(N);
	case ISD::SMULO:
	case ISD::UMULO: return visitMULO(N);
	case ISD::SMIN:
	case ISD::SMAX:
	case ISD::UMIN:
	case ISD::UMAX: return visitIMINMAX(N);
	case ISD::AND: return visitAND(N);
	case ISD::OR: return visitOR(N);
	case ISD::XOR: return visitXOR(N);
	case ISD::SHL: return visitSHL(N);
	case ISD::SRA: return visitSRA(N);
	case ISD::SRL: return visitSRL(N);
	case ISD::ROTR:
	case ISD::ROTL: return visitRotate(N);
	case ISD::FSHL:
	case ISD::FSHR: return visitFunnelShift(N);
	case ISD::ABS: return visitABS(N);
	case ISD::BSWAP: return visitBSWAP(N);
	case ISD::BITREVERSE: return visitBITREVERSE(N);
	case ISD::CTLZ: return visitCTLZ(N);
	case ISD::CTLZ_ZERO_UNDEF: return visitCTLZ_ZERO_UNDEF(N);
	case ISD::CTTZ: return visitCTTZ(N);
	case ISD::CTTZ_ZERO_UNDEF: return visitCTTZ_ZERO_UNDEF(N);
	case ISD::CTPOP: return visitCTPOP(N);
	case ISD::SELECT: return visitSELECT(N);
	case ISD::VSELECT: return visitVSELECT(N);
	case ISD::SELECT_CC: return visitSELECT_CC(N);
	case ISD::SETCC: return visitSETCC(N);
	case ISD::SETCCCARRY: return visitSETCCCARRY(N);
	case ISD::SIGN_EXTEND: return visitSIGN_EXTEND(N);
	case ISD::ZERO_EXTEND: return visitZERO_EXTEND(N);
	case ISD::ANY_EXTEND: return visitANY_EXTEND(N);
	case ISD::AssertSext:
	case ISD::AssertZext: return visitAssertExt(N);
	case ISD::AssertAlign: return visitAssertAlign(N);
	case ISD::SIGN_EXTEND_INREG: return visitSIGN_EXTEND_INREG(N);
	case ISD::SIGN_EXTEND_VECTOR_INREG: return visitSIGN_EXTEND_VECTOR_INREG(N);
	case ISD::ZERO_EXTEND_VECTOR_INREG: return visitZERO_EXTEND_VECTOR_INREG(N);
	case ISD::TRUNCATE: return visitTRUNCATE(N);
	case ISD::BITCAST: return visitBITCAST(N);
	case ISD::BUILD_PAIR: return visitBUILD_PAIR(N);
	case ISD::FADD: return visitFADD(N);
	case ISD::FSUB: return visitFSUB(N);
	case ISD::FMUL: return visitFMUL(N);
	case ISD::FMA: return visitFMA(N);
	case ISD::FDIV: return visitFDIV(N);
	case ISD::FREM: return visitFREM(N);
	case ISD::FSQRT: return visitFSQRT(N);
	case ISD::FCOPYSIGN: return visitFCOPYSIGN(N);
	case ISD::FPOW: return visitFPOW(N);
	case ISD::SINT_TO_FP: return visitSINT_TO_FP(N);
	case ISD::UINT_TO_FP: return visitUINT_TO_FP(N);
	case ISD::FP_TO_SINT: return visitFP_TO_SINT(N);
	case ISD::FP_TO_UINT: return visitFP_TO_UINT(N);
	case ISD::FP_ROUND: return visitFP_ROUND(N);
	case ISD::FP_EXTEND: return visitFP_EXTEND(N);
	case ISD::FNEG: return visitFNEG(N);
	case ISD::FABS: return visitFABS(N);
	case ISD::FFLOOR: return visitFFLOOR(N);
	case ISD::FMINNUM: return visitFMINNUM(N);
	case ISD::FMAXNUM: return visitFMAXNUM(N);
	case ISD::FMINIMUM: return visitFMINIMUM(N);
	case ISD::FMAXIMUM: return visitFMAXIMUM(N);
	case ISD::FCEIL: return visitFCEIL(N);
	case ISD::FTRUNC: return visitFTRUNC(N);
	case ISD::BRCOND: return visitBRCOND(N);
	case ISD::BR_CC: return visitBR_CC(N);
	case ISD::LOAD: return visitLOAD(N);
	case ISD::STORE: return visitSTORE(N);
	case ISD::INSERT_VECTOR_ELT: return visitINSERT_VECTOR_ELT(N);
	case ISD::EXTRACT_VECTOR_ELT: return visitEXTRACT_VECTOR_ELT(N);
	case ISD::BUILD_VECTOR: return visitBUILD_VECTOR(N);
	case ISD::CONCAT_VECTORS: return visitCONCAT_VECTORS(N);
	case ISD::EXTRACT_SUBVECTOR: return visitEXTRACT_SUBVECTOR(N);
	case ISD::VECTOR_SHUFFLE: return visitVECTOR_SHUFFLE(N);
	case ISD::SCALAR_TO_VECTOR: return visitSCALAR_TO_VECTOR(N);
	case ISD::INSERT_SUBVECTOR: return visitINSERT_SUBVECTOR(N);
	case ISD::MGATHER: return visitMGATHER(N);
	case ISD::MLOAD: return visitMLOAD(N);
	case ISD::MSCATTER: return visitMSCATTER(N);
	case ISD::MSTORE: return visitMSTORE(N);
	case ISD::LIFETIME_END: return visitLIFETIME_END(N);
	case ISD::FP_TO_FP16: return visitFP_TO_FP16(N);
	case ISD::FP16_TO_FP: return visitFP16_TO_FP(N);
	case ISD::FREEZE: return visitFREEZE(N);
	case ISD::VECREDUCE_FADD:
	case ISD::VECREDUCE_FMUL:
	case ISD::VECREDUCE_ADD:
	case ISD::VECREDUCE_MUL:
	case ISD::VECREDUCE_AND:
	case ISD::VECREDUCE_OR:
	case ISD::VECREDUCE_XOR:
	case ISD::VECREDUCE_SMAX:
	case ISD::VECREDUCE_SMIN:
	case ISD::VECREDUCE_UMAX:
	case ISD::VECREDUCE_UMIN:
	case ISD::VECREDUCE_FMAX:
	case ISD::VECREDUCE_FMIN: return visitVECREDUCE(N);
	}
	return SDValue();
	}

	SDValue DAGCombiner::combine(SDNode *N) {
	SDValue RV;
	if (!DisableGenericCombines)
	RV = visit(N);

	// If nothing happened, try a target-specific DAG combine.
	if (!RV.getNode()) {
	assert(N->getOpcode() != ISD::DELETED_NODE &&
	"Node was deleted but visit returned NULL!");

	if (N->getOpcode() >= ISD::BUILTIN_OP_END \|\|
	TLI.hasTargetDAGCombine((ISD::NodeType)N->getOpcode())) {

	// Expose the DAG combiner to the target combiner impls.
	TargetLowering::DAGCombinerInfo
	DagCombineInfo(DAG, Level, false, this);

	RV = TLI.PerformDAGCombine(N, DagCombineInfo);
	}
	}

	// If nothing happened still, try promoting the operation.
	if (!RV.getNode()) {
	switch (N->getOpcode()) {
	default: break;
	case ISD::ADD:
	case ISD::SUB:
	case ISD::MUL:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	RV = PromoteIntBinOp(SDValue(N, 0));
	break;
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL:
	RV = PromoteIntShiftOp(SDValue(N, 0));
	break;
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	RV = PromoteExtend(SDValue(N, 0));
	break;
	case ISD::LOAD:
	if (PromoteLoad(SDValue(N, 0)))
	RV = SDValue(N, 0);
	break;
	}
	}

	// If N is a commutative binary node, try to eliminate it if the commuted
	// version is already present in the DAG.
	if (!RV.getNode() && TLI.isCommutativeBinOp(N->getOpcode()) &&
	N->getNumValues() == 1) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Constant operands are canonicalized to RHS.
	if (N0 != N1 && (isa<ConstantSDNode>(N0) \|\| !isa<ConstantSDNode>(N1))) {
	SDValue Ops[] = {N1, N0};
	SDNode *CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(), Ops,
	N->getFlags());
	if (CSENode)
	return SDValue(CSENode, 0);
	}
	}

	return RV;
	}

	/// Given a node, return its input chain if it has one, otherwise return a null
	/// sd operand.
	static SDValue getInputChainForNode(SDNode *N) {
	if (unsigned NumOps = N->getNumOperands()) {
	if (N->getOperand(0).getValueType() == MVT::Other)
	return N->getOperand(0);
	if (N->getOperand(NumOps-1).getValueType() == MVT::Other)
	return N->getOperand(NumOps-1);
	for (unsigned i = 1; i < NumOps-1; ++i)
	if (N->getOperand(i).getValueType() == MVT::Other)
	return N->getOperand(i);
	}
	return SDValue();
	}

	SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
	// If N has two operands, where one has an input chain equal to the other,
	// the 'other' chain is redundant.
	if (N->getNumOperands() == 2) {
	if (getInputChainForNode(N->getOperand(0).getNode()) == N->getOperand(1))
	return N->getOperand(0);
	if (getInputChainForNode(N->getOperand(1).getNode()) == N->getOperand(0))
	return N->getOperand(1);
	}

	// Don't simplify token factors if optnone.
	if (OptLevel == CodeGenOpt::None)
	return SDValue();

	// If the sole user is a token factor, we should make sure we have a
	// chance to merge them together. This prevents TF chains from inhibiting
	// optimizations.
	if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::TokenFactor)
	AddToWorklist(*(N->use_begin()));

	SmallVector<SDNode *, 8> TFs; // List of token factors to visit.
	SmallVector<SDValue, 8> Ops; // Ops for replacing token factor.
	SmallPtrSet<SDNode*, 16> SeenOps;
	bool Changed = false; // If we should replace this token factor.

	// Start out with this token factor.
	TFs.push_back(N);

	// Iterate through token factors. The TFs grows when new token factors are
	// encountered.
	for (unsigned i = 0; i < TFs.size(); ++i) {
	// Limit number of nodes to inline, to avoid quadratic compile times.
	// We have to add the outstanding Token Factors to Ops, otherwise we might
	// drop Ops from the resulting Token Factors.
	if (Ops.size() > TokenFactorInlineLimit) {
	for (unsigned j = i; j < TFs.size(); j++)
	Ops.emplace_back(TFs[j], 0);
	// Drop unprocessed Token Factors from TFs, so we do not add them to the
	// combiner worklist later.
	TFs.resize(i);
	break;
	}

	SDNode *TF = TFs[i];
	// Check each of the operands.
	for (const SDValue &Op : TF->op_values()) {
	switch (Op.getOpcode()) {
	case ISD::EntryToken:
	// Entry tokens don't need to be added to the list. They are
	// redundant.
	Changed = true;
	break;

	case ISD::TokenFactor:
	if (Op.hasOneUse() && !is_contained(TFs, Op.getNode())) {
	// Queue up for processing.
	TFs.push_back(Op.getNode());
	Changed = true;
	break;
	}
	LLVM_FALLTHROUGH;

	default:
	// Only add if it isn't already in the list.
	if (SeenOps.insert(Op.getNode()).second)
	Ops.push_back(Op);
	else
	Changed = true;
	break;
	}
	}
	}

	// Re-visit inlined Token Factors, to clean them up in case they have been
	// removed. Skip the first Token Factor, as this is the current node.
	for (unsigned i = 1, e = TFs.size(); i < e; i++)
	AddToWorklist(TFs[i]);

	// Remove Nodes that are chained to another node in the list. Do so
	// by walking up chains breath-first stopping when we've seen
	// another operand. In general we must climb to the EntryNode, but we can exit
	// early if we find all remaining work is associated with just one operand as
	// no further pruning is possible.

	// List of nodes to search through and original Ops from which they originate.
	SmallVector<std::pair<SDNode *, unsigned>, 8> Worklist;
	SmallVector<unsigned, 8> OpWorkCount; // Count of work for each Op.
	SmallPtrSet<SDNode *, 16> SeenChains;
	bool DidPruneOps = false;

	unsigned NumLeftToConsider = 0;
	for (const SDValue &Op : Ops) {
	Worklist.push_back(std::make_pair(Op.getNode(), NumLeftToConsider++));
	OpWorkCount.push_back(1);
	}

	auto AddToWorklist = [&](unsigned CurIdx, SDNode *Op, unsigned OpNumber) {
	// If this is an Op, we can remove the op from the list. Remark any
	// search associated with it as from the current OpNumber.
	if (SeenOps.count(Op) != 0) {
	Changed = true;
	DidPruneOps = true;
	unsigned OrigOpNumber = 0;
	while (OrigOpNumber < Ops.size() && Ops[OrigOpNumber].getNode() != Op)
	OrigOpNumber++;
	assert((OrigOpNumber != Ops.size()) &&
	"expected to find TokenFactor Operand");
	// Re-mark worklist from OrigOpNumber to OpNumber
	for (unsigned i = CurIdx + 1; i < Worklist.size(); ++i) {
	if (Worklist[i].second == OrigOpNumber) {
	Worklist[i].second = OpNumber;
	}
	}
	OpWorkCount[OpNumber] += OpWorkCount[OrigOpNumber];
	OpWorkCount[OrigOpNumber] = 0;
	NumLeftToConsider--;
	}
	// Add if it's a new chain
	if (SeenChains.insert(Op).second) {
	OpWorkCount[OpNumber]++;
	Worklist.push_back(std::make_pair(Op, OpNumber));
	}
	};

	for (unsigned i = 0; i < Worklist.size() && i < 1024; ++i) {
	// We need at least be consider at least 2 Ops to prune.
	if (NumLeftToConsider <= 1)
	break;
	auto CurNode = Worklist[i].first;
	auto CurOpNumber = Worklist[i].second;
	assert((OpWorkCount[CurOpNumber] > 0) &&
	"Node should not appear in worklist");
	switch (CurNode->getOpcode()) {
	case ISD::EntryToken:
	// Hitting EntryToken is the only way for the search to terminate without
	// hitting
	// another operand's search. Prevent us from marking this operand
	// considered.
	NumLeftToConsider++;
	break;
	case ISD::TokenFactor:
	for (const SDValue &Op : CurNode->op_values())
	AddToWorklist(i, Op.getNode(), CurOpNumber);
	break;
	case ISD::LIFETIME_START:
	case ISD::LIFETIME_END:
	case ISD::CopyFromReg:
	case ISD::CopyToReg:
	AddToWorklist(i, CurNode->getOperand(0).getNode(), CurOpNumber);
	break;
	default:
	if (auto *MemNode = dyn_cast<MemSDNode>(CurNode))
	AddToWorklist(i, MemNode->getChain().getNode(), CurOpNumber);
	break;
	}
	OpWorkCount[CurOpNumber]--;
	if (OpWorkCount[CurOpNumber] == 0)
	NumLeftToConsider--;
	}

	// If we've changed things around then replace token factor.
	if (Changed) {
	SDValue Result;
	if (Ops.empty()) {
	// The entry token is the only possible outcome.
	Result = DAG.getEntryNode();
	} else {
	if (DidPruneOps) {
	SmallVector<SDValue, 8> PrunedOps;
	//
	for (const SDValue &Op : Ops) {
	if (SeenChains.count(Op.getNode()) == 0)
	PrunedOps.push_back(Op);
	}
	Result = DAG.getTokenFactor(SDLoc(N), PrunedOps);
	} else {
	Result = DAG.getTokenFactor(SDLoc(N), Ops);
	}
	}
	return Result;
	}
	return SDValue();
	}

	/// MERGE_VALUES can always be eliminated.
	SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) {
	WorklistRemover DeadNodes(*this);
	// Replacing results may cause a different MERGE_VALUES to suddenly
	// be CSE'd with N, and carry its uses with it. Iterate until no
	// uses remain, to ensure that the node can be safely deleted.
	// First add the users of this node to the work list so that they
	// can be tried again once they have new operands.
	AddUsersToWorklist(N);
	do {
	// Do as a single replacement to avoid rewalking use lists.
	SmallVector<SDValue, 8> Ops;
	for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
	Ops.push_back(N->getOperand(i));
	DAG.ReplaceAllUsesWith(N, Ops.data());
	} while (!N->use_empty());
	deleteAndRecombine(N);
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}

	/// If \p N is a ConstantSDNode with isOpaque() == false return it casted to a
	/// ConstantSDNode pointer else nullptr.
	static ConstantSDNode *getAsNonOpaqueConstant(SDValue N) {
	ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N);
	return Const != nullptr && !Const->isOpaque() ? Const : nullptr;
	}

	SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) {
	assert(TLI.isBinOp(BO->getOpcode()) && BO->getNumValues() == 1 &&
	"Unexpected binary operator");

	// Don't do this unless the old select is going away. We want to eliminate the
	// binary operator, not replace a binop with a select.
	// TODO: Handle ISD::SELECT_CC.
	unsigned SelOpNo = 0;
	SDValue Sel = BO->getOperand(0);
	if (Sel.getOpcode() != ISD::SELECT \|\| !Sel.hasOneUse()) {
	SelOpNo = 1;
	Sel = BO->getOperand(1);
	}

	if (Sel.getOpcode() != ISD::SELECT \|\| !Sel.hasOneUse())
	return SDValue();

	SDValue CT = Sel.getOperand(1);
	if (!isConstantOrConstantVector(CT, true) &&
	!isConstantFPBuildVectorOrConstantFP(CT))
	return SDValue();

	SDValue CF = Sel.getOperand(2);
	if (!isConstantOrConstantVector(CF, true) &&
	!isConstantFPBuildVectorOrConstantFP(CF))
	return SDValue();

	// Bail out if any constants are opaque because we can't constant fold those.
	// The exception is "and" and "or" with either 0 or -1 in which case we can
	// propagate non constant operands into select. I.e.:
	// and (select Cond, 0, -1), X --> select Cond, 0, X
	// or X, (select Cond, -1, 0) --> select Cond, -1, X
	auto BinOpcode = BO->getOpcode();
	bool CanFoldNonConst =
	(BinOpcode == ISD::AND \|\| BinOpcode == ISD::OR) &&
	(isNullOrNullSplat(CT) \|\| isAllOnesOrAllOnesSplat(CT)) &&
	(isNullOrNullSplat(CF) \|\| isAllOnesOrAllOnesSplat(CF));

	SDValue CBO = BO->getOperand(SelOpNo ^ 1);
	if (!CanFoldNonConst &&
	!isConstantOrConstantVector(CBO, true) &&
	!isConstantFPBuildVectorOrConstantFP(CBO))
	return SDValue();

	EVT VT = Sel.getValueType();

	// In case of shift value and shift amount may have different VT. For instance
	// on x86 shift amount is i8 regardles of LHS type. Bail out if we have
	// swapped operands and value types do not match. NB: x86 is fine if operands
	// are not swapped with shift amount VT being not bigger than shifted value.
	// TODO: that is possible to check for a shift operation, correct VTs and
	// still perform optimization on x86 if needed.
	if (SelOpNo && VT != CBO.getValueType())
	return SDValue();

	// We have a select-of-constants followed by a binary operator with a
	// constant. Eliminate the binop by pulling the constant math into the select.
	// Example: add (select Cond, CT, CF), CBO --> select Cond, CT + CBO, CF + CBO
	SDLoc DL(Sel);
	SDValue NewCT = SelOpNo ? DAG.getNode(BinOpcode, DL, VT, CBO, CT)
	: DAG.getNode(BinOpcode, DL, VT, CT, CBO);
	if (!CanFoldNonConst && !NewCT.isUndef() &&
	!isConstantOrConstantVector(NewCT, true) &&
	!isConstantFPBuildVectorOrConstantFP(NewCT))
	return SDValue();

	SDValue NewCF = SelOpNo ? DAG.getNode(BinOpcode, DL, VT, CBO, CF)
	: DAG.getNode(BinOpcode, DL, VT, CF, CBO);
	if (!CanFoldNonConst && !NewCF.isUndef() &&
	!isConstantOrConstantVector(NewCF, true) &&
	!isConstantFPBuildVectorOrConstantFP(NewCF))
	return SDValue();

	SDValue SelectOp = DAG.getSelect(DL, VT, Sel.getOperand(0), NewCT, NewCF);
	SelectOp->setFlags(BO->getFlags());
	return SelectOp;
	}

	static SDValue foldAddSubBoolOfMaskedVal(SDNode *N, SelectionDAG &DAG) {
	assert((N->getOpcode() == ISD::ADD \|\| N->getOpcode() == ISD::SUB) &&
	"Expecting add or sub");

	// Match a constant operand and a zext operand for the math instruction:
	// add Z, C
	// sub C, Z
	bool IsAdd = N->getOpcode() == ISD::ADD;
	SDValue C = IsAdd ? N->getOperand(1) : N->getOperand(0);
	SDValue Z = IsAdd ? N->getOperand(0) : N->getOperand(1);
	auto *CN = dyn_cast<ConstantSDNode>(C);
	if (!CN \|\| Z.getOpcode() != ISD::ZERO_EXTEND)
	return SDValue();

	// Match the zext operand as a setcc of a boolean.
	if (Z.getOperand(0).getOpcode() != ISD::SETCC \|\|
	Z.getOperand(0).getValueType() != MVT::i1)
	return SDValue();

	// Match the compare as: setcc (X & 1), 0, eq.
	SDValue SetCC = Z.getOperand(0);
	ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
	if (CC != ISD::SETEQ \|\| !isNullConstant(SetCC.getOperand(1)) \|\|
	SetCC.getOperand(0).getOpcode() != ISD::AND \|\|
	!isOneConstant(SetCC.getOperand(0).getOperand(1)))
	return SDValue();

	// We are adding/subtracting a constant and an inverted low bit. Turn that
	// into a subtract/add of the low bit with incremented/decremented constant:
	// add (zext i1 (seteq (X & 1), 0)), C --> sub C+1, (zext (X & 1))
	// sub C, (zext i1 (seteq (X & 1), 0)) --> add C-1, (zext (X & 1))
	EVT VT = C.getValueType();
	SDLoc DL(N);
	SDValue LowBit = DAG.getZExtOrTrunc(SetCC.getOperand(0), DL, VT);
	SDValue C1 = IsAdd ? DAG.getConstant(CN->getAPIntValue() + 1, DL, VT) :
	DAG.getConstant(CN->getAPIntValue() - 1, DL, VT);
	return DAG.getNode(IsAdd ? ISD::SUB : ISD::ADD, DL, VT, C1, LowBit);
	}

	/// Try to fold a 'not' shifted sign-bit with add/sub with constant operand into
	/// a shift and add with a different constant.
	static SDValue foldAddSubOfSignBit(SDNode *N, SelectionDAG &DAG) {
	assert((N->getOpcode() == ISD::ADD \|\| N->getOpcode() == ISD::SUB) &&
	"Expecting add or sub");

	// We need a constant operand for the add/sub, and the other operand is a
	// logical shift right: add (srl), C or sub C, (srl).
	bool IsAdd = N->getOpcode() == ISD::ADD;
	SDValue ConstantOp = IsAdd ? N->getOperand(1) : N->getOperand(0);
	SDValue ShiftOp = IsAdd ? N->getOperand(0) : N->getOperand(1);
	if (!DAG.isConstantIntBuildVectorOrConstantInt(ConstantOp) \|\|
	ShiftOp.getOpcode() != ISD::SRL)
	return SDValue();

	// The shift must be of a 'not' value.
	SDValue Not = ShiftOp.getOperand(0);
	if (!Not.hasOneUse() \|\| !isBitwiseNot(Not))
	return SDValue();

	// The shift must be moving the sign bit to the least-significant-bit.
	EVT VT = ShiftOp.getValueType();
	SDValue ShAmt = ShiftOp.getOperand(1);
	ConstantSDNode *ShAmtC = isConstOrConstSplat(ShAmt);
	if (!ShAmtC \|\| ShAmtC->getAPIntValue() != (VT.getScalarSizeInBits() - 1))
	return SDValue();

	// Eliminate the 'not' by adjusting the shift and add/sub constant:
	// add (srl (not X), 31), C --> add (sra X, 31), (C + 1)
	// sub C, (srl (not X), 31) --> add (srl X, 31), (C - 1)
	SDLoc DL(N);
	auto ShOpcode = IsAdd ? ISD::SRA : ISD::SRL;
	SDValue NewShift = DAG.getNode(ShOpcode, DL, VT, Not.getOperand(0), ShAmt);
	if (SDValue NewC =
	DAG.FoldConstantArithmetic(IsAdd ? ISD::ADD : ISD::SUB, DL, VT,
	{ConstantOp, DAG.getConstant(1, DL, VT)}))
	return DAG.getNode(ISD::ADD, DL, VT, NewShift, NewC);
	return SDValue();
	}

	/// Try to fold a node that behaves like an ADD (note that N isn't necessarily
	/// an ISD::ADD here, it could for example be an ISD::OR if we know that there
	/// are no common bits set in the operands).
	SDValue DAGCombiner::visitADDLike(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	SDLoc DL(N);

	// fold vector ops
	if (VT.isVector()) {
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	// fold (add x, 0) -> x, vector edition
	if (ISD::isBuildVectorAllZeros(N1.getNode()))
	return N0;
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return N1;
	}

	// fold (add x, undef) -> undef
	if (N0.isUndef())
	return N0;

	if (N1.isUndef())
	return N1;

	if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) {
	// canonicalize constant to RHS
	if (!DAG.isConstantIntBuildVectorOrConstantInt(N1))
	return DAG.getNode(ISD::ADD, DL, VT, N1, N0);
	// fold (add c1, c2) -> c1+c2
	return DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N0, N1});
	}

	// fold (add x, 0) -> x
	if (isNullConstant(N1))
	return N0;

	if (isConstantOrConstantVector(N1, /* NoOpaque */ true)) {
	// fold ((A-c1)+c2) -> (A+(c2-c1))
	if (N0.getOpcode() == ISD::SUB &&
	isConstantOrConstantVector(N0.getOperand(1), /* NoOpaque */ true)) {
	SDValue Sub =
	DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N1, N0.getOperand(1)});
	assert(Sub && "Constant folding failed");
	return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Sub);
	}

	// fold ((c1-A)+c2) -> (c1+c2)-A
	if (N0.getOpcode() == ISD::SUB &&
	isConstantOrConstantVector(N0.getOperand(0), /* NoOpaque */ true)) {
	SDValue Add =
	DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N1, N0.getOperand(0)});
	assert(Add && "Constant folding failed");
	return DAG.getNode(ISD::SUB, DL, VT, Add, N0.getOperand(1));
	}

	// add (sext i1 X), 1 -> zext (not i1 X)
	// We don't transform this pattern:
	// add (zext i1 X), -1 -> sext (not i1 X)
	// because most (?) targets generate better code for the zext form.
	if (N0.getOpcode() == ISD::SIGN_EXTEND && N0.hasOneUse() &&
	isOneOrOneSplat(N1)) {
	SDValue X = N0.getOperand(0);
	if ((!LegalOperations \|\|
	(TLI.isOperationLegal(ISD::XOR, X.getValueType()) &&
	TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) &&
	X.getScalarValueSizeInBits() == 1) {
	SDValue Not = DAG.getNOT(DL, X, X.getValueType());
	return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Not);
	}
	}

	// Fold (add (or x, c0), c1) -> (add x, (c0 + c1)) if (or x, c0) is
	// equivalent to (add x, c0).
	if (N0.getOpcode() == ISD::OR &&
	isConstantOrConstantVector(N0.getOperand(1), /* NoOpaque */ true) &&
	DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1))) {
	if (SDValue Add0 = DAG.FoldConstantArithmetic(ISD::ADD, DL, VT,
	{N1, N0.getOperand(1)}))
	return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Add0);
	}
	}

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// reassociate add
	if (!reassociationCanBreakAddressingModePattern(ISD::ADD, DL, N0, N1)) {
	if (SDValue RADD = reassociateOps(ISD::ADD, DL, N0, N1, N->getFlags()))
	return RADD;
	}
	// fold ((0-A) + B) -> B-A
	if (N0.getOpcode() == ISD::SUB && isNullOrNullSplat(N0.getOperand(0)))
	return DAG.getNode(ISD::SUB, DL, VT, N1, N0.getOperand(1));

	// fold (A + (0-B)) -> A-B
	if (N1.getOpcode() == ISD::SUB && isNullOrNullSplat(N1.getOperand(0)))
	return DAG.getNode(ISD::SUB, DL, VT, N0, N1.getOperand(1));

	// fold (A+(B-A)) -> B
	if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(1))
	return N1.getOperand(0);

	// fold ((B-A)+A) -> B
	if (N0.getOpcode() == ISD::SUB && N1 == N0.getOperand(1))
	return N0.getOperand(0);

	// fold ((A-B)+(C-A)) -> (C-B)
	if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB &&
	N0.getOperand(0) == N1.getOperand(1))
	return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0),
	N0.getOperand(1));

	// fold ((A-B)+(B-C)) -> (A-C)
	if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB &&
	N0.getOperand(1) == N1.getOperand(0))
	return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0),
	N1.getOperand(1));

	// fold (A+(B-(A+C))) to (B-C)
	if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD &&
	N0 == N1.getOperand(1).getOperand(0))
	return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0),
	N1.getOperand(1).getOperand(1));

	// fold (A+(B-(C+A))) to (B-C)
	if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD &&
	N0 == N1.getOperand(1).getOperand(1))
	return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0),
	N1.getOperand(1).getOperand(0));

	// fold (A+((B-A)+or-C)) to (B+or-C)
	if ((N1.getOpcode() == ISD::SUB \|\| N1.getOpcode() == ISD::ADD) &&
	N1.getOperand(0).getOpcode() == ISD::SUB &&
	N0 == N1.getOperand(0).getOperand(1))
	return DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0).getOperand(0),
	N1.getOperand(1));

	// fold (A-B)+(C-D) to (A+C)-(B+D) when A or C is constant
	if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB) {
	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	SDValue N10 = N1.getOperand(0);
	SDValue N11 = N1.getOperand(1);

	if (isConstantOrConstantVector(N00) \|\| isConstantOrConstantVector(N10))
	return DAG.getNode(ISD::SUB, DL, VT,
	DAG.getNode(ISD::ADD, SDLoc(N0), VT, N00, N10),
	DAG.getNode(ISD::ADD, SDLoc(N1), VT, N01, N11));
	}

	// fold (add (umax X, C), -C) --> (usubsat X, C)
	if (N0.getOpcode() == ISD::UMAX && hasOperation(ISD::USUBSAT, VT)) {
	auto MatchUSUBSAT = [](ConstantSDNode Max, ConstantSDNode Op) {
	return (!Max && !Op) \|\|
	(Max && Op && Max->getAPIntValue() == (-Op->getAPIntValue()));
	};
	if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchUSUBSAT,
	/AllowUndefs/ true))
	return DAG.getNode(ISD::USUBSAT, DL, VT, N0.getOperand(0),
	N0.getOperand(1));
	}

	if (SimplifyDemandedBits(SDValue(N, 0)))
	return SDValue(N, 0);

	if (isOneOrOneSplat(N1)) {
	// fold (add (xor a, -1), 1) -> (sub 0, a)
	if (isBitwiseNot(N0))
	return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	N0.getOperand(0));

	// fold (add (add (xor a, -1), b), 1) -> (sub b, a)
	if (N0.getOpcode() == ISD::ADD \|\|
	N0.getOpcode() == ISD::UADDO \|\|
	N0.getOpcode() == ISD::SADDO) {
	SDValue A, Xor;

	if (isBitwiseNot(N0.getOperand(0))) {
	A = N0.getOperand(1);
	Xor = N0.getOperand(0);
	} else if (isBitwiseNot(N0.getOperand(1))) {
	A = N0.getOperand(0);
	Xor = N0.getOperand(1);
	}

	if (Xor)
	return DAG.getNode(ISD::SUB, DL, VT, A, Xor.getOperand(0));
	}

	// Look for:
	// add (add x, y), 1
	// And if the target does not like this form then turn into:
	// sub y, (xor x, -1)
	if (!TLI.preferIncOfAddToSubOfNot(VT) && N0.hasOneUse() &&
	N0.getOpcode() == ISD::ADD) {
	SDValue Not = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0),
	DAG.getAllOnesConstant(DL, VT));
	return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(1), Not);
	}
	}

	// (x - y) + -1 -> add (xor y, -1), x
	if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB &&
	isAllOnesOrAllOnesSplat(N1)) {
	SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(1), N1);
	return DAG.getNode(ISD::ADD, DL, VT, Xor, N0.getOperand(0));
	}

	if (SDValue Combined = visitADDLikeCommutative(N0, N1, N))
	return Combined;

	if (SDValue Combined = visitADDLikeCommutative(N1, N0, N))
	return Combined;

	return SDValue();
	}

	SDValue DAGCombiner::visitADD(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	SDLoc DL(N);

	if (SDValue Combined = visitADDLike(N))
	return Combined;

	if (SDValue V = foldAddSubBoolOfMaskedVal(N, DAG))
	return V;

	if (SDValue V = foldAddSubOfSignBit(N, DAG))
	return V;

	// fold (a+b) -> (a\|b) iff a and b share no bits.
	if ((!LegalOperations \|\| TLI.isOperationLegal(ISD::OR, VT)) &&
	DAG.haveNoCommonBitsSet(N0, N1))
	return DAG.getNode(ISD::OR, DL, VT, N0, N1);

	// Fold (add (vscale * C0), (vscale * C1)) to (vscale * (C0 + C1)).
	if (N0.getOpcode() == ISD::VSCALE && N1.getOpcode() == ISD::VSCALE) {
	APInt C0 = N0->getConstantOperandAPInt(0);
	APInt C1 = N1->getConstantOperandAPInt(0);
	return DAG.getVScale(DL, VT, C0 + C1);
	}

	// fold a+vscale(c1)+vscale(c2) -> a+vscale(c1+c2)
	if ((N0.getOpcode() == ISD::ADD) &&
	(N0.getOperand(1).getOpcode() == ISD::VSCALE) &&
	(N1.getOpcode() == ISD::VSCALE)) {
	auto VS0 = N0.getOperand(1)->getConstantOperandAPInt(0);
	auto VS1 = N1->getConstantOperandAPInt(0);
	auto VS = DAG.getVScale(DL, VT, VS0 + VS1);
	return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), VS);
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitADDSAT(SDNode *N) {
	unsigned Opcode = N->getOpcode();
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	SDLoc DL(N);

	// fold vector ops
	if (VT.isVector()) {
	// TODO SimplifyVBinOp

	// fold (add_sat x, 0) -> x, vector edition
	if (ISD::isBuildVectorAllZeros(N1.getNode()))
	return N0;
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return N1;
	}

	// fold (add_sat x, undef) -> -1
	if (N0.isUndef() \|\| N1.isUndef())
	return DAG.getAllOnesConstant(DL, VT);

	if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) {
	// canonicalize constant to RHS
	if (!DAG.isConstantIntBuildVectorOrConstantInt(N1))
	return DAG.getNode(Opcode, DL, VT, N1, N0);
	// fold (add_sat c1, c2) -> c3
	return DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1});
	}

	// fold (add_sat x, 0) -> x
	if (isNullConstant(N1))
	return N0;

	// If it cannot overflow, transform into an add.
	if (Opcode == ISD::UADDSAT)
	if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never)
	return DAG.getNode(ISD::ADD, DL, VT, N0, N1);

	return SDValue();
	}

	static SDValue getAsCarry(const TargetLowering &TLI, SDValue V) {
	bool Masked = false;

	// First, peel away TRUNCATE/ZERO_EXTEND/AND nodes due to legalization.
	while (true) {
	if (V.getOpcode() == ISD::TRUNCATE \|\| V.getOpcode() == ISD::ZERO_EXTEND) {
	V = V.getOperand(0);
	continue;
	}

	if (V.getOpcode() == ISD::AND && isOneConstant(V.getOperand(1))) {
	Masked = true;
	V = V.getOperand(0);
	continue;
	}

	break;
	}

	// If this is not a carry, return.
	if (V.getResNo() != 1)
	return SDValue();

	if (V.getOpcode() != ISD::ADDCARRY && V.getOpcode() != ISD::SUBCARRY &&
	V.getOpcode() != ISD::UADDO && V.getOpcode() != ISD::USUBO)
	return SDValue();

	EVT VT = V.getNode()->getValueType(0);
	if (!TLI.isOperationLegalOrCustom(V.getOpcode(), VT))
	return SDValue();

	// If the result is masked, then no matter what kind of bool it is we can
	// return. If it isn't, then we need to make sure the bool type is either 0 or
	// 1 and not other values.
	if (Masked \|\|
	TLI.getBooleanContents(V.getValueType()) ==
	TargetLoweringBase::ZeroOrOneBooleanContent)
	return V;

	return SDValue();
	}

	/// Given the operands of an add/sub operation, see if the 2nd operand is a
	/// masked 0/1 whose source operand is actually known to be 0/-1. If so, invert
	/// the opcode and bypass the mask operation.
	static SDValue foldAddSubMasked1(bool IsAdd, SDValue N0, SDValue N1,
	SelectionDAG &DAG, const SDLoc &DL) {
	if (N1.getOpcode() != ISD::AND \|\| !isOneOrOneSplat(N1->getOperand(1)))
	return SDValue();

	EVT VT = N0.getValueType();
	if (DAG.ComputeNumSignBits(N1.getOperand(0)) != VT.getScalarSizeInBits())
	return SDValue();

	// add N0, (and (AssertSext X, i1), 1) --> sub N0, X
	// sub N0, (and (AssertSext X, i1), 1) --> add N0, X
	return DAG.getNode(IsAdd ? ISD::SUB : ISD::ADD, DL, VT, N0, N1.getOperand(0));
	}

	/// Helper for doing combines based on N0 and N1 being added to each other.
	SDValue DAGCombiner::visitADDLikeCommutative(SDValue N0, SDValue N1,
	SDNode *LocReference) {
	EVT VT = N0.getValueType();
	SDLoc DL(LocReference);

	// fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n))
	if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::SUB &&
	isNullOrNullSplat(N1.getOperand(0).getOperand(0)))
	return DAG.getNode(ISD::SUB, DL, VT, N0,
	DAG.getNode(ISD::SHL, DL, VT,
	N1.getOperand(0).getOperand(1),
	N1.getOperand(1)));

	if (SDValue V = foldAddSubMasked1(true, N0, N1, DAG, DL))
	return V;

	// Look for:
	// add (add x, 1), y
	// And if the target does not like this form then turn into:
	// sub y, (xor x, -1)
	if (!TLI.preferIncOfAddToSubOfNot(VT) && N0.hasOneUse() &&
	N0.getOpcode() == ISD::ADD && isOneOrOneSplat(N0.getOperand(1))) {
	SDValue Not = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0),
	DAG.getAllOnesConstant(DL, VT));
	return DAG.getNode(ISD::SUB, DL, VT, N1, Not);
	}

	// Hoist one-use subtraction by non-opaque constant:
	// (x - C) + y -> (x + y) - C
	// This is necessary because SUB(X,C) -> ADD(X,-C) doesn't work for vectors.
	if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB &&
	isConstantOrConstantVector(N0.getOperand(1), /NoOpaques=/true)) {
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), N1);
	return DAG.getNode(ISD::SUB, DL, VT, Add, N0.getOperand(1));
	}
	// Hoist one-use subtraction from non-opaque constant:
	// (C - x) + y -> (y - x) + C
	if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB &&
	isConstantOrConstantVector(N0.getOperand(0), /NoOpaques=/true)) {
	SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N1, N0.getOperand(1));
	return DAG.getNode(ISD::ADD, DL, VT, Sub, N0.getOperand(0));
	}

	// If the target's bool is represented as 0/1, prefer to make this 'sub 0/1'
	// rather than 'add 0/-1' (the zext should get folded).
	// add (sext i1 Y), X --> sub X, (zext i1 Y)
	if (N0.getOpcode() == ISD::SIGN_EXTEND &&
	N0.getOperand(0).getScalarValueSizeInBits() == 1 &&
	TLI.getBooleanContents(VT) == TargetLowering::ZeroOrOneBooleanContent) {
	SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
	return DAG.getNode(ISD::SUB, DL, VT, N1, ZExt);
	}

	// add X, (sextinreg Y i1) -> sub X, (and Y 1)
	if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) {
	VTSDNode *TN = cast<VTSDNode>(N1.getOperand(1));
	if (TN->getVT() == MVT::i1) {
	SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0),
	DAG.getConstant(1, DL, VT));
	return DAG.getNode(ISD::SUB, DL, VT, N0, ZExt);
	}
	}

	// (add X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry)
	if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1)) &&
	N1.getResNo() == 0)
	return DAG.getNode(ISD::ADDCARRY, DL, N1->getVTList(),
	N0, N1.getOperand(0), N1.getOperand(2));

	// (add X, Carry) -> (addcarry X, 0, Carry)
	if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT))
	if (SDValue Carry = getAsCarry(TLI, N1))
	return DAG.getNode(ISD::ADDCARRY, DL,
	DAG.getVTList(VT, Carry.getValueType()), N0,
	DAG.getConstant(0, DL, VT), Carry);

	return SDValue();
	}

	SDValue DAGCombiner::visitADDC(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	SDLoc DL(N);

	// If the flag result is dead, turn this into an ADD.
	if (!N->hasAnyUseOfValue(1))
	return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
	DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));

	// canonicalize constant to RHS.
	ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
	if (N0C && !N1C)
	return DAG.getNode(ISD::ADDC, DL, N->getVTList(), N1, N0);

	// fold (addc x, 0) -> x + no carry out
	if (isNullConstant(N1))
	return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE,
	DL, MVT::Glue));

	// If it cannot overflow, transform into an add.
	if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never)
	return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
	DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));

	return SDValue();
	}

	static SDValue flipBoolean(SDValue V, const SDLoc &DL,
	SelectionDAG &DAG, const TargetLowering &TLI) {
	EVT VT = V.getValueType();

	SDValue Cst;
	switch (TLI.getBooleanContents(VT)) {
	case TargetLowering::ZeroOrOneBooleanContent:
	case TargetLowering::UndefinedBooleanContent:
	Cst = DAG.getConstant(1, DL, VT);
	break;
	case TargetLowering::ZeroOrNegativeOneBooleanContent:
	Cst = DAG.getAllOnesConstant(DL, VT);
	break;
	}

	return DAG.getNode(ISD::XOR, DL, VT, V, Cst);
	}

	/**
	* Flips a boolean if it is cheaper to compute. If the Force parameters is set,
	* then the flip also occurs if computing the inverse is the same cost.
	* This function returns an empty SDValue in case it cannot flip the boolean
	* without increasing the cost of the computation. If you want to flip a boolean
	* no matter what, use flipBoolean.
	*/
	static SDValue extractBooleanFlip(SDValue V, SelectionDAG &DAG,
	const TargetLowering &TLI,
	bool Force) {
	if (Force && isa<ConstantSDNode>(V))
	return flipBoolean(V, SDLoc(V), DAG, TLI);

	if (V.getOpcode() != ISD::XOR)
	return SDValue();

	ConstantSDNode *Const = isConstOrConstSplat(V.getOperand(1), false);
	if (!Const)
	return SDValue();

	EVT VT = V.getValueType();

	bool IsFlip = false;
	switch(TLI.getBooleanContents(VT)) {
	case TargetLowering::ZeroOrOneBooleanContent:
	IsFlip = Const->isOne();
	break;
	case TargetLowering::ZeroOrNegativeOneBooleanContent:
	IsFlip = Const->isAllOnesValue();
	break;
	case TargetLowering::UndefinedBooleanContent:
	IsFlip = (Const->getAPIntValue() & 0x01) == 1;
	break;
	}

	if (IsFlip)
	return V.getOperand(0);
	if (Force)
	return flipBoolean(V, SDLoc(V), DAG, TLI);
	return SDValue();
	}

	SDValue DAGCombiner::visitADDO(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	bool IsSigned = (ISD::SADDO == N->getOpcode());

	EVT CarryVT = N->getValueType(1);
	SDLoc DL(N);

	// If the flag result is dead, turn this into an ADD.
	if (!N->hasAnyUseOfValue(1))
	return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
	DAG.getUNDEF(CarryVT));

	// canonicalize constant to RHS.
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
	!DAG.isConstantIntBuildVectorOrConstantInt(N1))
	return DAG.getNode(N->getOpcode(), DL, N->getVTList(), N1, N0);

	// fold (addo x, 0) -> x + no carry out
	if (isNullOrNullSplat(N1))
	return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT));

	if (!IsSigned) {
	// If it cannot overflow, transform into an add.
	if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never)
	return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
	DAG.getConstant(0, DL, CarryVT));

	// fold (uaddo (xor a, -1), 1) -> (usub 0, a) and flip carry.
	if (isBitwiseNot(N0) && isOneOrOneSplat(N1)) {
	SDValue Sub = DAG.getNode(ISD::USUBO, DL, N->getVTList(),
	DAG.getConstant(0, DL, VT), N0.getOperand(0));
	return CombineTo(N, Sub,
	flipBoolean(Sub.getValue(1), DL, DAG, TLI));
	}

	if (SDValue Combined = visitUADDOLike(N0, N1, N))
	return Combined;

	if (SDValue Combined = visitUADDOLike(N1, N0, N))
	return Combined;
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitUADDOLike(SDValue N0, SDValue N1, SDNode *N) {
	EVT VT = N0.getValueType();
	if (VT.isVector())
	return SDValue();

	// (uaddo X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry)
	// If Y + 1 cannot overflow.
	if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1))) {
	SDValue Y = N1.getOperand(0);
	SDValue One = DAG.getConstant(1, SDLoc(N), Y.getValueType());
	if (DAG.computeOverflowKind(Y, One) == SelectionDAG::OFK_Never)
	return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, Y,
	N1.getOperand(2));
	}

	// (uaddo X, Carry) -> (addcarry X, 0, Carry)
	if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT))
	if (SDValue Carry = getAsCarry(TLI, N1))
	return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0,
	DAG.getConstant(0, SDLoc(N), VT), Carry);

	return SDValue();
	}

	SDValue DAGCombiner::visitADDE(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue CarryIn = N->getOperand(2);

	// canonicalize constant to RHS
	ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
	if (N0C && !N1C)
	return DAG.getNode(ISD::ADDE, SDLoc(N), N->getVTList(),
	N1, N0, CarryIn);

	// fold (adde x, y, false) -> (addc x, y)
	if (CarryIn.getOpcode() == ISD::CARRY_FALSE)
	return DAG.getNode(ISD::ADDC, SDLoc(N), N->getVTList(), N0, N1);

	return SDValue();
	}

	SDValue DAGCombiner::visitADDCARRY(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue CarryIn = N->getOperand(2);
	SDLoc DL(N);

	// canonicalize constant to RHS
	ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
	if (N0C && !N1C)
	return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), N1, N0, CarryIn);

	// fold (addcarry x, y, false) -> (uaddo x, y)
	if (isNullConstant(CarryIn)) {
	if (!LegalOperations \|\|
	TLI.isOperationLegalOrCustom(ISD::UADDO, N->getValueType(0)))
	return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1);
	}

	// fold (addcarry 0, 0, X) -> (and (ext/trunc X), 1) and no carry.
	if (isNullConstant(N0) && isNullConstant(N1)) {
	EVT VT = N0.getValueType();
	EVT CarryVT = CarryIn.getValueType();
	SDValue CarryExt = DAG.getBoolExtOrTrunc(CarryIn, DL, VT, CarryVT);
	AddToWorklist(CarryExt.getNode());
	return CombineTo(N, DAG.getNode(ISD::AND, DL, VT, CarryExt,
	DAG.getConstant(1, DL, VT)),
	DAG.getConstant(0, DL, CarryVT));
	}

	if (SDValue Combined = visitADDCARRYLike(N0, N1, CarryIn, N))
	return Combined;

	if (SDValue Combined = visitADDCARRYLike(N1, N0, CarryIn, N))
	return Combined;

	return SDValue();
	}

	/**
	* If we are facing some sort of diamond carry propapagtion pattern try to
	* break it up to generate something like:
	* (addcarry X, 0, (addcarry A, B, Z):Carry)
	*
	* The end result is usually an increase in operation required, but because the
	* carry is now linearized, other tranforms can kick in and optimize the DAG.
	*
	* Patterns typically look something like
	* (uaddo A, B)
	* / \
	* Carry Sum
	* \| \
	* \| (addcarry *, 0, Z)
	* \| /
	* \ Carry
	* \| /
	* (addcarry X, , )
	*
	* But numerous variation exist. Our goal is to identify A, B, X and Z and
	* produce a combine with a single path for carry propagation.
	*/
	static SDValue combineADDCARRYDiamond(DAGCombiner &Combiner, SelectionDAG &DAG,
	SDValue X, SDValue Carry0, SDValue Carry1,
	SDNode *N) {
	if (Carry1.getResNo() != 1 \|\| Carry0.getResNo() != 1)
	return SDValue();
	if (Carry1.getOpcode() != ISD::UADDO)
	return SDValue();

	SDValue Z;

	/**
	* First look for a suitable Z. It will present itself in the form of
	* (addcarry Y, 0, Z) or its equivalent (uaddo Y, 1) for Z=true
	*/
	if (Carry0.getOpcode() == ISD::ADDCARRY &&
	isNullConstant(Carry0.getOperand(1))) {
	Z = Carry0.getOperand(2);
	} else if (Carry0.getOpcode() == ISD::UADDO &&
	isOneConstant(Carry0.getOperand(1))) {
	EVT VT = Combiner.getSetCCResultType(Carry0.getValueType());
	Z = DAG.getConstant(1, SDLoc(Carry0.getOperand(1)), VT);
	} else {
	// We couldn't find a suitable Z.
	return SDValue();
	}


	auto cancelDiamond = [&](SDValue A,SDValue B) {
	SDLoc DL(N);
	SDValue NewY = DAG.getNode(ISD::ADDCARRY, DL, Carry0->getVTList(), A, B, Z);
	Combiner.AddToWorklist(NewY.getNode());
	return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), X,
	DAG.getConstant(0, DL, X.getValueType()),
	NewY.getValue(1));
	};

	/**
	* (uaddo A, B)
	* \|
	* Sum
	* \|
	* (addcarry *, 0, Z)
	*/
	if (Carry0.getOperand(0) == Carry1.getValue(0)) {
	return cancelDiamond(Carry1.getOperand(0), Carry1.getOperand(1));
	}

	/**
	* (addcarry A, 0, Z)
	* \|
	* Sum
	* \|
	* (uaddo *, B)
	*/
	if (Carry1.getOperand(0) == Carry0.getValue(0)) {
	return cancelDiamond(Carry0.getOperand(0), Carry1.getOperand(1));
	}

	if (Carry1.getOperand(1) == Carry0.getValue(0)) {
	return cancelDiamond(Carry1.getOperand(0), Carry0.getOperand(0));
	}

	return SDValue();
	}

	// If we are facing some sort of diamond carry/borrow in/out pattern try to
	// match patterns like:
	//
	// (uaddo A, B) CarryIn
	// \| \ \|
	// \| \ \|
	// PartialSum PartialCarryOutX /
	// \| \| /
	// \| ____\|____________/
	// \| / \|
	// (uaddo , ) \________
	// \| \ \
	// \| \ \|
	// \| PartialCarryOutY \|
	// \| \ \|
	// \| \ /
	// AddCarrySum \| ______/
	// \| /
	// CarryOut = (or , )
	//
	// And generate ADDCARRY (or SUBCARRY) with two result values:
	//
	// {AddCarrySum, CarryOut} = (addcarry A, B, CarryIn)
	//
	// Our goal is to identify A, B, and CarryIn and produce ADDCARRY/SUBCARRY with
	// a single path for carry/borrow out propagation:
	static SDValue combineCarryDiamond(DAGCombiner &Combiner, SelectionDAG &DAG,
	const TargetLowering &TLI, SDValue Carry0,
	SDValue Carry1, SDNode *N) {
	if (Carry0.getResNo() != 1 \|\| Carry1.getResNo() != 1)
	return SDValue();
	unsigned Opcode = Carry0.getOpcode();
	if (Opcode != Carry1.getOpcode())
	return SDValue();
	if (Opcode != ISD::UADDO && Opcode != ISD::USUBO)
	return SDValue();

	// Canonicalize the add/sub of A and B as Carry0 and the add/sub of the
	// carry/borrow in as Carry1. (The top and middle uaddo nodes respectively in
	// the above ASCII art.)
	if (Carry1.getOperand(0) != Carry0.getValue(0) &&
	Carry1.getOperand(1) != Carry0.getValue(0))
	std::swap(Carry0, Carry1);
	if (Carry1.getOperand(0) != Carry0.getValue(0) &&
	Carry1.getOperand(1) != Carry0.getValue(0))
	return SDValue();

	// The carry in value must be on the righthand side for subtraction.
	unsigned CarryInOperandNum =
	Carry1.getOperand(0) == Carry0.getValue(0) ? 1 : 0;
	if (Opcode == ISD::USUBO && CarryInOperandNum != 1)
	return SDValue();
	SDValue CarryIn = Carry1.getOperand(CarryInOperandNum);

	unsigned NewOp = Opcode == ISD::UADDO ? ISD::ADDCARRY : ISD::SUBCARRY;
	if (!TLI.isOperationLegalOrCustom(NewOp, Carry0.getValue(0).getValueType()))
	return SDValue();

	// Verify that the carry/borrow in is plausibly a carry/borrow bit.
	// TODO: make getAsCarry() aware of how partial carries are merged.
	if (CarryIn.getOpcode() != ISD::ZERO_EXTEND)
	return SDValue();
	CarryIn = CarryIn.getOperand(0);
	if (CarryIn.getValueType() != MVT::i1)
	return SDValue();

	SDLoc DL(N);
	SDValue Merged =
	DAG.getNode(NewOp, DL, Carry1->getVTList(), Carry0.getOperand(0),
	Carry0.getOperand(1), CarryIn);

	// Please note that because we have proven that the result of the UADDO/USUBO
	// of A and B feeds into the UADDO/USUBO that does the carry/borrow in, we can
	// therefore prove that if the first UADDO/USUBO overflows, the second
	// UADDO/USUBO cannot. For example consider 8-bit numbers where 0xFF is the
	// maximum value.
	//
	// 0xFF + 0xFF == 0xFE with carry but 0xFE + 1 does not carry
	// 0x00 - 0xFF == 1 with a carry/borrow but 1 - 1 == 0 (no carry/borrow)
	//
	// This is important because it means that OR and XOR can be used to merge
	// carry flags; and that AND can return a constant zero.
	//
	// TODO: match other operations that can merge flags (ADD, etc)
	DAG.ReplaceAllUsesOfValueWith(Carry1.getValue(0), Merged.getValue(0));
	if (N->getOpcode() == ISD::AND)
	return DAG.getConstant(0, DL, MVT::i1);
	return Merged.getValue(1);
	}

	SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn,
	SDNode *N) {
	// fold (addcarry (xor a, -1), b, c) -> (subcarry b, a, !c) and flip carry.
	if (isBitwiseNot(N0))
	if (SDValue NotC = extractBooleanFlip(CarryIn, DAG, TLI, true)) {
	SDLoc DL(N);
	SDValue Sub = DAG.getNode(ISD::SUBCARRY, DL, N->getVTList(), N1,
	N0.getOperand(0), NotC);
	return CombineTo(N, Sub,
	flipBoolean(Sub.getValue(1), DL, DAG, TLI));
	}

	// Iff the flag result is dead:
	// (addcarry (add\|uaddo X, Y), 0, Carry) -> (addcarry X, Y, Carry)
	// Don't do this if the Carry comes from the uaddo. It won't remove the uaddo
	// or the dependency between the instructions.
	if ((N0.getOpcode() == ISD::ADD \|\|
	(N0.getOpcode() == ISD::UADDO && N0.getResNo() == 0 &&
	N0.getValue(1) != CarryIn)) &&
	isNullConstant(N1) && !N->hasAnyUseOfValue(1))
	return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(),
	N0.getOperand(0), N0.getOperand(1), CarryIn);

	/**
	* When one of the addcarry argument is itself a carry, we may be facing
	* a diamond carry propagation. In which case we try to transform the DAG
	* to ensure linear carry propagation if that is possible.
	*/
	if (auto Y = getAsCarry(TLI, N1)) {
	// Because both are carries, Y and Z can be swapped.
	if (auto R = combineADDCARRYDiamond(*this, DAG, N0, Y, CarryIn, N))
	return R;
	if (auto R = combineADDCARRYDiamond(*this, DAG, N0, CarryIn, Y, N))
	return R;
	}

	return SDValue();
	}

	// Since it may not be valid to emit a fold to zero for vector initializers
	// check if we can before folding.
	static SDValue tryFoldToZero(const SDLoc &DL, const TargetLowering &TLI, EVT VT,
	SelectionDAG &DAG, bool LegalOperations) {
	if (!VT.isVector())
	return DAG.getConstant(0, DL, VT);
	if (!LegalOperations \|\| TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
	return DAG.getConstant(0, DL, VT);
	return SDValue();
	}

	SDValue DAGCombiner::visitSUB(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	SDLoc DL(N);

	// fold vector ops
	if (VT.isVector()) {
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	// fold (sub x, 0) -> x, vector edition
	if (ISD::isBuildVectorAllZeros(N1.getNode()))
	return N0;
	}

	// fold (sub x, x) -> 0
	// FIXME: Refactor this and xor and other similar operations together.
	if (N0 == N1)
	return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);

	// fold (sub c1, c2) -> c3
	if (SDValue C = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0, N1}))
	return C;

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);

	// fold (sub x, c) -> (add x, -c)
	if (N1C) {
	return DAG.getNode(ISD::ADD, DL, VT, N0,
	DAG.getConstant(-N1C->getAPIntValue(), DL, VT));
	}

	if (isNullOrNullSplat(N0)) {
	unsigned BitWidth = VT.getScalarSizeInBits();
	// Right-shifting everything out but the sign bit followed by negation is
	// the same as flipping arithmetic/logical shift type without the negation:
	// -(X >>u 31) -> (X >>s 31)
	// -(X >>s 31) -> (X >>u 31)
	if (N1->getOpcode() == ISD::SRA \|\| N1->getOpcode() == ISD::SRL) {
	ConstantSDNode *ShiftAmt = isConstOrConstSplat(N1.getOperand(1));
	if (ShiftAmt && ShiftAmt->getAPIntValue() == (BitWidth - 1)) {
	auto NewSh = N1->getOpcode() == ISD::SRA ? ISD::SRL : ISD::SRA;
	if (!LegalOperations \|\| TLI.isOperationLegal(NewSh, VT))
	return DAG.getNode(NewSh, DL, VT, N1.getOperand(0), N1.getOperand(1));
	}
	}

	// 0 - X --> 0 if the sub is NUW.
	if (N->getFlags().hasNoUnsignedWrap())
	return N0;

	if (DAG.MaskedValueIsZero(N1, ~APInt::getSignMask(BitWidth))) {
	// N1 is either 0 or the minimum signed value. If the sub is NSW, then
	// N1 must be 0 because negating the minimum signed value is undefined.
	if (N->getFlags().hasNoSignedWrap())
	return N0;

	// 0 - X --> X if X is 0 or the minimum signed value.
	return N1;
	}
	}

	// Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1)
	if (isAllOnesOrAllOnesSplat(N0))
	return DAG.getNode(ISD::XOR, DL, VT, N1, N0);

	// fold (A - (0-B)) -> A+B
	if (N1.getOpcode() == ISD::SUB && isNullOrNullSplat(N1.getOperand(0)))
	return DAG.getNode(ISD::ADD, DL, VT, N0, N1.getOperand(1));

	// fold A-(A-B) -> B
	if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(0))
	return N1.getOperand(1);

	// fold (A+B)-A -> B
	if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1)
	return N0.getOperand(1);

	// fold (A+B)-B -> A
	if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1)
	return N0.getOperand(0);

	// fold (A+C1)-C2 -> A+(C1-C2)
	if (N0.getOpcode() == ISD::ADD &&
	isConstantOrConstantVector(N1, /* NoOpaques */ true) &&
	isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) {
	SDValue NewC =
	DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0.getOperand(1), N1});
	assert(NewC && "Constant folding failed");
	return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), NewC);
	}

	// fold C2-(A+C1) -> (C2-C1)-A
	if (N1.getOpcode() == ISD::ADD) {
	SDValue N11 = N1.getOperand(1);
	if (isConstantOrConstantVector(N0, /* NoOpaques */ true) &&
	isConstantOrConstantVector(N11, /* NoOpaques */ true)) {
	SDValue NewC = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0, N11});
	assert(NewC && "Constant folding failed");
	return DAG.getNode(ISD::SUB, DL, VT, NewC, N1.getOperand(0));
	}
	}

	// fold (A-C1)-C2 -> A-(C1+C2)
	if (N0.getOpcode() == ISD::SUB &&
	isConstantOrConstantVector(N1, /* NoOpaques */ true) &&
	isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) {
	SDValue NewC =
	DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N0.getOperand(1), N1});
	assert(NewC && "Constant folding failed");
	return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), NewC);
	}

	// fold (c1-A)-c2 -> (c1-c2)-A
	if (N0.getOpcode() == ISD::SUB &&
	isConstantOrConstantVector(N1, /* NoOpaques */ true) &&
	isConstantOrConstantVector(N0.getOperand(0), /* NoOpaques */ true)) {
	SDValue NewC =
	DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0.getOperand(0), N1});
	assert(NewC && "Constant folding failed");
	return DAG.getNode(ISD::SUB, DL, VT, NewC, N0.getOperand(1));
	}

	// fold ((A+(B+or-C))-B) -> A+or-C
	if (N0.getOpcode() == ISD::ADD &&
	(N0.getOperand(1).getOpcode() == ISD::SUB \|\|
	N0.getOperand(1).getOpcode() == ISD::ADD) &&
	N0.getOperand(1).getOperand(0) == N1)
	return DAG.getNode(N0.getOperand(1).getOpcode(), DL, VT, N0.getOperand(0),
	N0.getOperand(1).getOperand(1));

	// fold ((A+(C+B))-B) -> A+C
	if (N0.getOpcode() == ISD::ADD && N0.getOperand(1).getOpcode() == ISD::ADD &&
	N0.getOperand(1).getOperand(1) == N1)
	return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0),
	N0.getOperand(1).getOperand(0));

	// fold ((A-(B-C))-C) -> A-B
	if (N0.getOpcode() == ISD::SUB && N0.getOperand(1).getOpcode() == ISD::SUB &&
	N0.getOperand(1).getOperand(1) == N1)
	return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0),
	N0.getOperand(1).getOperand(0));

	// fold (A-(B-C)) -> A+(C-B)
	if (N1.getOpcode() == ISD::SUB && N1.hasOneUse())
	return DAG.getNode(ISD::ADD, DL, VT, N0,
	DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(1),
	N1.getOperand(0)));

	// A - (A & B) -> A & (~B)
	if (N1.getOpcode() == ISD::AND) {
	SDValue A = N1.getOperand(0);
	SDValue B = N1.getOperand(1);
	if (A != N0)
	std::swap(A, B);
	if (A == N0 &&
	(N1.hasOneUse() \|\| isConstantOrConstantVector(B, /NoOpaques=/true))) {
	SDValue InvB =
	DAG.getNode(ISD::XOR, DL, VT, B, DAG.getAllOnesConstant(DL, VT));
	return DAG.getNode(ISD::AND, DL, VT, A, InvB);
	}
	}

	// fold (X - (-Y * Z)) -> (X + (Y * Z))
	if (N1.getOpcode() == ISD::MUL && N1.hasOneUse()) {
	if (N1.getOperand(0).getOpcode() == ISD::SUB &&
	isNullOrNullSplat(N1.getOperand(0).getOperand(0))) {
	SDValue Mul = DAG.getNode(ISD::MUL, DL, VT,
	N1.getOperand(0).getOperand(1),
	N1.getOperand(1));
	return DAG.getNode(ISD::ADD, DL, VT, N0, Mul);
	}
	if (N1.getOperand(1).getOpcode() == ISD::SUB &&
	isNullOrNullSplat(N1.getOperand(1).getOperand(0))) {
	SDValue Mul = DAG.getNode(ISD::MUL, DL, VT,
	N1.getOperand(0),
	N1.getOperand(1).getOperand(1));
	return DAG.getNode(ISD::ADD, DL, VT, N0, Mul);
	}
	}

	// If either operand of a sub is undef, the result is undef
	if (N0.isUndef())
	return N0;
	if (N1.isUndef())
	return N1;

	if (SDValue V = foldAddSubBoolOfMaskedVal(N, DAG))
	return V;

	if (SDValue V = foldAddSubOfSignBit(N, DAG))
	return V;

	if (SDValue V = foldAddSubMasked1(false, N0, N1, DAG, SDLoc(N)))
	return V;

	// (x - y) - 1 -> add (xor y, -1), x
	if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB && isOneOrOneSplat(N1)) {
	SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(1),
	DAG.getAllOnesConstant(DL, VT));
	return DAG.getNode(ISD::ADD, DL, VT, Xor, N0.getOperand(0));
	}

	// Look for:
	// sub y, (xor x, -1)
	// And if the target does not like this form then turn into:
	// add (add x, y), 1
	if (TLI.preferIncOfAddToSubOfNot(VT) && N1.hasOneUse() && isBitwiseNot(N1)) {
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, N1.getOperand(0));
	return DAG.getNode(ISD::ADD, DL, VT, Add, DAG.getConstant(1, DL, VT));
	}

	// Hoist one-use addition by non-opaque constant:
	// (x + C) - y -> (x - y) + C
	if (N0.hasOneUse() && N0.getOpcode() == ISD::ADD &&
	isConstantOrConstantVector(N0.getOperand(1), /NoOpaques=/true)) {
	SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), N1);
	return DAG.getNode(ISD::ADD, DL, VT, Sub, N0.getOperand(1));
	}
	// y - (x + C) -> (y - x) - C
	if (N1.hasOneUse() && N1.getOpcode() == ISD::ADD &&
	isConstantOrConstantVector(N1.getOperand(1), /NoOpaques=/true)) {
	SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, N1.getOperand(0));
	return DAG.getNode(ISD::SUB, DL, VT, Sub, N1.getOperand(1));
	}
	// (x - C) - y -> (x - y) - C
	// This is necessary because SUB(X,C) -> ADD(X,-C) doesn't work for vectors.
	if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB &&
	isConstantOrConstantVector(N0.getOperand(1), /NoOpaques=/true)) {
	SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), N1);
	return DAG.getNode(ISD::SUB, DL, VT, Sub, N0.getOperand(1));
	}
	// (C - x) - y -> C - (x + y)
	if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB &&
	isConstantOrConstantVector(N0.getOperand(0), /NoOpaques=/true)) {
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(1), N1);
	return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), Add);
	}

	// If the target's bool is represented as 0/-1, prefer to make this 'add 0/-1'
	// rather than 'sub 0/1' (the sext should get folded).
	// sub X, (zext i1 Y) --> add X, (sext i1 Y)
	if (N1.getOpcode() == ISD::ZERO_EXTEND &&
	N1.getOperand(0).getScalarValueSizeInBits() == 1 &&
	TLI.getBooleanContents(VT) ==
	TargetLowering::ZeroOrNegativeOneBooleanContent) {
	SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N1.getOperand(0));
	return DAG.getNode(ISD::ADD, DL, VT, N0, SExt);
	}

	// fold Y = sra (X, size(X)-1); sub (xor (X, Y), Y) -> (abs X)
	if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) {
	if (N0.getOpcode() == ISD::XOR && N1.getOpcode() == ISD::SRA) {
	SDValue X0 = N0.getOperand(0), X1 = N0.getOperand(1);
	SDValue S0 = N1.getOperand(0);
	if ((X0 == S0 && X1 == N1) \|\| (X0 == N1 && X1 == S0)) {
	unsigned OpSizeInBits = VT.getScalarSizeInBits();
	if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1)))
	if (C->getAPIntValue() == (OpSizeInBits - 1))
	return DAG.getNode(ISD::ABS, SDLoc(N), VT, S0);
	}
	}
	}

	// If the relocation model supports it, consider symbol offsets.
	if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N0))
	if (!LegalOperations && TLI.isOffsetFoldingLegal(GA)) {
	// fold (sub Sym, c) -> Sym-c
	if (N1C && GA->getOpcode() == ISD::GlobalAddress)
	return DAG.getGlobalAddress(GA->getGlobal(), SDLoc(N1C), VT,
	GA->getOffset() -
	(uint64_t)N1C->getSExtValue());
	// fold (sub Sym+c1, Sym+c2) -> c1-c2
	if (GlobalAddressSDNode *GB = dyn_cast<GlobalAddressSDNode>(N1))
	if (GA->getGlobal() == GB->getGlobal())
	return DAG.getConstant((uint64_t)GA->getOffset() - GB->getOffset(),
	DL, VT);
	}

	// sub X, (sextinreg Y i1) -> add X, (and Y 1)
	if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) {
	VTSDNode *TN = cast<VTSDNode>(N1.getOperand(1));
	if (TN->getVT() == MVT::i1) {
	SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0),
	DAG.getConstant(1, DL, VT));
	return DAG.getNode(ISD::ADD, DL, VT, N0, ZExt);
	}
	}

	// canonicalize (sub X, (vscale * C)) to (add X, (vscale * -C))
	if (N1.getOpcode() == ISD::VSCALE) {
	APInt IntVal = N1.getConstantOperandAPInt(0);
	return DAG.getNode(ISD::ADD, DL, VT, N0, DAG.getVScale(DL, VT, -IntVal));
	}

	// Prefer an add for more folding potential and possibly better codegen:
	// sub N0, (lshr N10, width-1) --> add N0, (ashr N10, width-1)
	if (!LegalOperations && N1.getOpcode() == ISD::SRL && N1.hasOneUse()) {
	SDValue ShAmt = N1.getOperand(1);
	ConstantSDNode *ShAmtC = isConstOrConstSplat(ShAmt);
	if (ShAmtC &&
	ShAmtC->getAPIntValue() == (N1.getScalarValueSizeInBits() - 1)) {
	SDValue SRA = DAG.getNode(ISD::SRA, DL, VT, N1.getOperand(0), ShAmt);
	return DAG.getNode(ISD::ADD, DL, VT, N0, SRA);
	}
	}

	if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) {
	// (sub Carry, X) -> (addcarry (sub 0, X), 0, Carry)
	if (SDValue Carry = getAsCarry(TLI, N0)) {
	SDValue X = N1;
	SDValue Zero = DAG.getConstant(0, DL, VT);
	SDValue NegX = DAG.getNode(ISD::SUB, DL, VT, Zero, X);
	return DAG.getNode(ISD::ADDCARRY, DL,
	DAG.getVTList(VT, Carry.getValueType()), NegX, Zero,
	Carry);
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitSUBSAT(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	SDLoc DL(N);

	// fold vector ops
	if (VT.isVector()) {
	// TODO SimplifyVBinOp

	// fold (sub_sat x, 0) -> x, vector edition
	if (ISD::isBuildVectorAllZeros(N1.getNode()))
	return N0;
	}

	// fold (sub_sat x, undef) -> 0
	if (N0.isUndef() \|\| N1.isUndef())
	return DAG.getConstant(0, DL, VT);

	// fold (sub_sat x, x) -> 0
	if (N0 == N1)
	return DAG.getConstant(0, DL, VT);

	// fold (sub_sat c1, c2) -> c3
	if (SDValue C = DAG.FoldConstantArithmetic(N->getOpcode(), DL, VT, {N0, N1}))
	return C;

	// fold (sub_sat x, 0) -> x
	if (isNullConstant(N1))
	return N0;

	return SDValue();
	}

	SDValue DAGCombiner::visitSUBC(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	SDLoc DL(N);

	// If the flag result is dead, turn this into an SUB.
	if (!N->hasAnyUseOfValue(1))
	return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1),
	DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));

	// fold (subc x, x) -> 0 + no borrow
	if (N0 == N1)
	return CombineTo(N, DAG.getConstant(0, DL, VT),
	DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));

	// fold (subc x, 0) -> x + no borrow
	if (isNullConstant(N1))
	return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));

	// Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1) + no borrow
	if (isAllOnesConstant(N0))
	return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0),
	DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));

	return SDValue();
	}

	SDValue DAGCombiner::visitSUBO(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	bool IsSigned = (ISD::SSUBO == N->getOpcode());

	EVT CarryVT = N->getValueType(1);
	SDLoc DL(N);

	// If the flag result is dead, turn this into an SUB.
	if (!N->hasAnyUseOfValue(1))
	return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1),
	DAG.getUNDEF(CarryVT));

	// fold (subo x, x) -> 0 + no borrow
	if (N0 == N1)
	return CombineTo(N, DAG.getConstant(0, DL, VT),
	DAG.getConstant(0, DL, CarryVT));

	ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);

	// fold (subox, c) -> (addo x, -c)
	if (IsSigned && N1C && !N1C->getAPIntValue().isMinSignedValue()) {
	return DAG.getNode(ISD::SADDO, DL, N->getVTList(), N0,
	DAG.getConstant(-N1C->getAPIntValue(), DL, VT));
	}

	// fold (subo x, 0) -> x + no borrow
	if (isNullOrNullSplat(N1))
	return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT));

	// Canonicalize (usubo -1, x) -> ~x, i.e. (xor x, -1) + no borrow
	if (!IsSigned && isAllOnesOrAllOnesSplat(N0))
	return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0),
	DAG.getConstant(0, DL, CarryVT));

	return SDValue();
	}

	SDValue DAGCombiner::visitSUBE(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue CarryIn = N->getOperand(2);

	// fold (sube x, y, false) -> (subc x, y)
	if (CarryIn.getOpcode() == ISD::CARRY_FALSE)
	return DAG.getNode(ISD::SUBC, SDLoc(N), N->getVTList(), N0, N1);

	return SDValue();
	}

	SDValue DAGCombiner::visitSUBCARRY(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue CarryIn = N->getOperand(2);

	// fold (subcarry x, y, false) -> (usubo x, y)
	if (isNullConstant(CarryIn)) {
	if (!LegalOperations \|\|
	TLI.isOperationLegalOrCustom(ISD::USUBO, N->getValueType(0)))
	return DAG.getNode(ISD::USUBO, SDLoc(N), N->getVTList(), N0, N1);
	}

	return SDValue();
	}

	// Notice that "mulfix" can be any of SMULFIX, SMULFIXSAT, UMULFIX and
	// UMULFIXSAT here.
	SDValue DAGCombiner::visitMULFIX(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue Scale = N->getOperand(2);
	EVT VT = N0.getValueType();

	// fold (mulfix x, undef, scale) -> 0
	if (N0.isUndef() \|\| N1.isUndef())
	return DAG.getConstant(0, SDLoc(N), VT);

	// Canonicalize constant to RHS (vector doesn't have to splat)
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
	!DAG.isConstantIntBuildVectorOrConstantInt(N1))
	return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0, Scale);

	// fold (mulfix x, 0, scale) -> 0
	if (isNullConstant(N1))
	return DAG.getConstant(0, SDLoc(N), VT);

	return SDValue();
	}

	SDValue DAGCombiner::visitMUL(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();

	// fold (mul x, undef) -> 0
	if (N0.isUndef() \|\| N1.isUndef())
	return DAG.getConstant(0, SDLoc(N), VT);

	bool N1IsConst = false;
	bool N1IsOpaqueConst = false;
	APInt ConstValue1;

	// fold vector ops
	if (VT.isVector()) {
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	N1IsConst = ISD::isConstantSplatVector(N1.getNode(), ConstValue1);
	assert((!N1IsConst \|\|
	ConstValue1.getBitWidth() == VT.getScalarSizeInBits()) &&
	"Splat APInt should be element width");
	} else {
	N1IsConst = isa<ConstantSDNode>(N1);
	if (N1IsConst) {
	ConstValue1 = cast<ConstantSDNode>(N1)->getAPIntValue();
	N1IsOpaqueConst = cast<ConstantSDNode>(N1)->isOpaque();
	}
	}

	// fold (mul c1, c2) -> c1*c2
	if (SDValue C = DAG.FoldConstantArithmetic(ISD::MUL, SDLoc(N), VT, {N0, N1}))
	return C;

	// canonicalize constant to RHS (vector doesn't have to splat)
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
	!DAG.isConstantIntBuildVectorOrConstantInt(N1))
	return DAG.getNode(ISD::MUL, SDLoc(N), VT, N1, N0);

	// fold (mul x, 0) -> 0
	if (N1IsConst && ConstValue1.isNullValue())
	return N1;

	// fold (mul x, 1) -> x
	if (N1IsConst && ConstValue1.isOneValue())
	return N0;

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// fold (mul x, -1) -> 0-x
	if (N1IsConst && ConstValue1.isAllOnesValue()) {
	SDLoc DL(N);
	return DAG.getNode(ISD::SUB, DL, VT,
	DAG.getConstant(0, DL, VT), N0);
	}

	// fold (mul x, (1 << c)) -> x << c
	if (isConstantOrConstantVector(N1, /NoOpaques/ true) &&
	DAG.isKnownToBeAPowerOfTwo(N1) &&
	(!VT.isVector() \|\| Level <= AfterLegalizeVectorOps)) {
	SDLoc DL(N);
	SDValue LogBase2 = BuildLogBase2(N1, DL);
	EVT ShiftVT = getShiftAmountTy(N0.getValueType());
	SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT);
	return DAG.getNode(ISD::SHL, DL, VT, N0, Trunc);
	}

	// fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c
	if (N1IsConst && !N1IsOpaqueConst && (-ConstValue1).isPowerOf2()) {
	unsigned Log2Val = (-ConstValue1).logBase2();
	SDLoc DL(N);
	// FIXME: If the input is something that is easily negated (e.g. a
	// single-use add), we should put the negate there.
	return DAG.getNode(ISD::SUB, DL, VT,
	DAG.getConstant(0, DL, VT),
	DAG.getNode(ISD::SHL, DL, VT, N0,
	DAG.getConstant(Log2Val, DL,
	getShiftAmountTy(N0.getValueType()))));
	}

	// Try to transform multiply-by-(power-of-2 +/- 1) into shift and add/sub.
	// mul x, (2^N + 1) --> add (shl x, N), x
	// mul x, (2^N - 1) --> sub (shl x, N), x
	// Examples: x * 33 --> (x << 5) + x
	// x * 15 --> (x << 4) - x
	// x * -33 --> -((x << 5) + x)
	// x * -15 --> -((x << 4) - x) ; this reduces --> x - (x << 4)
	if (N1IsConst && TLI.decomposeMulByConstant(*DAG.getContext(), VT, N1)) {
	// TODO: We could handle more general decomposition of any constant by
	// having the target set a limit on number of ops and making a
	// callback to determine that sequence (similar to sqrt expansion).
	unsigned MathOp = ISD::DELETED_NODE;
	APInt MulC = ConstValue1.abs();
	if ((MulC - 1).isPowerOf2())
	MathOp = ISD::ADD;
	else if ((MulC + 1).isPowerOf2())
	MathOp = ISD::SUB;

	if (MathOp != ISD::DELETED_NODE) {
	unsigned ShAmt =
	MathOp == ISD::ADD ? (MulC - 1).logBase2() : (MulC + 1).logBase2();
	assert(ShAmt < VT.getScalarSizeInBits() &&
	"multiply-by-constant generated out of bounds shift");
	SDLoc DL(N);
	SDValue Shl =
	DAG.getNode(ISD::SHL, DL, VT, N0, DAG.getConstant(ShAmt, DL, VT));
	SDValue R = DAG.getNode(MathOp, DL, VT, Shl, N0);
	if (ConstValue1.isNegative())
	R = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), R);
	return R;
	}
	}

	// (mul (shl X, c1), c2) -> (mul X, c2 << c1)
	if (N0.getOpcode() == ISD::SHL &&
	isConstantOrConstantVector(N1, /* NoOpaques */ true) &&
	isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) {
	SDValue C3 = DAG.getNode(ISD::SHL, SDLoc(N), VT, N1, N0.getOperand(1));
	if (isConstantOrConstantVector(C3))
	return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), C3);
	}

	// Change (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one
	// use.
	{
	SDValue Sh(nullptr, 0), Y(nullptr, 0);

	// Check for both (mul (shl X, C), Y) and (mul Y, (shl X, C)).
	if (N0.getOpcode() == ISD::SHL &&
	isConstantOrConstantVector(N0.getOperand(1)) &&
	N0.getNode()->hasOneUse()) {
	Sh = N0; Y = N1;
	} else if (N1.getOpcode() == ISD::SHL &&
	isConstantOrConstantVector(N1.getOperand(1)) &&
	N1.getNode()->hasOneUse()) {
	Sh = N1; Y = N0;
	}

	if (Sh.getNode()) {
	SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N), VT, Sh.getOperand(0), Y);
	return DAG.getNode(ISD::SHL, SDLoc(N), VT, Mul, Sh.getOperand(1));
	}
	}

	// fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2)
	if (DAG.isConstantIntBuildVectorOrConstantInt(N1) &&
	N0.getOpcode() == ISD::ADD &&
	DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)) &&
	isMulAddWithConstProfitable(N, N0, N1))
	return DAG.getNode(ISD::ADD, SDLoc(N), VT,
	DAG.getNode(ISD::MUL, SDLoc(N0), VT,
	N0.getOperand(0), N1),
	DAG.getNode(ISD::MUL, SDLoc(N1), VT,
	N0.getOperand(1), N1));

	// Fold (mul (vscale * C0), C1) to (vscale * (C0 * C1)).
	if (N0.getOpcode() == ISD::VSCALE)
	if (ConstantSDNode *NC1 = isConstOrConstSplat(N1)) {
	APInt C0 = N0.getConstantOperandAPInt(0);
	APInt C1 = NC1->getAPIntValue();
	return DAG.getVScale(SDLoc(N), VT, C0 * C1);
	}

	// reassociate mul
	if (SDValue RMUL = reassociateOps(ISD::MUL, SDLoc(N), N0, N1, N->getFlags()))
	return RMUL;

	return SDValue();
	}

	/// Return true if divmod libcall is available.
	static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned,
	const TargetLowering &TLI) {
	RTLIB::Libcall LC;
	EVT NodeType = Node->getValueType(0);
	if (!NodeType.isSimple())
	return false;
	switch (NodeType.getSimpleVT().SimpleTy) {
	default: return false; // No libcall for vector types.
	case MVT::i8: LC= isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
	case MVT::i16: LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
	case MVT::i32: LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
	case MVT::i64: LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
	case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break;
	}

	return TLI.getLibcallName(LC) != nullptr;
	}

	/// Issue divrem if both quotient and remainder are needed.
	SDValue DAGCombiner::useDivRem(SDNode *Node) {
	if (Node->use_empty())
	return SDValue(); // This is a dead node, leave it alone.

	unsigned Opcode = Node->getOpcode();
	bool isSigned = (Opcode == ISD::SDIV) \|\| (Opcode == ISD::SREM);
	unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;

	// DivMod lib calls can still work on non-legal types if using lib-calls.
	EVT VT = Node->getValueType(0);
	if (VT.isVector() \|\| !VT.isInteger())
	return SDValue();

	if (!TLI.isTypeLegal(VT) && !TLI.isOperationCustom(DivRemOpc, VT))
	return SDValue();

	// If DIVREM is going to get expanded into a libcall,
	// but there is no libcall available, then don't combine.
	if (!TLI.isOperationLegalOrCustom(DivRemOpc, VT) &&
	!isDivRemLibcallAvailable(Node, isSigned, TLI))
	return SDValue();

	// If div is legal, it's better to do the normal expansion
	unsigned OtherOpcode = 0;
	if ((Opcode == ISD::SDIV) \|\| (Opcode == ISD::UDIV)) {
	OtherOpcode = isSigned ? ISD::SREM : ISD::UREM;
	if (TLI.isOperationLegalOrCustom(Opcode, VT))
	return SDValue();
	} else {
	OtherOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
	if (TLI.isOperationLegalOrCustom(OtherOpcode, VT))
	return SDValue();
	}

	SDValue Op0 = Node->getOperand(0);
	SDValue Op1 = Node->getOperand(1);
	SDValue combined;
	for (SDNode::use_iterator UI = Op0.getNode()->use_begin(),
	UE = Op0.getNode()->use_end(); UI != UE; ++UI) {
	SDNode User = UI;
	if (User == Node \|\| User->getOpcode() == ISD::DELETED_NODE \|\|
	User->use_empty())
	continue;
	// Convert the other matching node(s), too;
	// otherwise, the DIVREM may get target-legalized into something
	// target-specific that we won't be able to recognize.
	unsigned UserOpc = User->getOpcode();
	if ((UserOpc == Opcode \|\| UserOpc == OtherOpcode \|\| UserOpc == DivRemOpc) &&
	User->getOperand(0) == Op0 &&
	User->getOperand(1) == Op1) {
	if (!combined) {
	if (UserOpc == OtherOpcode) {
	SDVTList VTs = DAG.getVTList(VT, VT);
	combined = DAG.getNode(DivRemOpc, SDLoc(Node), VTs, Op0, Op1);
	} else if (UserOpc == DivRemOpc) {
	combined = SDValue(User, 0);
	} else {
	assert(UserOpc == Opcode);
	continue;
	}
	}
	if (UserOpc == ISD::SDIV \|\| UserOpc == ISD::UDIV)
	CombineTo(User, combined);
	else if (UserOpc == ISD::SREM \|\| UserOpc == ISD::UREM)
	CombineTo(User, combined.getValue(1));
	}
	}
	return combined;
	}

	static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	unsigned Opc = N->getOpcode();
	bool IsDiv = (ISD::SDIV == Opc) \|\| (ISD::UDIV == Opc);
	ConstantSDNode *N1C = isConstOrConstSplat(N1);

	// X / undef -> undef
	// X % undef -> undef
	// X / 0 -> undef
	// X % 0 -> undef
	// NOTE: This includes vectors where any divisor element is zero/undef.
	if (DAG.isUndef(Opc, {N0, N1}))
	return DAG.getUNDEF(VT);

	// undef / X -> 0
	// undef % X -> 0
	if (N0.isUndef())
	return DAG.getConstant(0, DL, VT);

	// 0 / X -> 0
	// 0 % X -> 0
	ConstantSDNode *N0C = isConstOrConstSplat(N0);
	if (N0C && N0C->isNullValue())
	return N0;

	// X / X -> 1
	// X % X -> 0
	if (N0 == N1)
	return DAG.getConstant(IsDiv ? 1 : 0, DL, VT);

	// X / 1 -> X
	// X % 1 -> 0
	// If this is a boolean op (single-bit element type), we can't have
	// division-by-zero or remainder-by-zero, so assume the divisor is 1.
	// TODO: Similarly, if we're zero-extending a boolean divisor, then assume
	// it's a 1.
	if ((N1C && N1C->isOne()) \|\| (VT.getScalarType() == MVT::i1))
	return IsDiv ? N0 : DAG.getConstant(0, DL, VT);

	return SDValue();
	}

	SDValue DAGCombiner::visitSDIV(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	EVT CCVT = getSetCCResultType(VT);

	// fold vector ops
	if (VT.isVector())
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	SDLoc DL(N);

	// fold (sdiv c1, c2) -> c1/c2
	ConstantSDNode *N1C = isConstOrConstSplat(N1);
	if (SDValue C = DAG.FoldConstantArithmetic(ISD::SDIV, DL, VT, {N0, N1}))
	return C;

	// fold (sdiv X, -1) -> 0-X
	if (N1C && N1C->isAllOnesValue())
	return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0);

	// fold (sdiv X, MIN_SIGNED) -> select(X == MIN_SIGNED, 1, 0)
	if (N1C && N1C->getAPIntValue().isMinSignedValue())
	return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ),
	DAG.getConstant(1, DL, VT),
	DAG.getConstant(0, DL, VT));

	if (SDValue V = simplifyDivRem(N, DAG))
	return V;

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// If we know the sign bits of both operands are zero, strength reduce to a
	// udiv instead. Handles (X&15) /s 4 -> X&15 >> 2
	if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
	return DAG.getNode(ISD::UDIV, DL, N1.getValueType(), N0, N1);

	if (SDValue V = visitSDIVLike(N0, N1, N)) {
	// If the corresponding remainder node exists, update its users with
	// (Dividend - (Quotient * Divisor).
	if (SDNode *RemNode = DAG.getNodeIfExists(ISD::SREM, N->getVTList(),
	{ N0, N1 })) {
	SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, V, N1);
	SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul);
	AddToWorklist(Mul.getNode());
	AddToWorklist(Sub.getNode());
	CombineTo(RemNode, Sub);
	}
	return V;
	}

	// sdiv, srem -> sdivrem
	// If the divisor is constant, then return DIVREM only if isIntDivCheap() is
	// true. Otherwise, we break the simplification logic in visitREM().
	AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
	if (!N1C \|\| TLI.isIntDivCheap(N->getValueType(0), Attr))
	if (SDValue DivRem = useDivRem(N))
	return DivRem;

	return SDValue();
	}

	SDValue DAGCombiner::visitSDIVLike(SDValue N0, SDValue N1, SDNode *N) {
	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	EVT CCVT = getSetCCResultType(VT);
	unsigned BitWidth = VT.getScalarSizeInBits();

	// Helper for determining whether a value is a power-2 constant scalar or a
	// vector of such elements.
	auto IsPowerOfTwo = [](ConstantSDNode *C) {
	if (C->isNullValue() \|\| C->isOpaque())
	return false;
	if (C->getAPIntValue().isPowerOf2())
	return true;
	if ((-C->getAPIntValue()).isPowerOf2())
	return true;
	return false;
	};

	// fold (sdiv X, pow2) -> simple ops after legalize
	// FIXME: We check for the exact bit here because the generic lowering gives
	// better results in that case. The target-specific lowering should learn how
	// to handle exact sdivs efficiently.
	if (!N->getFlags().hasExact() && ISD::matchUnaryPredicate(N1, IsPowerOfTwo)) {
	// Target-specific implementation of sdiv x, pow2.
	if (SDValue Res = BuildSDIVPow2(N))
	return Res;

	// Create constants that are functions of the shift amount value.
	EVT ShiftAmtTy = getShiftAmountTy(N0.getValueType());
	SDValue Bits = DAG.getConstant(BitWidth, DL, ShiftAmtTy);
	SDValue C1 = DAG.getNode(ISD::CTTZ, DL, VT, N1);
	C1 = DAG.getZExtOrTrunc(C1, DL, ShiftAmtTy);
	SDValue Inexact = DAG.getNode(ISD::SUB, DL, ShiftAmtTy, Bits, C1);
	if (!isConstantOrConstantVector(Inexact))
	return SDValue();

	// Splat the sign bit into the register
	SDValue Sign = DAG.getNode(ISD::SRA, DL, VT, N0,
	DAG.getConstant(BitWidth - 1, DL, ShiftAmtTy));
	AddToWorklist(Sign.getNode());

	// Add (N0 < 0) ? abs2 - 1 : 0;
	SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, Sign, Inexact);
	AddToWorklist(Srl.getNode());
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Srl);
	AddToWorklist(Add.getNode());
	SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Add, C1);
	AddToWorklist(Sra.getNode());

	// Special case: (sdiv X, 1) -> X
	// Special Case: (sdiv X, -1) -> 0-X
	SDValue One = DAG.getConstant(1, DL, VT);
	SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
	SDValue IsOne = DAG.getSetCC(DL, CCVT, N1, One, ISD::SETEQ);
	SDValue IsAllOnes = DAG.getSetCC(DL, CCVT, N1, AllOnes, ISD::SETEQ);
	SDValue IsOneOrAllOnes = DAG.getNode(ISD::OR, DL, CCVT, IsOne, IsAllOnes);
	Sra = DAG.getSelect(DL, VT, IsOneOrAllOnes, N0, Sra);

	// If dividing by a positive value, we're done. Otherwise, the result must
	// be negated.
	SDValue Zero = DAG.getConstant(0, DL, VT);
	SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, Zero, Sra);

	// FIXME: Use SELECT_CC once we improve SELECT_CC constant-folding.
	SDValue IsNeg = DAG.getSetCC(DL, CCVT, N1, Zero, ISD::SETLT);
	SDValue Res = DAG.getSelect(DL, VT, IsNeg, Sub, Sra);
	return Res;
	}

	// If integer divide is expensive and we satisfy the requirements, emit an
	// alternate sequence. Targets may check function attributes for size/speed
	// trade-offs.
	AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
	if (isConstantOrConstantVector(N1) &&
	!TLI.isIntDivCheap(N->getValueType(0), Attr))
	if (SDValue Op = BuildSDIV(N))
	return Op;

	return SDValue();
	}

	SDValue DAGCombiner::visitUDIV(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	EVT CCVT = getSetCCResultType(VT);

	// fold vector ops
	if (VT.isVector())
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	SDLoc DL(N);

	// fold (udiv c1, c2) -> c1/c2
	ConstantSDNode *N1C = isConstOrConstSplat(N1);
	if (SDValue C = DAG.FoldConstantArithmetic(ISD::UDIV, DL, VT, {N0, N1}))
	return C;

	// fold (udiv X, -1) -> select(X == -1, 1, 0)
	if (N1C && N1C->getAPIntValue().isAllOnesValue())
	return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ),
	DAG.getConstant(1, DL, VT),
	DAG.getConstant(0, DL, VT));

	if (SDValue V = simplifyDivRem(N, DAG))
	return V;

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	if (SDValue V = visitUDIVLike(N0, N1, N)) {
	// If the corresponding remainder node exists, update its users with
	// (Dividend - (Quotient * Divisor).
	if (SDNode *RemNode = DAG.getNodeIfExists(ISD::UREM, N->getVTList(),
	{ N0, N1 })) {
	SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, V, N1);
	SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul);
	AddToWorklist(Mul.getNode());
	AddToWorklist(Sub.getNode());
	CombineTo(RemNode, Sub);
	}
	return V;
	}

	// sdiv, srem -> sdivrem
	// If the divisor is constant, then return DIVREM only if isIntDivCheap() is
	// true. Otherwise, we break the simplification logic in visitREM().
	AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
	if (!N1C \|\| TLI.isIntDivCheap(N->getValueType(0), Attr))
	if (SDValue DivRem = useDivRem(N))
	return DivRem;

	return SDValue();
	}

	SDValue DAGCombiner::visitUDIVLike(SDValue N0, SDValue N1, SDNode *N) {
	SDLoc DL(N);
	EVT VT = N->getValueType(0);

	// fold (udiv x, (1 << c)) -> x >>u c
	if (isConstantOrConstantVector(N1, /NoOpaques/ true) &&
	DAG.isKnownToBeAPowerOfTwo(N1)) {
	SDValue LogBase2 = BuildLogBase2(N1, DL);
	AddToWorklist(LogBase2.getNode());

	EVT ShiftVT = getShiftAmountTy(N0.getValueType());
	SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT);
	AddToWorklist(Trunc.getNode());
	return DAG.getNode(ISD::SRL, DL, VT, N0, Trunc);
	}

	// fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
	if (N1.getOpcode() == ISD::SHL) {
	SDValue N10 = N1.getOperand(0);
	if (isConstantOrConstantVector(N10, /NoOpaques/ true) &&
	DAG.isKnownToBeAPowerOfTwo(N10)) {
	SDValue LogBase2 = BuildLogBase2(N10, DL);
	AddToWorklist(LogBase2.getNode());

	EVT ADDVT = N1.getOperand(1).getValueType();
	SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ADDVT);
	AddToWorklist(Trunc.getNode());
	SDValue Add = DAG.getNode(ISD::ADD, DL, ADDVT, N1.getOperand(1), Trunc);
	AddToWorklist(Add.getNode());
	return DAG.getNode(ISD::SRL, DL, VT, N0, Add);
	}
	}

	// fold (udiv x, c) -> alternate
	AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
	if (isConstantOrConstantVector(N1) &&
	!TLI.isIntDivCheap(N->getValueType(0), Attr))
	if (SDValue Op = BuildUDIV(N))
	return Op;

	return SDValue();
	}

	// handles ISD::SREM and ISD::UREM
	SDValue DAGCombiner::visitREM(SDNode *N) {
	unsigned Opcode = N->getOpcode();
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	EVT CCVT = getSetCCResultType(VT);

	bool isSigned = (Opcode == ISD::SREM);
	SDLoc DL(N);

	// fold (rem c1, c2) -> c1%c2
	ConstantSDNode *N1C = isConstOrConstSplat(N1);
	if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}))
	return C;

	// fold (urem X, -1) -> select(X == -1, 0, x)
	if (!isSigned && N1C && N1C->getAPIntValue().isAllOnesValue())
	return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ),
	DAG.getConstant(0, DL, VT), N0);

	if (SDValue V = simplifyDivRem(N, DAG))
	return V;

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	if (isSigned) {
	// If we know the sign bits of both operands are zero, strength reduce to a
	// urem instead. Handles (X & 0x0FFFFFFF) %s 16 -> X&15
	if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
	return DAG.getNode(ISD::UREM, DL, VT, N0, N1);
	} else {
	SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
	if (DAG.isKnownToBeAPowerOfTwo(N1)) {
	// fold (urem x, pow2) -> (and x, pow2-1)
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne);
	AddToWorklist(Add.getNode());
	return DAG.getNode(ISD::AND, DL, VT, N0, Add);
	}
	if (N1.getOpcode() == ISD::SHL &&
	DAG.isKnownToBeAPowerOfTwo(N1.getOperand(0))) {
	// fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne);
	AddToWorklist(Add.getNode());
	return DAG.getNode(ISD::AND, DL, VT, N0, Add);
	}
	}

	AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();

	// If X/C can be simplified by the division-by-constant logic, lower
	// X%C to the equivalent of X-X/C*C.
	// Reuse the SDIVLike/UDIVLike combines - to avoid mangling nodes, the
	// speculative DIV must not cause a DIVREM conversion. We guard against this
	// by skipping the simplification if isIntDivCheap(). When div is not cheap,
	// combine will not return a DIVREM. Regardless, checking cheapness here
	// makes sense since the simplification results in fatter code.
	if (DAG.isKnownNeverZero(N1) && !TLI.isIntDivCheap(VT, Attr)) {
	SDValue OptimizedDiv =
	isSigned ? visitSDIVLike(N0, N1, N) : visitUDIVLike(N0, N1, N);
	if (OptimizedDiv.getNode()) {
	// If the equivalent Div node also exists, update its users.
	unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
	if (SDNode *DivNode = DAG.getNodeIfExists(DivOpcode, N->getVTList(),
	{ N0, N1 }))
	CombineTo(DivNode, OptimizedDiv);
	SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, OptimizedDiv, N1);
	SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul);
	AddToWorklist(OptimizedDiv.getNode());
	AddToWorklist(Mul.getNode());
	return Sub;
	}
	}

	// sdiv, srem -> sdivrem
	if (SDValue DivRem = useDivRem(N))
	return DivRem.getValue(1);

	return SDValue();
	}

	SDValue DAGCombiner::visitMULHS(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	if (VT.isVector()) {
	// fold (mulhs x, 0) -> 0
	// do not return N0/N1, because undef node may exist.
	if (ISD::isBuildVectorAllZeros(N0.getNode()) \|\|
	ISD::isBuildVectorAllZeros(N1.getNode()))
	return DAG.getConstant(0, DL, VT);
	}

	// fold (mulhs x, 0) -> 0
	if (isNullConstant(N1))
	return N1;
	// fold (mulhs x, 1) -> (sra x, size(x)-1)
	if (isOneConstant(N1))
	return DAG.getNode(ISD::SRA, DL, N0.getValueType(), N0,
	DAG.getConstant(N0.getScalarValueSizeInBits() - 1, DL,
	getShiftAmountTy(N0.getValueType())));

	// fold (mulhs x, undef) -> 0
	if (N0.isUndef() \|\| N1.isUndef())
	return DAG.getConstant(0, DL, VT);

	// If the type twice as wide is legal, transform the mulhs to a wider multiply
	// plus a shift.
	if (!TLI.isMulhCheaperThanMulShift(VT) && VT.isSimple() && !VT.isVector()) {
	MVT Simple = VT.getSimpleVT();
	unsigned SimpleSize = Simple.getSizeInBits();
	EVT NewVT = EVT::getIntegerVT(DAG.getContext(), SimpleSize2);
	if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
	N0 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N0);
	N1 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N1);
	N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1);
	N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1,
	DAG.getConstant(SimpleSize, DL,
	getShiftAmountTy(N1.getValueType())));
	return DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitMULHU(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	if (VT.isVector()) {
	// fold (mulhu x, 0) -> 0
	// do not return N0/N1, because undef node may exist.
	if (ISD::isBuildVectorAllZeros(N0.getNode()) \|\|
	ISD::isBuildVectorAllZeros(N1.getNode()))
	return DAG.getConstant(0, DL, VT);
	}

	// fold (mulhu x, 0) -> 0
	if (isNullConstant(N1))
	return N1;
	// fold (mulhu x, 1) -> 0
	if (isOneConstant(N1))
	return DAG.getConstant(0, DL, N0.getValueType());
	// fold (mulhu x, undef) -> 0
	if (N0.isUndef() \|\| N1.isUndef())
	return DAG.getConstant(0, DL, VT);

	// fold (mulhu x, (1 << c)) -> x >> (bitwidth - c)
	if (isConstantOrConstantVector(N1, /NoOpaques/ true) &&
	DAG.isKnownToBeAPowerOfTwo(N1) && hasOperation(ISD::SRL, VT)) {
	unsigned NumEltBits = VT.getScalarSizeInBits();
	SDValue LogBase2 = BuildLogBase2(N1, DL);
	SDValue SRLAmt = DAG.getNode(
	ISD::SUB, DL, VT, DAG.getConstant(NumEltBits, DL, VT), LogBase2);
	EVT ShiftVT = getShiftAmountTy(N0.getValueType());
	SDValue Trunc = DAG.getZExtOrTrunc(SRLAmt, DL, ShiftVT);
	return DAG.getNode(ISD::SRL, DL, VT, N0, Trunc);
	}

	// If the type twice as wide is legal, transform the mulhu to a wider multiply
	// plus a shift.
	if (!TLI.isMulhCheaperThanMulShift(VT) && VT.isSimple() && !VT.isVector()) {
	MVT Simple = VT.getSimpleVT();
	unsigned SimpleSize = Simple.getSizeInBits();
	EVT NewVT = EVT::getIntegerVT(DAG.getContext(), SimpleSize2);
	if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
	N0 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N0);
	N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N1);
	N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1);
	N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1,
	DAG.getConstant(SimpleSize, DL,
	getShiftAmountTy(N1.getValueType())));
	return DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
	}
	}

	return SDValue();
	}

	/// Perform optimizations common to nodes that compute two values. LoOp and HiOp
	/// give the opcodes for the two computations that are being performed. Return
	/// true if a simplification was made.
	SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
	unsigned HiOp) {
	// If the high half is not needed, just compute the low half.
	bool HiExists = N->hasAnyUseOfValue(1);
	if (!HiExists && (!LegalOperations \|\|
	TLI.isOperationLegalOrCustom(LoOp, N->getValueType(0)))) {
	SDValue Res = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops());
	return CombineTo(N, Res, Res);
	}

	// If the low half is not needed, just compute the high half.
	bool LoExists = N->hasAnyUseOfValue(0);
	if (!LoExists && (!LegalOperations \|\|
	TLI.isOperationLegalOrCustom(HiOp, N->getValueType(1)))) {
	SDValue Res = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops());
	return CombineTo(N, Res, Res);
	}

	// If both halves are used, return as it is.
	if (LoExists && HiExists)
	return SDValue();

	// If the two computed results can be simplified separately, separate them.
	if (LoExists) {
	SDValue Lo = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops());
	AddToWorklist(Lo.getNode());
	SDValue LoOpt = combine(Lo.getNode());
	if (LoOpt.getNode() && LoOpt.getNode() != Lo.getNode() &&
	(!LegalOperations \|\|
	TLI.isOperationLegalOrCustom(LoOpt.getOpcode(), LoOpt.getValueType())))
	return CombineTo(N, LoOpt, LoOpt);
	}

	if (HiExists) {
	SDValue Hi = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops());
	AddToWorklist(Hi.getNode());
	SDValue HiOpt = combine(Hi.getNode());
	if (HiOpt.getNode() && HiOpt != Hi &&
	(!LegalOperations \|\|
	TLI.isOperationLegalOrCustom(HiOpt.getOpcode(), HiOpt.getValueType())))
	return CombineTo(N, HiOpt, HiOpt);
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) {
	if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHS))
	return Res;

	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	// If the type is twice as wide is legal, transform the mulhu to a wider
	// multiply plus a shift.
	if (VT.isSimple() && !VT.isVector()) {
	MVT Simple = VT.getSimpleVT();
	unsigned SimpleSize = Simple.getSizeInBits();
	EVT NewVT = EVT::getIntegerVT(DAG.getContext(), SimpleSize2);
	if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
	SDValue Lo = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N->getOperand(0));
	SDValue Hi = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N->getOperand(1));
	Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi);
	// Compute the high part as N1.
	Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo,
	DAG.getConstant(SimpleSize, DL,
	getShiftAmountTy(Lo.getValueType())));
	Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi);
	// Compute the low part as N0.
	Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo);
	return CombineTo(N, Lo, Hi);
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) {
	if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHU))
	return Res;

	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	// (umul_lohi N0, 0) -> (0, 0)
	if (isNullConstant(N->getOperand(1))) {
	SDValue Zero = DAG.getConstant(0, DL, VT);
	return CombineTo(N, Zero, Zero);
	}

	// (umul_lohi N0, 1) -> (N0, 0)
	if (isOneConstant(N->getOperand(1))) {
	SDValue Zero = DAG.getConstant(0, DL, VT);
	return CombineTo(N, N->getOperand(0), Zero);
	}

	// If the type is twice as wide is legal, transform the mulhu to a wider
	// multiply plus a shift.
	if (VT.isSimple() && !VT.isVector()) {
	MVT Simple = VT.getSimpleVT();
	unsigned SimpleSize = Simple.getSizeInBits();
	EVT NewVT = EVT::getIntegerVT(DAG.getContext(), SimpleSize2);
	if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
	SDValue Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N->getOperand(0));
	SDValue Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N->getOperand(1));
	Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi);
	// Compute the high part as N1.
	Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo,
	DAG.getConstant(SimpleSize, DL,
	getShiftAmountTy(Lo.getValueType())));
	Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi);
	// Compute the low part as N0.
	Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo);
	return CombineTo(N, Lo, Hi);
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitMULO(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	bool IsSigned = (ISD::SMULO == N->getOpcode());

	EVT CarryVT = N->getValueType(1);
	SDLoc DL(N);

	// canonicalize constant to RHS.
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
	!DAG.isConstantIntBuildVectorOrConstantInt(N1))
	return DAG.getNode(N->getOpcode(), DL, N->getVTList(), N1, N0);

	// fold (mulo x, 0) -> 0 + no carry out
	if (isNullOrNullSplat(N1))
	return CombineTo(N, DAG.getConstant(0, DL, VT),
	DAG.getConstant(0, DL, CarryVT));

	// (mulo x, 2) -> (addo x, x)
	if (ConstantSDNode *C2 = isConstOrConstSplat(N1))
	if (C2->getAPIntValue() == 2)
	return DAG.getNode(IsSigned ? ISD::SADDO : ISD::UADDO, DL,
	N->getVTList(), N0, N0);

	return SDValue();
	}

	SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	unsigned Opcode = N->getOpcode();

	// fold vector ops
	if (VT.isVector())
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	// fold operation with constant operands.
	if (SDValue C = DAG.FoldConstantArithmetic(Opcode, SDLoc(N), VT, {N0, N1}))
	return C;

	// canonicalize constant to RHS
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
	!DAG.isConstantIntBuildVectorOrConstantInt(N1))
	return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0);

	// Is sign bits are zero, flip between UMIN/UMAX and SMIN/SMAX.
	// Only do this if the current op isn't legal and the flipped is.
	if (!TLI.isOperationLegal(Opcode, VT) &&
	(N0.isUndef() \|\| DAG.SignBitIsZero(N0)) &&
	(N1.isUndef() \|\| DAG.SignBitIsZero(N1))) {
	unsigned AltOpcode;
	switch (Opcode) {
	case ISD::SMIN: AltOpcode = ISD::UMIN; break;
	case ISD::SMAX: AltOpcode = ISD::UMAX; break;
	case ISD::UMIN: AltOpcode = ISD::SMIN; break;
	case ISD::UMAX: AltOpcode = ISD::SMAX; break;
	default: llvm_unreachable("Unknown MINMAX opcode");
	}
	if (TLI.isOperationLegal(AltOpcode, VT))
	return DAG.getNode(AltOpcode, SDLoc(N), VT, N0, N1);
	}

	return SDValue();
	}

	/// If this is a bitwise logic instruction and both operands have the same
	/// opcode, try to sink the other opcode after the logic instruction.
	SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
	SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	unsigned LogicOpcode = N->getOpcode();
	unsigned HandOpcode = N0.getOpcode();
	assert((LogicOpcode == ISD::AND \|\| LogicOpcode == ISD::OR \|\|
	LogicOpcode == ISD::XOR) && "Expected logic opcode");
	assert(HandOpcode == N1.getOpcode() && "Bad input!");

	// Bail early if none of these transforms apply.
	if (N0.getNumOperands() == 0)
	return SDValue();

	// FIXME: We should check number of uses of the operands to not increase
	// the instruction count for all transforms.

	// Handle size-changing casts.
	SDValue X = N0.getOperand(0);
	SDValue Y = N1.getOperand(0);
	EVT XVT = X.getValueType();
	SDLoc DL(N);
	if (HandOpcode == ISD::ANY_EXTEND \|\| HandOpcode == ISD::ZERO_EXTEND \|\|
	HandOpcode == ISD::SIGN_EXTEND) {
	// If both operands have other uses, this transform would create extra
	// instructions without eliminating anything.
	if (!N0.hasOneUse() && !N1.hasOneUse())
	return SDValue();
	// We need matching integer source types.
	if (XVT != Y.getValueType())
	return SDValue();
	// Don't create an illegal op during or after legalization. Don't ever
	// create an unsupported vector op.
	if ((VT.isVector() \|\| LegalOperations) &&
	!TLI.isOperationLegalOrCustom(LogicOpcode, XVT))
	return SDValue();
	// Avoid infinite looping with PromoteIntBinOp.
	// TODO: Should we apply desirable/legal constraints to all opcodes?
	if (HandOpcode == ISD::ANY_EXTEND && LegalTypes &&
	!TLI.isTypeDesirableForOp(LogicOpcode, XVT))
	return SDValue();
	// logic_op (hand_op X), (hand_op Y) --> hand_op (logic_op X, Y)
	SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
	return DAG.getNode(HandOpcode, DL, VT, Logic);
	}

	// logic_op (truncate x), (truncate y) --> truncate (logic_op x, y)
	if (HandOpcode == ISD::TRUNCATE) {
	// If both operands have other uses, this transform would create extra
	// instructions without eliminating anything.
	if (!N0.hasOneUse() && !N1.hasOneUse())
	return SDValue();
	// We need matching source types.
	if (XVT != Y.getValueType())
	return SDValue();
	// Don't create an illegal op during or after legalization.
	if (LegalOperations && !TLI.isOperationLegal(LogicOpcode, XVT))
	return SDValue();
	// Be extra careful sinking truncate. If it's free, there's no benefit in
	// widening a binop. Also, don't create a logic op on an illegal type.
	if (TLI.isZExtFree(VT, XVT) && TLI.isTruncateFree(XVT, VT))
	return SDValue();
	if (!TLI.isTypeLegal(XVT))
	return SDValue();
	SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
	return DAG.getNode(HandOpcode, DL, VT, Logic);
	}

	// For binops SHL/SRL/SRA/AND:
	// logic_op (OP x, z), (OP y, z) --> OP (logic_op x, y), z
	if ((HandOpcode == ISD::SHL \|\| HandOpcode == ISD::SRL \|\|
	HandOpcode == ISD::SRA \|\| HandOpcode == ISD::AND) &&
	N0.getOperand(1) == N1.getOperand(1)) {
	// If either operand has other uses, this transform is not an improvement.
	if (!N0.hasOneUse() \|\| !N1.hasOneUse())
	return SDValue();
	SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
	return DAG.getNode(HandOpcode, DL, VT, Logic, N0.getOperand(1));
	}

	// Unary ops: logic_op (bswap x), (bswap y) --> bswap (logic_op x, y)
	if (HandOpcode == ISD::BSWAP) {
	// If either operand has other uses, this transform is not an improvement.
	if (!N0.hasOneUse() \|\| !N1.hasOneUse())
	return SDValue();
	SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
	return DAG.getNode(HandOpcode, DL, VT, Logic);
	}

	// Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B))
	// Only perform this optimization up until type legalization, before
	// LegalizeVectorOprs. LegalizeVectorOprs promotes vector operations by
	// adding bitcasts. For example (xor v4i32) is promoted to (v2i64), and
	// we don't want to undo this promotion.
	// We also handle SCALAR_TO_VECTOR because xor/or/and operations are cheaper
	// on scalars.
	if ((HandOpcode == ISD::BITCAST \|\| HandOpcode == ISD::SCALAR_TO_VECTOR) &&
	Level <= AfterLegalizeTypes) {
	// Input types must be integer and the same.
	if (XVT.isInteger() && XVT == Y.getValueType() &&
	!(VT.isVector() && TLI.isTypeLegal(VT) &&
	!XVT.isVector() && !TLI.isTypeLegal(XVT))) {
	SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
	return DAG.getNode(HandOpcode, DL, VT, Logic);
	}
	}

	// Xor/and/or are indifferent to the swizzle operation (shuffle of one value).
	// Simplify xor/and/or (shuff(A), shuff(B)) -> shuff(op (A,B))
	// If both shuffles use the same mask, and both shuffle within a single
	// vector, then it is worthwhile to move the swizzle after the operation.
	// The type-legalizer generates this pattern when loading illegal
	// vector types from memory. In many cases this allows additional shuffle
	// optimizations.
	// There are other cases where moving the shuffle after the xor/and/or
	// is profitable even if shuffles don't perform a swizzle.
	// If both shuffles use the same mask, and both shuffles have the same first
	// or second operand, then it might still be profitable to move the shuffle
	// after the xor/and/or operation.
	if (HandOpcode == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG) {
	auto *SVN0 = cast<ShuffleVectorSDNode>(N0);
	auto *SVN1 = cast<ShuffleVectorSDNode>(N1);
	assert(X.getValueType() == Y.getValueType() &&
	"Inputs to shuffles are not the same type");

	// Check that both shuffles use the same mask. The masks are known to be of
	// the same length because the result vector type is the same.
	// Check also that shuffles have only one use to avoid introducing extra
	// instructions.
	if (!SVN0->hasOneUse() \|\| !SVN1->hasOneUse() \|\|
	!SVN0->getMask().equals(SVN1->getMask()))
	return SDValue();

	// Don't try to fold this node if it requires introducing a
	// build vector of all zeros that might be illegal at this stage.
	SDValue ShOp = N0.getOperand(1);
	if (LogicOpcode == ISD::XOR && !ShOp.isUndef())
	ShOp = tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);

	// (logic_op (shuf (A, C), shuf (B, C))) --> shuf (logic_op (A, B), C)
	if (N0.getOperand(1) == N1.getOperand(1) && ShOp.getNode()) {
	SDValue Logic = DAG.getNode(LogicOpcode, DL, VT,
	N0.getOperand(0), N1.getOperand(0));
	return DAG.getVectorShuffle(VT, DL, Logic, ShOp, SVN0->getMask());
	}

	// Don't try to fold this node if it requires introducing a
	// build vector of all zeros that might be illegal at this stage.
	ShOp = N0.getOperand(0);
	if (LogicOpcode == ISD::XOR && !ShOp.isUndef())
	ShOp = tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);

	// (logic_op (shuf (C, A), shuf (C, B))) --> shuf (C, logic_op (A, B))
	if (N0.getOperand(0) == N1.getOperand(0) && ShOp.getNode()) {
	SDValue Logic = DAG.getNode(LogicOpcode, DL, VT, N0.getOperand(1),
	N1.getOperand(1));
	return DAG.getVectorShuffle(VT, DL, ShOp, Logic, SVN0->getMask());
	}
	}

	return SDValue();
	}

	/// Try to make (and/or setcc (LL, LR), setcc (RL, RR)) more efficient.
	SDValue DAGCombiner::foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
	const SDLoc &DL) {
	SDValue LL, LR, RL, RR, N0CC, N1CC;
	if (!isSetCCEquivalent(N0, LL, LR, N0CC) \|\|
	!isSetCCEquivalent(N1, RL, RR, N1CC))
	return SDValue();

	assert(N0.getValueType() == N1.getValueType() &&
	"Unexpected operand types for bitwise logic op");
	assert(LL.getValueType() == LR.getValueType() &&
	RL.getValueType() == RR.getValueType() &&
	"Unexpected operand types for setcc");

	// If we're here post-legalization or the logic op type is not i1, the logic
	// op type must match a setcc result type. Also, all folds require new
	// operations on the left and right operands, so those types must match.
	EVT VT = N0.getValueType();
	EVT OpVT = LL.getValueType();
	if (LegalOperations \|\| VT.getScalarType() != MVT::i1)
	if (VT != getSetCCResultType(OpVT))
	return SDValue();
	if (OpVT != RL.getValueType())
	return SDValue();

	ISD::CondCode CC0 = cast<CondCodeSDNode>(N0CC)->get();
	ISD::CondCode CC1 = cast<CondCodeSDNode>(N1CC)->get();
	bool IsInteger = OpVT.isInteger();
	if (LR == RR && CC0 == CC1 && IsInteger) {
	bool IsZero = isNullOrNullSplat(LR);
	bool IsNeg1 = isAllOnesOrAllOnesSplat(LR);

	// All bits clear?
	bool AndEqZero = IsAnd && CC1 == ISD::SETEQ && IsZero;
	// All sign bits clear?
	bool AndGtNeg1 = IsAnd && CC1 == ISD::SETGT && IsNeg1;
	// Any bits set?
	bool OrNeZero = !IsAnd && CC1 == ISD::SETNE && IsZero;
	// Any sign bits set?
	bool OrLtZero = !IsAnd && CC1 == ISD::SETLT && IsZero;

	// (and (seteq X, 0), (seteq Y, 0)) --> (seteq (or X, Y), 0)
	// (and (setgt X, -1), (setgt Y, -1)) --> (setgt (or X, Y), -1)
	// (or (setne X, 0), (setne Y, 0)) --> (setne (or X, Y), 0)
	// (or (setlt X, 0), (setlt Y, 0)) --> (setlt (or X, Y), 0)
	if (AndEqZero \|\| AndGtNeg1 \|\| OrNeZero \|\| OrLtZero) {
	SDValue Or = DAG.getNode(ISD::OR, SDLoc(N0), OpVT, LL, RL);
	AddToWorklist(Or.getNode());
	return DAG.getSetCC(DL, VT, Or, LR, CC1);
	}

	// All bits set?
	bool AndEqNeg1 = IsAnd && CC1 == ISD::SETEQ && IsNeg1;
	// All sign bits set?
	bool AndLtZero = IsAnd && CC1 == ISD::SETLT && IsZero;
	// Any bits clear?
	bool OrNeNeg1 = !IsAnd && CC1 == ISD::SETNE && IsNeg1;
	// Any sign bits clear?
	bool OrGtNeg1 = !IsAnd && CC1 == ISD::SETGT && IsNeg1;

	// (and (seteq X, -1), (seteq Y, -1)) --> (seteq (and X, Y), -1)
	// (and (setlt X, 0), (setlt Y, 0)) --> (setlt (and X, Y), 0)
	// (or (setne X, -1), (setne Y, -1)) --> (setne (and X, Y), -1)
	// (or (setgt X, -1), (setgt Y -1)) --> (setgt (and X, Y), -1)
	if (AndEqNeg1 \|\| AndLtZero \|\| OrNeNeg1 \|\| OrGtNeg1) {
	SDValue And = DAG.getNode(ISD::AND, SDLoc(N0), OpVT, LL, RL);
	AddToWorklist(And.getNode());
	return DAG.getSetCC(DL, VT, And, LR, CC1);
	}
	}

	// TODO: What is the 'or' equivalent of this fold?
	// (and (setne X, 0), (setne X, -1)) --> (setuge (add X, 1), 2)
	if (IsAnd && LL == RL && CC0 == CC1 && OpVT.getScalarSizeInBits() > 1 &&
	IsInteger && CC0 == ISD::SETNE &&
	((isNullConstant(LR) && isAllOnesConstant(RR)) \|\|
	(isAllOnesConstant(LR) && isNullConstant(RR)))) {
	SDValue One = DAG.getConstant(1, DL, OpVT);
	SDValue Two = DAG.getConstant(2, DL, OpVT);
	SDValue Add = DAG.getNode(ISD::ADD, SDLoc(N0), OpVT, LL, One);
	AddToWorklist(Add.getNode());
	return DAG.getSetCC(DL, VT, Add, Two, ISD::SETUGE);
	}

	// Try more general transforms if the predicates match and the only user of
	// the compares is the 'and' or 'or'.
	if (IsInteger && TLI.convertSetCCLogicToBitwiseLogic(OpVT) && CC0 == CC1 &&
	N0.hasOneUse() && N1.hasOneUse()) {
	// and (seteq A, B), (seteq C, D) --> seteq (or (xor A, B), (xor C, D)), 0
	// or (setne A, B), (setne C, D) --> setne (or (xor A, B), (xor C, D)), 0
	if ((IsAnd && CC1 == ISD::SETEQ) \|\| (!IsAnd && CC1 == ISD::SETNE)) {
	SDValue XorL = DAG.getNode(ISD::XOR, SDLoc(N0), OpVT, LL, LR);
	SDValue XorR = DAG.getNode(ISD::XOR, SDLoc(N1), OpVT, RL, RR);
	SDValue Or = DAG.getNode(ISD::OR, DL, OpVT, XorL, XorR);
	SDValue Zero = DAG.getConstant(0, DL, OpVT);
	return DAG.getSetCC(DL, VT, Or, Zero, CC1);
	}

	// Turn compare of constants whose difference is 1 bit into add+and+setcc.
	// TODO - support non-uniform vector amounts.
	if ((IsAnd && CC1 == ISD::SETNE) \|\| (!IsAnd && CC1 == ISD::SETEQ)) {
	// Match a shared variable operand and 2 non-opaque constant operands.
	ConstantSDNode *C0 = isConstOrConstSplat(LR);
	ConstantSDNode *C1 = isConstOrConstSplat(RR);
	if (LL == RL && C0 && C1 && !C0->isOpaque() && !C1->isOpaque()) {
	// Canonicalize larger constant as C0.
	if (C1->getAPIntValue().ugt(C0->getAPIntValue()))
	std::swap(C0, C1);

	// The difference of the constants must be a single bit.
	const APInt &C0Val = C0->getAPIntValue();
	const APInt &C1Val = C1->getAPIntValue();
	if ((C0Val - C1Val).isPowerOf2()) {
	// and/or (setcc X, C0, ne), (setcc X, C1, ne/eq) -->
	// setcc ((add X, -C1), ~(C0 - C1)), 0, ne/eq
	SDValue OffsetC = DAG.getConstant(-C1Val, DL, OpVT);
	SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LL, OffsetC);
	SDValue MaskC = DAG.getConstant(~(C0Val - C1Val), DL, OpVT);
	SDValue And = DAG.getNode(ISD::AND, DL, OpVT, Add, MaskC);
	SDValue Zero = DAG.getConstant(0, DL, OpVT);
	return DAG.getSetCC(DL, VT, And, Zero, CC0);
	}
	}
	}
	}

	// Canonicalize equivalent operands to LL == RL.
	if (LL == RR && LR == RL) {
	CC1 = ISD::getSetCCSwappedOperands(CC1);
	std::swap(RL, RR);
	}

	// (and (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC)
	// (or (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC)
	if (LL == RL && LR == RR) {
	ISD::CondCode NewCC = IsAnd ? ISD::getSetCCAndOperation(CC0, CC1, OpVT)
	: ISD::getSetCCOrOperation(CC0, CC1, OpVT);
	if (NewCC != ISD::SETCC_INVALID &&
	(!LegalOperations \|\|
	(TLI.isCondCodeLegal(NewCC, LL.getSimpleValueType()) &&
	TLI.isOperationLegal(ISD::SETCC, OpVT))))
	return DAG.getSetCC(DL, VT, LL, LR, NewCC);
	}

	return SDValue();
	}

	/// This contains all DAGCombine rules which reduce two values combined by
	/// an And operation to a single value. This makes them reusable in the context
	/// of visitSELECT(). Rules involving constants are not included as
	/// visitSELECT() already handles those cases.
	SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, SDNode *N) {
	EVT VT = N1.getValueType();
	SDLoc DL(N);

	// fold (and x, undef) -> 0
	if (N0.isUndef() \|\| N1.isUndef())
	return DAG.getConstant(0, DL, VT);

	if (SDValue V = foldLogicOfSetCCs(true, N0, N1, DL))
	return V;

	if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::SRL &&
	VT.getSizeInBits() <= 64) {
	if (ConstantSDNode *ADDI = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
	if (ConstantSDNode *SRLI = dyn_cast<ConstantSDNode>(N1.getOperand(1))) {
	// Look for (and (add x, c1), (lshr y, c2)). If C1 wasn't a legal
	// immediate for an add, but it is legal if its top c2 bits are set,
	// transform the ADD so the immediate doesn't need to be materialized
	// in a register.
	APInt ADDC = ADDI->getAPIntValue();
	APInt SRLC = SRLI->getAPIntValue();
	if (ADDC.getMinSignedBits() <= 64 &&
	SRLC.ult(VT.getSizeInBits()) &&
	!TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
	APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
	SRLC.getZExtValue());
	if (DAG.MaskedValueIsZero(N0.getOperand(1), Mask)) {
	ADDC \|= Mask;
	if (TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
	SDLoc DL0(N0);
	SDValue NewAdd =
	DAG.getNode(ISD::ADD, DL0, VT,
	N0.getOperand(0), DAG.getConstant(ADDC, DL, VT));
	CombineTo(N0.getNode(), NewAdd);
	// Return N so it doesn't get rechecked!
	return SDValue(N, 0);
	}
	}
	}
	}
	}
	}

	// Reduce bit extract of low half of an integer to the narrower type.
	// (and (srl i64:x, K), KMask) ->
	// (i64 zero_extend (and (srl (i32 (trunc i64:x)), K)), KMask)
	if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) {
	if (ConstantSDNode *CAnd = dyn_cast<ConstantSDNode>(N1)) {
	if (ConstantSDNode *CShift = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
	unsigned Size = VT.getSizeInBits();
	const APInt &AndMask = CAnd->getAPIntValue();
	unsigned ShiftBits = CShift->getZExtValue();

	// Bail out, this node will probably disappear anyway.
	if (ShiftBits == 0)
	return SDValue();

	unsigned MaskBits = AndMask.countTrailingOnes();
	EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), Size / 2);

	if (AndMask.isMask() &&
	// Required bits must not span the two halves of the integer and
	// must fit in the half size type.
	(ShiftBits + MaskBits <= Size / 2) &&
	TLI.isNarrowingProfitable(VT, HalfVT) &&
	TLI.isTypeDesirableForOp(ISD::AND, HalfVT) &&
	TLI.isTypeDesirableForOp(ISD::SRL, HalfVT) &&
	TLI.isTruncateFree(VT, HalfVT) &&
	TLI.isZExtFree(HalfVT, VT)) {
	// The isNarrowingProfitable is to avoid regressions on PPC and
	// AArch64 which match a few 64-bit bit insert / bit extract patterns
	// on downstream users of this. Those patterns could probably be
	// extended to handle extensions mixed in.

	SDValue SL(N0);
	assert(MaskBits <= Size);

	// Extracting the highest bit of the low half.
	EVT ShiftVT = TLI.getShiftAmountTy(HalfVT, DAG.getDataLayout());
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, HalfVT,
	N0.getOperand(0));

	SDValue NewMask = DAG.getConstant(AndMask.trunc(Size / 2), SL, HalfVT);
	SDValue ShiftK = DAG.getConstant(ShiftBits, SL, ShiftVT);
	SDValue Shift = DAG.getNode(ISD::SRL, SL, HalfVT, Trunc, ShiftK);
	SDValue And = DAG.getNode(ISD::AND, SL, HalfVT, Shift, NewMask);
	return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, And);
	}
	}
	}
	}

	return SDValue();
	}

	bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode AndC, LoadSDNode LoadN,
	EVT LoadResultTy, EVT &ExtVT) {
	if (!AndC->getAPIntValue().isMask())
	return false;

	unsigned ActiveBits = AndC->getAPIntValue().countTrailingOnes();

	ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
	EVT LoadedVT = LoadN->getMemoryVT();

	if (ExtVT == LoadedVT &&
	(!LegalOperations \|\|
	TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))) {
	// ZEXTLOAD will match without needing to change the size of the value being
	// loaded.
	return true;
	}

	// Do not change the width of a volatile or atomic loads.
	if (!LoadN->isSimple())
	return false;

	// Do not generate loads of non-round integer types since these can
	// be expensive (and would be wrong if the type is not byte sized).
	if (!LoadedVT.bitsGT(ExtVT) \|\| !ExtVT.isRound())
	return false;

	if (LegalOperations &&
	!TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))
	return false;

	if (!TLI.shouldReduceLoadWidth(LoadN, ISD::ZEXTLOAD, ExtVT))
	return false;

	return true;
	}

	bool DAGCombiner::isLegalNarrowLdSt(LSBaseSDNode *LDST,
	ISD::LoadExtType ExtType, EVT &MemVT,
	unsigned ShAmt) {
	if (!LDST)
	return false;
	// Only allow byte offsets.
	if (ShAmt % 8)
	return false;

	// Do not generate loads of non-round integer types since these can
	// be expensive (and would be wrong if the type is not byte sized).
	if (!MemVT.isRound())
	return false;

	// Don't change the width of a volatile or atomic loads.
	if (!LDST->isSimple())
	return false;

	// Verify that we are actually reducing a load width here.
	if (LDST->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits())
	return false;

	// Ensure that this isn't going to produce an unsupported memory access.
	if (ShAmt) {
	assert(ShAmt % 8 == 0 && "ShAmt is byte offset");
	const unsigned ByteShAmt = ShAmt / 8;
	const Align LDSTAlign = LDST->getAlign();
	const Align NarrowAlign = commonAlignment(LDSTAlign, ByteShAmt);
	if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
	LDST->getAddressSpace(), NarrowAlign,
	LDST->getMemOperand()->getFlags()))
	return false;
	}

	// It's not possible to generate a constant of extended or untyped type.
	EVT PtrType = LDST->getBasePtr().getValueType();
	if (PtrType == MVT::Untyped \|\| PtrType.isExtended())
	return false;

	if (isa<LoadSDNode>(LDST)) {
	LoadSDNode *Load = cast<LoadSDNode>(LDST);
	// Don't transform one with multiple uses, this would require adding a new
	// load.
	if (!SDValue(Load, 0).hasOneUse())
	return false;

	if (LegalOperations &&
	!TLI.isLoadExtLegal(ExtType, Load->getValueType(0), MemVT))
	return false;

	// For the transform to be legal, the load must produce only two values
	// (the value loaded and the chain). Don't transform a pre-increment
	// load, for example, which produces an extra value. Otherwise the
	// transformation is not equivalent, and the downstream logic to replace
	// uses gets things wrong.
	if (Load->getNumValues() > 2)
	return false;

	// If the load that we're shrinking is an extload and we're not just
	// discarding the extension we can't simply shrink the load. Bail.
	// TODO: It would be possible to merge the extensions in some cases.
	if (Load->getExtensionType() != ISD::NON_EXTLOAD &&
	Load->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits() + ShAmt)
	return false;

	if (!TLI.shouldReduceLoadWidth(Load, ExtType, MemVT))
	return false;
	} else {
	assert(isa<StoreSDNode>(LDST) && "It is not a Load nor a Store SDNode");
	StoreSDNode *Store = cast<StoreSDNode>(LDST);
	// Can't write outside the original store
	if (Store->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits() + ShAmt)
	return false;

	if (LegalOperations &&
	!TLI.isTruncStoreLegal(Store->getValue().getValueType(), MemVT))
	return false;
	}
	return true;
	}

	bool DAGCombiner::SearchForAndLoads(SDNode *N,
	SmallVectorImpl<LoadSDNode*> &Loads,
	SmallPtrSetImpl<SDNode*> &NodesWithConsts,
	ConstantSDNode *Mask,
	SDNode *&NodeToMask) {
	// Recursively search for the operands, looking for loads which can be
	// narrowed.
	for (SDValue Op : N->op_values()) {
	if (Op.getValueType().isVector())
	return false;

	// Some constants may need fixing up later if they are too large.
	if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
	if ((N->getOpcode() == ISD::OR \|\| N->getOpcode() == ISD::XOR) &&
	(Mask->getAPIntValue() & C->getAPIntValue()) != C->getAPIntValue())
	NodesWithConsts.insert(N);
	continue;
	}

	if (!Op.hasOneUse())
	return false;

	switch(Op.getOpcode()) {
	case ISD::LOAD: {
	auto *Load = cast<LoadSDNode>(Op);
	EVT ExtVT;
	if (isAndLoadExtLoad(Mask, Load, Load->getValueType(0), ExtVT) &&
	isLegalNarrowLdSt(Load, ISD::ZEXTLOAD, ExtVT)) {

	// ZEXTLOAD is already small enough.
	if (Load->getExtensionType() == ISD::ZEXTLOAD &&
	ExtVT.bitsGE(Load->getMemoryVT()))
	continue;

	// Use LE to convert equal sized loads to zext.
	if (ExtVT.bitsLE(Load->getMemoryVT()))
	Loads.push_back(Load);

	continue;
	}
	return false;
	}
	case ISD::ZERO_EXTEND:
	case ISD::AssertZext: {
	unsigned ActiveBits = Mask->getAPIntValue().countTrailingOnes();
	EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
	EVT VT = Op.getOpcode() == ISD::AssertZext ?
	cast<VTSDNode>(Op.getOperand(1))->getVT() :
	Op.getOperand(0).getValueType();

	// We can accept extending nodes if the mask is wider or an equal
	// width to the original type.
	if (ExtVT.bitsGE(VT))
	continue;
	break;
	}
	case ISD::OR:
	case ISD::XOR:
	case ISD::AND:
	if (!SearchForAndLoads(Op.getNode(), Loads, NodesWithConsts, Mask,
	NodeToMask))
	return false;
	continue;
	}

	// Allow one node which will masked along with any loads found.
	if (NodeToMask)
	return false;

	// Also ensure that the node to be masked only produces one data result.
	NodeToMask = Op.getNode();
	if (NodeToMask->getNumValues() > 1) {
	bool HasValue = false;
	for (unsigned i = 0, e = NodeToMask->getNumValues(); i < e; ++i) {
	MVT VT = SDValue(NodeToMask, i).getSimpleValueType();
	if (VT != MVT::Glue && VT != MVT::Other) {
	if (HasValue) {
	NodeToMask = nullptr;
	return false;
	}
	HasValue = true;
	}
	}
	assert(HasValue && "Node to be masked has no data result?");
	}
	}
	return true;
	}

	bool DAGCombiner::BackwardsPropagateMask(SDNode *N) {
	auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!Mask)
	return false;

	if (!Mask->getAPIntValue().isMask())
	return false;

	// No need to do anything if the and directly uses a load.
	if (isa<LoadSDNode>(N->getOperand(0)))
	return false;

	SmallVector<LoadSDNode*, 8> Loads;
	SmallPtrSet<SDNode*, 2> NodesWithConsts;
	SDNode *FixupNode = nullptr;
	if (SearchForAndLoads(N, Loads, NodesWithConsts, Mask, FixupNode)) {
	if (Loads.size() == 0)
	return false;

	LLVM_DEBUG(dbgs() << "Backwards propagate AND: "; N->dump());
	SDValue MaskOp = N->getOperand(1);

	// If it exists, fixup the single node we allow in the tree that needs
	// masking.
	if (FixupNode) {
	LLVM_DEBUG(dbgs() << "First, need to fix up: "; FixupNode->dump());
	SDValue And = DAG.getNode(ISD::AND, SDLoc(FixupNode),
	FixupNode->getValueType(0),
	SDValue(FixupNode, 0), MaskOp);
	DAG.ReplaceAllUsesOfValueWith(SDValue(FixupNode, 0), And);
	if (And.getOpcode() == ISD ::AND)
	DAG.UpdateNodeOperands(And.getNode(), SDValue(FixupNode, 0), MaskOp);
	}

	// Narrow any constants that need it.
	for (auto *LogicN : NodesWithConsts) {
	SDValue Op0 = LogicN->getOperand(0);
	SDValue Op1 = LogicN->getOperand(1);

	if (isa<ConstantSDNode>(Op0))
	std::swap(Op0, Op1);

	SDValue And = DAG.getNode(ISD::AND, SDLoc(Op1), Op1.getValueType(),
	Op1, MaskOp);

	DAG.UpdateNodeOperands(LogicN, Op0, And);
	}

	// Create narrow loads.
	for (auto *Load : Loads) {
	LLVM_DEBUG(dbgs() << "Propagate AND back to: "; Load->dump());
	SDValue And = DAG.getNode(ISD::AND, SDLoc(Load), Load->getValueType(0),
	SDValue(Load, 0), MaskOp);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), And);
	if (And.getOpcode() == ISD ::AND)
	And = SDValue(
	DAG.UpdateNodeOperands(And.getNode(), SDValue(Load, 0), MaskOp), 0);
	SDValue NewLoad = ReduceLoadWidth(And.getNode());
	assert(NewLoad &&
	"Shouldn't be masking the load if it can't be narrowed");
	CombineTo(Load, NewLoad, NewLoad.getValue(1));
	}
	DAG.ReplaceAllUsesWith(N, N->getOperand(0).getNode());
	return true;
	}
	return false;
	}

	// Unfold
	// x & (-1 'logical shift' y)
	// To
	// (x 'opposite logical shift' y) 'logical shift' y
	// if it is better for performance.
	SDValue DAGCombiner::unfoldExtremeBitClearingToShifts(SDNode *N) {
	assert(N->getOpcode() == ISD::AND);

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Do we actually prefer shifts over mask?
	if (!TLI.shouldFoldMaskToVariableShiftPair(N0))
	return SDValue();

	// Try to match (-1 '[outer] logical shift' y)
	unsigned OuterShift;
	unsigned InnerShift; // The opposite direction to the OuterShift.
	SDValue Y; // Shift amount.
	auto matchMask = [&OuterShift, &InnerShift, &Y](SDValue M) -> bool {
	if (!M.hasOneUse())
	return false;
	OuterShift = M->getOpcode();
	if (OuterShift == ISD::SHL)
	InnerShift = ISD::SRL;
	else if (OuterShift == ISD::SRL)
	InnerShift = ISD::SHL;
	else
	return false;
	if (!isAllOnesConstant(M->getOperand(0)))
	return false;
	Y = M->getOperand(1);
	return true;
	};

	SDValue X;
	if (matchMask(N1))
	X = N0;
	else if (matchMask(N0))
	X = N1;
	else
	return SDValue();

	SDLoc DL(N);
	EVT VT = N->getValueType(0);

	// tmp = x 'opposite logical shift' y
	SDValue T0 = DAG.getNode(InnerShift, DL, VT, X, Y);
	// ret = tmp 'logical shift' y
	SDValue T1 = DAG.getNode(OuterShift, DL, VT, T0, Y);

	return T1;
	}

	/// Try to replace shift/logic that tests if a bit is clear with mask + setcc.
	/// For a target with a bit test, this is expected to become test + set and save
	/// at least 1 instruction.
	static SDValue combineShiftAnd1ToBitTest(SDNode *And, SelectionDAG &DAG) {
	assert(And->getOpcode() == ISD::AND && "Expected an 'and' op");

	// This is probably not worthwhile without a supported type.
	EVT VT = And->getValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isTypeLegal(VT))
	return SDValue();

	// Look through an optional extension and find a 'not'.
	// TODO: Should we favor test+set even without the 'not' op?
	SDValue Not = And->getOperand(0), And1 = And->getOperand(1);
	if (Not.getOpcode() == ISD::ANY_EXTEND)
	Not = Not.getOperand(0);
	if (!isBitwiseNot(Not) \|\| !Not.hasOneUse() \|\| !isOneConstant(And1))
	return SDValue();

	// Look though an optional truncation. The source operand may not be the same
	// type as the original 'and', but that is ok because we are masking off
	// everything but the low bit.
	SDValue Srl = Not.getOperand(0);
	if (Srl.getOpcode() == ISD::TRUNCATE)
	Srl = Srl.getOperand(0);

	// Match a shift-right by constant.
	if (Srl.getOpcode() != ISD::SRL \|\| !Srl.hasOneUse() \|\|
	!isa<ConstantSDNode>(Srl.getOperand(1)))
	return SDValue();

	// We might have looked through casts that make this transform invalid.
	// TODO: If the source type is wider than the result type, do the mask and
	// compare in the source type.
	const APInt &ShiftAmt = Srl.getConstantOperandAPInt(1);
	unsigned VTBitWidth = VT.getSizeInBits();
	if (ShiftAmt.uge(VTBitWidth))
	return SDValue();

	// Turn this into a bit-test pattern using mask op + setcc:
	// and (not (srl X, C)), 1 --> (and X, 1<<C) == 0
	SDLoc DL(And);
	SDValue X = DAG.getZExtOrTrunc(Srl.getOperand(0), DL, VT);
	EVT CCVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
	SDValue Mask = DAG.getConstant(
	APInt::getOneBitSet(VTBitWidth, ShiftAmt.getZExtValue()), DL, VT);
	SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, Mask);
	SDValue Zero = DAG.getConstant(0, DL, VT);
	SDValue Setcc = DAG.getSetCC(DL, CCVT, NewAnd, Zero, ISD::SETEQ);
	return DAG.getZExtOrTrunc(Setcc, DL, VT);
	}

	SDValue DAGCombiner::visitAND(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N1.getValueType();

	// x & x --> x
	if (N0 == N1)
	return N0;

	// fold vector ops
	if (VT.isVector()) {
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	// fold (and x, 0) -> 0, vector edition
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	// do not return N0, because undef node may exist in N0
	return DAG.getConstant(APInt::getNullValue(N0.getScalarValueSizeInBits()),
	SDLoc(N), N0.getValueType());
	if (ISD::isBuildVectorAllZeros(N1.getNode()))
	// do not return N1, because undef node may exist in N1
	return DAG.getConstant(APInt::getNullValue(N1.getScalarValueSizeInBits()),
	SDLoc(N), N1.getValueType());

	// fold (and x, -1) -> x, vector edition
	if (ISD::isBuildVectorAllOnes(N0.getNode()))
	return N1;
	if (ISD::isBuildVectorAllOnes(N1.getNode()))
	return N0;
	}

	// fold (and c1, c2) -> c1&c2
	ConstantSDNode *N1C = isConstOrConstSplat(N1);
	if (SDValue C = DAG.FoldConstantArithmetic(ISD::AND, SDLoc(N), VT, {N0, N1}))
	return C;

	// canonicalize constant to RHS
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
	!DAG.isConstantIntBuildVectorOrConstantInt(N1))
	return DAG.getNode(ISD::AND, SDLoc(N), VT, N1, N0);

	// fold (and x, -1) -> x
	if (isAllOnesConstant(N1))
	return N0;

	// if (and x, c) is known to be zero, return 0
	unsigned BitWidth = VT.getScalarSizeInBits();
	if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),
	APInt::getAllOnesValue(BitWidth)))
	return DAG.getConstant(0, SDLoc(N), VT);

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// reassociate and
	if (SDValue RAND = reassociateOps(ISD::AND, SDLoc(N), N0, N1, N->getFlags()))
	return RAND;

	// Try to convert a constant mask AND into a shuffle clear mask.
	if (VT.isVector())
	if (SDValue Shuffle = XformToShuffleWithZero(N))
	return Shuffle;

	if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N))
	return Combined;

	// fold (and (or x, C), D) -> D if (C & D) == D
	auto MatchSubset = [](ConstantSDNode LHS, ConstantSDNode RHS) {
	return RHS->getAPIntValue().isSubsetOf(LHS->getAPIntValue());
	};
	if (N0.getOpcode() == ISD::OR &&
	ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchSubset))
	return N1;
	// fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits.
	if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {
	SDValue N0Op0 = N0.getOperand(0);
	APInt Mask = ~N1C->getAPIntValue();
	Mask = Mask.trunc(N0Op0.getScalarValueSizeInBits());
	if (DAG.MaskedValueIsZero(N0Op0, Mask)) {
	SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
	N0.getValueType(), N0Op0);

	// Replace uses of the AND with uses of the Zero extend node.
	CombineTo(N, Zext);

	// We actually want to replace all uses of the any_extend with the
	// zero_extend, to avoid duplicating things. This will later cause this
	// AND to be folded.
	CombineTo(N0.getNode(), Zext);
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}

	// similarly fold (and (X (load ([non_ext\|any_ext\|zero_ext] V))), c) ->
	// (X (load ([non_ext\|zero_ext] V))) if 'and' only clears top bits which must
	// already be zero by virtue of the width of the base type of the load.
	//
	// the 'X' node here can either be nothing or an extract_vector_elt to catch
	// more cases.
	if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	N0.getValueSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits() &&
	N0.getOperand(0).getOpcode() == ISD::LOAD &&
	N0.getOperand(0).getResNo() == 0) \|\|
	(N0.getOpcode() == ISD::LOAD && N0.getResNo() == 0)) {
	LoadSDNode *Load = cast<LoadSDNode>( (N0.getOpcode() == ISD::LOAD) ?
	N0 : N0.getOperand(0) );

	// Get the constant (if applicable) the zero'th operand is being ANDed with.
	// This can be a pure constant or a vector splat, in which case we treat the
	// vector as a scalar and use the splat value.
	APInt Constant = APInt::getNullValue(1);
	if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
	Constant = C->getAPIntValue();
	} else if (BuildVectorSDNode *Vector = dyn_cast<BuildVectorSDNode>(N1)) {
	APInt SplatValue, SplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	bool IsSplat = Vector->isConstantSplat(SplatValue, SplatUndef,
	SplatBitSize, HasAnyUndefs);
	if (IsSplat) {
	// Undef bits can contribute to a possible optimisation if set, so
	// set them.
	SplatValue \|= SplatUndef;

	// The splat value may be something like "0x00FFFFFF", which means 0 for
	// the first vector value and FF for the rest, repeating. We need a mask
	// that will apply equally to all members of the vector, so AND all the
	// lanes of the constant together.
	unsigned EltBitWidth = Vector->getValueType(0).getScalarSizeInBits();

	// If the splat value has been compressed to a bitlength lower
	// than the size of the vector lane, we need to re-expand it to
	// the lane size.
	if (EltBitWidth > SplatBitSize)
	for (SplatValue = SplatValue.zextOrTrunc(EltBitWidth);
	SplatBitSize < EltBitWidth; SplatBitSize = SplatBitSize * 2)
	SplatValue \|= SplatValue.shl(SplatBitSize);

	// Make sure that variable 'Constant' is only set if 'SplatBitSize' is a
	// multiple of 'BitWidth'. Otherwise, we could propagate a wrong value.
	if ((SplatBitSize % EltBitWidth) == 0) {
	Constant = APInt::getAllOnesValue(EltBitWidth);
	for (unsigned i = 0, n = (SplatBitSize / EltBitWidth); i < n; ++i)
	Constant &= SplatValue.extractBits(EltBitWidth, i * EltBitWidth);
	}
	}
	}

	// If we want to change an EXTLOAD to a ZEXTLOAD, ensure a ZEXTLOAD is
	// actually legal and isn't going to get expanded, else this is a false
	// optimisation.
	bool CanZextLoadProfitably = TLI.isLoadExtLegal(ISD::ZEXTLOAD,
	Load->getValueType(0),
	Load->getMemoryVT());

	// Resize the constant to the same size as the original memory access before
	// extension. If it is still the AllOnesValue then this AND is completely
	// unneeded.
	Constant = Constant.zextOrTrunc(Load->getMemoryVT().getScalarSizeInBits());

	bool B;
	switch (Load->getExtensionType()) {
	default: B = false; break;
	case ISD::EXTLOAD: B = CanZextLoadProfitably; break;
	case ISD::ZEXTLOAD:
	case ISD::NON_EXTLOAD: B = true; break;
	}

	if (B && Constant.isAllOnesValue()) {
	// If the load type was an EXTLOAD, convert to ZEXTLOAD in order to
	// preserve semantics once we get rid of the AND.
	SDValue NewLoad(Load, 0);

	// Fold the AND away. NewLoad may get replaced immediately.
	CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0);

	if (Load->getExtensionType() == ISD::EXTLOAD) {
	NewLoad = DAG.getLoad(Load->getAddressingMode(), ISD::ZEXTLOAD,
	Load->getValueType(0), SDLoc(Load),
	Load->getChain(), Load->getBasePtr(),
	Load->getOffset(), Load->getMemoryVT(),
	Load->getMemOperand());
	// Replace uses of the EXTLOAD with the new ZEXTLOAD.
	if (Load->getNumValues() == 3) {
	// PRE/POST_INC loads have 3 values.
	SDValue To[] = { NewLoad.getValue(0), NewLoad.getValue(1),
	NewLoad.getValue(2) };
	CombineTo(Load, To, 3, true);
	} else {
	CombineTo(Load, NewLoad.getValue(0), NewLoad.getValue(1));
	}
	}

	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}

	// fold (and (load x), 255) -> (zextload x, i8)
	// fold (and (extload x, i16), 255) -> (zextload x, i8)
	// fold (and (any_ext (extload x, i16)), 255) -> (zextload x, i8)
	if (!VT.isVector() && N1C && (N0.getOpcode() == ISD::LOAD \|\|
	(N0.getOpcode() == ISD::ANY_EXTEND &&
	N0.getOperand(0).getOpcode() == ISD::LOAD))) {
	if (SDValue Res = ReduceLoadWidth(N)) {
	LoadSDNode *LN0 = N0->getOpcode() == ISD::ANY_EXTEND
	? cast<LoadSDNode>(N0.getOperand(0)) : cast<LoadSDNode>(N0);
	AddToWorklist(N);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 0), Res);
	return SDValue(N, 0);
	}
	}

	if (LegalTypes) {
	// Attempt to propagate the AND back up to the leaves which, if they're
	// loads, can be combined to narrow loads and the AND node can be removed.
	// Perform after legalization so that extend nodes will already be
	// combined into the loads.
	if (BackwardsPropagateMask(N))
	return SDValue(N, 0);
	}

	if (SDValue Combined = visitANDLike(N0, N1, N))
	return Combined;

	// Simplify: (and (op x...), (op y...)) -> (op (and x, y))
	if (N0.getOpcode() == N1.getOpcode())
	if (SDValue V = hoistLogicOpWithSameOpcodeHands(N))
	return V;

	// Masking the negated extension of a boolean is just the zero-extended
	// boolean:
	// and (sub 0, zext(bool X)), 1 --> zext(bool X)
	// and (sub 0, sext(bool X)), 1 --> zext(bool X)
	//
	// Note: the SimplifyDemandedBits fold below can make an information-losing
	// transform, and then we have no way to find this better fold.
	if (N1C && N1C->isOne() && N0.getOpcode() == ISD::SUB) {
	if (isNullOrNullSplat(N0.getOperand(0))) {
	SDValue SubRHS = N0.getOperand(1);
	if (SubRHS.getOpcode() == ISD::ZERO_EXTEND &&
	SubRHS.getOperand(0).getScalarValueSizeInBits() == 1)
	return SubRHS;
	if (SubRHS.getOpcode() == ISD::SIGN_EXTEND &&
	SubRHS.getOperand(0).getScalarValueSizeInBits() == 1)
	return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, SubRHS.getOperand(0));
	}
	}

	// fold (and (sign_extend_inreg x, i16 to i32), 1) -> (and x, 1)
	// fold (and (sra)) -> (and (srl)) when possible.
	if (SimplifyDemandedBits(SDValue(N, 0)))
	return SDValue(N, 0);

	// fold (zext_inreg (extload x)) -> (zextload x)
	// fold (zext_inreg (sextload x)) -> (zextload x) iff load has one use
	if (ISD::isUNINDEXEDLoad(N0.getNode()) &&
	(ISD::isEXTLoad(N0.getNode()) \|\|
	(ISD::isSEXTLoad(N0.getNode()) && N0.hasOneUse()))) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	EVT MemVT = LN0->getMemoryVT();
	// If we zero all the possible extended bits, then we can turn this into
	// a zextload if we are running before legalize or the operation is legal.
	unsigned ExtBitSize = N1.getScalarValueSizeInBits();
	unsigned MemBitSize = MemVT.getScalarSizeInBits();
	APInt ExtBits = APInt::getHighBitsSet(ExtBitSize, ExtBitSize - MemBitSize);
	if (DAG.MaskedValueIsZero(N1, ExtBits) &&
	((!LegalOperations && LN0->isSimple()) \|\|
	TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) {
	SDValue ExtLoad =
	DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT, LN0->getChain(),
	LN0->getBasePtr(), MemVT, LN0->getMemOperand());
	AddToWorklist(N);
	CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}

	// fold (and (or (srl N, 8), (shl N, 8)), 0xffff) -> (srl (bswap N), const)
	if (N1C && N1C->getAPIntValue() == 0xffff && N0.getOpcode() == ISD::OR) {
	if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
	N0.getOperand(1), false))
	return BSwap;
	}

	if (SDValue Shifts = unfoldExtremeBitClearingToShifts(N))
	return Shifts;

	if (TLI.hasBitTest(N0, N1))
	if (SDValue V = combineShiftAnd1ToBitTest(N, DAG))
	return V;

	return SDValue();
	}

	/// Match (a >> 8) \| (a << 8) as (bswap a) >> 16.
	SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
	bool DemandHighBits) {
	if (!LegalOperations)
	return SDValue();

	EVT VT = N->getValueType(0);
	if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16)
	return SDValue();
	if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT))
	return SDValue();

	// Recognize (and (shl a, 8), 0xff00), (and (srl a, 8), 0xff)
	bool LookPassAnd0 = false;
	bool LookPassAnd1 = false;
	if (N0.getOpcode() == ISD::AND && N0.getOperand(0).getOpcode() == ISD::SRL)
	std::swap(N0, N1);
	if (N1.getOpcode() == ISD::AND && N1.getOperand(0).getOpcode() == ISD::SHL)
	std::swap(N0, N1);
	if (N0.getOpcode() == ISD::AND) {
	if (!N0.getNode()->hasOneUse())
	return SDValue();
	ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	// Also handle 0xffff since the LHS is guaranteed to have zeros there.
	// This is needed for X86.
	if (!N01C \|\| (N01C->getZExtValue() != 0xFF00 &&
	N01C->getZExtValue() != 0xFFFF))
	return SDValue();
	N0 = N0.getOperand(0);
	LookPassAnd0 = true;
	}

	if (N1.getOpcode() == ISD::AND) {
	if (!N1.getNode()->hasOneUse())
	return SDValue();
	ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
	if (!N11C \|\| N11C->getZExtValue() != 0xFF)
	return SDValue();
	N1 = N1.getOperand(0);
	LookPassAnd1 = true;
	}

	if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
	std::swap(N0, N1);
	if (N0.getOpcode() != ISD::SHL \|\| N1.getOpcode() != ISD::SRL)
	return SDValue();
	if (!N0.getNode()->hasOneUse() \|\| !N1.getNode()->hasOneUse())
	return SDValue();

	ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
	if (!N01C \|\| !N11C)
	return SDValue();
	if (N01C->getZExtValue() != 8 \|\| N11C->getZExtValue() != 8)
	return SDValue();

	// Look for (shl (and a, 0xff), 8), (srl (and a, 0xff00), 8)
	SDValue N00 = N0->getOperand(0);
	if (!LookPassAnd0 && N00.getOpcode() == ISD::AND) {
	if (!N00.getNode()->hasOneUse())
	return SDValue();
	ConstantSDNode *N001C = dyn_cast<ConstantSDNode>(N00.getOperand(1));
	if (!N001C \|\| N001C->getZExtValue() != 0xFF)
	return SDValue();
	N00 = N00.getOperand(0);
	LookPassAnd0 = true;
	}

	SDValue N10 = N1->getOperand(0);
	if (!LookPassAnd1 && N10.getOpcode() == ISD::AND) {
	if (!N10.getNode()->hasOneUse())
	return SDValue();
	ConstantSDNode *N101C = dyn_cast<ConstantSDNode>(N10.getOperand(1));
	// Also allow 0xFFFF since the bits will be shifted out. This is needed
	// for X86.
	if (!N101C \|\| (N101C->getZExtValue() != 0xFF00 &&
	N101C->getZExtValue() != 0xFFFF))
	return SDValue();
	N10 = N10.getOperand(0);
	LookPassAnd1 = true;
	}

	if (N00 != N10)
	return SDValue();

	// Make sure everything beyond the low halfword gets set to zero since the SRL
	// 16 will clear the top bits.
	unsigned OpSizeInBits = VT.getSizeInBits();
	if (DemandHighBits && OpSizeInBits > 16) {
	// If the left-shift isn't masked out then the only way this is a bswap is
	// if all bits beyond the low 8 are 0. In that case the entire pattern
	// reduces to a left shift anyway: leave it for other parts of the combiner.
	if (!LookPassAnd0)
	return SDValue();

	// However, if the right shift isn't masked out then it might be because
	// it's not needed. See if we can spot that too.
	if (!LookPassAnd1 &&
	!DAG.MaskedValueIsZero(
	N10, APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - 16)))
	return SDValue();
	}

	SDValue Res = DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N00);
	if (OpSizeInBits > 16) {
	SDLoc DL(N);
	Res = DAG.getNode(ISD::SRL, DL, VT, Res,
	DAG.getConstant(OpSizeInBits - 16, DL,
	getShiftAmountTy(VT)));
	}
	return Res;
	}

	/// Return true if the specified node is an element that makes up a 32-bit
	/// packed halfword byteswap.
	/// ((x & 0x000000ff) << 8) \|
	/// ((x & 0x0000ff00) >> 8) \|
	/// ((x & 0x00ff0000) << 8) \|
	/// ((x & 0xff000000) >> 8)
	static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) {
	if (!N.getNode()->hasOneUse())
	return false;

	unsigned Opc = N.getOpcode();
	if (Opc != ISD::AND && Opc != ISD::SHL && Opc != ISD::SRL)
	return false;

	SDValue N0 = N.getOperand(0);
	unsigned Opc0 = N0.getOpcode();
	if (Opc0 != ISD::AND && Opc0 != ISD::SHL && Opc0 != ISD::SRL)
	return false;

	ConstantSDNode *N1C = nullptr;
	// SHL or SRL: look upstream for AND mask operand
	if (Opc == ISD::AND)
	N1C = dyn_cast<ConstantSDNode>(N.getOperand(1));
	else if (Opc0 == ISD::AND)
	N1C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (!N1C)
	return false;

	unsigned MaskByteOffset;
	switch (N1C->getZExtValue()) {
	default:
	return false;
	case 0xFF: MaskByteOffset = 0; break;
	case 0xFF00: MaskByteOffset = 1; break;
	case 0xFFFF:
	// In case demanded bits didn't clear the bits that will be shifted out.
	// This is needed for X86.
	if (Opc == ISD::SRL \|\| (Opc == ISD::AND && Opc0 == ISD::SHL)) {
	MaskByteOffset = 1;
	break;
	}
	return false;
	case 0xFF0000: MaskByteOffset = 2; break;
	case 0xFF000000: MaskByteOffset = 3; break;
	}

	// Look for (x & 0xff) << 8 as well as ((x << 8) & 0xff00).
	if (Opc == ISD::AND) {
	if (MaskByteOffset == 0 \|\| MaskByteOffset == 2) {
	// (x >> 8) & 0xff
	// (x >> 8) & 0xff0000
	if (Opc0 != ISD::SRL)
	return false;
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (!C \|\| C->getZExtValue() != 8)
	return false;
	} else {
	// (x << 8) & 0xff00
	// (x << 8) & 0xff000000
	if (Opc0 != ISD::SHL)
	return false;
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (!C \|\| C->getZExtValue() != 8)
	return false;
	}
	} else if (Opc == ISD::SHL) {
	// (x & 0xff) << 8
	// (x & 0xff0000) << 8
	if (MaskByteOffset != 0 && MaskByteOffset != 2)
	return false;
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1));
	if (!C \|\| C->getZExtValue() != 8)
	return false;
	} else { // Opc == ISD::SRL
	// (x & 0xff00) >> 8
	// (x & 0xff000000) >> 8
	if (MaskByteOffset != 1 && MaskByteOffset != 3)
	return false;
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1));
	if (!C \|\| C->getZExtValue() != 8)
	return false;
	}

	if (Parts[MaskByteOffset])
	return false;

	Parts[MaskByteOffset] = N0.getOperand(0).getNode();
	return true;
	}

	// Match 2 elements of a packed halfword bswap.
	static bool isBSwapHWordPair(SDValue N, MutableArrayRef<SDNode *> Parts) {
	if (N.getOpcode() == ISD::OR)
	return isBSwapHWordElement(N.getOperand(0), Parts) &&
	isBSwapHWordElement(N.getOperand(1), Parts);

	if (N.getOpcode() == ISD::SRL && N.getOperand(0).getOpcode() == ISD::BSWAP) {
	ConstantSDNode *C = isConstOrConstSplat(N.getOperand(1));
	if (!C \|\| C->getAPIntValue() != 16)
	return false;
	Parts[0] = Parts[1] = N.getOperand(0).getOperand(0).getNode();
	return true;
	}

	return false;
	}

	// Match this pattern:
	// (or (and (shl (A, 8)), 0xff00ff00), (and (srl (A, 8)), 0x00ff00ff))
	// And rewrite this to:
	// (rotr (bswap A), 16)
	static SDValue matchBSwapHWordOrAndAnd(const TargetLowering &TLI,
	SelectionDAG &DAG, SDNode *N, SDValue N0,
	SDValue N1, EVT VT, EVT ShiftAmountTy) {
	assert(N->getOpcode() == ISD::OR && VT == MVT::i32 &&
	"MatchBSwapHWordOrAndAnd: expecting i32");
	if (!TLI.isOperationLegalOrCustom(ISD::ROTR, VT))
	return SDValue();
	if (N0.getOpcode() != ISD::AND \|\| N1.getOpcode() != ISD::AND)
	return SDValue();
	// TODO: this is too restrictive; lifting this restriction requires more tests
	if (!N0->hasOneUse() \|\| !N1->hasOneUse())
	return SDValue();
	ConstantSDNode *Mask0 = isConstOrConstSplat(N0.getOperand(1));
	ConstantSDNode *Mask1 = isConstOrConstSplat(N1.getOperand(1));
	if (!Mask0 \|\| !Mask1)
	return SDValue();
	if (Mask0->getAPIntValue() != 0xff00ff00 \|\|
	Mask1->getAPIntValue() != 0x00ff00ff)
	return SDValue();
	SDValue Shift0 = N0.getOperand(0);
	SDValue Shift1 = N1.getOperand(0);
	if (Shift0.getOpcode() != ISD::SHL \|\| Shift1.getOpcode() != ISD::SRL)
	return SDValue();
	ConstantSDNode *ShiftAmt0 = isConstOrConstSplat(Shift0.getOperand(1));
	ConstantSDNode *ShiftAmt1 = isConstOrConstSplat(Shift1.getOperand(1));
	if (!ShiftAmt0 \|\| !ShiftAmt1)
	return SDValue();
	if (ShiftAmt0->getAPIntValue() != 8 \|\| ShiftAmt1->getAPIntValue() != 8)
	return SDValue();
	if (Shift0.getOperand(0) != Shift1.getOperand(0))
	return SDValue();

	SDLoc DL(N);
	SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT, Shift0.getOperand(0));
	SDValue ShAmt = DAG.getConstant(16, DL, ShiftAmountTy);
	return DAG.getNode(ISD::ROTR, DL, VT, BSwap, ShAmt);
	}

	/// Match a 32-bit packed halfword bswap. That is
	/// ((x & 0x000000ff) << 8) \|
	/// ((x & 0x0000ff00) >> 8) \|
	/// ((x & 0x00ff0000) << 8) \|
	/// ((x & 0xff000000) >> 8)
	/// => (rotl (bswap x), 16)
	SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
	if (!LegalOperations)
	return SDValue();

	EVT VT = N->getValueType(0);
	if (VT != MVT::i32)
	return SDValue();
	if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT))
	return SDValue();

	if (SDValue BSwap = matchBSwapHWordOrAndAnd(TLI, DAG, N, N0, N1, VT,
	getShiftAmountTy(VT)))
	return BSwap;

	// Try again with commuted operands.
	if (SDValue BSwap = matchBSwapHWordOrAndAnd(TLI, DAG, N, N1, N0, VT,
	getShiftAmountTy(VT)))
	return BSwap;


	// Look for either
	// (or (bswaphpair), (bswaphpair))
	// (or (or (bswaphpair), (and)), (and))
	// (or (or (and), (bswaphpair)), (and))
	SDNode *Parts[4] = {};

	if (isBSwapHWordPair(N0, Parts)) {
	// (or (or (and), (and)), (or (and), (and)))
	if (!isBSwapHWordPair(N1, Parts))
	return SDValue();
	} else if (N0.getOpcode() == ISD::OR) {
	// (or (or (or (and), (and)), (and)), (and))
	if (!isBSwapHWordElement(N1, Parts))
	return SDValue();
	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	if (!(isBSwapHWordElement(N01, Parts) && isBSwapHWordPair(N00, Parts)) &&
	!(isBSwapHWordElement(N00, Parts) && isBSwapHWordPair(N01, Parts)))
	return SDValue();
	} else
	return SDValue();

	// Make sure the parts are all coming from the same node.
	if (Parts[0] != Parts[1] \|\| Parts[0] != Parts[2] \|\| Parts[0] != Parts[3])
	return SDValue();

	SDLoc DL(N);
	SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT,
	SDValue(Parts[0], 0));

	// Result of the bswap should be rotated by 16. If it's not legal, then
	// do (x << 16) \| (x >> 16).
	SDValue ShAmt = DAG.getConstant(16, DL, getShiftAmountTy(VT));
	if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT))
	return DAG.getNode(ISD::ROTL, DL, VT, BSwap, ShAmt);
	if (TLI.isOperationLegalOrCustom(ISD::ROTR, VT))
	return DAG.getNode(ISD::ROTR, DL, VT, BSwap, ShAmt);
	return DAG.getNode(ISD::OR, DL, VT,
	DAG.getNode(ISD::SHL, DL, VT, BSwap, ShAmt),
	DAG.getNode(ISD::SRL, DL, VT, BSwap, ShAmt));
	}

	/// This contains all DAGCombine rules which reduce two values combined by
	/// an Or operation to a single value \see visitANDLike().
	SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *N) {
	EVT VT = N1.getValueType();
	SDLoc DL(N);

	// fold (or x, undef) -> -1
	if (!LegalOperations && (N0.isUndef() \|\| N1.isUndef()))
	return DAG.getAllOnesConstant(DL, VT);

	if (SDValue V = foldLogicOfSetCCs(false, N0, N1, DL))
	return V;

	// (or (and X, C1), (and Y, C2)) -> (and (or X, Y), C3) if possible.
	if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND &&
	// Don't increase # computations.
	(N0.getNode()->hasOneUse() \|\| N1.getNode()->hasOneUse())) {
	// We can only do this xform if we know that bits from X that are set in C2
	// but not in C1 are already zero. Likewise for Y.
	if (const ConstantSDNode *N0O1C =
	getAsNonOpaqueConstant(N0.getOperand(1))) {
	if (const ConstantSDNode *N1O1C =
	getAsNonOpaqueConstant(N1.getOperand(1))) {
	// We can only do this xform if we know that bits from X that are set in
	// C2 but not in C1 are already zero. Likewise for Y.
	const APInt &LHSMask = N0O1C->getAPIntValue();
	const APInt &RHSMask = N1O1C->getAPIntValue();

	if (DAG.MaskedValueIsZero(N0.getOperand(0), RHSMask&~LHSMask) &&
	DAG.MaskedValueIsZero(N1.getOperand(0), LHSMask&~RHSMask)) {
	SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
	N0.getOperand(0), N1.getOperand(0));
	return DAG.getNode(ISD::AND, DL, VT, X,
	DAG.getConstant(LHSMask \| RHSMask, DL, VT));
	}
	}
	}
	}

	// (or (and X, M), (and X, N)) -> (and X, (or M, N))
	if (N0.getOpcode() == ISD::AND &&
	N1.getOpcode() == ISD::AND &&
	N0.getOperand(0) == N1.getOperand(0) &&
	// Don't increase # computations.
	(N0.getNode()->hasOneUse() \|\| N1.getNode()->hasOneUse())) {
	SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
	N0.getOperand(1), N1.getOperand(1));
	return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), X);
	}

	return SDValue();
	}

	/// OR combines for which the commuted variant will be tried as well.
	static SDValue visitORCommutative(
	SelectionDAG &DAG, SDValue N0, SDValue N1, SDNode *N) {
	EVT VT = N0.getValueType();
	if (N0.getOpcode() == ISD::AND) {
	// fold (or (and X, (xor Y, -1)), Y) -> (or X, Y)
	if (isBitwiseNot(N0.getOperand(1)) && N0.getOperand(1).getOperand(0) == N1)
	return DAG.getNode(ISD::OR, SDLoc(N), VT, N0.getOperand(0), N1);

	// fold (or (and (xor Y, -1), X), Y) -> (or X, Y)
	if (isBitwiseNot(N0.getOperand(0)) && N0.getOperand(0).getOperand(0) == N1)
	return DAG.getNode(ISD::OR, SDLoc(N), VT, N0.getOperand(1), N1);
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitOR(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N1.getValueType();

	// x \| x --> x
	if (N0 == N1)
	return N0;

	// fold vector ops
	if (VT.isVector()) {
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	// fold (or x, 0) -> x, vector edition
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return N1;
	if (ISD::isBuildVectorAllZeros(N1.getNode()))
	return N0;

	// fold (or x, -1) -> -1, vector edition
	if (ISD::isBuildVectorAllOnes(N0.getNode()))
	// do not return N0, because undef node may exist in N0
	return DAG.getAllOnesConstant(SDLoc(N), N0.getValueType());
	if (ISD::isBuildVectorAllOnes(N1.getNode()))
	// do not return N1, because undef node may exist in N1
	return DAG.getAllOnesConstant(SDLoc(N), N1.getValueType());

	// fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask)
	// Do this only if the resulting shuffle is legal.
	if (isa<ShuffleVectorSDNode>(N0) &&
	isa<ShuffleVectorSDNode>(N1) &&
	// Avoid folding a node with illegal type.
	TLI.isTypeLegal(VT)) {
	bool ZeroN00 = ISD::isBuildVectorAllZeros(N0.getOperand(0).getNode());
	bool ZeroN01 = ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode());
	bool ZeroN10 = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode());
	bool ZeroN11 = ISD::isBuildVectorAllZeros(N1.getOperand(1).getNode());
	// Ensure both shuffles have a zero input.
	if ((ZeroN00 != ZeroN01) && (ZeroN10 != ZeroN11)) {
	assert((!ZeroN00 \|\| !ZeroN01) && "Both inputs zero!");
	assert((!ZeroN10 \|\| !ZeroN11) && "Both inputs zero!");
	const ShuffleVectorSDNode *SV0 = cast<ShuffleVectorSDNode>(N0);
	const ShuffleVectorSDNode *SV1 = cast<ShuffleVectorSDNode>(N1);
	bool CanFold = true;
	int NumElts = VT.getVectorNumElements();
	SmallVector<int, 4> Mask(NumElts);

	for (int i = 0; i != NumElts; ++i) {
	int M0 = SV0->getMaskElt(i);
	int M1 = SV1->getMaskElt(i);

	// Determine if either index is pointing to a zero vector.
	bool M0Zero = M0 < 0 \|\| (ZeroN00 == (M0 < NumElts));
	bool M1Zero = M1 < 0 \|\| (ZeroN10 == (M1 < NumElts));

	// If one element is zero and the otherside is undef, keep undef.
	// This also handles the case that both are undef.
	if ((M0Zero && M1 < 0) \|\| (M1Zero && M0 < 0)) {
	Mask[i] = -1;
	continue;
	}

	// Make sure only one of the elements is zero.
	if (M0Zero == M1Zero) {
	CanFold = false;
	break;
	}

	assert((M0 >= 0 \|\| M1 >= 0) && "Undef index!");

	// We have a zero and non-zero element. If the non-zero came from
	// SV0 make the index a LHS index. If it came from SV1, make it
	// a RHS index. We need to mod by NumElts because we don't care
	// which operand it came from in the original shuffles.
	Mask[i] = M1Zero ? M0 % NumElts : (M1 % NumElts) + NumElts;
	}

	if (CanFold) {
	SDValue NewLHS = ZeroN00 ? N0.getOperand(1) : N0.getOperand(0);
	SDValue NewRHS = ZeroN10 ? N1.getOperand(1) : N1.getOperand(0);

	SDValue LegalShuffle =
	TLI.buildLegalVectorShuffle(VT, SDLoc(N), NewLHS, NewRHS,
	Mask, DAG);
	if (LegalShuffle)
	return LegalShuffle;
	}
	}
	}
	}

	// fold (or c1, c2) -> c1\|c2
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
	if (SDValue C = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N), VT, {N0, N1}))
	return C;

	// canonicalize constant to RHS
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
	!DAG.isConstantIntBuildVectorOrConstantInt(N1))
	return DAG.getNode(ISD::OR, SDLoc(N), VT, N1, N0);

	// fold (or x, 0) -> x
	if (isNullConstant(N1))
	return N0;

	// fold (or x, -1) -> -1
	if (isAllOnesConstant(N1))
	return N1;

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// fold (or x, c) -> c iff (x & ~c) == 0
	if (N1C && DAG.MaskedValueIsZero(N0, ~N1C->getAPIntValue()))
	return N1;

	if (SDValue Combined = visitORLike(N0, N1, N))
	return Combined;

	if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N))
	return Combined;

	// Recognize halfword bswaps as (bswap + rotl 16) or (bswap + shl 16)
	if (SDValue BSwap = MatchBSwapHWord(N, N0, N1))
	return BSwap;
	if (SDValue BSwap = MatchBSwapHWordLow(N, N0, N1))
	return BSwap;

	// reassociate or
	if (SDValue ROR = reassociateOps(ISD::OR, SDLoc(N), N0, N1, N->getFlags()))
	return ROR;

	// Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1\|c2)
	// iff (c1 & c2) != 0 or c1/c2 are undef.
	auto MatchIntersect = [](ConstantSDNode C1, ConstantSDNode C2) {
	return !C1 \|\| !C2 \|\| C1->getAPIntValue().intersects(C2->getAPIntValue());
	};
	if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
	ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchIntersect, true)) {
	if (SDValue COR = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N1), VT,
	{N1, N0.getOperand(1)})) {
	SDValue IOR = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1);
	AddToWorklist(IOR.getNode());
	return DAG.getNode(ISD::AND, SDLoc(N), VT, COR, IOR);
	}
	}

	if (SDValue Combined = visitORCommutative(DAG, N0, N1, N))
	return Combined;
	if (SDValue Combined = visitORCommutative(DAG, N1, N0, N))
	return Combined;

	// Simplify: (or (op x...), (op y...)) -> (op (or x, y))
	if (N0.getOpcode() == N1.getOpcode())
	if (SDValue V = hoistLogicOpWithSameOpcodeHands(N))
	return V;

	// See if this is some rotate idiom.
	if (SDValue Rot = MatchRotate(N0, N1, SDLoc(N)))
	return Rot;

	if (SDValue Load = MatchLoadCombine(N))
	return Load;

	// Simplify the operands using demanded-bits information.
	if (SimplifyDemandedBits(SDValue(N, 0)))
	return SDValue(N, 0);

	// If OR can be rewritten into ADD, try combines based on ADD.
	if ((!LegalOperations \|\| TLI.isOperationLegal(ISD::ADD, VT)) &&
	DAG.haveNoCommonBitsSet(N0, N1))
	if (SDValue Combined = visitADDLike(N))
	return Combined;

	return SDValue();
	}

	static SDValue stripConstantMask(SelectionDAG &DAG, SDValue Op, SDValue &Mask) {
	if (Op.getOpcode() == ISD::AND &&
	DAG.isConstantIntBuildVectorOrConstantInt(Op.getOperand(1))) {
	Mask = Op.getOperand(1);
	return Op.getOperand(0);
	}
	return Op;
	}

	/// Match "(X shl/srl V1) & V2" where V2 may not be present.
	static bool matchRotateHalf(SelectionDAG &DAG, SDValue Op, SDValue &Shift,
	SDValue &Mask) {
	Op = stripConstantMask(DAG, Op, Mask);
	if (Op.getOpcode() == ISD::SRL \|\| Op.getOpcode() == ISD::SHL) {
	Shift = Op;
	return true;
	}
	return false;
	}

	/// Helper function for visitOR to extract the needed side of a rotate idiom
	/// from a shl/srl/mul/udiv. This is meant to handle cases where
	/// InstCombine merged some outside op with one of the shifts from
	/// the rotate pattern.
	/// \returns An empty \c SDValue if the needed shift couldn't be extracted.
	/// Otherwise, returns an expansion of \p ExtractFrom based on the following
	/// patterns:
	///
	/// (or (add v v) (shrl v bitwidth-1)):
	/// expands (add v v) -> (shl v 1)
	///
	/// (or (mul v c0) (shrl (mul v c1) c2)):
	/// expands (mul v c0) -> (shl (mul v c1) c3)
	///
	/// (or (udiv v c0) (shl (udiv v c1) c2)):
	/// expands (udiv v c0) -> (shrl (udiv v c1) c3)
	///
	/// (or (shl v c0) (shrl (shl v c1) c2)):
	/// expands (shl v c0) -> (shl (shl v c1) c3)
	///
	/// (or (shrl v c0) (shl (shrl v c1) c2)):
	/// expands (shrl v c0) -> (shrl (shrl v c1) c3)
	///
	/// Such that in all cases, c3+c2==bitwidth(op v c1).
	static SDValue extractShiftForRotate(SelectionDAG &DAG, SDValue OppShift,
	SDValue ExtractFrom, SDValue &Mask,
	const SDLoc &DL) {
	assert(OppShift && ExtractFrom && "Empty SDValue");
	assert(
	(OppShift.getOpcode() == ISD::SHL \|\| OppShift.getOpcode() == ISD::SRL) &&
	"Existing shift must be valid as a rotate half");

	ExtractFrom = stripConstantMask(DAG, ExtractFrom, Mask);

	// Value and Type of the shift.
	SDValue OppShiftLHS = OppShift.getOperand(0);
	EVT ShiftedVT = OppShiftLHS.getValueType();

	// Amount of the existing shift.
	ConstantSDNode *OppShiftCst = isConstOrConstSplat(OppShift.getOperand(1));

	// (add v v) -> (shl v 1)
	// TODO: Should this be a general DAG canonicalization?
	if (OppShift.getOpcode() == ISD::SRL && OppShiftCst &&
	ExtractFrom.getOpcode() == ISD::ADD &&
	ExtractFrom.getOperand(0) == ExtractFrom.getOperand(1) &&
	ExtractFrom.getOperand(0) == OppShiftLHS &&
	OppShiftCst->getAPIntValue() == ShiftedVT.getScalarSizeInBits() - 1)
	return DAG.getNode(ISD::SHL, DL, ShiftedVT, OppShiftLHS,
	DAG.getShiftAmountConstant(1, ShiftedVT, DL));

	// Preconditions:
	// (or (op0 v c0) (shiftl/r (op0 v c1) c2))
	//
	// Find opcode of the needed shift to be extracted from (op0 v c0).
	unsigned Opcode = ISD::DELETED_NODE;
	bool IsMulOrDiv = false;
	// Set Opcode and IsMulOrDiv if the extract opcode matches the needed shift
	// opcode or its arithmetic (mul or udiv) variant.
	auto SelectOpcode = [&](unsigned NeededShift, unsigned MulOrDivVariant) {
	IsMulOrDiv = ExtractFrom.getOpcode() == MulOrDivVariant;
	if (!IsMulOrDiv && ExtractFrom.getOpcode() != NeededShift)
	return false;
	Opcode = NeededShift;
	return true;
	};
	// op0 must be either the needed shift opcode or the mul/udiv equivalent
	// that the needed shift can be extracted from.
	if ((OppShift.getOpcode() != ISD::SRL \|\| !SelectOpcode(ISD::SHL, ISD::MUL)) &&
	(OppShift.getOpcode() != ISD::SHL \|\| !SelectOpcode(ISD::SRL, ISD::UDIV)))
	return SDValue();

	// op0 must be the same opcode on both sides, have the same LHS argument,
	// and produce the same value type.
	if (OppShiftLHS.getOpcode() != ExtractFrom.getOpcode() \|\|
	OppShiftLHS.getOperand(0) != ExtractFrom.getOperand(0) \|\|
	ShiftedVT != ExtractFrom.getValueType())
	return SDValue();

	// Constant mul/udiv/shift amount from the RHS of the shift's LHS op.
	ConstantSDNode *OppLHSCst = isConstOrConstSplat(OppShiftLHS.getOperand(1));
	// Constant mul/udiv/shift amount from the RHS of the ExtractFrom op.
	ConstantSDNode *ExtractFromCst =
	isConstOrConstSplat(ExtractFrom.getOperand(1));
	// TODO: We should be able to handle non-uniform constant vectors for these values
	// Check that we have constant values.
	if (!OppShiftCst \|\| !OppShiftCst->getAPIntValue() \|\|
	!OppLHSCst \|\| !OppLHSCst->getAPIntValue() \|\|
	!ExtractFromCst \|\| !ExtractFromCst->getAPIntValue())
	return SDValue();

	// Compute the shift amount we need to extract to complete the rotate.
	const unsigned VTWidth = ShiftedVT.getScalarSizeInBits();
	if (OppShiftCst->getAPIntValue().ugt(VTWidth))
	return SDValue();
	APInt NeededShiftAmt = VTWidth - OppShiftCst->getAPIntValue();
	// Normalize the bitwidth of the two mul/udiv/shift constant operands.
	APInt ExtractFromAmt = ExtractFromCst->getAPIntValue();
	APInt OppLHSAmt = OppLHSCst->getAPIntValue();
	zeroExtendToMatch(ExtractFromAmt, OppLHSAmt);

	// Now try extract the needed shift from the ExtractFrom op and see if the
	// result matches up with the existing shift's LHS op.
	if (IsMulOrDiv) {
	// Op to extract from is a mul or udiv by a constant.
	// Check:
	// c2 / (1 << (bitwidth(op0 v c0) - c1)) == c0
	// c2 % (1 << (bitwidth(op0 v c0) - c1)) == 0
	const APInt ExtractDiv = APInt::getOneBitSet(ExtractFromAmt.getBitWidth(),
	NeededShiftAmt.getZExtValue());
	APInt ResultAmt;
	APInt Rem;
	APInt::udivrem(ExtractFromAmt, ExtractDiv, ResultAmt, Rem);
	if (Rem != 0 \|\| ResultAmt != OppLHSAmt)
	return SDValue();
	} else {
	// Op to extract from is a shift by a constant.
	// Check:
	// c2 - (bitwidth(op0 v c0) - c1) == c0
	if (OppLHSAmt != ExtractFromAmt - NeededShiftAmt.zextOrTrunc(
	ExtractFromAmt.getBitWidth()))
	return SDValue();
	}

	// Return the expanded shift op that should allow a rotate to be formed.
	EVT ShiftVT = OppShift.getOperand(1).getValueType();
	EVT ResVT = ExtractFrom.getValueType();
	SDValue NewShiftNode = DAG.getConstant(NeededShiftAmt, DL, ShiftVT);
	return DAG.getNode(Opcode, DL, ResVT, OppShiftLHS, NewShiftNode);
	}

	// Return true if we can prove that, whenever Neg and Pos are both in the
	// range [0, EltSize), Neg == (Pos == 0 ? 0 : EltSize - Pos). This means that
	// for two opposing shifts shift1 and shift2 and a value X with OpBits bits:
	//
	// (or (shift1 X, Neg), (shift2 X, Pos))
	//
	// reduces to a rotate in direction shift2 by Pos or (equivalently) a rotate
	// in direction shift1 by Neg. The range [0, EltSize) means that we only need
	// to consider shift amounts with defined behavior.
	static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize,
	SelectionDAG &DAG) {
	// If EltSize is a power of 2 then:
	//
	// (a) (Pos == 0 ? 0 : EltSize - Pos) == (EltSize - Pos) & (EltSize - 1)
	// (b) Neg == Neg & (EltSize - 1) whenever Neg is in [0, EltSize).
	//
	// So if EltSize is a power of 2 and Neg is (and Neg', EltSize-1), we check
	// for the stronger condition:
	//
	// Neg & (EltSize - 1) == (EltSize - Pos) & (EltSize - 1) [A]
	//
	// for all Neg and Pos. Since Neg & (EltSize - 1) == Neg' & (EltSize - 1)
	// we can just replace Neg with Neg' for the rest of the function.
	//
	// In other cases we check for the even stronger condition:
	//
	// Neg == EltSize - Pos [B]
	//
	// for all Neg and Pos. Note that the (or ...) then invokes undefined
	// behavior if Pos == 0 (and consequently Neg == EltSize).
	//
	// We could actually use [A] whenever EltSize is a power of 2, but the
	// only extra cases that it would match are those uninteresting ones
	// where Neg and Pos are never in range at the same time. E.g. for
	// EltSize == 32, using [A] would allow a Neg of the form (sub 64, Pos)
	// as well as (sub 32, Pos), but:
	//
	// (or (shift1 X, (sub 64, Pos)), (shift2 X, Pos))
	//
	// always invokes undefined behavior for 32-bit X.
	//
	// Below, Mask == EltSize - 1 when using [A] and is all-ones otherwise.
	unsigned MaskLoBits = 0;
	if (Neg.getOpcode() == ISD::AND && isPowerOf2_64(EltSize)) {
	if (ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(1))) {
	KnownBits Known = DAG.computeKnownBits(Neg.getOperand(0));
	unsigned Bits = Log2_64(EltSize);
	if (NegC->getAPIntValue().getActiveBits() <= Bits &&
	((NegC->getAPIntValue() \| Known.Zero).countTrailingOnes() >= Bits)) {
	Neg = Neg.getOperand(0);
	MaskLoBits = Bits;
	}
	}
	}

	// Check whether Neg has the form (sub NegC, NegOp1) for some NegC and NegOp1.
	if (Neg.getOpcode() != ISD::SUB)
	return false;
	ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(0));
	if (!NegC)
	return false;
	SDValue NegOp1 = Neg.getOperand(1);

	// On the RHS of [A], if Pos is Pos' & (EltSize - 1), just replace Pos with
	// Pos'. The truncation is redundant for the purpose of the equality.
	if (MaskLoBits && Pos.getOpcode() == ISD::AND) {
	if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1))) {
	KnownBits Known = DAG.computeKnownBits(Pos.getOperand(0));
	if (PosC->getAPIntValue().getActiveBits() <= MaskLoBits &&
	((PosC->getAPIntValue() \| Known.Zero).countTrailingOnes() >=
	MaskLoBits))
	Pos = Pos.getOperand(0);
	}
	}

	// The condition we need is now:
	//
	// (NegC - NegOp1) & Mask == (EltSize - Pos) & Mask
	//
	// If NegOp1 == Pos then we need:
	//
	// EltSize & Mask == NegC & Mask
	//
	// (because "x & Mask" is a truncation and distributes through subtraction).
	//
	// We also need to account for a potential truncation of NegOp1 if the amount
	// has already been legalized to a shift amount type.
	APInt Width;
	if ((Pos == NegOp1) \|\|
	(NegOp1.getOpcode() == ISD::TRUNCATE && Pos == NegOp1.getOperand(0)))
	Width = NegC->getAPIntValue();

	// Check for cases where Pos has the form (add NegOp1, PosC) for some PosC.
	// Then the condition we want to prove becomes:
	//
	// (NegC - NegOp1) & Mask == (EltSize - (NegOp1 + PosC)) & Mask
	//
	// which, again because "x & Mask" is a truncation, becomes:
	//
	// NegC & Mask == (EltSize - PosC) & Mask
	// EltSize & Mask == (NegC + PosC) & Mask
	else if (Pos.getOpcode() == ISD::ADD && Pos.getOperand(0) == NegOp1) {
	if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1)))
	Width = PosC->getAPIntValue() + NegC->getAPIntValue();
	else
	return false;
	} else
	return false;

	// Now we just need to check that EltSize & Mask == Width & Mask.
	if (MaskLoBits)
	// EltSize & Mask is 0 since Mask is EltSize - 1.
	return Width.getLoBits(MaskLoBits) == 0;
	return Width == EltSize;
	}

	// A subroutine of MatchRotate used once we have found an OR of two opposite
	// shifts of Shifted. If Neg == <operand size> - Pos then the OR reduces
	// to both (PosOpcode Shifted, Pos) and (NegOpcode Shifted, Neg), with the
	// former being preferred if supported. InnerPos and InnerNeg are Pos and
	// Neg with outer conversions stripped away.
	SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
	SDValue Neg, SDValue InnerPos,
	SDValue InnerNeg, unsigned PosOpcode,
	unsigned NegOpcode, const SDLoc &DL) {
	// fold (or (shl x, (*ext y)),
	// (srl x, (*ext (sub 32, y)))) ->
	// (rotl x, y) or (rotr x, (sub 32, y))
	//
	// fold (or (shl x, (*ext (sub 32, y))),
	// (srl x, (*ext y))) ->
	// (rotr x, y) or (rotl x, (sub 32, y))
	EVT VT = Shifted.getValueType();
	if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG)) {
	bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT);
	return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted,
	HasPos ? Pos : Neg);
	}

	return SDValue();
	}

	// A subroutine of MatchRotate used once we have found an OR of two opposite
	// shifts of N0 + N1. If Neg == <operand size> - Pos then the OR reduces
	// to both (PosOpcode N0, N1, Pos) and (NegOpcode N0, N1, Neg), with the
	// former being preferred if supported. InnerPos and InnerNeg are Pos and
	// Neg with outer conversions stripped away.
	// TODO: Merge with MatchRotatePosNeg.
	SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
	SDValue Neg, SDValue InnerPos,
	SDValue InnerNeg, unsigned PosOpcode,
	unsigned NegOpcode, const SDLoc &DL) {
	EVT VT = N0.getValueType();
	unsigned EltBits = VT.getScalarSizeInBits();

	// fold (or (shl x0, (*ext y)),
	// (srl x1, (*ext (sub 32, y)))) ->
	// (fshl x0, x1, y) or (fshr x0, x1, (sub 32, y))
	//
	// fold (or (shl x0, (*ext (sub 32, y))),
	// (srl x1, (*ext y))) ->
	// (fshr x0, x1, y) or (fshl x0, x1, (sub 32, y))
	if (matchRotateSub(InnerPos, InnerNeg, EltBits, DAG)) {
	bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT);
	return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, N0, N1,
	HasPos ? Pos : Neg);
	}

	// Matching the shift+xor cases, we can't easily use the xor'd shift amount
	// so for now just use the PosOpcode case if its legal.
	// TODO: When can we use the NegOpcode case?
	if (PosOpcode == ISD::FSHL && isPowerOf2_32(EltBits)) {
	auto IsBinOpImm = [](SDValue Op, unsigned BinOpc, unsigned Imm) {
	if (Op.getOpcode() != BinOpc)
	return false;
	ConstantSDNode *Cst = isConstOrConstSplat(Op.getOperand(1));
	return Cst && (Cst->getAPIntValue() == Imm);
	};

	// fold (or (shl x0, y), (srl (srl x1, 1), (xor y, 31)))
	// -> (fshl x0, x1, y)
	if (IsBinOpImm(N1, ISD::SRL, 1) &&
	IsBinOpImm(InnerNeg, ISD::XOR, EltBits - 1) &&
	InnerPos == InnerNeg.getOperand(0) &&
	TLI.isOperationLegalOrCustom(ISD::FSHL, VT)) {
	return DAG.getNode(ISD::FSHL, DL, VT, N0, N1.getOperand(0), Pos);
	}

	// fold (or (shl (shl x0, 1), (xor y, 31)), (srl x1, y))
	// -> (fshr x0, x1, y)
	if (IsBinOpImm(N0, ISD::SHL, 1) &&
	IsBinOpImm(InnerPos, ISD::XOR, EltBits - 1) &&
	InnerNeg == InnerPos.getOperand(0) &&
	TLI.isOperationLegalOrCustom(ISD::FSHR, VT)) {
	return DAG.getNode(ISD::FSHR, DL, VT, N0.getOperand(0), N1, Neg);
	}

	// fold (or (shl (add x0, x0), (xor y, 31)), (srl x1, y))
	// -> (fshr x0, x1, y)
	// TODO: Should add(x,x) -> shl(x,1) be a general DAG canonicalization?
	if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N0.getOperand(1) &&
	IsBinOpImm(InnerPos, ISD::XOR, EltBits - 1) &&
	InnerNeg == InnerPos.getOperand(0) &&
	TLI.isOperationLegalOrCustom(ISD::FSHR, VT)) {
	return DAG.getNode(ISD::FSHR, DL, VT, N0.getOperand(0), N1, Neg);
	}
	}

	return SDValue();
	}

	// MatchRotate - Handle an 'or' of two operands. If this is one of the many
	// idioms for rotate, and if the target supports rotation instructions, generate
	// a rot[lr]. This also matches funnel shift patterns, similar to rotation but
	// with different shifted sources.
	SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
	// Must be a legal type. Expanded 'n promoted things won't work with rotates.
	EVT VT = LHS.getValueType();
	if (!TLI.isTypeLegal(VT))
	return SDValue();

	// The target must have at least one rotate/funnel flavor.
	bool HasROTL = hasOperation(ISD::ROTL, VT);
	bool HasROTR = hasOperation(ISD::ROTR, VT);
	bool HasFSHL = hasOperation(ISD::FSHL, VT);
	bool HasFSHR = hasOperation(ISD::FSHR, VT);
	if (!HasROTL && !HasROTR && !HasFSHL && !HasFSHR)
	return SDValue();

	// Check for truncated rotate.
	if (LHS.getOpcode() == ISD::TRUNCATE && RHS.getOpcode() == ISD::TRUNCATE &&
	LHS.getOperand(0).getValueType() == RHS.getOperand(0).getValueType()) {
	assert(LHS.getValueType() == RHS.getValueType());
	if (SDValue Rot = MatchRotate(LHS.getOperand(0), RHS.getOperand(0), DL)) {
	return DAG.getNode(ISD::TRUNCATE, SDLoc(LHS), LHS.getValueType(), Rot);
	}
	}

	// Match "(X shl/srl V1) & V2" where V2 may not be present.
	SDValue LHSShift; // The shift.
	SDValue LHSMask; // AND value if any.
	matchRotateHalf(DAG, LHS, LHSShift, LHSMask);

	SDValue RHSShift; // The shift.
	SDValue RHSMask; // AND value if any.
	matchRotateHalf(DAG, RHS, RHSShift, RHSMask);

	// If neither side matched a rotate half, bail
	if (!LHSShift && !RHSShift)
	return SDValue();

	// InstCombine may have combined a constant shl, srl, mul, or udiv with one
	// side of the rotate, so try to handle that here. In all cases we need to
	// pass the matched shift from the opposite side to compute the opcode and
	// needed shift amount to extract. We still want to do this if both sides
	// matched a rotate half because one half may be a potential overshift that
	// can be broken down (ie if InstCombine merged two shl or srl ops into a
	// single one).

	// Have LHS side of the rotate, try to extract the needed shift from the RHS.
	if (LHSShift)
	if (SDValue NewRHSShift =
	extractShiftForRotate(DAG, LHSShift, RHS, RHSMask, DL))
	RHSShift = NewRHSShift;
	// Have RHS side of the rotate, try to extract the needed shift from the LHS.
	if (RHSShift)
	if (SDValue NewLHSShift =
	extractShiftForRotate(DAG, RHSShift, LHS, LHSMask, DL))
	LHSShift = NewLHSShift;

	// If a side is still missing, nothing else we can do.
	if (!RHSShift \|\| !LHSShift)
	return SDValue();

	// At this point we've matched or extracted a shift op on each side.

	if (LHSShift.getOpcode() == RHSShift.getOpcode())
	return SDValue(); // Shifts must disagree.

	bool IsRotate = LHSShift.getOperand(0) == RHSShift.getOperand(0);
	if (!IsRotate && !(HasFSHL \|\| HasFSHR))
	return SDValue(); // Requires funnel shift support.

	// Canonicalize shl to left side in a shl/srl pair.
	if (RHSShift.getOpcode() == ISD::SHL) {
	std::swap(LHS, RHS);
	std::swap(LHSShift, RHSShift);
	std::swap(LHSMask, RHSMask);
	}

	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	SDValue LHSShiftArg = LHSShift.getOperand(0);
	SDValue LHSShiftAmt = LHSShift.getOperand(1);
	SDValue RHSShiftArg = RHSShift.getOperand(0);
	SDValue RHSShiftAmt = RHSShift.getOperand(1);

	// fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1)
	// fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2)
	// fold (or (shl x, C1), (srl y, C2)) -> (fshl x, y, C1)
	// fold (or (shl x, C1), (srl y, C2)) -> (fshr x, y, C2)
	// iff C1+C2 == EltSizeInBits
	auto MatchRotateSum = [EltSizeInBits](ConstantSDNode *LHS,
	ConstantSDNode *RHS) {
	return (LHS->getAPIntValue() + RHS->getAPIntValue()) == EltSizeInBits;
	};
	if (ISD::matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) {
	SDValue Res;
	if (IsRotate && (HasROTL \|\| HasROTR))
	Res = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT, LHSShiftArg,
	HasROTL ? LHSShiftAmt : RHSShiftAmt);
	else
	Res = DAG.getNode(HasFSHL ? ISD::FSHL : ISD::FSHR, DL, VT, LHSShiftArg,
	RHSShiftArg, HasFSHL ? LHSShiftAmt : RHSShiftAmt);

	// If there is an AND of either shifted operand, apply it to the result.
	if (LHSMask.getNode() \|\| RHSMask.getNode()) {
	SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
	SDValue Mask = AllOnes;

	if (LHSMask.getNode()) {
	SDValue RHSBits = DAG.getNode(ISD::SRL, DL, VT, AllOnes, RHSShiftAmt);
	Mask = DAG.getNode(ISD::AND, DL, VT, Mask,
	DAG.getNode(ISD::OR, DL, VT, LHSMask, RHSBits));
	}
	if (RHSMask.getNode()) {
	SDValue LHSBits = DAG.getNode(ISD::SHL, DL, VT, AllOnes, LHSShiftAmt);
	Mask = DAG.getNode(ISD::AND, DL, VT, Mask,
	DAG.getNode(ISD::OR, DL, VT, RHSMask, LHSBits));
	}

	Res = DAG.getNode(ISD::AND, DL, VT, Res, Mask);
	}

	return Res;
	}

	// If there is a mask here, and we have a variable shift, we can't be sure
	// that we're masking out the right stuff.
	if (LHSMask.getNode() \|\| RHSMask.getNode())
	return SDValue();

	// If the shift amount is sign/zext/any-extended just peel it off.
	SDValue LExtOp0 = LHSShiftAmt;
	SDValue RExtOp0 = RHSShiftAmt;
	if ((LHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND \|\|
	LHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND \|\|
	LHSShiftAmt.getOpcode() == ISD::ANY_EXTEND \|\|
	LHSShiftAmt.getOpcode() == ISD::TRUNCATE) &&
	(RHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND \|\|
	RHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND \|\|
	RHSShiftAmt.getOpcode() == ISD::ANY_EXTEND \|\|
	RHSShiftAmt.getOpcode() == ISD::TRUNCATE)) {
	LExtOp0 = LHSShiftAmt.getOperand(0);
	RExtOp0 = RHSShiftAmt.getOperand(0);
	}

	if (IsRotate && (HasROTL \|\| HasROTR)) {
	SDValue TryL =
	MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt, LExtOp0,
	RExtOp0, ISD::ROTL, ISD::ROTR, DL);
	if (TryL)
	return TryL;

	SDValue TryR =
	MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt, RExtOp0,
	LExtOp0, ISD::ROTR, ISD::ROTL, DL);
	if (TryR)
	return TryR;
	}

	SDValue TryL =
	MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, LHSShiftAmt, RHSShiftAmt,
	LExtOp0, RExtOp0, ISD::FSHL, ISD::FSHR, DL);
	if (TryL)
	return TryL;

	SDValue TryR =
	MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, RHSShiftAmt, LHSShiftAmt,
	RExtOp0, LExtOp0, ISD::FSHR, ISD::FSHL, DL);
	if (TryR)
	return TryR;

	return SDValue();
	}

	namespace {

	/// Represents known origin of an individual byte in load combine pattern. The
	/// value of the byte is either constant zero or comes from memory.
	struct ByteProvider {
	// For constant zero providers Load is set to nullptr. For memory providers
	// Load represents the node which loads the byte from memory.
	// ByteOffset is the offset of the byte in the value produced by the load.
	LoadSDNode *Load = nullptr;
	unsigned ByteOffset = 0;

	ByteProvider() = default;

	static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset) {
	return ByteProvider(Load, ByteOffset);
	}

	static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0); }

	bool isConstantZero() const { return !Load; }
	bool isMemory() const { return Load; }

	bool operator==(const ByteProvider &Other) const {
	return Other.Load == Load && Other.ByteOffset == ByteOffset;
	}

	private:
	ByteProvider(LoadSDNode *Load, unsigned ByteOffset)
	: Load(Load), ByteOffset(ByteOffset) {}
	};

	} // end anonymous namespace

	/// Recursively traverses the expression calculating the origin of the requested
	/// byte of the given value. Returns None if the provider can't be calculated.
	///
	/// For all the values except the root of the expression verifies that the value
	/// has exactly one use and if it's not true return None. This way if the origin
	/// of the byte is returned it's guaranteed that the values which contribute to
	/// the byte are not used outside of this expression.
	///
	/// Because the parts of the expression are not allowed to have more than one
	/// use this function iterates over trees, not DAGs. So it never visits the same
	/// node more than once.
	static const Optional<ByteProvider>
	calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth,
	bool Root = false) {
	// Typical i64 by i8 pattern requires recursion up to 8 calls depth
	if (Depth == 10)
	return None;

	if (!Root && !Op.hasOneUse())
	return None;

	assert(Op.getValueType().isScalarInteger() && "can't handle other types");
	unsigned BitWidth = Op.getValueSizeInBits();
	if (BitWidth % 8 != 0)
	return None;
	unsigned ByteWidth = BitWidth / 8;
	assert(Index < ByteWidth && "invalid index requested");
	(void) ByteWidth;

	switch (Op.getOpcode()) {
	case ISD::OR: {
	auto LHS = calculateByteProvider(Op->getOperand(0), Index, Depth + 1);
	if (!LHS)
	return None;
	auto RHS = calculateByteProvider(Op->getOperand(1), Index, Depth + 1);
	if (!RHS)
	return None;

	if (LHS->isConstantZero())
	return RHS;
	if (RHS->isConstantZero())
	return LHS;
	return None;
	}
	case ISD::SHL: {
	auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
	if (!ShiftOp)
	return None;

	uint64_t BitShift = ShiftOp->getZExtValue();
	if (BitShift % 8 != 0)
	return None;
	uint64_t ByteShift = BitShift / 8;

	return Index < ByteShift
	? ByteProvider::getConstantZero()
	: calculateByteProvider(Op->getOperand(0), Index - ByteShift,
	Depth + 1);
	}
	case ISD::ANY_EXTEND:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND: {
	SDValue NarrowOp = Op->getOperand(0);
	unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits();
	if (NarrowBitWidth % 8 != 0)
	return None;
	uint64_t NarrowByteWidth = NarrowBitWidth / 8;

	if (Index >= NarrowByteWidth)
	return Op.getOpcode() == ISD::ZERO_EXTEND
	? Optional<ByteProvider>(ByteProvider::getConstantZero())
	: None;
	return calculateByteProvider(NarrowOp, Index, Depth + 1);
	}
	case ISD::BSWAP:
	return calculateByteProvider(Op->getOperand(0), ByteWidth - Index - 1,
	Depth + 1);
	case ISD::LOAD: {
	auto L = cast<LoadSDNode>(Op.getNode());
	if (!L->isSimple() \|\| L->isIndexed())
	return None;

	unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
	if (NarrowBitWidth % 8 != 0)
	return None;
	uint64_t NarrowByteWidth = NarrowBitWidth / 8;

	if (Index >= NarrowByteWidth)
	return L->getExtensionType() == ISD::ZEXTLOAD
	? Optional<ByteProvider>(ByteProvider::getConstantZero())
	: None;
	return ByteProvider::getMemory(L, Index);
	}
	}

	return None;
	}

	static unsigned LittleEndianByteAt(unsigned BW, unsigned i) {
	return i;
	}

	static unsigned BigEndianByteAt(unsigned BW, unsigned i) {
	return BW - i - 1;
	}

	// Check if the bytes offsets we are looking at match with either big or
	// little endian value loaded. Return true for big endian, false for little
	// endian, and None if match failed.
	static Optional<bool> isBigEndian(const ArrayRef<int64_t> ByteOffsets,
	int64_t FirstOffset) {
	// The endian can be decided only when it is 2 bytes at least.
	unsigned Width = ByteOffsets.size();
	if (Width < 2)
	return None;

	bool BigEndian = true, LittleEndian = true;
	for (unsigned i = 0; i < Width; i++) {
	int64_t CurrentByteOffset = ByteOffsets[i] - FirstOffset;
	LittleEndian &= CurrentByteOffset == LittleEndianByteAt(Width, i);
	BigEndian &= CurrentByteOffset == BigEndianByteAt(Width, i);
	if (!BigEndian && !LittleEndian)
	return None;
	}

	assert((BigEndian != LittleEndian) && "It should be either big endian or"
	"little endian");
	return BigEndian;
	}

	static SDValue stripTruncAndExt(SDValue Value) {
	switch (Value.getOpcode()) {
	case ISD::TRUNCATE:
	case ISD::ZERO_EXTEND:
	case ISD::SIGN_EXTEND:
	case ISD::ANY_EXTEND:
	return stripTruncAndExt(Value.getOperand(0));
	}
	return Value;
	}

	/// Match a pattern where a wide type scalar value is stored by several narrow
	/// stores. Fold it into a single store or a BSWAP and a store if the targets
	/// supports it.
	///
	/// Assuming little endian target:
	/// i8 *p = ...
	/// i32 val = ...
	/// p[0] = (val >> 0) & 0xFF;
	/// p[1] = (val >> 8) & 0xFF;
	/// p[2] = (val >> 16) & 0xFF;
	/// p[3] = (val >> 24) & 0xFF;
	/// =>
	/// *((i32)p) = val;
	///
	/// i8 *p = ...
	/// i32 val = ...
	/// p[0] = (val >> 24) & 0xFF;
	/// p[1] = (val >> 16) & 0xFF;
	/// p[2] = (val >> 8) & 0xFF;
	/// p[3] = (val >> 0) & 0xFF;
	/// =>
	/// *((i32)p) = BSWAP(val);
	SDValue DAGCombiner::MatchStoreCombine(StoreSDNode *N) {
	// Collect all the stores in the chain.
	SDValue Chain;
	SmallVector<StoreSDNode *, 8> Stores;
	for (StoreSDNode *Store = N; Store; Store = dyn_cast<StoreSDNode>(Chain)) {
	// TODO: Allow unordered atomics when wider type is legal (see D66309)
	if (Store->getMemoryVT() != MVT::i8 \|\|
	!Store->isSimple() \|\| Store->isIndexed())
	return SDValue();
	Stores.push_back(Store);
	Chain = Store->getChain();
	}
	// Handle the simple type only.
	unsigned Width = Stores.size();
	EVT VT = EVT::getIntegerVT(
	DAG.getContext(), Width N->getMemoryVT().getSizeInBits());
	if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
	return SDValue();

	if (LegalOperations && !TLI.isOperationLegal(ISD::STORE, VT))
	return SDValue();

	// Check if all the bytes of the combined value we are looking at are stored
	// to the same base address. Collect bytes offsets from Base address into
	// ByteOffsets.
	SDValue CombinedValue;
	SmallVector<int64_t, 8> ByteOffsets(Width, INT64_MAX);
	int64_t FirstOffset = INT64_MAX;
	StoreSDNode *FirstStore = nullptr;
	Optional<BaseIndexOffset> Base;
	for (auto Store : Stores) {
	// All the stores store different byte of the CombinedValue. A truncate is
	// required to get that byte value.
	SDValue Trunc = Store->getValue();
	if (Trunc.getOpcode() != ISD::TRUNCATE)
	return SDValue();
	// A shift operation is required to get the right byte offset, except the
	// first byte.
	int64_t Offset = 0;
	SDValue Value = Trunc.getOperand(0);
	if (Value.getOpcode() == ISD::SRL \|\|
	Value.getOpcode() == ISD::SRA) {
	auto *ShiftOffset = dyn_cast<ConstantSDNode>(Value.getOperand(1));
	// Trying to match the following pattern. The shift offset must be
	// a constant and a multiple of 8. It is the byte offset in "y".
	//
	// x = srl y, offset
	// i8 z = trunc x
	// store z, ...
	if (!ShiftOffset \|\| (ShiftOffset->getSExtValue() % 8))
	return SDValue();

	Offset = ShiftOffset->getSExtValue()/8;
	Value = Value.getOperand(0);
	}

	// Stores must share the same combined value with different offsets.
	if (!CombinedValue)
	CombinedValue = Value;
	else if (stripTruncAndExt(CombinedValue) != stripTruncAndExt(Value))
	return SDValue();

	// The trunc and all the extend operation should be stripped to get the
	// real value we are stored.
	else if (CombinedValue.getValueType() != VT) {
	if (Value.getValueType() == VT \|\|
	Value.getValueSizeInBits() > CombinedValue.getValueSizeInBits())
	CombinedValue = Value;
	// Give up if the combined value type is smaller than the store size.
	if (CombinedValue.getValueSizeInBits() < VT.getSizeInBits())
	return SDValue();
	}

	// Stores must share the same base address
	BaseIndexOffset Ptr = BaseIndexOffset::match(Store, DAG);
	int64_t ByteOffsetFromBase = 0;
	if (!Base)
	Base = Ptr;
	else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase))
	return SDValue();

	// Remember the first byte store
	if (ByteOffsetFromBase < FirstOffset) {
	FirstStore = Store;
	FirstOffset = ByteOffsetFromBase;
	}
	// Map the offset in the store and the offset in the combined value, and
	// early return if it has been set before.
	if (Offset < 0 \|\| Offset >= Width \|\| ByteOffsets[Offset] != INT64_MAX)
	return SDValue();
	ByteOffsets[Offset] = ByteOffsetFromBase;
	}

	assert(FirstOffset != INT64_MAX && "First byte offset must be set");
	assert(FirstStore && "First store must be set");

	// Check if the bytes of the combined value we are looking at match with
	// either big or little endian value store.
	Optional<bool> IsBigEndian = isBigEndian(ByteOffsets, FirstOffset);
	if (!IsBigEndian.hasValue())
	return SDValue();

	// The node we are looking at matches with the pattern, check if we can
	// replace it with a single bswap if needed and store.

	// If the store needs byte swap check if the target supports it
	bool NeedsBswap = DAG.getDataLayout().isBigEndian() != *IsBigEndian;

	// Before legalize we can introduce illegal bswaps which will be later
	// converted to an explicit bswap sequence. This way we end up with a single
	// store and byte shuffling instead of several stores and byte shuffling.
	if (NeedsBswap && LegalOperations && !TLI.isOperationLegal(ISD::BSWAP, VT))
	return SDValue();

	// Check that a store of the wide type is both allowed and fast on the target
	bool Fast = false;
	bool Allowed =
	TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
	*FirstStore->getMemOperand(), &Fast);
	if (!Allowed \|\| !Fast)
	return SDValue();

	if (VT != CombinedValue.getValueType()) {
	assert(CombinedValue.getValueType().getSizeInBits() > VT.getSizeInBits() &&
	"Get unexpected store value to combine");
	CombinedValue = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
	CombinedValue);
	}

	if (NeedsBswap)
	CombinedValue = DAG.getNode(ISD::BSWAP, SDLoc(N), VT, CombinedValue);

	SDValue NewStore =
	DAG.getStore(Chain, SDLoc(N), CombinedValue, FirstStore->getBasePtr(),
	FirstStore->getPointerInfo(), FirstStore->getAlignment());

	// Rely on other DAG combine rules to remove the other individual stores.
	DAG.ReplaceAllUsesWith(N, NewStore.getNode());
	return NewStore;
	}

	/// Match a pattern where a wide type scalar value is loaded by several narrow
	/// loads and combined by shifts and ors. Fold it into a single load or a load
	/// and a BSWAP if the targets supports it.
	///
	/// Assuming little endian target:
	/// i8 *a = ...
	/// i32 val = a[0] \| (a[1] << 8) \| (a[2] << 16) \| (a[3] << 24)
	/// =>
	/// i32 val = *((i32)a)
	///
	/// i8 *a = ...
	/// i32 val = (a[0] << 24) \| (a[1] << 16) \| (a[2] << 8) \| a[3]
	/// =>
	/// i32 val = BSWAP(*((i32)a))
	///
	/// TODO: This rule matches complex patterns with OR node roots and doesn't
	/// interact well with the worklist mechanism. When a part of the pattern is
	/// updated (e.g. one of the loads) its direct users are put into the worklist,
	/// but the root node of the pattern which triggers the load combine is not
	/// necessarily a direct user of the changed node. For example, once the address
	/// of t28 load is reassociated load combine won't be triggered:
	/// t25: i32 = add t4, Constant:i32<2>
	/// t26: i64 = sign_extend t25
	/// t27: i64 = add t2, t26
	/// t28: i8,ch = load<LD1[%tmp9]> t0, t27, undef:i64
	/// t29: i32 = zero_extend t28
	/// t32: i32 = shl t29, Constant:i8<8>
	/// t33: i32 = or t23, t32
	/// As a possible fix visitLoad can check if the load can be a part of a load
	/// combine pattern and add corresponding OR roots to the worklist.
	SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
	assert(N->getOpcode() == ISD::OR &&
	"Can only match load combining against OR nodes");

	// Handles simple types only
	EVT VT = N->getValueType(0);
	if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
	return SDValue();
	unsigned ByteWidth = VT.getSizeInBits() / 8;

	bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian();
	auto MemoryByteOffset = [&] (ByteProvider P) {
	assert(P.isMemory() && "Must be a memory byte provider");
	unsigned LoadBitWidth = P.Load->getMemoryVT().getSizeInBits();
	assert(LoadBitWidth % 8 == 0 &&
	"can only analyze providers for individual bytes not bit");
	unsigned LoadByteWidth = LoadBitWidth / 8;
	return IsBigEndianTarget
	? BigEndianByteAt(LoadByteWidth, P.ByteOffset)
	: LittleEndianByteAt(LoadByteWidth, P.ByteOffset);
	};

	Optional<BaseIndexOffset> Base;
	SDValue Chain;

	SmallPtrSet<LoadSDNode *, 8> Loads;
	Optional<ByteProvider> FirstByteProvider;
	int64_t FirstOffset = INT64_MAX;

	// Check if all the bytes of the OR we are looking at are loaded from the same
	// base address. Collect bytes offsets from Base address in ByteOffsets.
	SmallVector<int64_t, 8> ByteOffsets(ByteWidth);
	unsigned ZeroExtendedBytes = 0;
	for (int i = ByteWidth - 1; i >= 0; --i) {
	auto P = calculateByteProvider(SDValue(N, 0), i, 0, /Root=/true);
	if (!P)
	return SDValue();

	if (P->isConstantZero()) {
	// It's OK for the N most significant bytes to be 0, we can just
	// zero-extend the load.
	if (++ZeroExtendedBytes != (ByteWidth - static_cast<unsigned>(i)))
	return SDValue();
	continue;
	}
	assert(P->isMemory() && "provenance should either be memory or zero");

	LoadSDNode *L = P->Load;
	assert(L->hasNUsesOfValue(1, 0) && L->isSimple() &&
	!L->isIndexed() &&
	"Must be enforced by calculateByteProvider");
	assert(L->getOffset().isUndef() && "Unindexed load must have undef offset");

	// All loads must share the same chain
	SDValue LChain = L->getChain();
	if (!Chain)
	Chain = LChain;
	else if (Chain != LChain)
	return SDValue();

	// Loads must share the same base address
	BaseIndexOffset Ptr = BaseIndexOffset::match(L, DAG);
	int64_t ByteOffsetFromBase = 0;
	if (!Base)
	Base = Ptr;
	else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase))
	return SDValue();

	// Calculate the offset of the current byte from the base address
	ByteOffsetFromBase += MemoryByteOffset(*P);
	ByteOffsets[i] = ByteOffsetFromBase;

	// Remember the first byte load
	if (ByteOffsetFromBase < FirstOffset) {
	FirstByteProvider = P;
	FirstOffset = ByteOffsetFromBase;
	}

	Loads.insert(L);
	}
	assert(!Loads.empty() && "All the bytes of the value must be loaded from "
	"memory, so there must be at least one load which produces the value");
	assert(Base && "Base address of the accessed memory location must be set");
	assert(FirstOffset != INT64_MAX && "First byte offset must be set");

	bool NeedsZext = ZeroExtendedBytes > 0;

	EVT MemVT =
	EVT::getIntegerVT(DAG.getContext(), (ByteWidth - ZeroExtendedBytes) 8);

	if (!MemVT.isSimple())
	return SDValue();

	// Before legalize we can introduce too wide illegal loads which will be later
	// split into legal sized loads. This enables us to combine i64 load by i8
	// patterns to a couple of i32 loads on 32 bit targets.
	if (LegalOperations &&
	!TLI.isOperationLegal(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD,
	MemVT))
	return SDValue();

	// Check if the bytes of the OR we are looking at match with either big or
	// little endian value load
	Optional<bool> IsBigEndian = isBigEndian(
	makeArrayRef(ByteOffsets).drop_back(ZeroExtendedBytes), FirstOffset);
	if (!IsBigEndian.hasValue())
	return SDValue();

	assert(FirstByteProvider && "must be set");

	// Ensure that the first byte is loaded from zero offset of the first load.
	// So the combined value can be loaded from the first load address.
	if (MemoryByteOffset(*FirstByteProvider) != 0)
	return SDValue();
	LoadSDNode *FirstLoad = FirstByteProvider->Load;

	// The node we are looking at matches with the pattern, check if we can
	// replace it with a single (possibly zero-extended) load and bswap + shift if
	// needed.

	// If the load needs byte swap check if the target supports it
	bool NeedsBswap = IsBigEndianTarget != *IsBigEndian;

	// Before legalize we can introduce illegal bswaps which will be later
	// converted to an explicit bswap sequence. This way we end up with a single
	// load and byte shuffling instead of several loads and byte shuffling.
	// We do not introduce illegal bswaps when zero-extending as this tends to
	// introduce too many arithmetic instructions.
	if (NeedsBswap && (LegalOperations \|\| NeedsZext) &&
	!TLI.isOperationLegal(ISD::BSWAP, VT))
	return SDValue();

	// If we need to bswap and zero extend, we have to insert a shift. Check that
	// it is legal.
	if (NeedsBswap && NeedsZext && LegalOperations &&
	!TLI.isOperationLegal(ISD::SHL, VT))
	return SDValue();

	// Check that a load of the wide type is both allowed and fast on the target
	bool Fast = false;
	bool Allowed =
	TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
	*FirstLoad->getMemOperand(), &Fast);
	if (!Allowed \|\| !Fast)
	return SDValue();

	SDValue NewLoad = DAG.getExtLoad(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD,
	SDLoc(N), VT, Chain, FirstLoad->getBasePtr(),
	FirstLoad->getPointerInfo(), MemVT,
	FirstLoad->getAlignment());

	// Transfer chain users from old loads to the new load.
	for (LoadSDNode *L : Loads)
	DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1));

	if (!NeedsBswap)
	return NewLoad;

	SDValue ShiftedLoad =
	NeedsZext
	? DAG.getNode(ISD::SHL, SDLoc(N), VT, NewLoad,
	DAG.getShiftAmountConstant(ZeroExtendedBytes * 8, VT,
	SDLoc(N), LegalOperations))
	: NewLoad;
	return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, ShiftedLoad);
	}

	// If the target has andn, bsl, or a similar bit-select instruction,
	// we want to unfold masked merge, with canonical pattern of:
	// \| A \| \|B\|
	// ((x ^ y) & m) ^ y
	// \| D \|
	// Into:
	// (x & m) \| (y & ~m)
	// If y is a constant, and the 'andn' does not work with immediates,
	// we unfold into a different pattern:
	// ~(~x & m) & (m \| y)
	// NOTE: we don't unfold the pattern if 'xor' is actually a 'not', because at
	// the very least that breaks andnpd / andnps patterns, and because those
	// patterns are simplified in IR and shouldn't be created in the DAG
	SDValue DAGCombiner::unfoldMaskedMerge(SDNode *N) {
	assert(N->getOpcode() == ISD::XOR);

	// Don't touch 'not' (i.e. where y = -1).
	if (isAllOnesOrAllOnesSplat(N->getOperand(1)))
	return SDValue();

	EVT VT = N->getValueType(0);

	// There are 3 commutable operators in the pattern,
	// so we have to deal with 8 possible variants of the basic pattern.
	SDValue X, Y, M;
	auto matchAndXor = [&X, &Y, &M](SDValue And, unsigned XorIdx, SDValue Other) {
	if (And.getOpcode() != ISD::AND \|\| !And.hasOneUse())
	return false;
	SDValue Xor = And.getOperand(XorIdx);
	if (Xor.getOpcode() != ISD::XOR \|\| !Xor.hasOneUse())
	return false;
	SDValue Xor0 = Xor.getOperand(0);
	SDValue Xor1 = Xor.getOperand(1);
	// Don't touch 'not' (i.e. where y = -1).
	if (isAllOnesOrAllOnesSplat(Xor1))
	return false;
	if (Other == Xor0)
	std::swap(Xor0, Xor1);
	if (Other != Xor1)
	return false;
	X = Xor0;
	Y = Xor1;
	M = And.getOperand(XorIdx ? 0 : 1);
	return true;
	};

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	if (!matchAndXor(N0, 0, N1) && !matchAndXor(N0, 1, N1) &&
	!matchAndXor(N1, 0, N0) && !matchAndXor(N1, 1, N0))
	return SDValue();

	// Don't do anything if the mask is constant. This should not be reachable.
	// InstCombine should have already unfolded this pattern, and DAGCombiner
	// probably shouldn't produce it, too.
	if (isa<ConstantSDNode>(M.getNode()))
	return SDValue();

	// We can transform if the target has AndNot
	if (!TLI.hasAndNot(M))
	return SDValue();

	SDLoc DL(N);

	// If Y is a constant, check that 'andn' works with immediates.
	if (!TLI.hasAndNot(Y)) {
	assert(TLI.hasAndNot(X) && "Only mask is a variable? Unreachable.");
	// If not, we need to do a bit more work to make sure andn is still used.
	SDValue NotX = DAG.getNOT(DL, X, VT);
	SDValue LHS = DAG.getNode(ISD::AND, DL, VT, NotX, M);
	SDValue NotLHS = DAG.getNOT(DL, LHS, VT);
	SDValue RHS = DAG.getNode(ISD::OR, DL, VT, M, Y);
	return DAG.getNode(ISD::AND, DL, VT, NotLHS, RHS);
	}

	SDValue LHS = DAG.getNode(ISD::AND, DL, VT, X, M);
	SDValue NotM = DAG.getNOT(DL, M, VT);
	SDValue RHS = DAG.getNode(ISD::AND, DL, VT, Y, NotM);

	return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
	}

	SDValue DAGCombiner::visitXOR(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();

	// fold vector ops
	if (VT.isVector()) {
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	// fold (xor x, 0) -> x, vector edition
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return N1;
	if (ISD::isBuildVectorAllZeros(N1.getNode()))
	return N0;
	}

	// fold (xor undef, undef) -> 0. This is a common idiom (misuse).
	SDLoc DL(N);
	if (N0.isUndef() && N1.isUndef())
	return DAG.getConstant(0, DL, VT);

	// fold (xor x, undef) -> undef
	if (N0.isUndef())
	return N0;
	if (N1.isUndef())
	return N1;

	// fold (xor c1, c2) -> c1^c2
	if (SDValue C = DAG.FoldConstantArithmetic(ISD::XOR, DL, VT, {N0, N1}))
	return C;

	// canonicalize constant to RHS
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
	!DAG.isConstantIntBuildVectorOrConstantInt(N1))
	return DAG.getNode(ISD::XOR, DL, VT, N1, N0);

	// fold (xor x, 0) -> x
	if (isNullConstant(N1))
	return N0;

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// reassociate xor
	if (SDValue RXOR = reassociateOps(ISD::XOR, DL, N0, N1, N->getFlags()))
	return RXOR;

	// fold !(x cc y) -> (x !cc y)
	unsigned N0Opcode = N0.getOpcode();
	SDValue LHS, RHS, CC;
	if (TLI.isConstTrueVal(N1.getNode()) &&
	isSetCCEquivalent(N0, LHS, RHS, CC, /MatchStrict/true)) {
	ISD::CondCode NotCC = ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
	LHS.getValueType());
	if (!LegalOperations \|\|
	TLI.isCondCodeLegal(NotCC, LHS.getSimpleValueType())) {
	switch (N0Opcode) {
	default:
	llvm_unreachable("Unhandled SetCC Equivalent!");
	case ISD::SETCC:
	return DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC);
	case ISD::SELECT_CC:
	return DAG.getSelectCC(SDLoc(N0), LHS, RHS, N0.getOperand(2),
	N0.getOperand(3), NotCC);
	case ISD::STRICT_FSETCC:
	case ISD::STRICT_FSETCCS: {
	if (N0.hasOneUse()) {
	// FIXME Can we handle multiple uses? Could we token factor the chain
	// results from the new/old setcc?
	SDValue SetCC = DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC,
	N0.getOperand(0),
	N0Opcode == ISD::STRICT_FSETCCS);
	CombineTo(N, SetCC);
	DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), SetCC.getValue(1));
	recursivelyDeleteUnusedNodes(N0.getNode());
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	break;
	}
	}
	}
	}

	// fold (not (zext (setcc x, y))) -> (zext (not (setcc x, y)))
	if (isOneConstant(N1) && N0Opcode == ISD::ZERO_EXTEND && N0.hasOneUse() &&
	isSetCCEquivalent(N0.getOperand(0), LHS, RHS, CC)){
	SDValue V = N0.getOperand(0);
	SDLoc DL0(N0);
	V = DAG.getNode(ISD::XOR, DL0, V.getValueType(), V,
	DAG.getConstant(1, DL0, V.getValueType()));
	AddToWorklist(V.getNode());
	return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, V);
	}

	// fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are setcc
	if (isOneConstant(N1) && VT == MVT::i1 && N0.hasOneUse() &&
	(N0Opcode == ISD::OR \|\| N0Opcode == ISD::AND)) {
	SDValue N00 = N0.getOperand(0), N01 = N0.getOperand(1);
	if (isOneUseSetCC(N01) \|\| isOneUseSetCC(N00)) {
	unsigned NewOpcode = N0Opcode == ISD::AND ? ISD::OR : ISD::AND;
	N00 = DAG.getNode(ISD::XOR, SDLoc(N00), VT, N00, N1); // N00 = ~N00
	N01 = DAG.getNode(ISD::XOR, SDLoc(N01), VT, N01, N1); // N01 = ~N01
	AddToWorklist(N00.getNode()); AddToWorklist(N01.getNode());
	return DAG.getNode(NewOpcode, DL, VT, N00, N01);
	}
	}
	// fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are constants
	if (isAllOnesConstant(N1) && N0.hasOneUse() &&
	(N0Opcode == ISD::OR \|\| N0Opcode == ISD::AND)) {
	SDValue N00 = N0.getOperand(0), N01 = N0.getOperand(1);
	if (isa<ConstantSDNode>(N01) \|\| isa<ConstantSDNode>(N00)) {
	unsigned NewOpcode = N0Opcode == ISD::AND ? ISD::OR : ISD::AND;
	N00 = DAG.getNode(ISD::XOR, SDLoc(N00), VT, N00, N1); // N00 = ~N00
	N01 = DAG.getNode(ISD::XOR, SDLoc(N01), VT, N01, N1); // N01 = ~N01
	AddToWorklist(N00.getNode()); AddToWorklist(N01.getNode());
	return DAG.getNode(NewOpcode, DL, VT, N00, N01);
	}
	}

	// fold (not (neg x)) -> (add X, -1)
	// FIXME: This can be generalized to (not (sub Y, X)) -> (add X, ~Y) if
	// Y is a constant or the subtract has a single use.
	if (isAllOnesConstant(N1) && N0.getOpcode() == ISD::SUB &&
	isNullConstant(N0.getOperand(0))) {
	return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(1),
	DAG.getAllOnesConstant(DL, VT));
	}

	// fold (not (add X, -1)) -> (neg X)
	if (isAllOnesConstant(N1) && N0.getOpcode() == ISD::ADD &&
	isAllOnesOrAllOnesSplat(N0.getOperand(1))) {
	return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	N0.getOperand(0));
	}

	// fold (xor (and x, y), y) -> (and (not x), y)
	if (N0Opcode == ISD::AND && N0.hasOneUse() && N0->getOperand(1) == N1) {
	SDValue X = N0.getOperand(0);
	SDValue NotX = DAG.getNOT(SDLoc(X), X, VT);
	AddToWorklist(NotX.getNode());
	return DAG.getNode(ISD::AND, DL, VT, NotX, N1);
	}

	if ((N0Opcode == ISD::SRL \|\| N0Opcode == ISD::SHL) && N0.hasOneUse()) {
	ConstantSDNode *XorC = isConstOrConstSplat(N1);
	ConstantSDNode *ShiftC = isConstOrConstSplat(N0.getOperand(1));
	unsigned BitWidth = VT.getScalarSizeInBits();
	if (XorC && ShiftC) {
	// Don't crash on an oversized shift. We can not guarantee that a bogus
	// shift has been simplified to undef.
	uint64_t ShiftAmt = ShiftC->getLimitedValue();
	if (ShiftAmt < BitWidth) {
	APInt Ones = APInt::getAllOnesValue(BitWidth);
	Ones = N0Opcode == ISD::SHL ? Ones.shl(ShiftAmt) : Ones.lshr(ShiftAmt);
	if (XorC->getAPIntValue() == Ones) {
	// If the xor constant is a shifted -1, do a 'not' before the shift:
	// xor (X << ShiftC), XorC --> (not X) << ShiftC
	// xor (X >> ShiftC), XorC --> (not X) >> ShiftC
	SDValue Not = DAG.getNOT(DL, N0.getOperand(0), VT);
	return DAG.getNode(N0Opcode, DL, VT, Not, N0.getOperand(1));
	}
	}
	}
	}

	// fold Y = sra (X, size(X)-1); xor (add (X, Y), Y) -> (abs X)
	if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) {
	SDValue A = N0Opcode == ISD::ADD ? N0 : N1;
	SDValue S = N0Opcode == ISD::SRA ? N0 : N1;
	if (A.getOpcode() == ISD::ADD && S.getOpcode() == ISD::SRA) {
	SDValue A0 = A.getOperand(0), A1 = A.getOperand(1);
	SDValue S0 = S.getOperand(0);
	if ((A0 == S && A1 == S0) \|\| (A1 == S && A0 == S0)) {
	unsigned OpSizeInBits = VT.getScalarSizeInBits();
	if (ConstantSDNode *C = isConstOrConstSplat(S.getOperand(1)))
	if (C->getAPIntValue() == (OpSizeInBits - 1))
	return DAG.getNode(ISD::ABS, DL, VT, S0);
	}
	}
	}

	// fold (xor x, x) -> 0
	if (N0 == N1)
	return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);

	// fold (xor (shl 1, x), -1) -> (rotl ~1, x)
	// Here is a concrete example of this equivalence:
	// i16 x == 14
	// i16 shl == 1 << 14 == 16384 == 0b0100000000000000
	// i16 xor == ~(1 << 14) == 49151 == 0b1011111111111111
	//
	// =>
	//
	// i16 ~1 == 0b1111111111111110
	// i16 rol(~1, 14) == 0b1011111111111111
	//
	// Some additional tips to help conceptualize this transform:
	// - Try to see the operation as placing a single zero in a value of all ones.
	// - There exists no value for x which would allow the result to contain zero.
	// - Values of x larger than the bitwidth are undefined and do not require a
	// consistent result.
	// - Pushing the zero left requires shifting one bits in from the right.
	// A rotate left of ~1 is a nice way of achieving the desired result.
	if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT) && N0Opcode == ISD::SHL &&
	isAllOnesConstant(N1) && isOneConstant(N0.getOperand(0))) {
	return DAG.getNode(ISD::ROTL, DL, VT, DAG.getConstant(~1, DL, VT),
	N0.getOperand(1));
	}

	// Simplify: xor (op x...), (op y...) -> (op (xor x, y))
	if (N0Opcode == N1.getOpcode())
	if (SDValue V = hoistLogicOpWithSameOpcodeHands(N))
	return V;

	// Unfold ((x ^ y) & m) ^ y into (x & m) \| (y & ~m) if profitable
	if (SDValue MM = unfoldMaskedMerge(N))
	return MM;

	// Simplify the expression using non-local knowledge.
	if (SimplifyDemandedBits(SDValue(N, 0)))
	return SDValue(N, 0);

	if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N))
	return Combined;

	return SDValue();
	}

	/// If we have a shift-by-constant of a bitwise logic op that itself has a
	/// shift-by-constant operand with identical opcode, we may be able to convert
	/// that into 2 independent shifts followed by the logic op. This is a
	/// throughput improvement.
	static SDValue combineShiftOfShiftedLogic(SDNode *Shift, SelectionDAG &DAG) {
	// Match a one-use bitwise logic op.
	SDValue LogicOp = Shift->getOperand(0);
	if (!LogicOp.hasOneUse())
	return SDValue();

	unsigned LogicOpcode = LogicOp.getOpcode();
	if (LogicOpcode != ISD::AND && LogicOpcode != ISD::OR &&
	LogicOpcode != ISD::XOR)
	return SDValue();

	// Find a matching one-use shift by constant.
	unsigned ShiftOpcode = Shift->getOpcode();
	SDValue C1 = Shift->getOperand(1);
	ConstantSDNode *C1Node = isConstOrConstSplat(C1);
	assert(C1Node && "Expected a shift with constant operand");
	const APInt &C1Val = C1Node->getAPIntValue();
	auto matchFirstShift = [&](SDValue V, SDValue &ShiftOp,
	const APInt *&ShiftAmtVal) {
	if (V.getOpcode() != ShiftOpcode \|\| !V.hasOneUse())
	return false;

	ConstantSDNode *ShiftCNode = isConstOrConstSplat(V.getOperand(1));
	if (!ShiftCNode)
	return false;

	// Capture the shifted operand and shift amount value.
	ShiftOp = V.getOperand(0);
	ShiftAmtVal = &ShiftCNode->getAPIntValue();

	// Shift amount types do not have to match their operand type, so check that
	// the constants are the same width.
	if (ShiftAmtVal->getBitWidth() != C1Val.getBitWidth())
	return false;

	// The fold is not valid if the sum of the shift values exceeds bitwidth.
	if ((*ShiftAmtVal + C1Val).uge(V.getScalarValueSizeInBits()))
	return false;

	return true;
	};

	// Logic ops are commutative, so check each operand for a match.
	SDValue X, Y;
	const APInt *C0Val;
	if (matchFirstShift(LogicOp.getOperand(0), X, C0Val))
	Y = LogicOp.getOperand(1);
	else if (matchFirstShift(LogicOp.getOperand(1), X, C0Val))
	Y = LogicOp.getOperand(0);
	else
	return SDValue();

	// shift (logic (shift X, C0), Y), C1 -> logic (shift X, C0+C1), (shift Y, C1)
	SDLoc DL(Shift);
	EVT VT = Shift->getValueType(0);
	EVT ShiftAmtVT = Shift->getOperand(1).getValueType();
	SDValue ShiftSumC = DAG.getConstant(*C0Val + C1Val, DL, ShiftAmtVT);
	SDValue NewShift1 = DAG.getNode(ShiftOpcode, DL, VT, X, ShiftSumC);
	SDValue NewShift2 = DAG.getNode(ShiftOpcode, DL, VT, Y, C1);
	return DAG.getNode(LogicOpcode, DL, VT, NewShift1, NewShift2);
	}

	/// Handle transforms common to the three shifts, when the shift amount is a
	/// constant.
	/// We are looking for: (shift being one of shl/sra/srl)
	/// shift (binop X, C0), C1
	/// And want to transform into:
	/// binop (shift X, C1), (shift C0, C1)
	SDValue DAGCombiner::visitShiftByConstant(SDNode *N) {
	assert(isConstOrConstSplat(N->getOperand(1)) && "Expected constant operand");

	// Do not turn a 'not' into a regular xor.
	if (isBitwiseNot(N->getOperand(0)))
	return SDValue();

	// The inner binop must be one-use, since we want to replace it.
	SDValue LHS = N->getOperand(0);
	if (!LHS.hasOneUse() \|\| !TLI.isDesirableToCommuteWithShift(N, Level))
	return SDValue();

	// TODO: This is limited to early combining because it may reveal regressions
	// otherwise. But since we just checked a target hook to see if this is
	// desirable, that should have filtered out cases where this interferes
	// with some other pattern matching.
	if (!LegalTypes)
	if (SDValue R = combineShiftOfShiftedLogic(N, DAG))
	return R;

	// We want to pull some binops through shifts, so that we have (and (shift))
	// instead of (shift (and)), likewise for add, or, xor, etc. This sort of
	// thing happens with address calculations, so it's important to canonicalize
	// it.
	switch (LHS.getOpcode()) {
	default:
	return SDValue();
	case ISD::OR:
	case ISD::XOR:
	case ISD::AND:
	break;
	case ISD::ADD:
	if (N->getOpcode() != ISD::SHL)
	return SDValue(); // only shl(add) not sr[al](add).
	break;
	}

	// We require the RHS of the binop to be a constant and not opaque as well.
	ConstantSDNode *BinOpCst = getAsNonOpaqueConstant(LHS.getOperand(1));
	if (!BinOpCst)
	return SDValue();

	// FIXME: disable this unless the input to the binop is a shift by a constant
	// or is copy/select. Enable this in other cases when figure out it's exactly
	// profitable.
	SDValue BinOpLHSVal = LHS.getOperand(0);
	bool IsShiftByConstant = (BinOpLHSVal.getOpcode() == ISD::SHL \|\|
	BinOpLHSVal.getOpcode() == ISD::SRA \|\|
	BinOpLHSVal.getOpcode() == ISD::SRL) &&
	isa<ConstantSDNode>(BinOpLHSVal.getOperand(1));
	bool IsCopyOrSelect = BinOpLHSVal.getOpcode() == ISD::CopyFromReg \|\|
	BinOpLHSVal.getOpcode() == ISD::SELECT;

	if (!IsShiftByConstant && !IsCopyOrSelect)
	return SDValue();

	if (IsCopyOrSelect && N->hasOneUse())
	return SDValue();

	// Fold the constants, shifting the binop RHS by the shift amount.
	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	SDValue NewRHS = DAG.getNode(N->getOpcode(), DL, VT, LHS.getOperand(1),
	N->getOperand(1));
	assert(isa<ConstantSDNode>(NewRHS) && "Folding was not successful!");

	SDValue NewShift = DAG.getNode(N->getOpcode(), DL, VT, LHS.getOperand(0),
	N->getOperand(1));
	return DAG.getNode(LHS.getOpcode(), DL, VT, NewShift, NewRHS);
	}

	SDValue DAGCombiner::distributeTruncateThroughAnd(SDNode *N) {
	assert(N->getOpcode() == ISD::TRUNCATE);
	assert(N->getOperand(0).getOpcode() == ISD::AND);

	// (truncate:TruncVT (and N00, N01C)) -> (and (truncate:TruncVT N00), TruncC)
	EVT TruncVT = N->getValueType(0);
	if (N->hasOneUse() && N->getOperand(0).hasOneUse() &&
	TLI.isTypeDesirableForOp(ISD::AND, TruncVT)) {
	SDValue N01 = N->getOperand(0).getOperand(1);
	if (isConstantOrConstantVector(N01, /* NoOpaques */ true)) {
	SDLoc DL(N);
	SDValue N00 = N->getOperand(0).getOperand(0);
	SDValue Trunc00 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N00);
	SDValue Trunc01 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N01);
	AddToWorklist(Trunc00.getNode());
	AddToWorklist(Trunc01.getNode());
	return DAG.getNode(ISD::AND, DL, TruncVT, Trunc00, Trunc01);
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitRotate(SDNode *N) {
	SDLoc dl(N);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	unsigned Bitsize = VT.getScalarSizeInBits();

	// fold (rot x, 0) -> x
	if (isNullOrNullSplat(N1))
	return N0;

	// fold (rot x, c) -> x iff (c % BitSize) == 0
	if (isPowerOf2_32(Bitsize) && Bitsize > 1) {
	APInt ModuloMask(N1.getScalarValueSizeInBits(), Bitsize - 1);
	if (DAG.MaskedValueIsZero(N1, ModuloMask))
	return N0;
	}

	// fold (rot x, c) -> (rot x, c % BitSize)
	bool OutOfRange = false;
	auto MatchOutOfRange = [Bitsize, &OutOfRange](ConstantSDNode *C) {
	OutOfRange \|= C->getAPIntValue().uge(Bitsize);
	return true;
	};
	if (ISD::matchUnaryPredicate(N1, MatchOutOfRange) && OutOfRange) {
	EVT AmtVT = N1.getValueType();
	SDValue Bits = DAG.getConstant(Bitsize, dl, AmtVT);
	if (SDValue Amt =
	DAG.FoldConstantArithmetic(ISD::UREM, dl, AmtVT, {N1, Bits}))
	return DAG.getNode(N->getOpcode(), dl, VT, N0, Amt);
	}

	// rot i16 X, 8 --> bswap X
	auto *RotAmtC = isConstOrConstSplat(N1);
	if (RotAmtC && RotAmtC->getAPIntValue() == 8 &&
	VT.getScalarSizeInBits() == 16 && hasOperation(ISD::BSWAP, VT))
	return DAG.getNode(ISD::BSWAP, dl, VT, N0);

	// Simplify the operands using demanded-bits information.
	if (SimplifyDemandedBits(SDValue(N, 0)))
	return SDValue(N, 0);

	// fold (rot* x, (trunc (and y, c))) -> (rot* x, (and (trunc y), (trunc c))).
	if (N1.getOpcode() == ISD::TRUNCATE &&
	N1.getOperand(0).getOpcode() == ISD::AND) {
	if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
	return DAG.getNode(N->getOpcode(), dl, VT, N0, NewOp1);
	}

	unsigned NextOp = N0.getOpcode();
	// fold (rot* (rot* x, c2), c1) -> (rot* x, c1 +- c2 % bitsize)
	if (NextOp == ISD::ROTL \|\| NextOp == ISD::ROTR) {
	SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1);
	SDNode *C2 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1));
	if (C1 && C2 && C1->getValueType(0) == C2->getValueType(0)) {
	EVT ShiftVT = C1->getValueType(0);
	bool SameSide = (N->getOpcode() == NextOp);
	unsigned CombineOp = SameSide ? ISD::ADD : ISD::SUB;
	if (SDValue CombinedShift = DAG.FoldConstantArithmetic(
	CombineOp, dl, ShiftVT, {N1, N0.getOperand(1)})) {
	SDValue BitsizeC = DAG.getConstant(Bitsize, dl, ShiftVT);
	SDValue CombinedShiftNorm = DAG.FoldConstantArithmetic(
	ISD::SREM, dl, ShiftVT, {CombinedShift, BitsizeC});
	return DAG.getNode(N->getOpcode(), dl, VT, N0->getOperand(0),
	CombinedShiftNorm);
	}
	}
	}
	return SDValue();
	}

	SDValue DAGCombiner::visitSHL(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	if (SDValue V = DAG.simplifyShift(N0, N1))
	return V;

	EVT VT = N0.getValueType();
	EVT ShiftVT = N1.getValueType();
	unsigned OpSizeInBits = VT.getScalarSizeInBits();

	// fold vector ops
	if (VT.isVector()) {
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	BuildVectorSDNode *N1CV = dyn_cast<BuildVectorSDNode>(N1);
	// If setcc produces all-one true value then:
	// (shl (and (setcc) N01CV) N1CV) -> (and (setcc) N01CV<<N1CV)
	if (N1CV && N1CV->isConstant()) {
	if (N0.getOpcode() == ISD::AND) {
	SDValue N00 = N0->getOperand(0);
	SDValue N01 = N0->getOperand(1);
	BuildVectorSDNode *N01CV = dyn_cast<BuildVectorSDNode>(N01);

	if (N01CV && N01CV->isConstant() && N00.getOpcode() == ISD::SETCC &&
	TLI.getBooleanContents(N00.getOperand(0).getValueType()) ==
	TargetLowering::ZeroOrNegativeOneBooleanContent) {
	if (SDValue C =
	DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, {N01, N1}))
	return DAG.getNode(ISD::AND, SDLoc(N), VT, N00, C);
	}
	}
	}
	}

	ConstantSDNode *N1C = isConstOrConstSplat(N1);

	// fold (shl c1, c2) -> c1<<c2
	if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, {N0, N1}))
	return C;

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// if (shl x, c) is known to be zero, return 0
	if (DAG.MaskedValueIsZero(SDValue(N, 0),
	APInt::getAllOnesValue(OpSizeInBits)))
	return DAG.getConstant(0, SDLoc(N), VT);

	// fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))).
	if (N1.getOpcode() == ISD::TRUNCATE &&
	N1.getOperand(0).getOpcode() == ISD::AND) {
	if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
	return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, NewOp1);
	}

	if (SimplifyDemandedBits(SDValue(N, 0)))
	return SDValue(N, 0);

	// fold (shl (shl x, c1), c2) -> 0 or (shl x, (add c1, c2))
	if (N0.getOpcode() == ISD::SHL) {
	auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS,
	ConstantSDNode *RHS) {
	APInt c1 = LHS->getAPIntValue();
	APInt c2 = RHS->getAPIntValue();
	zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
	return (c1 + c2).uge(OpSizeInBits);
	};
	if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
	return DAG.getConstant(0, SDLoc(N), VT);

	auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS,
	ConstantSDNode *RHS) {
	APInt c1 = LHS->getAPIntValue();
	APInt c2 = RHS->getAPIntValue();
	zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
	return (c1 + c2).ult(OpSizeInBits);
	};
	if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
	SDLoc DL(N);
	SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1));
	return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Sum);
	}
	}

	// fold (shl (ext (shl x, c1)), c2) -> (shl (ext x), (add c1, c2))
	// For this to be valid, the second form must not preserve any of the bits
	// that are shifted out by the inner shift in the first form. This means
	// the outer shift size must be >= the number of bits added by the ext.
	// As a corollary, we don't care what kind of ext it is.
	if ((N0.getOpcode() == ISD::ZERO_EXTEND \|\|
	N0.getOpcode() == ISD::ANY_EXTEND \|\|
	N0.getOpcode() == ISD::SIGN_EXTEND) &&
	N0.getOperand(0).getOpcode() == ISD::SHL) {
	SDValue N0Op0 = N0.getOperand(0);
	SDValue InnerShiftAmt = N0Op0.getOperand(1);
	EVT InnerVT = N0Op0.getValueType();
	uint64_t InnerBitwidth = InnerVT.getScalarSizeInBits();

	auto MatchOutOfRange = [OpSizeInBits, InnerBitwidth](ConstantSDNode *LHS,
	ConstantSDNode *RHS) {
	APInt c1 = LHS->getAPIntValue();
	APInt c2 = RHS->getAPIntValue();
	zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
	return c2.uge(OpSizeInBits - InnerBitwidth) &&
	(c1 + c2).uge(OpSizeInBits);
	};
	if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchOutOfRange,
	/AllowUndefs/ false,
	/AllowTypeMismatch/ true))
	return DAG.getConstant(0, SDLoc(N), VT);

	auto MatchInRange = [OpSizeInBits, InnerBitwidth](ConstantSDNode *LHS,
	ConstantSDNode *RHS) {
	APInt c1 = LHS->getAPIntValue();
	APInt c2 = RHS->getAPIntValue();
	zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
	return c2.uge(OpSizeInBits - InnerBitwidth) &&
	(c1 + c2).ult(OpSizeInBits);
	};
	if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchInRange,
	/AllowUndefs/ false,
	/AllowTypeMismatch/ true)) {
	SDLoc DL(N);
	SDValue Ext = DAG.getNode(N0.getOpcode(), DL, VT, N0Op0.getOperand(0));
	SDValue Sum = DAG.getZExtOrTrunc(InnerShiftAmt, DL, ShiftVT);
	Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, Sum, N1);
	return DAG.getNode(ISD::SHL, DL, VT, Ext, Sum);
	}
	}

	// fold (shl (zext (srl x, C)), C) -> (zext (shl (srl x, C), C))
	// Only fold this if the inner zext has no other uses to avoid increasing
	// the total number of instructions.
	if (N0.getOpcode() == ISD::ZERO_EXTEND && N0.hasOneUse() &&
	N0.getOperand(0).getOpcode() == ISD::SRL) {
	SDValue N0Op0 = N0.getOperand(0);
	SDValue InnerShiftAmt = N0Op0.getOperand(1);

	auto MatchEqual = [VT](ConstantSDNode LHS, ConstantSDNode RHS) {
	APInt c1 = LHS->getAPIntValue();
	APInt c2 = RHS->getAPIntValue();
	zeroExtendToMatch(c1, c2);
	return c1.ult(VT.getScalarSizeInBits()) && (c1 == c2);
	};
	if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchEqual,
	/AllowUndefs/ false,
	/AllowTypeMismatch/ true)) {
	SDLoc DL(N);
	EVT InnerShiftAmtVT = N0Op0.getOperand(1).getValueType();
	SDValue NewSHL = DAG.getZExtOrTrunc(N1, DL, InnerShiftAmtVT);
	NewSHL = DAG.getNode(ISD::SHL, DL, N0Op0.getValueType(), N0Op0, NewSHL);
	AddToWorklist(NewSHL.getNode());
	return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N0), VT, NewSHL);
	}
	}

	// fold (shl (sr[la] exact X, C1), C2) -> (shl X, (C2-C1)) if C1 <= C2
	// fold (shl (sr[la] exact X, C1), C2) -> (sr[la] X, (C2-C1)) if C1 > C2
	// TODO - support non-uniform vector shift amounts.
	if (N1C && (N0.getOpcode() == ISD::SRL \|\| N0.getOpcode() == ISD::SRA) &&
	N0->getFlags().hasExact()) {
	if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
	uint64_t C1 = N0C1->getZExtValue();
	uint64_t C2 = N1C->getZExtValue();
	SDLoc DL(N);
	if (C1 <= C2)
	return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0),
	DAG.getConstant(C2 - C1, DL, ShiftVT));
	return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0),
	DAG.getConstant(C1 - C2, DL, ShiftVT));
	}
	}

	// fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or
	// (and (srl x, (sub c1, c2), MASK)
	// Only fold this if the inner shift has no other uses -- if it does, folding
	// this will increase the total number of instructions.
	// TODO - drop hasOneUse requirement if c1 == c2?
	// TODO - support non-uniform vector shift amounts.
	if (N1C && N0.getOpcode() == ISD::SRL && N0.hasOneUse() &&
	TLI.shouldFoldConstantShiftPairToMask(N, Level)) {
	if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
	if (N0C1->getAPIntValue().ult(OpSizeInBits)) {
	uint64_t c1 = N0C1->getZExtValue();
	uint64_t c2 = N1C->getZExtValue();
	APInt Mask = APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - c1);
	SDValue Shift;
	if (c2 > c1) {
	Mask <<= c2 - c1;
	SDLoc DL(N);
	Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0),
	DAG.getConstant(c2 - c1, DL, ShiftVT));
	} else {
	Mask.lshrInPlace(c1 - c2);
	SDLoc DL(N);
	Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0),
	DAG.getConstant(c1 - c2, DL, ShiftVT));
	}
	SDLoc DL(N0);
	return DAG.getNode(ISD::AND, DL, VT, Shift,
	DAG.getConstant(Mask, DL, VT));
	}
	}
	}

	// fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1))
	if (N0.getOpcode() == ISD::SRA && N1 == N0.getOperand(1) &&
	isConstantOrConstantVector(N1, /* No Opaques */ true)) {
	SDLoc DL(N);
	SDValue AllBits = DAG.getAllOnesConstant(DL, VT);
	SDValue HiBitsMask = DAG.getNode(ISD::SHL, DL, VT, AllBits, N1);
	return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), HiBitsMask);
	}

	// fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
	// fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
	// Variant of version done on multiply, except mul by a power of 2 is turned
	// into a shift.
	if ((N0.getOpcode() == ISD::ADD \|\| N0.getOpcode() == ISD::OR) &&
	N0.getNode()->hasOneUse() &&
	isConstantOrConstantVector(N1, /* No Opaques */ true) &&
	isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true) &&
	TLI.isDesirableToCommuteWithShift(N, Level)) {
	SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1);
	SDValue Shl1 = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);
	AddToWorklist(Shl0.getNode());
	AddToWorklist(Shl1.getNode());
	return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Shl0, Shl1);
	}

	// fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)
	if (N0.getOpcode() == ISD::MUL && N0.getNode()->hasOneUse() &&
	isConstantOrConstantVector(N1, /* No Opaques */ true) &&
	isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) {
	SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);
	if (isConstantOrConstantVector(Shl))
	return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), Shl);
	}

	if (N1C && !N1C->isOpaque())
	if (SDValue NewSHL = visitShiftByConstant(N))
	return NewSHL;

	// Fold (shl (vscale * C0), C1) to (vscale * (C0 << C1)).
	if (N0.getOpcode() == ISD::VSCALE)
	if (ConstantSDNode *NC1 = isConstOrConstSplat(N->getOperand(1))) {
	auto DL = SDLoc(N);
	APInt C0 = N0.getConstantOperandAPInt(0);
	APInt C1 = NC1->getAPIntValue();
	return DAG.getVScale(DL, VT, C0 << C1);
	}

	return SDValue();
	}

	// Transform a right shift of a multiply into a multiply-high.
	// Examples:
	// (srl (mul (zext i32:$a to i64), (zext i32:$a to i64)), 32) -> (mulhu $a, $b)
	// (sra (mul (sext i32:$a to i64), (sext i32:$a to i64)), 32) -> (mulhs $a, $b)
	static SDValue combineShiftToMULH(SDNode *N, SelectionDAG &DAG,
	const TargetLowering &TLI) {
	assert((N->getOpcode() == ISD::SRL \|\| N->getOpcode() == ISD::SRA) &&
	"SRL or SRA node is required here!");

	// Check the shift amount. Proceed with the transformation if the shift
	// amount is constant.
	ConstantSDNode *ShiftAmtSrc = isConstOrConstSplat(N->getOperand(1));
	if (!ShiftAmtSrc)
	return SDValue();

	SDLoc DL(N);

	// The operation feeding into the shift must be a multiply.
	SDValue ShiftOperand = N->getOperand(0);
	if (ShiftOperand.getOpcode() != ISD::MUL)
	return SDValue();

	// Both operands must be equivalent extend nodes.
	SDValue LeftOp = ShiftOperand.getOperand(0);
	SDValue RightOp = ShiftOperand.getOperand(1);
	bool IsSignExt = LeftOp.getOpcode() == ISD::SIGN_EXTEND;
	bool IsZeroExt = LeftOp.getOpcode() == ISD::ZERO_EXTEND;

	if ((!(IsSignExt \|\| IsZeroExt)) \|\| LeftOp.getOpcode() != RightOp.getOpcode())
	return SDValue();

	EVT WideVT1 = LeftOp.getValueType();
	EVT WideVT2 = RightOp.getValueType();
	(void)WideVT2;
	// Proceed with the transformation if the wide types match.
	assert((WideVT1 == WideVT2) &&
	"Cannot have a multiply node with two different operand types.");

	EVT NarrowVT = LeftOp.getOperand(0).getValueType();
	// Check that the two extend nodes are the same type.
	if (NarrowVT != RightOp.getOperand(0).getValueType())
	return SDValue();

	// Only transform into mulh if mulh for the narrow type is cheaper than
	// a multiply followed by a shift. This should also check if mulh is
	// legal for NarrowVT on the target.
	if (!TLI.isMulhCheaperThanMulShift(NarrowVT))
	return SDValue();

	// Proceed with the transformation if the wide type is twice as large
	// as the narrow type.
	unsigned NarrowVTSize = NarrowVT.getScalarSizeInBits();
	if (WideVT1.getScalarSizeInBits() != 2 * NarrowVTSize)
	return SDValue();

	// Check the shift amount with the narrow type size.
	// Proceed with the transformation if the shift amount is the width
	// of the narrow type.
	unsigned ShiftAmt = ShiftAmtSrc->getZExtValue();
	if (ShiftAmt != NarrowVTSize)
	return SDValue();

	// If the operation feeding into the MUL is a sign extend (sext),
	// we use mulhs. Othewise, zero extends (zext) use mulhu.
	unsigned MulhOpcode = IsSignExt ? ISD::MULHS : ISD::MULHU;

	SDValue Result = DAG.getNode(MulhOpcode, DL, NarrowVT, LeftOp.getOperand(0),
	RightOp.getOperand(0));
	return (N->getOpcode() == ISD::SRA ? DAG.getSExtOrTrunc(Result, DL, WideVT1)
	: DAG.getZExtOrTrunc(Result, DL, WideVT1));
	}

	SDValue DAGCombiner::visitSRA(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	if (SDValue V = DAG.simplifyShift(N0, N1))
	return V;

	EVT VT = N0.getValueType();
	unsigned OpSizeInBits = VT.getScalarSizeInBits();

	// Arithmetic shifting an all-sign-bit value is a no-op.
	// fold (sra 0, x) -> 0
	// fold (sra -1, x) -> -1
	if (DAG.ComputeNumSignBits(N0) == OpSizeInBits)
	return N0;

	// fold vector ops
	if (VT.isVector())
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	ConstantSDNode *N1C = isConstOrConstSplat(N1);

	// fold (sra c1, c2) -> (sra c1, c2)
	if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, {N0, N1}))
	return C;

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports
	// sext_inreg.
	if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) {
	unsigned LowBits = OpSizeInBits - (unsigned)N1C->getZExtValue();
	EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), LowBits);
	if (VT.isVector())
	ExtVT = EVT::getVectorVT(*DAG.getContext(),
	ExtVT, VT.getVectorNumElements());
	if (!LegalOperations \|\|
	TLI.getOperationAction(ISD::SIGN_EXTEND_INREG, ExtVT) ==
	TargetLowering::Legal)
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,
	N0.getOperand(0), DAG.getValueType(ExtVT));
	}

	// fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2))
	// clamp (add c1, c2) to max shift.
	if (N0.getOpcode() == ISD::SRA) {
	SDLoc DL(N);
	EVT ShiftVT = N1.getValueType();
	EVT ShiftSVT = ShiftVT.getScalarType();
	SmallVector<SDValue, 16> ShiftValues;

	auto SumOfShifts = [&](ConstantSDNode LHS, ConstantSDNode RHS) {
	APInt c1 = LHS->getAPIntValue();
	APInt c2 = RHS->getAPIntValue();
	zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
	APInt Sum = c1 + c2;
	unsigned ShiftSum =
	Sum.uge(OpSizeInBits) ? (OpSizeInBits - 1) : Sum.getZExtValue();
	ShiftValues.push_back(DAG.getConstant(ShiftSum, DL, ShiftSVT));
	return true;
	};
	if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), SumOfShifts)) {
	SDValue ShiftValue;
	if (VT.isVector())
	ShiftValue = DAG.getBuildVector(ShiftVT, DL, ShiftValues);
	else
	ShiftValue = ShiftValues[0];
	return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0), ShiftValue);
	}
	}

	// fold (sra (shl X, m), (sub result_size, n))
	// -> (sign_extend (trunc (shl X, (sub (sub result_size, n), m)))) for
	// result_size - n != m.
	// If truncate is free for the target sext(shl) is likely to result in better
	// code.
	if (N0.getOpcode() == ISD::SHL && N1C) {
	// Get the two constanst of the shifts, CN0 = m, CN = n.
	const ConstantSDNode *N01C = isConstOrConstSplat(N0.getOperand(1));
	if (N01C) {
	LLVMContext &Ctx = *DAG.getContext();
	// Determine what the truncate's result bitsize and type would be.
	EVT TruncVT = EVT::getIntegerVT(Ctx, OpSizeInBits - N1C->getZExtValue());

	if (VT.isVector())
	TruncVT = EVT::getVectorVT(Ctx, TruncVT, VT.getVectorNumElements());

	// Determine the residual right-shift amount.
	int ShiftAmt = N1C->getZExtValue() - N01C->getZExtValue();

	// If the shift is not a no-op (in which case this should be just a sign
	// extend already), the truncated to type is legal, sign_extend is legal
	// on that type, and the truncate to that type is both legal and free,
	// perform the transform.
	if ((ShiftAmt > 0) &&
	TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND, TruncVT) &&
	TLI.isOperationLegalOrCustom(ISD::TRUNCATE, VT) &&
	TLI.isTruncateFree(VT, TruncVT)) {
	SDLoc DL(N);
	SDValue Amt = DAG.getConstant(ShiftAmt, DL,
	getShiftAmountTy(N0.getOperand(0).getValueType()));
	SDValue Shift = DAG.getNode(ISD::SRL, DL, VT,
	N0.getOperand(0), Amt);
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT,
	Shift);
	return DAG.getNode(ISD::SIGN_EXTEND, DL,
	N->getValueType(0), Trunc);
	}
	}
	}

	// We convert trunc/ext to opposing shifts in IR, but casts may be cheaper.
	// sra (add (shl X, N1C), AddC), N1C -->
	// sext (add (trunc X to (width - N1C)), AddC')
	if (N0.getOpcode() == ISD::ADD && N0.hasOneUse() && N1C &&
	N0.getOperand(0).getOpcode() == ISD::SHL &&
	N0.getOperand(0).getOperand(1) == N1 && N0.getOperand(0).hasOneUse()) {
	if (ConstantSDNode *AddC = isConstOrConstSplat(N0.getOperand(1))) {
	SDValue Shl = N0.getOperand(0);
	// Determine what the truncate's type would be and ask the target if that
	// is a free operation.
	LLVMContext &Ctx = *DAG.getContext();
	unsigned ShiftAmt = N1C->getZExtValue();
	EVT TruncVT = EVT::getIntegerVT(Ctx, OpSizeInBits - ShiftAmt);
	if (VT.isVector())
	TruncVT = EVT::getVectorVT(Ctx, TruncVT, VT.getVectorNumElements());

	// TODO: The simple type check probably belongs in the default hook
	// implementation and/or target-specific overrides (because
	// non-simple types likely require masking when legalized), but that
	// restriction may conflict with other transforms.
	if (TruncVT.isSimple() && isTypeLegal(TruncVT) &&
	TLI.isTruncateFree(VT, TruncVT)) {
	SDLoc DL(N);
	SDValue Trunc = DAG.getZExtOrTrunc(Shl.getOperand(0), DL, TruncVT);
	SDValue ShiftC = DAG.getConstant(AddC->getAPIntValue().lshr(ShiftAmt).
	trunc(TruncVT.getScalarSizeInBits()), DL, TruncVT);
	SDValue Add = DAG.getNode(ISD::ADD, DL, TruncVT, Trunc, ShiftC);
	return DAG.getSExtOrTrunc(Add, DL, VT);
	}
	}
	}

	// fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))).
	if (N1.getOpcode() == ISD::TRUNCATE &&
	N1.getOperand(0).getOpcode() == ISD::AND) {
	if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
	return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0, NewOp1);
	}

	// fold (sra (trunc (sra x, c1)), c2) -> (trunc (sra x, c1 + c2))
	// fold (sra (trunc (srl x, c1)), c2) -> (trunc (sra x, c1 + c2))
	// if c1 is equal to the number of bits the trunc removes
	// TODO - support non-uniform vector shift amounts.
	if (N0.getOpcode() == ISD::TRUNCATE &&
	(N0.getOperand(0).getOpcode() == ISD::SRL \|\|
	N0.getOperand(0).getOpcode() == ISD::SRA) &&
	N0.getOperand(0).hasOneUse() &&
	N0.getOperand(0).getOperand(1).hasOneUse() && N1C) {
	SDValue N0Op0 = N0.getOperand(0);
	if (ConstantSDNode *LargeShift = isConstOrConstSplat(N0Op0.getOperand(1))) {
	EVT LargeVT = N0Op0.getValueType();
	unsigned TruncBits = LargeVT.getScalarSizeInBits() - OpSizeInBits;
	if (LargeShift->getAPIntValue() == TruncBits) {
	SDLoc DL(N);
	SDValue Amt = DAG.getConstant(N1C->getZExtValue() + TruncBits, DL,
	getShiftAmountTy(LargeVT));
	SDValue SRA =
	DAG.getNode(ISD::SRA, DL, LargeVT, N0Op0.getOperand(0), Amt);
	return DAG.getNode(ISD::TRUNCATE, DL, VT, SRA);
	}
	}
	}

	// Simplify, based on bits shifted out of the LHS.
	if (SimplifyDemandedBits(SDValue(N, 0)))
	return SDValue(N, 0);

	// If the sign bit is known to be zero, switch this to a SRL.
	if (DAG.SignBitIsZero(N0))
	return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1);

	if (N1C && !N1C->isOpaque())
	if (SDValue NewSRA = visitShiftByConstant(N))
	return NewSRA;

	// Try to transform this shift into a multiply-high if
	// it matches the appropriate pattern detected in combineShiftToMULH.
	if (SDValue MULH = combineShiftToMULH(N, DAG, TLI))
	return MULH;

	return SDValue();
	}

	SDValue DAGCombiner::visitSRL(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	if (SDValue V = DAG.simplifyShift(N0, N1))
	return V;

	EVT VT = N0.getValueType();
	unsigned OpSizeInBits = VT.getScalarSizeInBits();

	// fold vector ops
	if (VT.isVector())
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	ConstantSDNode *N1C = isConstOrConstSplat(N1);

	// fold (srl c1, c2) -> c1 >>u c2
	if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, {N0, N1}))
	return C;

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// if (srl x, c) is known to be zero, return 0
	if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),
	APInt::getAllOnesValue(OpSizeInBits)))
	return DAG.getConstant(0, SDLoc(N), VT);

	// fold (srl (srl x, c1), c2) -> 0 or (srl x, (add c1, c2))
	if (N0.getOpcode() == ISD::SRL) {
	auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS,
	ConstantSDNode *RHS) {
	APInt c1 = LHS->getAPIntValue();
	APInt c2 = RHS->getAPIntValue();
	zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
	return (c1 + c2).uge(OpSizeInBits);
	};
	if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
	return DAG.getConstant(0, SDLoc(N), VT);

	auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS,
	ConstantSDNode *RHS) {
	APInt c1 = LHS->getAPIntValue();
	APInt c2 = RHS->getAPIntValue();
	zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
	return (c1 + c2).ult(OpSizeInBits);
	};
	if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
	SDLoc DL(N);
	EVT ShiftVT = N1.getValueType();
	SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1));
	return DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Sum);
	}
	}

	if (N1C && N0.getOpcode() == ISD::TRUNCATE &&
	N0.getOperand(0).getOpcode() == ISD::SRL) {
	SDValue InnerShift = N0.getOperand(0);
	// TODO - support non-uniform vector shift amounts.
	if (auto *N001C = isConstOrConstSplat(InnerShift.getOperand(1))) {
	uint64_t c1 = N001C->getZExtValue();
	uint64_t c2 = N1C->getZExtValue();
	EVT InnerShiftVT = InnerShift.getValueType();
	EVT ShiftAmtVT = InnerShift.getOperand(1).getValueType();
	uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits();
	// srl (trunc (srl x, c1)), c2 --> 0 or (trunc (srl x, (add c1, c2)))
	// This is only valid if the OpSizeInBits + c1 = size of inner shift.
	if (c1 + OpSizeInBits == InnerShiftSize) {
	SDLoc DL(N);
	if (c1 + c2 >= InnerShiftSize)
	return DAG.getConstant(0, DL, VT);
	SDValue NewShiftAmt = DAG.getConstant(c1 + c2, DL, ShiftAmtVT);
	SDValue NewShift = DAG.getNode(ISD::SRL, DL, InnerShiftVT,
	InnerShift.getOperand(0), NewShiftAmt);
	return DAG.getNode(ISD::TRUNCATE, DL, VT, NewShift);
	}
	// In the more general case, we can clear the high bits after the shift:
	// srl (trunc (srl x, c1)), c2 --> trunc (and (srl x, (c1+c2)), Mask)
	if (N0.hasOneUse() && InnerShift.hasOneUse() &&
	c1 + c2 < InnerShiftSize) {
	SDLoc DL(N);
	SDValue NewShiftAmt = DAG.getConstant(c1 + c2, DL, ShiftAmtVT);
	SDValue NewShift = DAG.getNode(ISD::SRL, DL, InnerShiftVT,
	InnerShift.getOperand(0), NewShiftAmt);
	SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(InnerShiftSize,
	OpSizeInBits - c2),
	DL, InnerShiftVT);
	SDValue And = DAG.getNode(ISD::AND, DL, InnerShiftVT, NewShift, Mask);
	return DAG.getNode(ISD::TRUNCATE, DL, VT, And);
	}
	}
	}

	// fold (srl (shl x, c), c) -> (and x, cst2)
	// TODO - (srl (shl x, c1), c2).
	if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 &&
	isConstantOrConstantVector(N1, /* NoOpaques */ true)) {
	SDLoc DL(N);
	SDValue Mask =
	DAG.getNode(ISD::SRL, DL, VT, DAG.getAllOnesConstant(DL, VT), N1);
	AddToWorklist(Mask.getNode());
	return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), Mask);
	}

	// fold (srl (anyextend x), c) -> (and (anyextend (srl x, c)), mask)
	// TODO - support non-uniform vector shift amounts.
	if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {
	// Shifting in all undef bits?
	EVT SmallVT = N0.getOperand(0).getValueType();
	unsigned BitSize = SmallVT.getScalarSizeInBits();
	if (N1C->getAPIntValue().uge(BitSize))
	return DAG.getUNDEF(VT);

	if (!LegalTypes \|\| TLI.isTypeDesirableForOp(ISD::SRL, SmallVT)) {
	uint64_t ShiftAmt = N1C->getZExtValue();
	SDLoc DL0(N0);
	SDValue SmallShift = DAG.getNode(ISD::SRL, DL0, SmallVT,
	N0.getOperand(0),
	DAG.getConstant(ShiftAmt, DL0,
	getShiftAmountTy(SmallVT)));
	AddToWorklist(SmallShift.getNode());
	APInt Mask = APInt::getLowBitsSet(OpSizeInBits, OpSizeInBits - ShiftAmt);
	SDLoc DL(N);
	return DAG.getNode(ISD::AND, DL, VT,
	DAG.getNode(ISD::ANY_EXTEND, DL, VT, SmallShift),
	DAG.getConstant(Mask, DL, VT));
	}
	}

	// fold (srl (sra X, Y), 31) -> (srl X, 31). This srl only looks at the sign
	// bit, which is unmodified by sra.
	if (N1C && N1C->getAPIntValue() == (OpSizeInBits - 1)) {
	if (N0.getOpcode() == ISD::SRA)
	return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0.getOperand(0), N1);
	}

	// fold (srl (ctlz x), "5") -> x iff x has one bit set (the low bit).
	if (N1C && N0.getOpcode() == ISD::CTLZ &&
	N1C->getAPIntValue() == Log2_32(OpSizeInBits)) {
	KnownBits Known = DAG.computeKnownBits(N0.getOperand(0));

	// If any of the input bits are KnownOne, then the input couldn't be all
	// zeros, thus the result of the srl will always be zero.
	if (Known.One.getBoolValue()) return DAG.getConstant(0, SDLoc(N0), VT);

	// If all of the bits input the to ctlz node are known to be zero, then
	// the result of the ctlz is "32" and the result of the shift is one.
	APInt UnknownBits = ~Known.Zero;
	if (UnknownBits == 0) return DAG.getConstant(1, SDLoc(N0), VT);

	// Otherwise, check to see if there is exactly one bit input to the ctlz.
	if (UnknownBits.isPowerOf2()) {
	// Okay, we know that only that the single bit specified by UnknownBits
	// could be set on input to the CTLZ node. If this bit is set, the SRL
	// will return 0, if it is clear, it returns 1. Change the CTLZ/SRL pair
	// to an SRL/XOR pair, which is likely to simplify more.
	unsigned ShAmt = UnknownBits.countTrailingZeros();
	SDValue Op = N0.getOperand(0);

	if (ShAmt) {
	SDLoc DL(N0);
	Op = DAG.getNode(ISD::SRL, DL, VT, Op,
	DAG.getConstant(ShAmt, DL,
	getShiftAmountTy(Op.getValueType())));
	AddToWorklist(Op.getNode());
	}

	SDLoc DL(N);
	return DAG.getNode(ISD::XOR, DL, VT,
	Op, DAG.getConstant(1, DL, VT));
	}
	}

	// fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))).
	if (N1.getOpcode() == ISD::TRUNCATE &&
	N1.getOperand(0).getOpcode() == ISD::AND) {
	if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
	return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, NewOp1);
	}

	// fold operands of srl based on knowledge that the low bits are not
	// demanded.
	if (SimplifyDemandedBits(SDValue(N, 0)))
	return SDValue(N, 0);

	if (N1C && !N1C->isOpaque())
	if (SDValue NewSRL = visitShiftByConstant(N))
	return NewSRL;

	// Attempt to convert a srl of a load into a narrower zero-extending load.
	if (SDValue NarrowLoad = ReduceLoadWidth(N))
	return NarrowLoad;

	// Here is a common situation. We want to optimize:
	//
	// %a = ...
	// %b = and i32 %a, 2
	// %c = srl i32 %b, 1
	// brcond i32 %c ...
	//
	// into
	//
	// %a = ...
	// %b = and %a, 2
	// %c = setcc eq %b, 0
	// brcond %c ...
	//
	// However when after the source operand of SRL is optimized into AND, the SRL
	// itself may not be optimized further. Look for it and add the BRCOND into
	// the worklist.
	if (N->hasOneUse()) {
	SDNode Use = N->use_begin();
	if (Use->getOpcode() == ISD::BRCOND)
	AddToWorklist(Use);
	else if (Use->getOpcode() == ISD::TRUNCATE && Use->hasOneUse()) {
	// Also look pass the truncate.
	Use = *Use->use_begin();
	if (Use->getOpcode() == ISD::BRCOND)
	AddToWorklist(Use);
	}
	}

	// Try to transform this shift into a multiply-high if
	// it matches the appropriate pattern detected in combineShiftToMULH.
	if (SDValue MULH = combineShiftToMULH(N, DAG, TLI))
	return MULH;

	return SDValue();
	}

	SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue N2 = N->getOperand(2);
	bool IsFSHL = N->getOpcode() == ISD::FSHL;
	unsigned BitWidth = VT.getScalarSizeInBits();

	// fold (fshl N0, N1, 0) -> N0
	// fold (fshr N0, N1, 0) -> N1
	if (isPowerOf2_32(BitWidth))
	if (DAG.MaskedValueIsZero(
	N2, APInt(N2.getScalarValueSizeInBits(), BitWidth - 1)))
	return IsFSHL ? N0 : N1;

	auto IsUndefOrZero = [](SDValue V) {
	return V.isUndef() \|\| isNullOrNullSplat(V, /AllowUndefs/ true);
	};

	// TODO - support non-uniform vector shift amounts.
	if (ConstantSDNode *Cst = isConstOrConstSplat(N2)) {
	EVT ShAmtTy = N2.getValueType();

	// fold (fsh* N0, N1, c) -> (fsh* N0, N1, c % BitWidth)
	if (Cst->getAPIntValue().uge(BitWidth)) {
	uint64_t RotAmt = Cst->getAPIntValue().urem(BitWidth);
	return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0, N1,
	DAG.getConstant(RotAmt, SDLoc(N), ShAmtTy));
	}

	unsigned ShAmt = Cst->getZExtValue();
	if (ShAmt == 0)
	return IsFSHL ? N0 : N1;

	// fold fshl(undef_or_zero, N1, C) -> lshr(N1, BW-C)
	// fold fshr(undef_or_zero, N1, C) -> lshr(N1, C)
	// fold fshl(N0, undef_or_zero, C) -> shl(N0, C)
	// fold fshr(N0, undef_or_zero, C) -> shl(N0, BW-C)
	if (IsUndefOrZero(N0))
	return DAG.getNode(ISD::SRL, SDLoc(N), VT, N1,
	DAG.getConstant(IsFSHL ? BitWidth - ShAmt : ShAmt,
	SDLoc(N), ShAmtTy));
	if (IsUndefOrZero(N1))
	return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0,
	DAG.getConstant(IsFSHL ? ShAmt : BitWidth - ShAmt,
	SDLoc(N), ShAmtTy));

	// fold (fshl ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive.
	// fold (fshr ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive.
	// TODO - bigendian support once we have test coverage.
	// TODO - can we merge this with CombineConseutiveLoads/MatchLoadCombine?
	// TODO - permit LHS EXTLOAD if extensions are shifted out.
	if ((BitWidth % 8) == 0 && (ShAmt % 8) == 0 && !VT.isVector() &&
	!DAG.getDataLayout().isBigEndian()) {
	auto *LHS = dyn_cast<LoadSDNode>(N0);
	auto *RHS = dyn_cast<LoadSDNode>(N1);
	if (LHS && RHS && LHS->isSimple() && RHS->isSimple() &&
	LHS->getAddressSpace() == RHS->getAddressSpace() &&
	(LHS->hasOneUse() \|\| RHS->hasOneUse()) && ISD::isNON_EXTLoad(RHS) &&
	ISD::isNON_EXTLoad(LHS)) {
	if (DAG.areNonVolatileConsecutiveLoads(LHS, RHS, BitWidth / 8, 1)) {
	SDLoc DL(RHS);
	uint64_t PtrOff =
	IsFSHL ? (((BitWidth - ShAmt) % BitWidth) / 8) : (ShAmt / 8);
	Align NewAlign = commonAlignment(RHS->getAlign(), PtrOff);
	bool Fast = false;
	if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
	RHS->getAddressSpace(), NewAlign,
	RHS->getMemOperand()->getFlags(), &Fast) &&
	Fast) {
	SDValue NewPtr =
	DAG.getMemBasePlusOffset(RHS->getBasePtr(), PtrOff, DL);
	AddToWorklist(NewPtr.getNode());
	SDValue Load = DAG.getLoad(
	VT, DL, RHS->getChain(), NewPtr,
	RHS->getPointerInfo().getWithOffset(PtrOff), NewAlign,
	RHS->getMemOperand()->getFlags(), RHS->getAAInfo());
	// Replace the old load's chain with the new load's chain.
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(N1.getValue(1), Load.getValue(1));
	return Load;
	}
	}
	}
	}
	}

	// fold fshr(undef_or_zero, N1, N2) -> lshr(N1, N2)
	// fold fshl(N0, undef_or_zero, N2) -> shl(N0, N2)
	// iff We know the shift amount is in range.
	// TODO: when is it worth doing SUB(BW, N2) as well?
	if (isPowerOf2_32(BitWidth)) {
	APInt ModuloBits(N2.getScalarValueSizeInBits(), BitWidth - 1);
	if (IsUndefOrZero(N0) && !IsFSHL && DAG.MaskedValueIsZero(N2, ~ModuloBits))
	return DAG.getNode(ISD::SRL, SDLoc(N), VT, N1, N2);
	if (IsUndefOrZero(N1) && IsFSHL && DAG.MaskedValueIsZero(N2, ~ModuloBits))
	return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, N2);
	}

	// fold (fshl N0, N0, N2) -> (rotl N0, N2)
	// fold (fshr N0, N0, N2) -> (rotr N0, N2)
	// TODO: Investigate flipping this rotate if only one is legal, if funnel shift
	// is legal as well we might be better off avoiding non-constant (BW - N2).
	unsigned RotOpc = IsFSHL ? ISD::ROTL : ISD::ROTR;
	if (N0 == N1 && hasOperation(RotOpc, VT))
	return DAG.getNode(RotOpc, SDLoc(N), VT, N0, N2);

	// Simplify, based on bits shifted out of N0/N1.
	if (SimplifyDemandedBits(SDValue(N, 0)))
	return SDValue(N, 0);

	return SDValue();
	}

	SDValue DAGCombiner::visitABS(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (abs c1) -> c2
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0);
	// fold (abs (abs x)) -> (abs x)
	if (N0.getOpcode() == ISD::ABS)
	return N0;
	// fold (abs x) -> x iff not-negative
	if (DAG.SignBitIsZero(N0))
	return N0;
	return SDValue();
	}

	SDValue DAGCombiner::visitBSWAP(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (bswap c1) -> c2
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N0);
	// fold (bswap (bswap x)) -> x
	if (N0.getOpcode() == ISD::BSWAP)
	return N0->getOperand(0);
	return SDValue();
	}

	SDValue DAGCombiner::visitBITREVERSE(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (bitreverse c1) -> c2
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::BITREVERSE, SDLoc(N), VT, N0);
	// fold (bitreverse (bitreverse x)) -> x
	if (N0.getOpcode() == ISD::BITREVERSE)
	return N0.getOperand(0);
	return SDValue();
	}

	SDValue DAGCombiner::visitCTLZ(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (ctlz c1) -> c2
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::CTLZ, SDLoc(N), VT, N0);

	// If the value is known never to be zero, switch to the undef version.
	if (!LegalOperations \|\| TLI.isOperationLegal(ISD::CTLZ_ZERO_UNDEF, VT)) {
	if (DAG.isKnownNeverZero(N0))
	return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0);
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitCTLZ_ZERO_UNDEF(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (ctlz_zero_undef c1) -> c2
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0);
	return SDValue();
	}

	SDValue DAGCombiner::visitCTTZ(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (cttz c1) -> c2
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::CTTZ, SDLoc(N), VT, N0);

	// If the value is known never to be zero, switch to the undef version.
	if (!LegalOperations \|\| TLI.isOperationLegal(ISD::CTTZ_ZERO_UNDEF, VT)) {
	if (DAG.isKnownNeverZero(N0))
	return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SDLoc(N), VT, N0);
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitCTTZ_ZERO_UNDEF(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (cttz_zero_undef c1) -> c2
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SDLoc(N), VT, N0);
	return SDValue();
	}

	SDValue DAGCombiner::visitCTPOP(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (ctpop c1) -> c2
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::CTPOP, SDLoc(N), VT, N0);
	return SDValue();
	}

	// FIXME: This should be checking for no signed zeros on individual operands, as
	// well as no nans.
	static bool isLegalToCombineMinNumMaxNum(SelectionDAG &DAG, SDValue LHS,
	SDValue RHS,
	const TargetLowering &TLI) {
	const TargetOptions &Options = DAG.getTarget().Options;
	EVT VT = LHS.getValueType();

	return Options.NoSignedZerosFPMath && VT.isFloatingPoint() &&
	TLI.isProfitableToCombineMinNumMaxNum(VT) &&
	DAG.isKnownNeverNaN(LHS) && DAG.isKnownNeverNaN(RHS);
	}

	/// Generate Min/Max node
	static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
	SDValue RHS, SDValue True, SDValue False,
	ISD::CondCode CC, const TargetLowering &TLI,
	SelectionDAG &DAG) {
	if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
	return SDValue();

	EVT TransformVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
	switch (CC) {
	case ISD::SETOLT:
	case ISD::SETOLE:
	case ISD::SETLT:
	case ISD::SETLE:
	case ISD::SETULT:
	case ISD::SETULE: {
	// Since it's known never nan to get here already, either fminnum or
	// fminnum_ieee are OK. Try the ieee version first, since it's fminnum is
	// expanded in terms of it.
	unsigned IEEEOpcode = (LHS == True) ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
	if (TLI.isOperationLegalOrCustom(IEEEOpcode, VT))
	return DAG.getNode(IEEEOpcode, DL, VT, LHS, RHS);

	unsigned Opcode = (LHS == True) ? ISD::FMINNUM : ISD::FMAXNUM;
	if (TLI.isOperationLegalOrCustom(Opcode, TransformVT))
	return DAG.getNode(Opcode, DL, VT, LHS, RHS);
	return SDValue();
	}
	case ISD::SETOGT:
	case ISD::SETOGE:
	case ISD::SETGT:
	case ISD::SETGE:
	case ISD::SETUGT:
	case ISD::SETUGE: {
	unsigned IEEEOpcode = (LHS == True) ? ISD::FMAXNUM_IEEE : ISD::FMINNUM_IEEE;
	if (TLI.isOperationLegalOrCustom(IEEEOpcode, VT))
	return DAG.getNode(IEEEOpcode, DL, VT, LHS, RHS);

	unsigned Opcode = (LHS == True) ? ISD::FMAXNUM : ISD::FMINNUM;
	if (TLI.isOperationLegalOrCustom(Opcode, TransformVT))
	return DAG.getNode(Opcode, DL, VT, LHS, RHS);
	return SDValue();
	}
	default:
	return SDValue();
	}
	}

	/// If a (v)select has a condition value that is a sign-bit test, try to smear
	/// the condition operand sign-bit across the value width and use it as a mask.
	static SDValue foldSelectOfConstantsUsingSra(SDNode *N, SelectionDAG &DAG) {
	SDValue Cond = N->getOperand(0);
	SDValue C1 = N->getOperand(1);
	SDValue C2 = N->getOperand(2);
	assert(isConstantOrConstantVector(C1) && isConstantOrConstantVector(C2) &&
	"Expected select-of-constants");

	EVT VT = N->getValueType(0);
	if (Cond.getOpcode() != ISD::SETCC \|\| !Cond.hasOneUse() \|\|
	VT != Cond.getOperand(0).getValueType())
	return SDValue();

	// The inverted-condition + commuted-select variants of these patterns are
	// canonicalized to these forms in IR.
	SDValue X = Cond.getOperand(0);
	SDValue CondC = Cond.getOperand(1);
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
	if (CC == ISD::SETGT && isAllOnesOrAllOnesSplat(CondC) &&
	isAllOnesOrAllOnesSplat(C2)) {
	// i32 X > -1 ? C1 : -1 --> (X >>s 31) \| C1
	SDLoc DL(N);
	SDValue ShAmtC = DAG.getConstant(X.getScalarValueSizeInBits() - 1, DL, VT);
	SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShAmtC);
	return DAG.getNode(ISD::OR, DL, VT, Sra, C1);
	}
	if (CC == ISD::SETLT && isNullOrNullSplat(CondC) && isNullOrNullSplat(C2)) {
	// i8 X < 0 ? C1 : 0 --> (X >>s 7) & C1
	SDLoc DL(N);
	SDValue ShAmtC = DAG.getConstant(X.getScalarValueSizeInBits() - 1, DL, VT);
	SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShAmtC);
	return DAG.getNode(ISD::AND, DL, VT, Sra, C1);
	}
	return SDValue();
	}

	SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {
	SDValue Cond = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue N2 = N->getOperand(2);
	EVT VT = N->getValueType(0);
	EVT CondVT = Cond.getValueType();
	SDLoc DL(N);

	if (!VT.isInteger())
	return SDValue();

	auto *C1 = dyn_cast<ConstantSDNode>(N1);
	auto *C2 = dyn_cast<ConstantSDNode>(N2);
	if (!C1 \|\| !C2)
	return SDValue();

	// Only do this before legalization to avoid conflicting with target-specific
	// transforms in the other direction (create a select from a zext/sext). There
	// is also a target-independent combine here in DAGCombiner in the other
	// direction for (select Cond, -1, 0) when the condition is not i1.
	if (CondVT == MVT::i1 && !LegalOperations) {
	if (C1->isNullValue() && C2->isOne()) {
	// select Cond, 0, 1 --> zext (!Cond)
	SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
	if (VT != MVT::i1)
	NotCond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NotCond);
	return NotCond;
	}
	if (C1->isNullValue() && C2->isAllOnesValue()) {
	// select Cond, 0, -1 --> sext (!Cond)
	SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
	if (VT != MVT::i1)
	NotCond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NotCond);
	return NotCond;
	}
	if (C1->isOne() && C2->isNullValue()) {
	// select Cond, 1, 0 --> zext (Cond)
	if (VT != MVT::i1)
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
	return Cond;
	}
	if (C1->isAllOnesValue() && C2->isNullValue()) {
	// select Cond, -1, 0 --> sext (Cond)
	if (VT != MVT::i1)
	Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
	return Cond;
	}

	// Use a target hook because some targets may prefer to transform in the
	// other direction.
	if (TLI.convertSelectOfConstantsToMath(VT)) {
	// For any constants that differ by 1, we can transform the select into an
	// extend and add.
	const APInt &C1Val = C1->getAPIntValue();
	const APInt &C2Val = C2->getAPIntValue();
	if (C1Val - 1 == C2Val) {
	// select Cond, C1, C1-1 --> add (zext Cond), C1-1
	if (VT != MVT::i1)
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
	return DAG.getNode(ISD::ADD, DL, VT, Cond, N2);
	}
	if (C1Val + 1 == C2Val) {
	// select Cond, C1, C1+1 --> add (sext Cond), C1+1
	if (VT != MVT::i1)
	Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
	return DAG.getNode(ISD::ADD, DL, VT, Cond, N2);
	}

	// select Cond, Pow2, 0 --> (zext Cond) << log2(Pow2)
	if (C1Val.isPowerOf2() && C2Val.isNullValue()) {
	if (VT != MVT::i1)
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
	SDValue ShAmtC = DAG.getConstant(C1Val.exactLogBase2(), DL, VT);
	return DAG.getNode(ISD::SHL, DL, VT, Cond, ShAmtC);
	}

	if (SDValue V = foldSelectOfConstantsUsingSra(N, DAG))
	return V;
	}

	return SDValue();
	}

	// fold (select Cond, 0, 1) -> (xor Cond, 1)
	// We can't do this reliably if integer based booleans have different contents
	// to floating point based booleans. This is because we can't tell whether we
	// have an integer-based boolean or a floating-point-based boolean unless we
	// can find the SETCC that produced it and inspect its operands. This is
	// fairly easy if C is the SETCC node, but it can potentially be
	// undiscoverable (or not reasonably discoverable). For example, it could be
	// in another basic block or it could require searching a complicated
	// expression.
	if (CondVT.isInteger() &&
	TLI.getBooleanContents(/isVec/false, /isFloat/true) ==
	TargetLowering::ZeroOrOneBooleanContent &&
	TLI.getBooleanContents(/isVec/false, /isFloat/false) ==
	TargetLowering::ZeroOrOneBooleanContent &&
	C1->isNullValue() && C2->isOne()) {
	SDValue NotCond =
	DAG.getNode(ISD::XOR, DL, CondVT, Cond, DAG.getConstant(1, DL, CondVT));
	if (VT.bitsEq(CondVT))
	return NotCond;
	return DAG.getZExtOrTrunc(NotCond, DL, VT);
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitSELECT(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue N2 = N->getOperand(2);
	EVT VT = N->getValueType(0);
	EVT VT0 = N0.getValueType();
	SDLoc DL(N);
	SDNodeFlags Flags = N->getFlags();

	if (SDValue V = DAG.simplifySelect(N0, N1, N2))
	return V;

	// fold (select X, X, Y) -> (or X, Y)
	// fold (select X, 1, Y) -> (or C, Y)
	if (VT == VT0 && VT == MVT::i1 && (N0 == N1 \|\| isOneConstant(N1)))
	return DAG.getNode(ISD::OR, DL, VT, N0, N2);

	if (SDValue V = foldSelectOfConstants(N))
	return V;

	// fold (select C, 0, X) -> (and (not C), X)
	if (VT == VT0 && VT == MVT::i1 && isNullConstant(N1)) {
	SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT);
	AddToWorklist(NOTNode.getNode());
	return DAG.getNode(ISD::AND, DL, VT, NOTNode, N2);
	}
	// fold (select C, X, 1) -> (or (not C), X)
	if (VT == VT0 && VT == MVT::i1 && isOneConstant(N2)) {
	SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT);
	AddToWorklist(NOTNode.getNode());
	return DAG.getNode(ISD::OR, DL, VT, NOTNode, N1);
	}
	// fold (select X, Y, X) -> (and X, Y)
	// fold (select X, Y, 0) -> (and X, Y)
	if (VT == VT0 && VT == MVT::i1 && (N0 == N2 \|\| isNullConstant(N2)))
	return DAG.getNode(ISD::AND, DL, VT, N0, N1);

	// If we can fold this based on the true/false value, do so.
	if (SimplifySelectOps(N, N1, N2))
	return SDValue(N, 0); // Don't revisit N.

	if (VT0 == MVT::i1) {
	// The code in this block deals with the following 2 equivalences:
	// select(C0\|C1, x, y) <=> select(C0, x, select(C1, x, y))
	// select(C0&C1, x, y) <=> select(C0, select(C1, x, y), y)
	// The target can specify its preferred form with the
	// shouldNormalizeToSelectSequence() callback. However we always transform
	// to the right anyway if we find the inner select exists in the DAG anyway
	// and we always transform to the left side if we know that we can further
	// optimize the combination of the conditions.
	bool normalizeToSequence =
	TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT);
	// select (and Cond0, Cond1), X, Y
	// -> select Cond0, (select Cond1, X, Y), Y
	if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) {
	SDValue Cond0 = N0->getOperand(0);
	SDValue Cond1 = N0->getOperand(1);
	SDValue InnerSelect =
	DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond1, N1, N2, Flags);
	if (normalizeToSequence \|\| !InnerSelect.use_empty())
	return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0,
	InnerSelect, N2, Flags);
	// Cleanup on failure.
	if (InnerSelect.use_empty())
	recursivelyDeleteUnusedNodes(InnerSelect.getNode());
	}
	// select (or Cond0, Cond1), X, Y -> select Cond0, X, (select Cond1, X, Y)
	if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) {
	SDValue Cond0 = N0->getOperand(0);
	SDValue Cond1 = N0->getOperand(1);
	SDValue InnerSelect = DAG.getNode(ISD::SELECT, DL, N1.getValueType(),
	Cond1, N1, N2, Flags);
	if (normalizeToSequence \|\| !InnerSelect.use_empty())
	return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0, N1,
	InnerSelect, Flags);
	// Cleanup on failure.
	if (InnerSelect.use_empty())
	recursivelyDeleteUnusedNodes(InnerSelect.getNode());
	}

	// select Cond0, (select Cond1, X, Y), Y -> select (and Cond0, Cond1), X, Y
	if (N1->getOpcode() == ISD::SELECT && N1->hasOneUse()) {
	SDValue N1_0 = N1->getOperand(0);
	SDValue N1_1 = N1->getOperand(1);
	SDValue N1_2 = N1->getOperand(2);
	if (N1_2 == N2 && N0.getValueType() == N1_0.getValueType()) {
	// Create the actual and node if we can generate good code for it.
	if (!normalizeToSequence) {
	SDValue And = DAG.getNode(ISD::AND, DL, N0.getValueType(), N0, N1_0);
	return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), And, N1_1,
	N2, Flags);
	}
	// Otherwise see if we can optimize the "and" to a better pattern.
	if (SDValue Combined = visitANDLike(N0, N1_0, N)) {
	return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1_1,
	N2, Flags);
	}
	}
	}
	// select Cond0, X, (select Cond1, X, Y) -> select (or Cond0, Cond1), X, Y
	if (N2->getOpcode() == ISD::SELECT && N2->hasOneUse()) {
	SDValue N2_0 = N2->getOperand(0);
	SDValue N2_1 = N2->getOperand(1);
	SDValue N2_2 = N2->getOperand(2);
	if (N2_1 == N1 && N0.getValueType() == N2_0.getValueType()) {
	// Create the actual or node if we can generate good code for it.
	if (!normalizeToSequence) {
	SDValue Or = DAG.getNode(ISD::OR, DL, N0.getValueType(), N0, N2_0);
	return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Or, N1,
	N2_2, Flags);
	}
	// Otherwise see if we can optimize to a better pattern.
	if (SDValue Combined = visitORLike(N0, N2_0, N))
	return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1,
	N2_2, Flags);
	}
	}
	}

	// select (not Cond), N1, N2 -> select Cond, N2, N1
	if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false)) {
	SDValue SelectOp = DAG.getSelect(DL, VT, F, N2, N1);
	SelectOp->setFlags(Flags);
	return SelectOp;
	}

	// Fold selects based on a setcc into other things, such as min/max/abs.
	if (N0.getOpcode() == ISD::SETCC) {
	SDValue Cond0 = N0.getOperand(0), Cond1 = N0.getOperand(1);
	ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();

	// select (fcmp lt x, y), x, y -> fminnum x, y
	// select (fcmp gt x, y), x, y -> fmaxnum x, y
	//
	// This is OK if we don't care what happens if either operand is a NaN.
	if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, N1, N2, TLI))
	if (SDValue FMinMax = combineMinNumMaxNum(DL, VT, Cond0, Cond1, N1, N2,
	CC, TLI, DAG))
	return FMinMax;

	// Use 'unsigned add with overflow' to optimize an unsigned saturating add.
	// This is conservatively limited to pre-legal-operations to give targets
	// a chance to reverse the transform if they want to do that. Also, it is
	// unlikely that the pattern would be formed late, so it's probably not
	// worth going through the other checks.
	if (!LegalOperations && TLI.isOperationLegalOrCustom(ISD::UADDO, VT) &&
	CC == ISD::SETUGT && N0.hasOneUse() && isAllOnesConstant(N1) &&
	N2.getOpcode() == ISD::ADD && Cond0 == N2.getOperand(0)) {
	auto *C = dyn_cast<ConstantSDNode>(N2.getOperand(1));
	auto *NotC = dyn_cast<ConstantSDNode>(Cond1);
	if (C && NotC && C->getAPIntValue() == ~NotC->getAPIntValue()) {
	// select (setcc Cond0, ~C, ugt), -1, (add Cond0, C) -->
	// uaddo Cond0, C; select uaddo.1, -1, uaddo.0
	//
	// The IR equivalent of this transform would have this form:
	// %a = add %x, C
	// %c = icmp ugt %x, ~C
	// %r = select %c, -1, %a
	// =>
	// %u = call {iN,i1} llvm.uadd.with.overflow(%x, C)
	// %u0 = extractvalue %u, 0
	// %u1 = extractvalue %u, 1
	// %r = select %u1, -1, %u0
	SDVTList VTs = DAG.getVTList(VT, VT0);
	SDValue UAO = DAG.getNode(ISD::UADDO, DL, VTs, Cond0, N2.getOperand(1));
	return DAG.getSelect(DL, VT, UAO.getValue(1), N1, UAO.getValue(0));
	}
	}

	if (TLI.isOperationLegal(ISD::SELECT_CC, VT) \|\|
	(!LegalOperations &&
	TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT))) {
	// Any flags available in a select/setcc fold will be on the setcc as they
	// migrated from fcmp
	Flags = N0.getNode()->getFlags();
	SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, VT, Cond0, Cond1, N1,
	N2, N0.getOperand(2));
	SelectNode->setFlags(Flags);
	return SelectNode;
	}

	return SimplifySelect(DL, N0, N1, N2);
	}

	return SDValue();
	}

	// This function assumes all the vselect's arguments are CONCAT_VECTOR
	// nodes and that the condition is a BV of ConstantSDNodes (or undefs).
	static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
	SDLoc DL(N);
	SDValue Cond = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	EVT VT = N->getValueType(0);
	int NumElems = VT.getVectorNumElements();
	assert(LHS.getOpcode() == ISD::CONCAT_VECTORS &&
	RHS.getOpcode() == ISD::CONCAT_VECTORS &&
	Cond.getOpcode() == ISD::BUILD_VECTOR);

	// CONCAT_VECTOR can take an arbitrary number of arguments. We only care about
	// binary ones here.
	if (LHS->getNumOperands() != 2 \|\| RHS->getNumOperands() != 2)
	return SDValue();

	// We're sure we have an even number of elements due to the
	// concat_vectors we have as arguments to vselect.
	// Skip BV elements until we find one that's not an UNDEF
	// After we find an UNDEF element, keep looping until we get to half the
	// length of the BV and see if all the non-undef nodes are the same.
	ConstantSDNode *BottomHalf = nullptr;
	for (int i = 0; i < NumElems / 2; ++i) {
	if (Cond->getOperand(i)->isUndef())
	continue;

	if (BottomHalf == nullptr)
	BottomHalf = cast<ConstantSDNode>(Cond.getOperand(i));
	else if (Cond->getOperand(i).getNode() != BottomHalf)
	return SDValue();
	}

	// Do the same for the second half of the BuildVector
	ConstantSDNode *TopHalf = nullptr;
	for (int i = NumElems / 2; i < NumElems; ++i) {
	if (Cond->getOperand(i)->isUndef())
	continue;

	if (TopHalf == nullptr)
	TopHalf = cast<ConstantSDNode>(Cond.getOperand(i));
	else if (Cond->getOperand(i).getNode() != TopHalf)
	return SDValue();
	}

	assert(TopHalf && BottomHalf &&
	"One half of the selector was all UNDEFs and the other was all the "
	"same value. This should have been addressed before this function.");
	return DAG.getNode(
	ISD::CONCAT_VECTORS, DL, VT,
	BottomHalf->isNullValue() ? RHS->getOperand(0) : LHS->getOperand(0),
	TopHalf->isNullValue() ? RHS->getOperand(1) : LHS->getOperand(1));
	}

	SDValue DAGCombiner::visitMSCATTER(SDNode *N) {
	MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N);
	SDValue Mask = MSC->getMask();
	SDValue Chain = MSC->getChain();
	SDLoc DL(N);

	// Zap scatters with a zero mask.
	if (ISD::isBuildVectorAllZeros(Mask.getNode()))
	return Chain;

	return SDValue();
	}

	SDValue DAGCombiner::visitMSTORE(SDNode *N) {
	MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
	SDValue Mask = MST->getMask();
	SDValue Chain = MST->getChain();
	SDLoc DL(N);

	// Zap masked stores with a zero mask.
	if (ISD::isBuildVectorAllZeros(Mask.getNode()))
	return Chain;

	// Try transforming N to an indexed store.
	if (CombineToPreIndexedLoadStore(N) \|\| CombineToPostIndexedLoadStore(N))
	return SDValue(N, 0);

	return SDValue();
	}

	SDValue DAGCombiner::visitMGATHER(SDNode *N) {
	MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(N);
	SDValue Mask = MGT->getMask();
	SDLoc DL(N);

	// Zap gathers with a zero mask.
	if (ISD::isBuildVectorAllZeros(Mask.getNode()))
	return CombineTo(N, MGT->getPassThru(), MGT->getChain());

	return SDValue();
	}

	SDValue DAGCombiner::visitMLOAD(SDNode *N) {
	MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N);
	SDValue Mask = MLD->getMask();
	SDLoc DL(N);

	// Zap masked loads with a zero mask.
	if (ISD::isBuildVectorAllZeros(Mask.getNode()))
	return CombineTo(N, MLD->getPassThru(), MLD->getChain());

	// Try transforming N to an indexed load.
	if (CombineToPreIndexedLoadStore(N) \|\| CombineToPostIndexedLoadStore(N))
	return SDValue(N, 0);

	return SDValue();
	}

	/// A vector select of 2 constant vectors can be simplified to math/logic to
	/// avoid a variable select instruction and possibly avoid constant loads.
	SDValue DAGCombiner::foldVSelectOfConstants(SDNode *N) {
	SDValue Cond = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue N2 = N->getOperand(2);
	EVT VT = N->getValueType(0);
	if (!Cond.hasOneUse() \|\| Cond.getScalarValueSizeInBits() != 1 \|\|
	!TLI.convertSelectOfConstantsToMath(VT) \|\|
	!ISD::isBuildVectorOfConstantSDNodes(N1.getNode()) \|\|
	!ISD::isBuildVectorOfConstantSDNodes(N2.getNode()))
	return SDValue();

	// Check if we can use the condition value to increment/decrement a single
	// constant value. This simplifies a select to an add and removes a constant
	// load/materialization from the general case.
	bool AllAddOne = true;
	bool AllSubOne = true;
	unsigned Elts = VT.getVectorNumElements();
	for (unsigned i = 0; i != Elts; ++i) {
	SDValue N1Elt = N1.getOperand(i);
	SDValue N2Elt = N2.getOperand(i);
	if (N1Elt.isUndef() \|\| N2Elt.isUndef())
	continue;
	if (N1Elt.getValueType() != N2Elt.getValueType())
	continue;

	const APInt &C1 = cast<ConstantSDNode>(N1Elt)->getAPIntValue();
	const APInt &C2 = cast<ConstantSDNode>(N2Elt)->getAPIntValue();
	if (C1 != C2 + 1)
	AllAddOne = false;
	if (C1 != C2 - 1)
	AllSubOne = false;
	}

	// Further simplifications for the extra-special cases where the constants are
	// all 0 or all -1 should be implemented as folds of these patterns.
	SDLoc DL(N);
	if (AllAddOne \|\| AllSubOne) {
	// vselect <N x i1> Cond, C+1, C --> add (zext Cond), C
	// vselect <N x i1> Cond, C-1, C --> add (sext Cond), C
	auto ExtendOpcode = AllAddOne ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
	SDValue ExtendedCond = DAG.getNode(ExtendOpcode, DL, VT, Cond);
	return DAG.getNode(ISD::ADD, DL, VT, ExtendedCond, N2);
	}

	// select Cond, Pow2C, 0 --> (zext Cond) << log2(Pow2C)
	APInt Pow2C;
	if (ISD::isConstantSplatVector(N1.getNode(), Pow2C) && Pow2C.isPowerOf2() &&
	isNullOrNullSplat(N2)) {
	SDValue ZextCond = DAG.getZExtOrTrunc(Cond, DL, VT);
	SDValue ShAmtC = DAG.getConstant(Pow2C.exactLogBase2(), DL, VT);
	return DAG.getNode(ISD::SHL, DL, VT, ZextCond, ShAmtC);
	}

	if (SDValue V = foldSelectOfConstantsUsingSra(N, DAG))
	return V;

	// The general case for select-of-constants:
	// vselect <N x i1> Cond, C1, C2 --> xor (and (sext Cond), (C1^C2)), C2
	// ...but that only makes sense if a vselect is slower than 2 logic ops, so
	// leave that to a machine-specific pass.
	return SDValue();
	}

	SDValue DAGCombiner::visitVSELECT(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue N2 = N->getOperand(2);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	if (SDValue V = DAG.simplifySelect(N0, N1, N2))
	return V;

	// vselect (not Cond), N1, N2 -> vselect Cond, N2, N1
	if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false))
	return DAG.getSelect(DL, VT, F, N2, N1);

	// Canonicalize integer abs.
	// vselect (setg[te] X, 0), X, -X ->
	// vselect (setgt X, -1), X, -X ->
	// vselect (setl[te] X, 0), -X, X ->
	// Y = sra (X, size(X)-1); xor (add (X, Y), Y)
	if (N0.getOpcode() == ISD::SETCC) {
	SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
	ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
	bool isAbs = false;
	bool RHSIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());

	if (((RHSIsAllZeros && (CC == ISD::SETGT \|\| CC == ISD::SETGE)) \|\|
	(ISD::isBuildVectorAllOnes(RHS.getNode()) && CC == ISD::SETGT)) &&
	N1 == LHS && N2.getOpcode() == ISD::SUB && N1 == N2.getOperand(1))
	isAbs = ISD::isBuildVectorAllZeros(N2.getOperand(0).getNode());
	else if ((RHSIsAllZeros && (CC == ISD::SETLT \|\| CC == ISD::SETLE)) &&
	N2 == LHS && N1.getOpcode() == ISD::SUB && N2 == N1.getOperand(1))
	isAbs = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode());

	if (isAbs) {
	if (TLI.isOperationLegalOrCustom(ISD::ABS, VT))
	return DAG.getNode(ISD::ABS, DL, VT, LHS);

	SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, LHS,
	DAG.getConstant(VT.getScalarSizeInBits() - 1,
	DL, getShiftAmountTy(VT)));
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, LHS, Shift);
	AddToWorklist(Shift.getNode());
	AddToWorklist(Add.getNode());
	return DAG.getNode(ISD::XOR, DL, VT, Add, Shift);
	}

	// vselect x, y (fcmp lt x, y) -> fminnum x, y
	// vselect x, y (fcmp gt x, y) -> fmaxnum x, y
	//
	// This is OK if we don't care about what happens if either operand is a
	// NaN.
	//
	if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, LHS, RHS, TLI)) {
	if (SDValue FMinMax =
	combineMinNumMaxNum(DL, VT, LHS, RHS, N1, N2, CC, TLI, DAG))
	return FMinMax;
	}

	// If this select has a condition (setcc) with narrower operands than the
	// select, try to widen the compare to match the select width.
	// TODO: This should be extended to handle any constant.
	// TODO: This could be extended to handle non-loading patterns, but that
	// requires thorough testing to avoid regressions.
	if (isNullOrNullSplat(RHS)) {
	EVT NarrowVT = LHS.getValueType();
	EVT WideVT = N1.getValueType().changeVectorElementTypeToInteger();
	EVT SetCCVT = getSetCCResultType(LHS.getValueType());
	unsigned SetCCWidth = SetCCVT.getScalarSizeInBits();
	unsigned WideWidth = WideVT.getScalarSizeInBits();
	bool IsSigned = isSignedIntSetCC(CC);
	auto LoadExtOpcode = IsSigned ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
	if (LHS.getOpcode() == ISD::LOAD && LHS.hasOneUse() &&
	SetCCWidth != 1 && SetCCWidth < WideWidth &&
	TLI.isLoadExtLegalOrCustom(LoadExtOpcode, WideVT, NarrowVT) &&
	TLI.isOperationLegalOrCustom(ISD::SETCC, WideVT)) {
	// Both compare operands can be widened for free. The LHS can use an
	// extended load, and the RHS is a constant:
	// vselect (ext (setcc load(X), C)), N1, N2 -->
	// vselect (setcc extload(X), C'), N1, N2
	auto ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	SDValue WideLHS = DAG.getNode(ExtOpcode, DL, WideVT, LHS);
	SDValue WideRHS = DAG.getNode(ExtOpcode, DL, WideVT, RHS);
	EVT WideSetCCVT = getSetCCResultType(WideVT);
	SDValue WideSetCC = DAG.getSetCC(DL, WideSetCCVT, WideLHS, WideRHS, CC);
	return DAG.getSelect(DL, N1.getValueType(), WideSetCC, N1, N2);
	}
	}
	}

	if (SimplifySelectOps(N, N1, N2))
	return SDValue(N, 0); // Don't revisit N.

	// Fold (vselect (build_vector all_ones), N1, N2) -> N1
	if (ISD::isBuildVectorAllOnes(N0.getNode()))
	return N1;
	// Fold (vselect (build_vector all_zeros), N1, N2) -> N2
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return N2;

	// The ConvertSelectToConcatVector function is assuming both the above
	// checks for (vselect (build_vector all{ones,zeros) ...) have been made
	// and addressed.
	if (N1.getOpcode() == ISD::CONCAT_VECTORS &&
	N2.getOpcode() == ISD::CONCAT_VECTORS &&
	ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
	if (SDValue CV = ConvertSelectToConcatVector(N, DAG))
	return CV;
	}

	if (SDValue V = foldVSelectOfConstants(N))
	return V;

	return SDValue();
	}

	SDValue DAGCombiner::visitSELECT_CC(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue N2 = N->getOperand(2);
	SDValue N3 = N->getOperand(3);
	SDValue N4 = N->getOperand(4);
	ISD::CondCode CC = cast<CondCodeSDNode>(N4)->get();

	// fold select_cc lhs, rhs, x, x, cc -> x
	if (N2 == N3)
	return N2;

	// Determine if the condition we're dealing with is constant
	if (SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()), N0, N1,
	CC, SDLoc(N), false)) {
	AddToWorklist(SCC.getNode());

	if (ConstantSDNode *SCCC = dyn_cast<ConstantSDNode>(SCC.getNode())) {
	if (!SCCC->isNullValue())
	return N2; // cond always true -> true val
	else
	return N3; // cond always false -> false val
	} else if (SCC->isUndef()) {
	// When the condition is UNDEF, just return the first operand. This is
	// coherent the DAG creation, no setcc node is created in this case
	return N2;
	} else if (SCC.getOpcode() == ISD::SETCC) {
	// Fold to a simpler select_cc
	SDValue SelectOp = DAG.getNode(
	ISD::SELECT_CC, SDLoc(N), N2.getValueType(), SCC.getOperand(0),
	SCC.getOperand(1), N2, N3, SCC.getOperand(2));
	SelectOp->setFlags(SCC->getFlags());
	return SelectOp;
	}
	}

	// If we can fold this based on the true/false value, do so.
	if (SimplifySelectOps(N, N2, N3))
	return SDValue(N, 0); // Don't revisit N.

	// fold select_cc into other things, such as min/max/abs
	return SimplifySelectCC(SDLoc(N), N0, N1, N2, N3, CC);
	}

	SDValue DAGCombiner::visitSETCC(SDNode *N) {
	// setcc is very commonly used as an argument to brcond. This pattern
	// also lend itself to numerous combines and, as a result, it is desired
	// we keep the argument to a brcond as a setcc as much as possible.
	bool PreferSetCC =
	N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BRCOND;

	SDValue Combined = SimplifySetCC(
	N->getValueType(0), N->getOperand(0), N->getOperand(1),
	cast<CondCodeSDNode>(N->getOperand(2))->get(), SDLoc(N), !PreferSetCC);

	if (!Combined)
	return SDValue();

	// If we prefer to have a setcc, and we don't, we'll try our best to
	// recreate one using rebuildSetCC.
	if (PreferSetCC && Combined.getOpcode() != ISD::SETCC) {
	SDValue NewSetCC = rebuildSetCC(Combined);

	// We don't have anything interesting to combine to.
	if (NewSetCC.getNode() == N)
	return SDValue();

	if (NewSetCC)
	return NewSetCC;
	}

	return Combined;
	}

	SDValue DAGCombiner::visitSETCCCARRY(SDNode *N) {
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	SDValue Carry = N->getOperand(2);
	SDValue Cond = N->getOperand(3);

	// If Carry is false, fold to a regular SETCC.
	if (isNullConstant(Carry))
	return DAG.getNode(ISD::SETCC, SDLoc(N), N->getVTList(), LHS, RHS, Cond);

	return SDValue();
	}

	/// Try to fold a sext/zext/aext dag node into a ConstantSDNode or
	/// a build_vector of constants.
	/// This function is called by the DAGCombiner when visiting sext/zext/aext
	/// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND).
	/// Vector extends are not folded if operations are legal; this is to
	/// avoid introducing illegal build_vector dag nodes.
	static SDValue tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI,
	SelectionDAG &DAG, bool LegalTypes) {
	unsigned Opcode = N->getOpcode();
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	assert((Opcode == ISD::SIGN_EXTEND \|\| Opcode == ISD::ZERO_EXTEND \|\|
	Opcode == ISD::ANY_EXTEND \|\| Opcode == ISD::SIGN_EXTEND_VECTOR_INREG \|\|
	Opcode == ISD::ZERO_EXTEND_VECTOR_INREG)
	&& "Expected EXTEND dag node in input!");

	// fold (sext c1) -> c1
	// fold (zext c1) -> c1
	// fold (aext c1) -> c1
	if (isa<ConstantSDNode>(N0))
	return DAG.getNode(Opcode, DL, VT, N0);

	// fold (sext (select cond, c1, c2)) -> (select cond, sext c1, sext c2)
	// fold (zext (select cond, c1, c2)) -> (select cond, zext c1, zext c2)
	// fold (aext (select cond, c1, c2)) -> (select cond, sext c1, sext c2)
	if (N0->getOpcode() == ISD::SELECT) {
	SDValue Op1 = N0->getOperand(1);
	SDValue Op2 = N0->getOperand(2);
	if (isa<ConstantSDNode>(Op1) && isa<ConstantSDNode>(Op2) &&
	(Opcode != ISD::ZERO_EXTEND \|\| !TLI.isZExtFree(N0.getValueType(), VT))) {
	// For any_extend, choose sign extension of the constants to allow a
	// possible further transform to sign_extend_inreg.i.e.
	//
	// t1: i8 = select t0, Constant:i8<-1>, Constant:i8<0>
	// t2: i64 = any_extend t1
	// -->
	// t3: i64 = select t0, Constant:i64<-1>, Constant:i64<0>
	// -->
	// t4: i64 = sign_extend_inreg t3
	unsigned FoldOpc = Opcode;
	if (FoldOpc == ISD::ANY_EXTEND)
	FoldOpc = ISD::SIGN_EXTEND;
	return DAG.getSelect(DL, VT, N0->getOperand(0),
	DAG.getNode(FoldOpc, DL, VT, Op1),
	DAG.getNode(FoldOpc, DL, VT, Op2));
	}
	}

	// fold (sext (build_vector AllConstants) -> (build_vector AllConstants)
	// fold (zext (build_vector AllConstants) -> (build_vector AllConstants)
	// fold (aext (build_vector AllConstants) -> (build_vector AllConstants)
	EVT SVT = VT.getScalarType();
	if (!(VT.isVector() && (!LegalTypes \|\| TLI.isTypeLegal(SVT)) &&
	ISD::isBuildVectorOfConstantSDNodes(N0.getNode())))
	return SDValue();

	// We can fold this node into a build_vector.
	unsigned VTBits = SVT.getSizeInBits();
	unsigned EVTBits = N0->getValueType(0).getScalarSizeInBits();
	SmallVector<SDValue, 8> Elts;
	unsigned NumElts = VT.getVectorNumElements();

	// For zero-extensions, UNDEF elements still guarantee to have the upper
	// bits set to zero.
	bool IsZext =
	Opcode == ISD::ZERO_EXTEND \|\| Opcode == ISD::ZERO_EXTEND_VECTOR_INREG;

	for (unsigned i = 0; i != NumElts; ++i) {
	SDValue Op = N0.getOperand(i);
	if (Op.isUndef()) {
	Elts.push_back(IsZext ? DAG.getConstant(0, DL, SVT) : DAG.getUNDEF(SVT));
	continue;
	}

	SDLoc DL(Op);
	// Get the constant value and if needed trunc it to the size of the type.
	// Nodes like build_vector might have constants wider than the scalar type.
	APInt C = cast<ConstantSDNode>(Op)->getAPIntValue().zextOrTrunc(EVTBits);
	if (Opcode == ISD::SIGN_EXTEND \|\| Opcode == ISD::SIGN_EXTEND_VECTOR_INREG)
	Elts.push_back(DAG.getConstant(C.sext(VTBits), DL, SVT));
	else
	Elts.push_back(DAG.getConstant(C.zext(VTBits), DL, SVT));
	}

	return DAG.getBuildVector(VT, DL, Elts);
	}

	// ExtendUsesToFormExtLoad - Trying to extend uses of a load to enable this:
	// "fold ({s\|z\|a}ext (load x)) -> ({s\|z\|a}ext (truncate ({s\|z\|a}extload x)))"
	// transformation. Returns true if extension are possible and the above
	// mentioned transformation is profitable.
	static bool ExtendUsesToFormExtLoad(EVT VT, SDNode *N, SDValue N0,
	unsigned ExtOpc,
	SmallVectorImpl<SDNode *> &ExtendNodes,
	const TargetLowering &TLI) {
	bool HasCopyToRegUses = false;
	bool isTruncFree = TLI.isTruncateFree(VT, N0.getValueType());
	for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
	UE = N0.getNode()->use_end();
	UI != UE; ++UI) {
	SDNode User = UI;
	if (User == N)
	continue;
	if (UI.getUse().getResNo() != N0.getResNo())
	continue;
	// FIXME: Only extend SETCC N, N and SETCC N, c for now.
	if (ExtOpc != ISD::ANY_EXTEND && User->getOpcode() == ISD::SETCC) {
	ISD::CondCode CC = cast<CondCodeSDNode>(User->getOperand(2))->get();
	if (ExtOpc == ISD::ZERO_EXTEND && ISD::isSignedIntSetCC(CC))
	// Sign bits will be lost after a zext.
	return false;
	bool Add = false;
	for (unsigned i = 0; i != 2; ++i) {
	SDValue UseOp = User->getOperand(i);
	if (UseOp == N0)
	continue;
	if (!isa<ConstantSDNode>(UseOp))
	return false;
	Add = true;
	}
	if (Add)
	ExtendNodes.push_back(User);
	continue;
	}
	// If truncates aren't free and there are users we can't
	// extend, it isn't worthwhile.
	if (!isTruncFree)
	return false;
	// Remember if this value is live-out.
	if (User->getOpcode() == ISD::CopyToReg)
	HasCopyToRegUses = true;
	}

	if (HasCopyToRegUses) {
	bool BothLiveOut = false;
	for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
	UI != UE; ++UI) {
	SDUse &Use = UI.getUse();
	if (Use.getResNo() == 0 && Use.getUser()->getOpcode() == ISD::CopyToReg) {
	BothLiveOut = true;
	break;
	}
	}
	if (BothLiveOut)
	// Both unextended and extended values are live out. There had better be
	// a good reason for the transformation.
	return ExtendNodes.size();
	}
	return true;
	}

	void DAGCombiner::ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs,
	SDValue OrigLoad, SDValue ExtLoad,
	ISD::NodeType ExtType) {
	// Extend SetCC uses if necessary.
	SDLoc DL(ExtLoad);
	for (SDNode *SetCC : SetCCs) {
	SmallVector<SDValue, 4> Ops;

	for (unsigned j = 0; j != 2; ++j) {
	SDValue SOp = SetCC->getOperand(j);
	if (SOp == OrigLoad)
	Ops.push_back(ExtLoad);
	else
	Ops.push_back(DAG.getNode(ExtType, DL, ExtLoad->getValueType(0), SOp));
	}

	Ops.push_back(SetCC->getOperand(2));
	CombineTo(SetCC, DAG.getNode(ISD::SETCC, DL, SetCC->getValueType(0), Ops));
	}
	}

	// FIXME: Bring more similar combines here, common to sext/zext (maybe aext?).
	SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT DstVT = N->getValueType(0);
	EVT SrcVT = N0.getValueType();

	assert((N->getOpcode() == ISD::SIGN_EXTEND \|\|
	N->getOpcode() == ISD::ZERO_EXTEND) &&
	"Unexpected node type (not an extend)!");

	// fold (sext (load x)) to multiple smaller sextloads; same for zext.
	// For example, on a target with legal v4i32, but illegal v8i32, turn:
	// (v8i32 (sext (v8i16 (load x))))
	// into:
	// (v8i32 (concat_vectors (v4i32 (sextload x)),
	// (v4i32 (sextload (x + 16)))))
	// Where uses of the original load, i.e.:
	// (v8i16 (load x))
	// are replaced with:
	// (v8i16 (truncate
	// (v8i32 (concat_vectors (v4i32 (sextload x)),
	// (v4i32 (sextload (x + 16)))))))
	//
	// This combine is only applicable to illegal, but splittable, vectors.
	// All legal types, and illegal non-vector types, are handled elsewhere.
	// This combine is controlled by TargetLowering::isVectorLoadExtDesirable.
	//
	if (N0->getOpcode() != ISD::LOAD)
	return SDValue();

	LoadSDNode *LN0 = cast<LoadSDNode>(N0);

	if (!ISD::isNON_EXTLoad(LN0) \|\| !ISD::isUNINDEXEDLoad(LN0) \|\|
	!N0.hasOneUse() \|\| !LN0->isSimple() \|\|
	!DstVT.isVector() \|\| !DstVT.isPow2VectorType() \|\|
	!TLI.isVectorLoadExtDesirable(SDValue(N, 0)))
	return SDValue();

	SmallVector<SDNode *, 4> SetCCs;
	if (!ExtendUsesToFormExtLoad(DstVT, N, N0, N->getOpcode(), SetCCs, TLI))
	return SDValue();

	ISD::LoadExtType ExtType =
	N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;

	// Try to split the vector types to get down to legal types.
	EVT SplitSrcVT = SrcVT;
	EVT SplitDstVT = DstVT;
	while (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT) &&
	SplitSrcVT.getVectorNumElements() > 1) {
	SplitDstVT = DAG.GetSplitDestVTs(SplitDstVT).first;
	SplitSrcVT = DAG.GetSplitDestVTs(SplitSrcVT).first;
	}

	if (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT))
	return SDValue();

	assert(!DstVT.isScalableVector() && "Unexpected scalable vector type");

	SDLoc DL(N);
	const unsigned NumSplits =
	DstVT.getVectorNumElements() / SplitDstVT.getVectorNumElements();
	const unsigned Stride = SplitSrcVT.getStoreSize();
	SmallVector<SDValue, 4> Loads;
	SmallVector<SDValue, 4> Chains;

	SDValue BasePtr = LN0->getBasePtr();
	for (unsigned Idx = 0; Idx < NumSplits; Idx++) {
	const unsigned Offset = Idx * Stride;
	const unsigned Align = MinAlign(LN0->getAlignment(), Offset);

	SDValue SplitLoad = DAG.getExtLoad(
	ExtType, SDLoc(LN0), SplitDstVT, LN0->getChain(), BasePtr,
	LN0->getPointerInfo().getWithOffset(Offset), SplitSrcVT, Align,
	LN0->getMemOperand()->getFlags(), LN0->getAAInfo());

	BasePtr = DAG.getMemBasePlusOffset(BasePtr, Stride, DL);

	Loads.push_back(SplitLoad.getValue(0));
	Chains.push_back(SplitLoad.getValue(1));
	}

	SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
	SDValue NewValue = DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Loads);

	// Simplify TF.
	AddToWorklist(NewChain.getNode());

	CombineTo(N, NewValue);

	// Replace uses of the original load (before extension)
	// with a truncate of the concatenated sextloaded vectors.
	SDValue Trunc =
	DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), NewValue);
	ExtendSetCCUses(SetCCs, N0, NewValue, (ISD::NodeType)N->getOpcode());
	CombineTo(N0.getNode(), Trunc, NewChain);
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}

	// fold (zext (and/or/xor (shl/shr (load x), cst), cst)) ->
	// (and/or/xor (shl/shr (zextload x), (zext cst)), (zext cst))
	SDValue DAGCombiner::CombineZExtLogicopShiftLoad(SDNode *N) {
	assert(N->getOpcode() == ISD::ZERO_EXTEND);
	EVT VT = N->getValueType(0);
	EVT OrigVT = N->getOperand(0).getValueType();
	if (TLI.isZExtFree(OrigVT, VT))
	return SDValue();

	// and/or/xor
	SDValue N0 = N->getOperand(0);
	if (!(N0.getOpcode() == ISD::AND \|\| N0.getOpcode() == ISD::OR \|\|
	N0.getOpcode() == ISD::XOR) \|\|
	N0.getOperand(1).getOpcode() != ISD::Constant \|\|
	(LegalOperations && !TLI.isOperationLegal(N0.getOpcode(), VT)))
	return SDValue();

	// shl/shr
	SDValue N1 = N0->getOperand(0);
	if (!(N1.getOpcode() == ISD::SHL \|\| N1.getOpcode() == ISD::SRL) \|\|
	N1.getOperand(1).getOpcode() != ISD::Constant \|\|
	(LegalOperations && !TLI.isOperationLegal(N1.getOpcode(), VT)))
	return SDValue();

	// load
	if (!isa<LoadSDNode>(N1.getOperand(0)))
	return SDValue();
	LoadSDNode *Load = cast<LoadSDNode>(N1.getOperand(0));
	EVT MemVT = Load->getMemoryVT();
	if (!TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT) \|\|
	Load->getExtensionType() == ISD::SEXTLOAD \|\| Load->isIndexed())
	return SDValue();


	// If the shift op is SHL, the logic op must be AND, otherwise the result
	// will be wrong.
	if (N1.getOpcode() == ISD::SHL && N0.getOpcode() != ISD::AND)
	return SDValue();

	if (!N0.hasOneUse() \|\| !N1.hasOneUse())
	return SDValue();

	SmallVector<SDNode*, 4> SetCCs;
	if (!ExtendUsesToFormExtLoad(VT, N1.getNode(), N1.getOperand(0),
	ISD::ZERO_EXTEND, SetCCs, TLI))
	return SDValue();

	// Actually do the transformation.
	SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Load), VT,
	Load->getChain(), Load->getBasePtr(),
	Load->getMemoryVT(), Load->getMemOperand());

	SDLoc DL1(N1);
	SDValue Shift = DAG.getNode(N1.getOpcode(), DL1, VT, ExtLoad,
	N1.getOperand(1));

	APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits());
	SDLoc DL0(N0);
	SDValue And = DAG.getNode(N0.getOpcode(), DL0, VT, Shift,
	DAG.getConstant(Mask, DL0, VT));

	ExtendSetCCUses(SetCCs, N1.getOperand(0), ExtLoad, ISD::ZERO_EXTEND);
	CombineTo(N, And);
	if (SDValue(Load, 0).hasOneUse()) {
	DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), ExtLoad.getValue(1));
	} else {
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(Load),
	Load->getValueType(0), ExtLoad);
	CombineTo(Load, Trunc, ExtLoad.getValue(1));
	}

	// N0 is dead at this point.
	recursivelyDeleteUnusedNodes(N0.getNode());

	return SDValue(N,0); // Return N so it doesn't get rechecked!
	}

	/// If we're narrowing or widening the result of a vector select and the final
	/// size is the same size as a setcc (compare) feeding the select, then try to
	/// apply the cast operation to the select's operands because matching vector
	/// sizes for a select condition and other operands should be more efficient.
	SDValue DAGCombiner::matchVSelectOpSizesWithSetCC(SDNode *Cast) {
	unsigned CastOpcode = Cast->getOpcode();
	assert((CastOpcode == ISD::SIGN_EXTEND \|\| CastOpcode == ISD::ZERO_EXTEND \|\|
	CastOpcode == ISD::TRUNCATE \|\| CastOpcode == ISD::FP_EXTEND \|\|
	CastOpcode == ISD::FP_ROUND) &&
	"Unexpected opcode for vector select narrowing/widening");

	// We only do this transform before legal ops because the pattern may be
	// obfuscated by target-specific operations after legalization. Do not create
	// an illegal select op, however, because that may be difficult to lower.
	EVT VT = Cast->getValueType(0);
	if (LegalOperations \|\| !TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
	return SDValue();

	SDValue VSel = Cast->getOperand(0);
	if (VSel.getOpcode() != ISD::VSELECT \|\| !VSel.hasOneUse() \|\|
	VSel.getOperand(0).getOpcode() != ISD::SETCC)
	return SDValue();

	// Does the setcc have the same vector size as the casted select?
	SDValue SetCC = VSel.getOperand(0);
	EVT SetCCVT = getSetCCResultType(SetCC.getOperand(0).getValueType());
	if (SetCCVT.getSizeInBits() != VT.getSizeInBits())
	return SDValue();

	// cast (vsel (setcc X), A, B) --> vsel (setcc X), (cast A), (cast B)
	SDValue A = VSel.getOperand(1);
	SDValue B = VSel.getOperand(2);
	SDValue CastA, CastB;
	SDLoc DL(Cast);
	if (CastOpcode == ISD::FP_ROUND) {
	// FP_ROUND (fptrunc) has an extra flag operand to pass along.
	CastA = DAG.getNode(CastOpcode, DL, VT, A, Cast->getOperand(1));
	CastB = DAG.getNode(CastOpcode, DL, VT, B, Cast->getOperand(1));
	} else {
	CastA = DAG.getNode(CastOpcode, DL, VT, A);
	CastB = DAG.getNode(CastOpcode, DL, VT, B);
	}
	return DAG.getNode(ISD::VSELECT, DL, VT, SetCC, CastA, CastB);
	}

	// fold ([s\|z]ext ([s\|z]extload x)) -> ([s\|z]ext (truncate ([s\|z]extload x)))
	// fold ([s\|z]ext ( extload x)) -> ([s\|z]ext (truncate ([s\|z]extload x)))
	static SDValue tryToFoldExtOfExtload(SelectionDAG &DAG, DAGCombiner &Combiner,
	const TargetLowering &TLI, EVT VT,
	bool LegalOperations, SDNode *N,
	SDValue N0, ISD::LoadExtType ExtLoadType) {
	SDNode *N0Node = N0.getNode();
	bool isAExtLoad = (ExtLoadType == ISD::SEXTLOAD) ? ISD::isSEXTLoad(N0Node)
	: ISD::isZEXTLoad(N0Node);
	if ((!isAExtLoad && !ISD::isEXTLoad(N0Node)) \|\|
	!ISD::isUNINDEXEDLoad(N0Node) \|\| !N0.hasOneUse())
	return SDValue();

	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	EVT MemVT = LN0->getMemoryVT();
	if ((LegalOperations \|\| !LN0->isSimple() \|\|
	VT.isVector()) &&
	!TLI.isLoadExtLegal(ExtLoadType, VT, MemVT))
	return SDValue();

	SDValue ExtLoad =
	DAG.getExtLoad(ExtLoadType, SDLoc(LN0), VT, LN0->getChain(),
	LN0->getBasePtr(), MemVT, LN0->getMemOperand());
	Combiner.CombineTo(N, ExtLoad);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
	if (LN0->use_empty())
	Combiner.recursivelyDeleteUnusedNodes(LN0);
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}

	// fold ([s\|z]ext (load x)) -> ([s\|z]ext (truncate ([s\|z]extload x)))
	// Only generate vector extloads when 1) they're legal, and 2) they are
	// deemed desirable by the target.
	static SDValue tryToFoldExtOfLoad(SelectionDAG &DAG, DAGCombiner &Combiner,
	const TargetLowering &TLI, EVT VT,
	bool LegalOperations, SDNode *N, SDValue N0,
	ISD::LoadExtType ExtLoadType,
	ISD::NodeType ExtOpc) {
	if (!ISD::isNON_EXTLoad(N0.getNode()) \|\|
	!ISD::isUNINDEXEDLoad(N0.getNode()) \|\|
	((LegalOperations \|\| VT.isVector() \|\|
	!cast<LoadSDNode>(N0)->isSimple()) &&
	!TLI.isLoadExtLegal(ExtLoadType, VT, N0.getValueType())))
	return {};

	bool DoXform = true;
	SmallVector<SDNode *, 4> SetCCs;
	if (!N0.hasOneUse())
	DoXform = ExtendUsesToFormExtLoad(VT, N, N0, ExtOpc, SetCCs, TLI);
	if (VT.isVector())
	DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0));
	if (!DoXform)
	return {};

	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	SDValue ExtLoad = DAG.getExtLoad(ExtLoadType, SDLoc(LN0), VT, LN0->getChain(),
	LN0->getBasePtr(), N0.getValueType(),
	LN0->getMemOperand());
	Combiner.ExtendSetCCUses(SetCCs, N0, ExtLoad, ExtOpc);
	// If the load value is used only by N, replace it via CombineTo N.
	bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
	Combiner.CombineTo(N, ExtLoad);
	if (NoReplaceTrunc) {
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
	Combiner.recursivelyDeleteUnusedNodes(LN0);
	} else {
	SDValue Trunc =
	DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad);
	Combiner.CombineTo(LN0, Trunc, ExtLoad.getValue(1));
	}
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}

	static SDValue tryToFoldExtOfMaskedLoad(SelectionDAG &DAG,
	const TargetLowering &TLI, EVT VT,
	SDNode *N, SDValue N0,
	ISD::LoadExtType ExtLoadType,
	ISD::NodeType ExtOpc) {
	if (!N0.hasOneUse())
	return SDValue();

	MaskedLoadSDNode *Ld = dyn_cast<MaskedLoadSDNode>(N0);
	if (!Ld \|\| Ld->getExtensionType() != ISD::NON_EXTLOAD)
	return SDValue();

	if (!TLI.isLoadExtLegal(ExtLoadType, VT, Ld->getValueType(0)))
	return SDValue();

	if (!TLI.isVectorLoadExtDesirable(SDValue(N, 0)))
	return SDValue();

	SDLoc dl(Ld);
	SDValue PassThru = DAG.getNode(ExtOpc, dl, VT, Ld->getPassThru());
	SDValue NewLoad = DAG.getMaskedLoad(
	VT, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getOffset(), Ld->getMask(),
	PassThru, Ld->getMemoryVT(), Ld->getMemOperand(), Ld->getAddressingMode(),
	ExtLoadType, Ld->isExpandingLoad());
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), SDValue(NewLoad.getNode(), 1));
	return NewLoad;
	}

	static SDValue foldExtendedSignBitTest(SDNode *N, SelectionDAG &DAG,
	bool LegalOperations) {
	assert((N->getOpcode() == ISD::SIGN_EXTEND \|\|
	N->getOpcode() == ISD::ZERO_EXTEND) && "Expected sext or zext");

	SDValue SetCC = N->getOperand(0);
	if (LegalOperations \|\| SetCC.getOpcode() != ISD::SETCC \|\|
	!SetCC.hasOneUse() \|\| SetCC.getValueType() != MVT::i1)
	return SDValue();

	SDValue X = SetCC.getOperand(0);
	SDValue Ones = SetCC.getOperand(1);
	ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
	EVT VT = N->getValueType(0);
	EVT XVT = X.getValueType();
	// setge X, C is canonicalized to setgt, so we do not need to match that
	// pattern. The setlt sibling is folded in SimplifySelectCC() because it does
	// not require the 'not' op.
	if (CC == ISD::SETGT && isAllOnesConstant(Ones) && VT == XVT) {
	// Invert and smear/shift the sign bit:
	// sext i1 (setgt iN X, -1) --> sra (not X), (N - 1)
	// zext i1 (setgt iN X, -1) --> srl (not X), (N - 1)
	SDLoc DL(N);
	unsigned ShCt = VT.getSizeInBits() - 1;
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.shouldAvoidTransformToShift(VT, ShCt)) {
	SDValue NotX = DAG.getNOT(DL, X, VT);
	SDValue ShiftAmount = DAG.getConstant(ShCt, DL, VT);
	auto ShiftOpcode =
	N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SRA : ISD::SRL;
	return DAG.getNode(ShiftOpcode, DL, VT, NotX, ShiftAmount);
	}
	}
	return SDValue();
	}

	SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
	return Res;

	// fold (sext (sext x)) -> (sext x)
	// fold (sext (aext x)) -> (sext x)
	if (N0.getOpcode() == ISD::SIGN_EXTEND \|\| N0.getOpcode() == ISD::ANY_EXTEND)
	return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N0.getOperand(0));

	if (N0.getOpcode() == ISD::TRUNCATE) {
	// fold (sext (truncate (load x))) -> (sext (smaller load x))
	// fold (sext (truncate (srl (load x), c))) -> (sext (smaller load (x+c/n)))
	if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
	SDNode *oye = N0.getOperand(0).getNode();
	if (NarrowLoad.getNode() != N0.getNode()) {
	CombineTo(N0.getNode(), NarrowLoad);
	// CombineTo deleted the truncate, if needed, but not what's under it.
	AddToWorklist(oye);
	}
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}

	// See if the value being truncated is already sign extended. If so, just
	// eliminate the trunc/sext pair.
	SDValue Op = N0.getOperand(0);
	unsigned OpBits = Op.getScalarValueSizeInBits();
	unsigned MidBits = N0.getScalarValueSizeInBits();
	unsigned DestBits = VT.getScalarSizeInBits();
	unsigned NumSignBits = DAG.ComputeNumSignBits(Op);

	if (OpBits == DestBits) {
	// Op is i32, Mid is i8, and Dest is i32. If Op has more than 24 sign
	// bits, it is already ready.
	if (NumSignBits > DestBits-MidBits)
	return Op;
	} else if (OpBits < DestBits) {
	// Op is i32, Mid is i8, and Dest is i64. If Op has more than 24 sign
	// bits, just sext from i32.
	if (NumSignBits > OpBits-MidBits)
	return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op);
	} else {
	// Op is i64, Mid is i8, and Dest is i32. If Op has more than 56 sign
	// bits, just truncate to i32.
	if (NumSignBits > OpBits-MidBits)
	return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
	}

	// fold (sext (truncate x)) -> (sextinreg x).
	if (!LegalOperations \|\| TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG,
	N0.getValueType())) {
	if (OpBits < DestBits)
	Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N0), VT, Op);
	else if (OpBits > DestBits)
	Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), VT, Op);
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op,
	DAG.getValueType(N0.getValueType()));
	}
	}

	// Try to simplify (sext (load x)).
	if (SDValue foldedExt =
	tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0,
	ISD::SEXTLOAD, ISD::SIGN_EXTEND))
	return foldedExt;

	if (SDValue foldedExt =
	tryToFoldExtOfMaskedLoad(DAG, TLI, VT, N, N0, ISD::SEXTLOAD,
	ISD::SIGN_EXTEND))
	return foldedExt;

	// fold (sext (load x)) to multiple smaller sextloads.
	// Only on illegal but splittable vectors.
	if (SDValue ExtLoad = CombineExtLoad(N))
	return ExtLoad;

	// Try to simplify (sext (sextload x)).
	if (SDValue foldedExt = tryToFoldExtOfExtload(
	DAG, *this, TLI, VT, LegalOperations, N, N0, ISD::SEXTLOAD))
	return foldedExt;

	// fold (sext (and/or/xor (load x), cst)) ->
	// (and/or/xor (sextload x), (sext cst))
	if ((N0.getOpcode() == ISD::AND \|\| N0.getOpcode() == ISD::OR \|\|
	N0.getOpcode() == ISD::XOR) &&
	isa<LoadSDNode>(N0.getOperand(0)) &&
	N0.getOperand(1).getOpcode() == ISD::Constant &&
	(!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
	LoadSDNode *LN00 = cast<LoadSDNode>(N0.getOperand(0));
	EVT MemVT = LN00->getMemoryVT();
	if (TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT) &&
	LN00->getExtensionType() != ISD::ZEXTLOAD && LN00->isUnindexed()) {
	SmallVector<SDNode*, 4> SetCCs;
	bool DoXform = ExtendUsesToFormExtLoad(VT, N0.getNode(), N0.getOperand(0),
	ISD::SIGN_EXTEND, SetCCs, TLI);
	if (DoXform) {
	SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(LN00), VT,
	LN00->getChain(), LN00->getBasePtr(),
	LN00->getMemoryVT(),
	LN00->getMemOperand());
	APInt Mask = N0.getConstantOperandAPInt(1).sext(VT.getSizeInBits());
	SDValue And = DAG.getNode(N0.getOpcode(), DL, VT,
	ExtLoad, DAG.getConstant(Mask, DL, VT));
	ExtendSetCCUses(SetCCs, N0.getOperand(0), ExtLoad, ISD::SIGN_EXTEND);
	bool NoReplaceTruncAnd = !N0.hasOneUse();
	bool NoReplaceTrunc = SDValue(LN00, 0).hasOneUse();
	CombineTo(N, And);
	// If N0 has multiple uses, change other uses as well.
	if (NoReplaceTruncAnd) {
	SDValue TruncAnd =
	DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And);
	CombineTo(N0.getNode(), TruncAnd);
	}
	if (NoReplaceTrunc) {
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN00, 1), ExtLoad.getValue(1));
	} else {
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(LN00),
	LN00->getValueType(0), ExtLoad);
	CombineTo(LN00, Trunc, ExtLoad.getValue(1));
	}
	return SDValue(N,0); // Return N so it doesn't get rechecked!
	}
	}
	}

	if (SDValue V = foldExtendedSignBitTest(N, DAG, LegalOperations))
	return V;

	if (N0.getOpcode() == ISD::SETCC) {
	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
	EVT N00VT = N0.getOperand(0).getValueType();

	// sext(setcc) -> sext_in_reg(vsetcc) for vectors.
	// Only do this before legalize for now.
	if (VT.isVector() && !LegalOperations &&
	TLI.getBooleanContents(N00VT) ==
	TargetLowering::ZeroOrNegativeOneBooleanContent) {
	// On some architectures (such as SSE/NEON/etc) the SETCC result type is
	// of the same size as the compared operands. Only optimize sext(setcc())
	// if this is the case.
	EVT SVT = getSetCCResultType(N00VT);

	// If we already have the desired type, don't change it.
	if (SVT != N0.getValueType()) {
	// We know that the # elements of the results is the same as the
	// # elements of the compare (and the # elements of the compare result
	// for that matter). Check to see that they are the same size. If so,
	// we know that the element size of the sext'd result matches the
	// element size of the compare operands.
	if (VT.getSizeInBits() == SVT.getSizeInBits())
	return DAG.getSetCC(DL, VT, N00, N01, CC);

	// If the desired elements are smaller or larger than the source
	// elements, we can use a matching integer vector type and then
	// truncate/sign extend.
	EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
	if (SVT == MatchingVecType) {
	SDValue VsetCC = DAG.getSetCC(DL, MatchingVecType, N00, N01, CC);
	return DAG.getSExtOrTrunc(VsetCC, DL, VT);
	}
	}
	}

	// sext(setcc x, y, cc) -> (select (setcc x, y, cc), T, 0)
	// Here, T can be 1 or -1, depending on the type of the setcc and
	// getBooleanContents().
	unsigned SetCCWidth = N0.getScalarValueSizeInBits();

	// To determine the "true" side of the select, we need to know the high bit
	// of the value returned by the setcc if it evaluates to true.
	// If the type of the setcc is i1, then the true case of the select is just
	// sext(i1 1), that is, -1.
	// If the type of the setcc is larger (say, i8) then the value of the high
	// bit depends on getBooleanContents(), so ask TLI for a real "true" value
	// of the appropriate width.
	SDValue ExtTrueVal = (SetCCWidth == 1)
	? DAG.getAllOnesConstant(DL, VT)
	: DAG.getBoolConstant(true, DL, VT, N00VT);
	SDValue Zero = DAG.getConstant(0, DL, VT);
	if (SDValue SCC =
	SimplifySelectCC(DL, N00, N01, ExtTrueVal, Zero, CC, true))
	return SCC;

	if (!VT.isVector() && !TLI.convertSelectOfConstantsToMath(VT)) {
	EVT SetCCVT = getSetCCResultType(N00VT);
	// Don't do this transform for i1 because there's a select transform
	// that would reverse it.
	// TODO: We should not do this transform at all without a target hook
	// because a sext is likely cheaper than a select?
	if (SetCCVT.getScalarSizeInBits() != 1 &&
	(!LegalOperations \|\| TLI.isOperationLegal(ISD::SETCC, N00VT))) {
	SDValue SetCC = DAG.getSetCC(DL, SetCCVT, N00, N01, CC);
	return DAG.getSelect(DL, VT, SetCC, ExtTrueVal, Zero);
	}
	}
	}

	// fold (sext x) -> (zext x) if the sign bit is known zero.
	if ((!LegalOperations \|\| TLI.isOperationLegal(ISD::ZERO_EXTEND, VT)) &&
	DAG.SignBitIsZero(N0))
	return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0);

	if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
	return NewVSel;

	// Eliminate this sign extend by doing a negation in the destination type:
	// sext i32 (0 - (zext i8 X to i32)) to i64 --> 0 - (zext i8 X to i64)
	if (N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
	isNullOrNullSplat(N0.getOperand(0)) &&
	N0.getOperand(1).getOpcode() == ISD::ZERO_EXTEND &&
	TLI.isOperationLegalOrCustom(ISD::SUB, VT)) {
	SDValue Zext = DAG.getZExtOrTrunc(N0.getOperand(1).getOperand(0), DL, VT);
	return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Zext);
	}
	// Eliminate this sign extend by doing a decrement in the destination type:
	// sext i32 ((zext i8 X to i32) + (-1)) to i64 --> (zext i8 X to i64) + (-1)
	if (N0.getOpcode() == ISD::ADD && N0.hasOneUse() &&
	isAllOnesOrAllOnesSplat(N0.getOperand(1)) &&
	N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND &&
	TLI.isOperationLegalOrCustom(ISD::ADD, VT)) {
	SDValue Zext = DAG.getZExtOrTrunc(N0.getOperand(0).getOperand(0), DL, VT);
	return DAG.getNode(ISD::ADD, DL, VT, Zext, DAG.getAllOnesConstant(DL, VT));
	}

	return SDValue();
	}

	// isTruncateOf - If N is a truncate of some other value, return true, record
	// the value being truncated in Op and which of Op's bits are zero/one in Known.
	// This function computes KnownBits to avoid a duplicated call to
	// computeKnownBits in the caller.
	static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op,
	KnownBits &Known) {
	if (N->getOpcode() == ISD::TRUNCATE) {
	Op = N->getOperand(0);
	Known = DAG.computeKnownBits(Op);
	return true;
	}

	if (N.getOpcode() != ISD::SETCC \|\|
	N.getValueType().getScalarType() != MVT::i1 \|\|
	cast<CondCodeSDNode>(N.getOperand(2))->get() != ISD::SETNE)
	return false;

	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	assert(Op0.getValueType() == Op1.getValueType());

	if (isNullOrNullSplat(Op0))
	Op = Op1;
	else if (isNullOrNullSplat(Op1))
	Op = Op0;
	else
	return false;

	Known = DAG.computeKnownBits(Op);

	return (Known.Zero \| 1).isAllOnesValue();
	}

	/// Given an extending node with a pop-count operand, if the target does not
	/// support a pop-count in the narrow source type but does support it in the
	/// destination type, widen the pop-count to the destination type.
	static SDValue widenCtPop(SDNode *Extend, SelectionDAG &DAG) {
	assert((Extend->getOpcode() == ISD::ZERO_EXTEND \|\|
	Extend->getOpcode() == ISD::ANY_EXTEND) && "Expected extend op");

	SDValue CtPop = Extend->getOperand(0);
	if (CtPop.getOpcode() != ISD::CTPOP \|\| !CtPop.hasOneUse())
	return SDValue();

	EVT VT = Extend->getValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.isOperationLegalOrCustom(ISD::CTPOP, CtPop.getValueType()) \|\|
	!TLI.isOperationLegalOrCustom(ISD::CTPOP, VT))
	return SDValue();

	// zext (ctpop X) --> ctpop (zext X)
	SDLoc DL(Extend);
	SDValue NewZext = DAG.getZExtOrTrunc(CtPop.getOperand(0), DL, VT);
	return DAG.getNode(ISD::CTPOP, DL, VT, NewZext);
	}

	SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
	return Res;

	// fold (zext (zext x)) -> (zext x)
	// fold (zext (aext x)) -> (zext x)
	if (N0.getOpcode() == ISD::ZERO_EXTEND \|\| N0.getOpcode() == ISD::ANY_EXTEND)
	return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT,
	N0.getOperand(0));

	// fold (zext (truncate x)) -> (zext x) or
	// (zext (truncate x)) -> (truncate x)
	// This is valid when the truncated bits of x are already zero.
	SDValue Op;
	KnownBits Known;
	if (isTruncateOf(DAG, N0, Op, Known)) {
	APInt TruncatedBits =
	(Op.getScalarValueSizeInBits() == N0.getScalarValueSizeInBits()) ?
	APInt(Op.getScalarValueSizeInBits(), 0) :
	APInt::getBitsSet(Op.getScalarValueSizeInBits(),
	N0.getScalarValueSizeInBits(),
	std::min(Op.getScalarValueSizeInBits(),
	VT.getScalarSizeInBits()));
	if (TruncatedBits.isSubsetOf(Known.Zero))
	return DAG.getZExtOrTrunc(Op, SDLoc(N), VT);
	}

	// fold (zext (truncate x)) -> (and x, mask)
	if (N0.getOpcode() == ISD::TRUNCATE) {
	// fold (zext (truncate (load x))) -> (zext (smaller load x))
	// fold (zext (truncate (srl (load x), c))) -> (zext (smaller load (x+c/n)))
	if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
	SDNode *oye = N0.getOperand(0).getNode();
	if (NarrowLoad.getNode() != N0.getNode()) {
	CombineTo(N0.getNode(), NarrowLoad);
	// CombineTo deleted the truncate, if needed, but not what's under it.
	AddToWorklist(oye);
	}
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}

	EVT SrcVT = N0.getOperand(0).getValueType();
	EVT MinVT = N0.getValueType();

	// Try to mask before the extension to avoid having to generate a larger mask,
	// possibly over several sub-vectors.
	if (SrcVT.bitsLT(VT) && VT.isVector()) {
	if (!LegalOperations \|\| (TLI.isOperationLegal(ISD::AND, SrcVT) &&
	TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) {
	SDValue Op = N0.getOperand(0);
	Op = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT);
	AddToWorklist(Op.getNode());
	SDValue ZExtOrTrunc = DAG.getZExtOrTrunc(Op, SDLoc(N), VT);
	// Transfer the debug info; the new node is equivalent to N0.
	DAG.transferDbgValues(N0, ZExtOrTrunc);
	return ZExtOrTrunc;
	}
	}

	if (!LegalOperations \|\| TLI.isOperationLegal(ISD::AND, VT)) {
	SDValue Op = DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
	AddToWorklist(Op.getNode());
	SDValue And = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT);
	// We may safely transfer the debug info describing the truncate node over
	// to the equivalent and operation.
	DAG.transferDbgValues(N0, And);
	return And;
	}
	}

	// Fold (zext (and (trunc x), cst)) -> (and x, cst),
	// if either of the casts is not free.
	if (N0.getOpcode() == ISD::AND &&
	N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
	N0.getOperand(1).getOpcode() == ISD::Constant &&
	(!TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(),
	N0.getValueType()) \|\|
	!TLI.isZExtFree(N0.getValueType(), VT))) {
	SDValue X = N0.getOperand(0).getOperand(0);
	X = DAG.getAnyExtOrTrunc(X, SDLoc(X), VT);
	APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits());
	SDLoc DL(N);
	return DAG.getNode(ISD::AND, DL, VT,
	X, DAG.getConstant(Mask, DL, VT));
	}

	// Try to simplify (zext (load x)).
	if (SDValue foldedExt =
	tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0,
	ISD::ZEXTLOAD, ISD::ZERO_EXTEND))
	return foldedExt;

	if (SDValue foldedExt =
	tryToFoldExtOfMaskedLoad(DAG, TLI, VT, N, N0, ISD::ZEXTLOAD,
	ISD::ZERO_EXTEND))
	return foldedExt;

	// fold (zext (load x)) to multiple smaller zextloads.
	// Only on illegal but splittable vectors.
	if (SDValue ExtLoad = CombineExtLoad(N))
	return ExtLoad;

	// fold (zext (and/or/xor (load x), cst)) ->
	// (and/or/xor (zextload x), (zext cst))
	// Unless (and (load x) cst) will match as a zextload already and has
	// additional users.
	if ((N0.getOpcode() == ISD::AND \|\| N0.getOpcode() == ISD::OR \|\|
	N0.getOpcode() == ISD::XOR) &&
	isa<LoadSDNode>(N0.getOperand(0)) &&
	N0.getOperand(1).getOpcode() == ISD::Constant &&
	(!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
	LoadSDNode *LN00 = cast<LoadSDNode>(N0.getOperand(0));
	EVT MemVT = LN00->getMemoryVT();
	if (TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT) &&
	LN00->getExtensionType() != ISD::SEXTLOAD && LN00->isUnindexed()) {
	bool DoXform = true;
	SmallVector<SDNode*, 4> SetCCs;
	if (!N0.hasOneUse()) {
	if (N0.getOpcode() == ISD::AND) {
	auto *AndC = cast<ConstantSDNode>(N0.getOperand(1));
	EVT LoadResultTy = AndC->getValueType(0);
	EVT ExtVT;
	if (isAndLoadExtLoad(AndC, LN00, LoadResultTy, ExtVT))
	DoXform = false;
	}
	}
	if (DoXform)
	DoXform = ExtendUsesToFormExtLoad(VT, N0.getNode(), N0.getOperand(0),
	ISD::ZERO_EXTEND, SetCCs, TLI);
	if (DoXform) {
	SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN00), VT,
	LN00->getChain(), LN00->getBasePtr(),
	LN00->getMemoryVT(),
	LN00->getMemOperand());
	APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits());
	SDLoc DL(N);
	SDValue And = DAG.getNode(N0.getOpcode(), DL, VT,
	ExtLoad, DAG.getConstant(Mask, DL, VT));
	ExtendSetCCUses(SetCCs, N0.getOperand(0), ExtLoad, ISD::ZERO_EXTEND);
	bool NoReplaceTruncAnd = !N0.hasOneUse();
	bool NoReplaceTrunc = SDValue(LN00, 0).hasOneUse();
	CombineTo(N, And);
	// If N0 has multiple uses, change other uses as well.
	if (NoReplaceTruncAnd) {
	SDValue TruncAnd =
	DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And);
	CombineTo(N0.getNode(), TruncAnd);
	}
	if (NoReplaceTrunc) {
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN00, 1), ExtLoad.getValue(1));
	} else {
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(LN00),
	LN00->getValueType(0), ExtLoad);
	CombineTo(LN00, Trunc, ExtLoad.getValue(1));
	}
	return SDValue(N,0); // Return N so it doesn't get rechecked!
	}
	}
	}

	// fold (zext (and/or/xor (shl/shr (load x), cst), cst)) ->
	// (and/or/xor (shl/shr (zextload x), (zext cst)), (zext cst))
	if (SDValue ZExtLoad = CombineZExtLogicopShiftLoad(N))
	return ZExtLoad;

	// Try to simplify (zext (zextload x)).
	if (SDValue foldedExt = tryToFoldExtOfExtload(
	DAG, *this, TLI, VT, LegalOperations, N, N0, ISD::ZEXTLOAD))
	return foldedExt;

	if (SDValue V = foldExtendedSignBitTest(N, DAG, LegalOperations))
	return V;

	if (N0.getOpcode() == ISD::SETCC) {
	// Only do this before legalize for now.
	if (!LegalOperations && VT.isVector() &&
	N0.getValueType().getVectorElementType() == MVT::i1) {
	EVT N00VT = N0.getOperand(0).getValueType();
	if (getSetCCResultType(N00VT) == N0.getValueType())
	return SDValue();

	// We know that the # elements of the results is the same as the #
	// elements of the compare (and the # elements of the compare result for
	// that matter). Check to see that they are the same size. If so, we know
	// that the element size of the sext'd result matches the element size of
	// the compare operands.
	SDLoc DL(N);
	if (VT.getSizeInBits() == N00VT.getSizeInBits()) {
	// zext(setcc) -> zext_in_reg(vsetcc) for vectors.
	SDValue VSetCC = DAG.getNode(ISD::SETCC, DL, VT, N0.getOperand(0),
	N0.getOperand(1), N0.getOperand(2));
	return DAG.getZeroExtendInReg(VSetCC, DL, N0.getValueType());
	}

	// If the desired elements are smaller or larger than the source
	// elements we can use a matching integer vector type and then
	// truncate/any extend followed by zext_in_reg.
	EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger();
	SDValue VsetCC =
	DAG.getNode(ISD::SETCC, DL, MatchingVectorType, N0.getOperand(0),
	N0.getOperand(1), N0.getOperand(2));
	return DAG.getZeroExtendInReg(DAG.getAnyExtOrTrunc(VsetCC, DL, VT), DL,
	N0.getValueType());
	}

	// zext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
	SDLoc DL(N);
	if (SDValue SCC = SimplifySelectCC(
	DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT),
	DAG.getConstant(0, DL, VT),
	cast<CondCodeSDNode>(N0.getOperand(2))->get(), true))
	return SCC;
	}

	// (zext (shl (zext x), cst)) -> (shl (zext x), cst)
	if ((N0.getOpcode() == ISD::SHL \|\| N0.getOpcode() == ISD::SRL) &&
	isa<ConstantSDNode>(N0.getOperand(1)) &&
	N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND &&
	N0.hasOneUse()) {
	SDValue ShAmt = N0.getOperand(1);
	if (N0.getOpcode() == ISD::SHL) {
	SDValue InnerZExt = N0.getOperand(0);
	// If the original shl may be shifting out bits, do not perform this
	// transformation.
	unsigned KnownZeroBits = InnerZExt.getValueSizeInBits() -
	InnerZExt.getOperand(0).getValueSizeInBits();
	if (cast<ConstantSDNode>(ShAmt)->getAPIntValue().ugt(KnownZeroBits))
	return SDValue();
	}

	SDLoc DL(N);

	// Ensure that the shift amount is wide enough for the shifted value.
	if (Log2_32_Ceil(VT.getSizeInBits()) > ShAmt.getValueSizeInBits())
	ShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShAmt);

	return DAG.getNode(N0.getOpcode(), DL, VT,
	DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)),
	ShAmt);
	}

	if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
	return NewVSel;

	if (SDValue NewCtPop = widenCtPop(N, DAG))
	return NewCtPop;

	return SDValue();
	}

	SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
	return Res;

	// fold (aext (aext x)) -> (aext x)
	// fold (aext (zext x)) -> (zext x)
	// fold (aext (sext x)) -> (sext x)
	if (N0.getOpcode() == ISD::ANY_EXTEND \|\|
	N0.getOpcode() == ISD::ZERO_EXTEND \|\|
	N0.getOpcode() == ISD::SIGN_EXTEND)
	return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0));

	// fold (aext (truncate (load x))) -> (aext (smaller load x))
	// fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n)))
	if (N0.getOpcode() == ISD::TRUNCATE) {
	if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
	SDNode *oye = N0.getOperand(0).getNode();
	if (NarrowLoad.getNode() != N0.getNode()) {
	CombineTo(N0.getNode(), NarrowLoad);
	// CombineTo deleted the truncate, if needed, but not what's under it.
	AddToWorklist(oye);
	}
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}

	// fold (aext (truncate x))
	if (N0.getOpcode() == ISD::TRUNCATE)
	return DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);

	// Fold (aext (and (trunc x), cst)) -> (and x, cst)
	// if the trunc is not free.
	if (N0.getOpcode() == ISD::AND &&
	N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
	N0.getOperand(1).getOpcode() == ISD::Constant &&
	!TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(),
	N0.getValueType())) {
	SDLoc DL(N);
	SDValue X = N0.getOperand(0).getOperand(0);
	X = DAG.getAnyExtOrTrunc(X, DL, VT);
	APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits());
	return DAG.getNode(ISD::AND, DL, VT,
	X, DAG.getConstant(Mask, DL, VT));
	}

	// fold (aext (load x)) -> (aext (truncate (extload x)))
	// None of the supported targets knows how to perform load and any_ext
	// on vectors in one instruction. We only perform this transformation on
	// scalars.
	if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() &&
	ISD::isUNINDEXEDLoad(N0.getNode()) &&
	TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
	bool DoXform = true;
	SmallVector<SDNode*, 4> SetCCs;
	if (!N0.hasOneUse())
	DoXform = ExtendUsesToFormExtLoad(VT, N, N0, ISD::ANY_EXTEND, SetCCs,
	TLI);
	if (DoXform) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
	LN0->getChain(),
	LN0->getBasePtr(), N0.getValueType(),
	LN0->getMemOperand());
	ExtendSetCCUses(SetCCs, N0, ExtLoad, ISD::ANY_EXTEND);
	// If the load value is used only by N, replace it via CombineTo N.
	bool NoReplaceTrunc = N0.hasOneUse();
	CombineTo(N, ExtLoad);
	if (NoReplaceTrunc) {
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
	recursivelyDeleteUnusedNodes(LN0);
	} else {
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
	N0.getValueType(), ExtLoad);
	CombineTo(LN0, Trunc, ExtLoad.getValue(1));
	}
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}

	// fold (aext (zextload x)) -> (aext (truncate (zextload x)))
	// fold (aext (sextload x)) -> (aext (truncate (sextload x)))
	// fold (aext ( extload x)) -> (aext (truncate (extload x)))
	if (N0.getOpcode() == ISD::LOAD && !ISD::isNON_EXTLoad(N0.getNode()) &&
	ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	ISD::LoadExtType ExtType = LN0->getExtensionType();
	EVT MemVT = LN0->getMemoryVT();
	if (!LegalOperations \|\| TLI.isLoadExtLegal(ExtType, VT, MemVT)) {
	SDValue ExtLoad = DAG.getExtLoad(ExtType, SDLoc(N),
	VT, LN0->getChain(), LN0->getBasePtr(),
	MemVT, LN0->getMemOperand());
	CombineTo(N, ExtLoad);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
	recursivelyDeleteUnusedNodes(LN0);
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}

	if (N0.getOpcode() == ISD::SETCC) {
	// For vectors:
	// aext(setcc) -> vsetcc
	// aext(setcc) -> truncate(vsetcc)
	// aext(setcc) -> aext(vsetcc)
	// Only do this before legalize for now.
	if (VT.isVector() && !LegalOperations) {
	EVT N00VT = N0.getOperand(0).getValueType();
	if (getSetCCResultType(N00VT) == N0.getValueType())
	return SDValue();

	// We know that the # elements of the results is the same as the
	// # elements of the compare (and the # elements of the compare result
	// for that matter). Check to see that they are the same size. If so,
	// we know that the element size of the sext'd result matches the
	// element size of the compare operands.
	if (VT.getSizeInBits() == N00VT.getSizeInBits())
	return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0),
	N0.getOperand(1),
	cast<CondCodeSDNode>(N0.getOperand(2))->get());

	// If the desired elements are smaller or larger than the source
	// elements we can use a matching integer vector type and then
	// truncate/any extend
	EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger();
	SDValue VsetCC =
	DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0),
	N0.getOperand(1),
	cast<CondCodeSDNode>(N0.getOperand(2))->get());
	return DAG.getAnyExtOrTrunc(VsetCC, SDLoc(N), VT);
	}

	// aext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
	SDLoc DL(N);
	if (SDValue SCC = SimplifySelectCC(
	DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT),
	DAG.getConstant(0, DL, VT),
	cast<CondCodeSDNode>(N0.getOperand(2))->get(), true))
	return SCC;
	}

	if (SDValue NewCtPop = widenCtPop(N, DAG))
	return NewCtPop;

	return SDValue();
	}

	SDValue DAGCombiner::visitAssertExt(SDNode *N) {
	unsigned Opcode = N->getOpcode();
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT AssertVT = cast<VTSDNode>(N1)->getVT();

	// fold (assert?ext (assert?ext x, vt), vt) -> (assert?ext x, vt)
	if (N0.getOpcode() == Opcode &&
	AssertVT == cast<VTSDNode>(N0.getOperand(1))->getVT())
	return N0;

	if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
	N0.getOperand(0).getOpcode() == Opcode) {
	// We have an assert, truncate, assert sandwich. Make one stronger assert
	// by asserting on the smallest asserted type to the larger source type.
	// This eliminates the later assert:
	// assert (trunc (assert X, i8) to iN), i1 --> trunc (assert X, i1) to iN
	// assert (trunc (assert X, i1) to iN), i8 --> trunc (assert X, i1) to iN
	SDValue BigA = N0.getOperand(0);
	EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT();
	assert(BigA_AssertVT.bitsLE(N0.getValueType()) &&
	"Asserting zero/sign-extended bits to a type larger than the "
	"truncated destination does not provide information");

	SDLoc DL(N);
	EVT MinAssertVT = AssertVT.bitsLT(BigA_AssertVT) ? AssertVT : BigA_AssertVT;
	SDValue MinAssertVTVal = DAG.getValueType(MinAssertVT);
	SDValue NewAssert = DAG.getNode(Opcode, DL, BigA.getValueType(),
	BigA.getOperand(0), MinAssertVTVal);
	return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert);
	}

	// If we have (AssertZext (truncate (AssertSext X, iX)), iY) and Y is smaller
	// than X. Just move the AssertZext in front of the truncate and drop the
	// AssertSExt.
	if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
	N0.getOperand(0).getOpcode() == ISD::AssertSext &&
	Opcode == ISD::AssertZext) {
	SDValue BigA = N0.getOperand(0);
	EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT();
	assert(BigA_AssertVT.bitsLE(N0.getValueType()) &&
	"Asserting zero/sign-extended bits to a type larger than the "
	"truncated destination does not provide information");

	if (AssertVT.bitsLT(BigA_AssertVT)) {
	SDLoc DL(N);
	SDValue NewAssert = DAG.getNode(Opcode, DL, BigA.getValueType(),
	BigA.getOperand(0), N1);
	return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert);
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitAssertAlign(SDNode *N) {
	SDLoc DL(N);

	Align AL = cast<AssertAlignSDNode>(N)->getAlign();
	SDValue N0 = N->getOperand(0);

	// Fold (assertalign (assertalign x, AL0), AL1) ->
	// (assertalign x, max(AL0, AL1))
	if (auto *AAN = dyn_cast<AssertAlignSDNode>(N0))
	return DAG.getAssertAlign(DL, N0.getOperand(0),
	std::max(AL, AAN->getAlign()));

	// In rare cases, there are trivial arithmetic ops in source operands. Sink
	// this assert down to source operands so that those arithmetic ops could be
	// exposed to the DAG combining.
	switch (N0.getOpcode()) {
	default:
	break;
	case ISD::ADD:
	case ISD::SUB: {
	unsigned AlignShift = Log2(AL);
	SDValue LHS = N0.getOperand(0);
	SDValue RHS = N0.getOperand(1);
	unsigned LHSAlignShift = DAG.computeKnownBits(LHS).countMinTrailingZeros();
	unsigned RHSAlignShift = DAG.computeKnownBits(RHS).countMinTrailingZeros();
	if (LHSAlignShift >= AlignShift \|\| RHSAlignShift >= AlignShift) {
	if (LHSAlignShift < AlignShift)
	LHS = DAG.getAssertAlign(DL, LHS, AL);
	if (RHSAlignShift < AlignShift)
	RHS = DAG.getAssertAlign(DL, RHS, AL);
	return DAG.getNode(N0.getOpcode(), DL, N0.getValueType(), LHS, RHS);
	}
	break;
	}
	}

	return SDValue();
	}

	/// If the result of a wider load is shifted to right of N bits and then
	/// truncated to a narrower type and where N is a multiple of number of bits of
	/// the narrower type, transform it to a narrower load from address + N / num of
	/// bits of new type. Also narrow the load if the result is masked with an AND
	/// to effectively produce a smaller type. If the result is to be extended, also
	/// fold the extension to form a extending load.
	SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
	unsigned Opc = N->getOpcode();

	ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT ExtVT = VT;

	// This transformation isn't valid for vector loads.
	if (VT.isVector())
	return SDValue();

	unsigned ShAmt = 0;
	bool HasShiftedOffset = false;
	// Special case: SIGN_EXTEND_INREG is basically truncating to ExtVT then
	// extended to VT.
	if (Opc == ISD::SIGN_EXTEND_INREG) {
	ExtType = ISD::SEXTLOAD;
	ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT();
	} else if (Opc == ISD::SRL) {
	// Another special-case: SRL is basically zero-extending a narrower value,
	// or it maybe shifting a higher subword, half or byte into the lowest
	// bits.
	ExtType = ISD::ZEXTLOAD;
	N0 = SDValue(N, 0);

	auto *LN0 = dyn_cast<LoadSDNode>(N0.getOperand(0));
	auto *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (!N01 \|\| !LN0)
	return SDValue();

	uint64_t ShiftAmt = N01->getZExtValue();
	uint64_t MemoryWidth = LN0->getMemoryVT().getSizeInBits();
	if (LN0->getExtensionType() != ISD::SEXTLOAD && MemoryWidth > ShiftAmt)
	ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShiftAmt);
	else
	ExtVT = EVT::getIntegerVT(*DAG.getContext(),
	VT.getSizeInBits() - ShiftAmt);
	} else if (Opc == ISD::AND) {
	// An AND with a constant mask is the same as a truncate + zero-extend.
	auto AndC = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!AndC)
	return SDValue();

	const APInt &Mask = AndC->getAPIntValue();
	unsigned ActiveBits = 0;
	if (Mask.isMask()) {
	ActiveBits = Mask.countTrailingOnes();
	} else if (Mask.isShiftedMask()) {
	ShAmt = Mask.countTrailingZeros();
	APInt ShiftedMask = Mask.lshr(ShAmt);
	ActiveBits = ShiftedMask.countTrailingOnes();
	HasShiftedOffset = true;
	} else
	return SDValue();

	ExtType = ISD::ZEXTLOAD;
	ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
	}

	if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) {
	SDValue SRL = N0;
	if (auto *ConstShift = dyn_cast<ConstantSDNode>(SRL.getOperand(1))) {
	ShAmt = ConstShift->getZExtValue();
	unsigned EVTBits = ExtVT.getSizeInBits();
	// Is the shift amount a multiple of size of VT?
	if ((ShAmt & (EVTBits-1)) == 0) {
	N0 = N0.getOperand(0);
	// Is the load width a multiple of size of VT?
	if ((N0.getValueSizeInBits() & (EVTBits-1)) != 0)
	return SDValue();
	}

	// At this point, we must have a load or else we can't do the transform.
	auto *LN0 = dyn_cast<LoadSDNode>(N0);
	if (!LN0) return SDValue();

	// Because a SRL must be assumed to need to zero-extend the high bits
	// (as opposed to anyext the high bits), we can't combine the zextload
	// lowering of SRL and an sextload.
	if (LN0->getExtensionType() == ISD::SEXTLOAD)
	return SDValue();

	// If the shift amount is larger than the input type then we're not
	// accessing any of the loaded bytes. If the load was a zextload/extload
	// then the result of the shift+trunc is zero/undef (handled elsewhere).
	if (ShAmt >= LN0->getMemoryVT().getSizeInBits())
	return SDValue();

	// If the SRL is only used by a masking AND, we may be able to adjust
	// the ExtVT to make the AND redundant.
	SDNode Mask = (SRL->use_begin());
	if (Mask->getOpcode() == ISD::AND &&
	isa<ConstantSDNode>(Mask->getOperand(1))) {
	const APInt& ShiftMask = Mask->getConstantOperandAPInt(1);
	if (ShiftMask.isMask()) {
	EVT MaskedVT = EVT::getIntegerVT(*DAG.getContext(),
	ShiftMask.countTrailingOnes());
	// If the mask is smaller, recompute the type.
	if ((ExtVT.getSizeInBits() > MaskedVT.getSizeInBits()) &&
	TLI.isLoadExtLegal(ExtType, N0.getValueType(), MaskedVT))
	ExtVT = MaskedVT;
	}
	}
	}
	}

	// If the load is shifted left (and the result isn't shifted back right),
	// we can fold the truncate through the shift.
	unsigned ShLeftAmt = 0;
	if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
	ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) {
	if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
	ShLeftAmt = N01->getZExtValue();
	N0 = N0.getOperand(0);
	}
	}

	// If we haven't found a load, we can't narrow it.
	if (!isa<LoadSDNode>(N0))
	return SDValue();

	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	// Reducing the width of a volatile load is illegal. For atomics, we may be
	// able to reduce the width provided we never widen again. (see D66309)
	if (!LN0->isSimple() \|\|
	!isLegalNarrowLdSt(LN0, ExtType, ExtVT, ShAmt))
	return SDValue();

	auto AdjustBigEndianShift = [&](unsigned ShAmt) {
	unsigned LVTStoreBits = LN0->getMemoryVT().getStoreSizeInBits();
	unsigned EVTStoreBits = ExtVT.getStoreSizeInBits();
	return LVTStoreBits - EVTStoreBits - ShAmt;
	};

	// For big endian targets, we need to adjust the offset to the pointer to
	// load the correct bytes.
	if (DAG.getDataLayout().isBigEndian())
	ShAmt = AdjustBigEndianShift(ShAmt);

	uint64_t PtrOff = ShAmt / 8;
	unsigned NewAlign = MinAlign(LN0->getAlignment(), PtrOff);
	SDLoc DL(LN0);
	// The original load itself didn't wrap, so an offset within it doesn't.
	SDNodeFlags Flags;
	Flags.setNoUnsignedWrap(true);
	SDValue NewPtr =
	DAG.getMemBasePlusOffset(LN0->getBasePtr(), PtrOff, DL, Flags);
	AddToWorklist(NewPtr.getNode());

	SDValue Load;
	if (ExtType == ISD::NON_EXTLOAD)
	Load = DAG.getLoad(VT, DL, LN0->getChain(), NewPtr,
	LN0->getPointerInfo().getWithOffset(PtrOff), NewAlign,
	LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
	else
	Load = DAG.getExtLoad(ExtType, DL, VT, LN0->getChain(), NewPtr,
	LN0->getPointerInfo().getWithOffset(PtrOff), ExtVT,
	NewAlign, LN0->getMemOperand()->getFlags(),
	LN0->getAAInfo());

	// Replace the old load's chain with the new load's chain.
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));

	// Shift the result left, if we've swallowed a left shift.
	SDValue Result = Load;
	if (ShLeftAmt != 0) {
	EVT ShImmTy = getShiftAmountTy(Result.getValueType());
	if (!isUIntN(ShImmTy.getSizeInBits(), ShLeftAmt))
	ShImmTy = VT;
	// If the shift amount is as large as the result size (but, presumably,
	// no larger than the source) then the useful bits of the result are
	// zero; we can't simply return the shortened shift, because the result
	// of that operation is undefined.
	if (ShLeftAmt >= VT.getSizeInBits())
	Result = DAG.getConstant(0, DL, VT);
	else
	Result = DAG.getNode(ISD::SHL, DL, VT,
	Result, DAG.getConstant(ShLeftAmt, DL, ShImmTy));
	}

	if (HasShiftedOffset) {
	// Recalculate the shift amount after it has been altered to calculate
	// the offset.
	if (DAG.getDataLayout().isBigEndian())
	ShAmt = AdjustBigEndianShift(ShAmt);

	// We're using a shifted mask, so the load now has an offset. This means
	// that data has been loaded into the lower bytes than it would have been
	// before, so we need to shl the loaded data into the correct position in the
	// register.
	SDValue ShiftC = DAG.getConstant(ShAmt, DL, VT);
	Result = DAG.getNode(ISD::SHL, DL, VT, Result, ShiftC);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
	}

	// Return the new loaded value.
	return Result;
	}

	SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	EVT ExtVT = cast<VTSDNode>(N1)->getVT();
	unsigned VTBits = VT.getScalarSizeInBits();
	unsigned ExtVTBits = ExtVT.getScalarSizeInBits();

	// sext_vector_inreg(undef) = 0 because the top bit will all be the same.
	if (N0.isUndef())
	return DAG.getConstant(0, SDLoc(N), VT);

	// fold (sext_in_reg c1) -> c1
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0, N1);

	// If the input is already sign extended, just drop the extension.
	if (DAG.ComputeNumSignBits(N0) >= (VTBits - ExtVTBits + 1))
	return N0;

	// fold (sext_in_reg (sext_in_reg x, VT2), VT1) -> (sext_in_reg x, minVT) pt2
	if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG &&
	ExtVT.bitsLT(cast<VTSDNode>(N0.getOperand(1))->getVT()))
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0.getOperand(0),
	N1);

	// fold (sext_in_reg (sext x)) -> (sext x)
	// fold (sext_in_reg (aext x)) -> (sext x)
	// if x is small enough or if we know that x has more than 1 sign bit and the
	// sign_extend_inreg is extending from one of them.
	if (N0.getOpcode() == ISD::SIGN_EXTEND \|\| N0.getOpcode() == ISD::ANY_EXTEND) {
	SDValue N00 = N0.getOperand(0);
	unsigned N00Bits = N00.getScalarValueSizeInBits();
	if ((N00Bits <= ExtVTBits \|\|
	(N00Bits - DAG.ComputeNumSignBits(N00)) < ExtVTBits) &&
	(!LegalOperations \|\| TLI.isOperationLegal(ISD::SIGN_EXTEND, VT)))
	return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00);
	}

	// fold (sext_in_reg (*_extend_vector_inreg x)) -> (sext_vector_inreg x)
	if ((N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG \|\|
	N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG \|\|
	N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) &&
	N0.getOperand(0).getScalarValueSizeInBits() == ExtVTBits) {
	if (!LegalOperations \|\|
	TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT))
	return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, SDLoc(N), VT,
	N0.getOperand(0));
	}

	// fold (sext_in_reg (zext x)) -> (sext x)
	// iff we are extending the source sign bit.
	if (N0.getOpcode() == ISD::ZERO_EXTEND) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getScalarValueSizeInBits() == ExtVTBits &&
	(!LegalOperations \|\| TLI.isOperationLegal(ISD::SIGN_EXTEND, VT)))
	return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1);
	}

	// fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero.
	if (DAG.MaskedValueIsZero(N0, APInt::getOneBitSet(VTBits, ExtVTBits - 1)))
	return DAG.getZeroExtendInReg(N0, SDLoc(N), ExtVT);

	// fold operands of sext_in_reg based on knowledge that the top bits are not
	// demanded.
	if (SimplifyDemandedBits(SDValue(N, 0)))
	return SDValue(N, 0);

	// fold (sext_in_reg (load x)) -> (smaller sextload x)
	// fold (sext_in_reg (srl (load x), c)) -> (smaller sextload (x+c/evtbits))
	if (SDValue NarrowLoad = ReduceLoadWidth(N))
	return NarrowLoad;

	// fold (sext_in_reg (srl X, 24), i8) -> (sra X, 24)
	// fold (sext_in_reg (srl X, 23), i8) -> (sra X, 23) iff possible.
	// We already fold "(sext_in_reg (srl X, 25), i8) -> srl X, 25" above.
	if (N0.getOpcode() == ISD::SRL) {
	if (auto *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1)))
	if (ShAmt->getAPIntValue().ule(VTBits - ExtVTBits)) {
	// We can turn this into an SRA iff the input to the SRL is already sign
	// extended enough.
	unsigned InSignBits = DAG.ComputeNumSignBits(N0.getOperand(0));
	if (((VTBits - ExtVTBits) - ShAmt->getZExtValue()) < InSignBits)
	return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0.getOperand(0),
	N0.getOperand(1));
	}
	}

	// fold (sext_inreg (extload x)) -> (sextload x)
	// If sextload is not supported by target, we can only do the combine when
	// load has one use. Doing otherwise can block folding the extload with other
	// extends that the target does support.
	if (ISD::isEXTLoad(N0.getNode()) &&
	ISD::isUNINDEXEDLoad(N0.getNode()) &&
	ExtVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
	((!LegalOperations && cast<LoadSDNode>(N0)->isSimple() &&
	N0.hasOneUse()) \|\|
	TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT))) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
	LN0->getChain(),
	LN0->getBasePtr(), ExtVT,
	LN0->getMemOperand());
	CombineTo(N, ExtLoad);
	CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
	AddToWorklist(ExtLoad.getNode());
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	// fold (sext_inreg (zextload x)) -> (sextload x) iff load has one use
	if (ISD::isZEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
	N0.hasOneUse() &&
	ExtVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
	((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) &&
	TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT))) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
	LN0->getChain(),
	LN0->getBasePtr(), ExtVT,
	LN0->getMemOperand());
	CombineTo(N, ExtLoad);
	CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}

	// Form (sext_inreg (bswap >> 16)) or (sext_inreg (rotl (bswap) 16))
	if (ExtVTBits <= 16 && N0.getOpcode() == ISD::OR) {
	if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
	N0.getOperand(1), false))
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, BSwap, N1);
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitSIGN_EXTEND_VECTOR_INREG(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// sext_vector_inreg(undef) = 0 because the top bit will all be the same.
	if (N0.isUndef())
	return DAG.getConstant(0, SDLoc(N), VT);

	if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
	return Res;

	if (SimplifyDemandedVectorElts(SDValue(N, 0)))
	return SDValue(N, 0);

	return SDValue();
	}

	SDValue DAGCombiner::visitZERO_EXTEND_VECTOR_INREG(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// zext_vector_inreg(undef) = 0 because the top bits will be zero.
	if (N0.isUndef())
	return DAG.getConstant(0, SDLoc(N), VT);

	if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
	return Res;

	if (SimplifyDemandedVectorElts(SDValue(N, 0)))
	return SDValue(N, 0);

	return SDValue();
	}

	SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT SrcVT = N0.getValueType();
	bool isLE = DAG.getDataLayout().isLittleEndian();

	// noop truncate
	if (SrcVT == VT)
	return N0;

	// fold (truncate (truncate x)) -> (truncate x)
	if (N0.getOpcode() == ISD::TRUNCATE)
	return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0));

	// fold (truncate c1) -> c1
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) {
	SDValue C = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0);
	if (C.getNode() != N)
	return C;
	}

	// fold (truncate (ext x)) -> (ext x) or (truncate x) or x
	if (N0.getOpcode() == ISD::ZERO_EXTEND \|\|
	N0.getOpcode() == ISD::SIGN_EXTEND \|\|
	N0.getOpcode() == ISD::ANY_EXTEND) {
	// if the source is smaller than the dest, we still need an extend.
	if (N0.getOperand(0).getValueType().bitsLT(VT))
	return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0));
	// if the source is larger than the dest, than we just need the truncate.
	if (N0.getOperand(0).getValueType().bitsGT(VT))
	return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0));
	// if the source and dest are the same type, we can drop both the extend
	// and the truncate.
	return N0.getOperand(0);
	}

	// If this is anyext(trunc), don't fold it, allow ourselves to be folded.
	if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ANY_EXTEND))
	return SDValue();

	// Fold extract-and-trunc into a narrow extract. For example:
	// i64 x = EXTRACT_VECTOR_ELT(v2i64 val, i32 1)
	// i32 y = TRUNCATE(i64 x)
	// -- becomes --
	// v16i8 b = BITCAST (v2i64 val)
	// i8 x = EXTRACT_VECTOR_ELT(v16i8 b, i32 8)
	//
	// Note: We only run this optimization after type legalization (which often
	// creates this pattern) and before operation legalization after which
	// we need to be more careful about the vector instructions that we generate.
	if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	LegalTypes && !LegalOperations && N0->hasOneUse() && VT != MVT::i1) {
	EVT VecTy = N0.getOperand(0).getValueType();
	EVT ExTy = N0.getValueType();
	EVT TrTy = N->getValueType(0);

	unsigned NumElem = VecTy.getVectorNumElements();
	unsigned SizeRatio = ExTy.getSizeInBits()/TrTy.getSizeInBits();

	EVT NVT = EVT::getVectorVT(DAG.getContext(), TrTy, SizeRatio NumElem);
	assert(NVT.getSizeInBits() == VecTy.getSizeInBits() && "Invalid Size");

	SDValue EltNo = N0->getOperand(1);
	if (isa<ConstantSDNode>(EltNo) && isTypeLegal(NVT)) {
	int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
	int Index = isLE ? (EltSizeRatio) : (EltSizeRatio + (SizeRatio-1));

	SDLoc DL(N);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TrTy,
	DAG.getBitcast(NVT, N0.getOperand(0)),
	DAG.getVectorIdxConstant(Index, DL));
	}
	}

	// trunc (select c, a, b) -> select c, (trunc a), (trunc b)
	if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse()) {
	if ((!LegalOperations \|\| TLI.isOperationLegal(ISD::SELECT, SrcVT)) &&
	TLI.isTruncateFree(SrcVT, VT)) {
	SDLoc SL(N0);
	SDValue Cond = N0.getOperand(0);
	SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
	SDValue TruncOp1 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(2));
	return DAG.getNode(ISD::SELECT, SDLoc(N), VT, Cond, TruncOp0, TruncOp1);
	}
	}

	// trunc (shl x, K) -> shl (trunc x), K => K < VT.getScalarSizeInBits()
	if (N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
	(!LegalOperations \|\| TLI.isOperationLegal(ISD::SHL, VT)) &&
	TLI.isTypeDesirableForOp(ISD::SHL, VT)) {
	SDValue Amt = N0.getOperand(1);
	KnownBits Known = DAG.computeKnownBits(Amt);
	unsigned Size = VT.getScalarSizeInBits();
	if (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size)) {
	SDLoc SL(N);
	EVT AmtVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());

	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0));
	if (AmtVT != Amt.getValueType()) {
	Amt = DAG.getZExtOrTrunc(Amt, SL, AmtVT);
	AddToWorklist(Amt.getNode());
	}
	return DAG.getNode(ISD::SHL, SL, VT, Trunc, Amt);
	}
	}

	// Attempt to pre-truncate BUILD_VECTOR sources.
	if (N0.getOpcode() == ISD::BUILD_VECTOR && !LegalOperations &&
	TLI.isTruncateFree(SrcVT.getScalarType(), VT.getScalarType()) &&
	// Avoid creating illegal types if running after type legalizer.
	(!LegalTypes \|\| TLI.isTypeLegal(VT.getScalarType()))) {
	SDLoc DL(N);
	EVT SVT = VT.getScalarType();
	SmallVector<SDValue, 8> TruncOps;
	for (const SDValue &Op : N0->op_values()) {
	SDValue TruncOp = DAG.getNode(ISD::TRUNCATE, DL, SVT, Op);
	TruncOps.push_back(TruncOp);
	}
	return DAG.getBuildVector(VT, DL, TruncOps);
	}

	// Fold a series of buildvector, bitcast, and truncate if possible.
	// For example fold
	// (2xi32 trunc (bitcast ((4xi32)buildvector x, x, y, y) 2xi64)) to
	// (2xi32 (buildvector x, y)).
	if (Level == AfterLegalizeVectorOps && VT.isVector() &&
	N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() &&
	N0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
	N0.getOperand(0).hasOneUse()) {
	SDValue BuildVect = N0.getOperand(0);
	EVT BuildVectEltTy = BuildVect.getValueType().getVectorElementType();
	EVT TruncVecEltTy = VT.getVectorElementType();

	// Check that the element types match.
	if (BuildVectEltTy == TruncVecEltTy) {
	// Now we only need to compute the offset of the truncated elements.
	unsigned BuildVecNumElts = BuildVect.getNumOperands();
	unsigned TruncVecNumElts = VT.getVectorNumElements();
	unsigned TruncEltOffset = BuildVecNumElts / TruncVecNumElts;

	assert((BuildVecNumElts % TruncVecNumElts) == 0 &&
	"Invalid number of elements");

	SmallVector<SDValue, 8> Opnds;
	for (unsigned i = 0, e = BuildVecNumElts; i != e; i += TruncEltOffset)
	Opnds.push_back(BuildVect.getOperand(i));

	return DAG.getBuildVector(VT, SDLoc(N), Opnds);
	}
	}

	// See if we can simplify the input to this truncate through knowledge that
	// only the low bits are being used.
	// For example "trunc (or (shl x, 8), y)" // -> trunc y
	// Currently we only perform this optimization on scalars because vectors
	// may have different active low bits.
	if (!VT.isVector()) {
	APInt Mask =
	APInt::getLowBitsSet(N0.getValueSizeInBits(), VT.getSizeInBits());
	if (SDValue Shorter = DAG.GetDemandedBits(N0, Mask))
	return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Shorter);
	}

	// fold (truncate (load x)) -> (smaller load x)
	// fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits))
	if (!LegalTypes \|\| TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) {
	if (SDValue Reduced = ReduceLoadWidth(N))
	return Reduced;

	// Handle the case where the load remains an extending load even
	// after truncation.
	if (N0.hasOneUse() && ISD::isUNINDEXEDLoad(N0.getNode())) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	if (LN0->isSimple() &&
	LN0->getMemoryVT().getStoreSizeInBits() < VT.getSizeInBits()) {
	SDValue NewLoad = DAG.getExtLoad(LN0->getExtensionType(), SDLoc(LN0),
	VT, LN0->getChain(), LN0->getBasePtr(),
	LN0->getMemoryVT(),
	LN0->getMemOperand());
	DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLoad.getValue(1));
	return NewLoad;
	}
	}
	}

	// fold (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...)),
	// where ... are all 'undef'.
	if (N0.getOpcode() == ISD::CONCAT_VECTORS && !LegalTypes) {
	SmallVector<EVT, 8> VTs;
	SDValue V;
	unsigned Idx = 0;
	unsigned NumDefs = 0;

	for (unsigned i = 0, e = N0.getNumOperands(); i != e; ++i) {
	SDValue X = N0.getOperand(i);
	if (!X.isUndef()) {
	V = X;
	Idx = i;
	NumDefs++;
	}
	// Stop if more than one members are non-undef.
	if (NumDefs > 1)
	break;
	+
	VTs.push_back(EVT::getVectorVT(*DAG.getContext(),
	VT.getVectorElementType(),
	- X.getValueType().getVectorNumElements()));
	+ X.getValueType().getVectorElementCount()));
	}

	if (NumDefs == 0)
	return DAG.getUNDEF(VT);

	if (NumDefs == 1) {
	assert(V.getNode() && "The single defined operand is empty!");
	SmallVector<SDValue, 8> Opnds;
	for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
	if (i != Idx) {
	Opnds.push_back(DAG.getUNDEF(VTs[i]));
	continue;
	}
	SDValue NV = DAG.getNode(ISD::TRUNCATE, SDLoc(V), VTs[i], V);
	AddToWorklist(NV.getNode());
	Opnds.push_back(NV);
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Opnds);
	}
	}

	// Fold truncate of a bitcast of a vector to an extract of the low vector
	// element.
	//
	// e.g. trunc (i64 (bitcast v2i32:x)) -> extract_vector_elt v2i32:x, idx
	if (N0.getOpcode() == ISD::BITCAST && !VT.isVector()) {
	SDValue VecSrc = N0.getOperand(0);
	EVT VecSrcVT = VecSrc.getValueType();
	if (VecSrcVT.isVector() && VecSrcVT.getScalarType() == VT &&
	(!LegalOperations \|\|
	TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VecSrcVT))) {
	SDLoc SL(N);

	unsigned Idx = isLE ? 0 : VecSrcVT.getVectorNumElements() - 1;
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, VT, VecSrc,
	DAG.getVectorIdxConstant(Idx, SL));
	}
	}

	// Simplify the operands using demanded-bits information.
	if (!VT.isVector() &&
	SimplifyDemandedBits(SDValue(N, 0)))
	return SDValue(N, 0);

	// (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry)
	// (trunc addcarry(X, Y, Carry)) -> (addcarry trunc(X), trunc(Y), Carry)
	// When the adde's carry is not used.
	if ((N0.getOpcode() == ISD::ADDE \|\| N0.getOpcode() == ISD::ADDCARRY) &&
	N0.hasOneUse() && !N0.getNode()->hasAnyUseOfValue(1) &&
	// We only do for addcarry before legalize operation
	((!LegalOperations && N0.getOpcode() == ISD::ADDCARRY) \|\|
	TLI.isOperationLegal(N0.getOpcode(), VT))) {
	SDLoc SL(N);
	auto X = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0));
	auto Y = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
	auto VTs = DAG.getVTList(VT, N0->getValueType(1));
	return DAG.getNode(N0.getOpcode(), SL, VTs, X, Y, N0.getOperand(2));
	}

	// fold (truncate (extract_subvector(ext x))) ->
	// (extract_subvector x)
	// TODO: This can be generalized to cover cases where the truncate and extract
	// do not fully cancel each other out.
	if (!LegalTypes && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getOpcode() == ISD::SIGN_EXTEND \|\|
	N00.getOpcode() == ISD::ZERO_EXTEND \|\|
	N00.getOpcode() == ISD::ANY_EXTEND) {
	if (N00.getOperand(0)->getValueType(0).getVectorElementType() ==
	VT.getVectorElementType())
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N0->getOperand(0)), VT,
	N00.getOperand(0), N0.getOperand(1));
	}
	}

	if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
	return NewVSel;

	// Narrow a suitable binary operation with a non-opaque constant operand by
	// moving it ahead of the truncate. This is limited to pre-legalization
	// because targets may prefer a wider type during later combines and invert
	// this transform.
	switch (N0.getOpcode()) {
	case ISD::ADD:
	case ISD::SUB:
	case ISD::MUL:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	if (!LegalOperations && N0.hasOneUse() &&
	(isConstantOrConstantVector(N0.getOperand(0), true) \|\|
	isConstantOrConstantVector(N0.getOperand(1), true))) {
	// TODO: We already restricted this to pre-legalization, but for vectors
	// we are extra cautious to not create an unsupported operation.
	// Target-specific changes are likely needed to avoid regressions here.
	if (VT.isScalarInteger() \|\| TLI.isOperationLegal(N0.getOpcode(), VT)) {
	SDLoc DL(N);
	SDValue NarrowL = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
	SDValue NarrowR = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1));
	return DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR);
	}
	}
	}

	return SDValue();
	}

	static SDNode getBuildPairElt(SDNode N, unsigned i) {
	SDValue Elt = N->getOperand(i);
	if (Elt.getOpcode() != ISD::MERGE_VALUES)
	return Elt.getNode();
	return Elt.getOperand(Elt.getResNo()).getNode();
	}

	/// build_pair (load, load) -> load
	/// if load locations are consecutive.
	SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {
	assert(N->getOpcode() == ISD::BUILD_PAIR);

	LoadSDNode *LD1 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 0));
	LoadSDNode *LD2 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 1));

	// A BUILD_PAIR is always having the least significant part in elt 0 and the
	// most significant part in elt 1. So when combining into one large load, we
	// need to consider the endianness.
	if (DAG.getDataLayout().isBigEndian())
	std::swap(LD1, LD2);

	if (!LD1 \|\| !LD2 \|\| !ISD::isNON_EXTLoad(LD1) \|\| !LD1->hasOneUse() \|\|
	LD1->getAddressSpace() != LD2->getAddressSpace())
	return SDValue();
	EVT LD1VT = LD1->getValueType(0);
	unsigned LD1Bytes = LD1VT.getStoreSize();
	if (ISD::isNON_EXTLoad(LD2) && LD2->hasOneUse() &&
	DAG.areNonVolatileConsecutiveLoads(LD2, LD1, LD1Bytes, 1)) {
	Align Alignment = LD1->getAlign();
	Align NewAlign = DAG.getDataLayout().getABITypeAlign(
	VT.getTypeForEVT(*DAG.getContext()));

	if (NewAlign <= Alignment &&
	(!LegalOperations \|\| TLI.isOperationLegal(ISD::LOAD, VT)))
	return DAG.getLoad(VT, SDLoc(N), LD1->getChain(), LD1->getBasePtr(),
	LD1->getPointerInfo(), Alignment);
	}

	return SDValue();
	}

	static unsigned getPPCf128HiElementSelector(const SelectionDAG &DAG) {
	// On little-endian machines, bitcasting from ppcf128 to i128 does swap the Hi
	// and Lo parts; on big-endian machines it doesn't.
	return DAG.getDataLayout().isBigEndian() ? 1 : 0;
	}

	static SDValue foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG,
	const TargetLowering &TLI) {
	// If this is not a bitcast to an FP type or if the target doesn't have
	// IEEE754-compliant FP logic, we're done.
	EVT VT = N->getValueType(0);
	if (!VT.isFloatingPoint() \|\| !TLI.hasBitPreservingFPLogic(VT))
	return SDValue();

	// TODO: Handle cases where the integer constant is a different scalar
	// bitwidth to the FP.
	SDValue N0 = N->getOperand(0);
	EVT SourceVT = N0.getValueType();
	if (VT.getScalarSizeInBits() != SourceVT.getScalarSizeInBits())
	return SDValue();

	unsigned FPOpcode;
	APInt SignMask;
	switch (N0.getOpcode()) {
	case ISD::AND:
	FPOpcode = ISD::FABS;
	SignMask = ~APInt::getSignMask(SourceVT.getScalarSizeInBits());
	break;
	case ISD::XOR:
	FPOpcode = ISD::FNEG;
	SignMask = APInt::getSignMask(SourceVT.getScalarSizeInBits());
	break;
	case ISD::OR:
	FPOpcode = ISD::FABS;
	SignMask = APInt::getSignMask(SourceVT.getScalarSizeInBits());
	break;
	default:
	return SDValue();
	}

	// Fold (bitcast int (and (bitcast fp X to int), 0x7fff...) to fp) -> fabs X
	// Fold (bitcast int (xor (bitcast fp X to int), 0x8000...) to fp) -> fneg X
	// Fold (bitcast int (or (bitcast fp X to int), 0x8000...) to fp) ->
	// fneg (fabs X)
	SDValue LogicOp0 = N0.getOperand(0);
	ConstantSDNode *LogicOp1 = isConstOrConstSplat(N0.getOperand(1), true);
	if (LogicOp1 && LogicOp1->getAPIntValue() == SignMask &&
	LogicOp0.getOpcode() == ISD::BITCAST &&
	LogicOp0.getOperand(0).getValueType() == VT) {
	SDValue FPOp = DAG.getNode(FPOpcode, SDLoc(N), VT, LogicOp0.getOperand(0));
	NumFPLogicOpsConv++;
	if (N0.getOpcode() == ISD::OR)
	return DAG.getNode(ISD::FNEG, SDLoc(N), VT, FPOp);
	return FPOp;
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitBITCAST(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	if (N0.isUndef())
	return DAG.getUNDEF(VT);

	// If the input is a BUILD_VECTOR with all constant elements, fold this now.
	// Only do this before legalize types, unless both types are integer and the
	// scalar type is legal. Only do this before legalize ops, since the target
	// maybe depending on the bitcast.
	// First check to see if this is all constant.
	// TODO: Support FP bitcasts after legalize types.
	if (VT.isVector() &&
	(!LegalTypes \|\|
	(!LegalOperations && VT.isInteger() && N0.getValueType().isInteger() &&
	TLI.isTypeLegal(VT.getVectorElementType()))) &&
	N0.getOpcode() == ISD::BUILD_VECTOR && N0.getNode()->hasOneUse() &&
	cast<BuildVectorSDNode>(N0)->isConstant())
	return ConstantFoldBITCASTofBUILD_VECTOR(N0.getNode(),
	VT.getVectorElementType());

	// If the input is a constant, let getNode fold it.
	if (isa<ConstantSDNode>(N0) \|\| isa<ConstantFPSDNode>(N0)) {
	// If we can't allow illegal operations, we need to check that this is just
	// a fp -> int or int -> conversion and that the resulting operation will
	// be legal.
	if (!LegalOperations \|\|
	(isa<ConstantSDNode>(N0) && VT.isFloatingPoint() && !VT.isVector() &&
	TLI.isOperationLegal(ISD::ConstantFP, VT)) \|\|
	(isa<ConstantFPSDNode>(N0) && VT.isInteger() && !VT.isVector() &&
	TLI.isOperationLegal(ISD::Constant, VT))) {
	SDValue C = DAG.getBitcast(VT, N0);
	if (C.getNode() != N)
	return C;
	}
	}

	// (conv (conv x, t1), t2) -> (conv x, t2)
	if (N0.getOpcode() == ISD::BITCAST)
	return DAG.getBitcast(VT, N0.getOperand(0));

	// fold (conv (load x)) -> (load (conv*)x)
	// If the resultant load doesn't need a higher alignment than the original!
	if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
	// Do not remove the cast if the types differ in endian layout.
	TLI.hasBigEndianPartOrdering(N0.getValueType(), DAG.getDataLayout()) ==
	TLI.hasBigEndianPartOrdering(VT, DAG.getDataLayout()) &&
	// If the load is volatile, we only want to change the load type if the
	// resulting load is legal. Otherwise we might increase the number of
	// memory accesses. We don't care if the original type was legal or not
	// as we assume software couldn't rely on the number of accesses of an
	// illegal type.
	((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) \|\|
	TLI.isOperationLegal(ISD::LOAD, VT))) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);

	if (TLI.isLoadBitCastBeneficial(N0.getValueType(), VT, DAG,
	*LN0->getMemOperand())) {
	SDValue Load =
	DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
	LN0->getPointerInfo(), LN0->getAlignment(),
	LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
	DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
	return Load;
	}
	}

	if (SDValue V = foldBitcastedFPLogic(N, DAG, TLI))
	return V;

	// fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
	// fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
	//
	// For ppc_fp128:
	// fold (bitcast (fneg x)) ->
	// flipbit = signbit
	// (xor (bitcast x) (build_pair flipbit, flipbit))
	//
	// fold (bitcast (fabs x)) ->
	// flipbit = (and (extract_element (bitcast x), 0), signbit)
	// (xor (bitcast x) (build_pair flipbit, flipbit))
	// This often reduces constant pool loads.
	if (((N0.getOpcode() == ISD::FNEG && !TLI.isFNegFree(N0.getValueType())) \|\|
	(N0.getOpcode() == ISD::FABS && !TLI.isFAbsFree(N0.getValueType()))) &&
	N0.getNode()->hasOneUse() && VT.isInteger() &&
	!VT.isVector() && !N0.getValueType().isVector()) {
	SDValue NewConv = DAG.getBitcast(VT, N0.getOperand(0));
	AddToWorklist(NewConv.getNode());

	SDLoc DL(N);
	if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) {
	assert(VT.getSizeInBits() == 128);
	SDValue SignBit = DAG.getConstant(
	APInt::getSignMask(VT.getSizeInBits() / 2), SDLoc(N0), MVT::i64);
	SDValue FlipBit;
	if (N0.getOpcode() == ISD::FNEG) {
	FlipBit = SignBit;
	AddToWorklist(FlipBit.getNode());
	} else {
	assert(N0.getOpcode() == ISD::FABS);
	SDValue Hi =
	DAG.getNode(ISD::EXTRACT_ELEMENT, SDLoc(NewConv), MVT::i64, NewConv,
	DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG),
	SDLoc(NewConv)));
	AddToWorklist(Hi.getNode());
	FlipBit = DAG.getNode(ISD::AND, SDLoc(N0), MVT::i64, Hi, SignBit);
	AddToWorklist(FlipBit.getNode());
	}
	SDValue FlipBits =
	DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit);
	AddToWorklist(FlipBits.getNode());
	return DAG.getNode(ISD::XOR, DL, VT, NewConv, FlipBits);
	}
	APInt SignBit = APInt::getSignMask(VT.getSizeInBits());
	if (N0.getOpcode() == ISD::FNEG)
	return DAG.getNode(ISD::XOR, DL, VT,
	NewConv, DAG.getConstant(SignBit, DL, VT));
	assert(N0.getOpcode() == ISD::FABS);
	return DAG.getNode(ISD::AND, DL, VT,
	NewConv, DAG.getConstant(~SignBit, DL, VT));
	}

	// fold (bitconvert (fcopysign cst, x)) ->
	// (or (and (bitconvert x), sign), (and cst, (not sign)))
	// Note that we don't handle (copysign x, cst) because this can always be
	// folded to an fneg or fabs.
	//
	// For ppc_fp128:
	// fold (bitcast (fcopysign cst, x)) ->
	// flipbit = (and (extract_element
	// (xor (bitcast cst), (bitcast x)), 0),
	// signbit)
	// (xor (bitcast cst) (build_pair flipbit, flipbit))
	if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse() &&
	isa<ConstantFPSDNode>(N0.getOperand(0)) &&
	VT.isInteger() && !VT.isVector()) {
	unsigned OrigXWidth = N0.getOperand(1).getValueSizeInBits();
	EVT IntXVT = EVT::getIntegerVT(*DAG.getContext(), OrigXWidth);
	if (isTypeLegal(IntXVT)) {
	SDValue X = DAG.getBitcast(IntXVT, N0.getOperand(1));
	AddToWorklist(X.getNode());

	// If X has a different width than the result/lhs, sext it or truncate it.
	unsigned VTWidth = VT.getSizeInBits();
	if (OrigXWidth < VTWidth) {
	X = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, X);
	AddToWorklist(X.getNode());
	} else if (OrigXWidth > VTWidth) {
	// To get the sign bit in the right place, we have to shift it right
	// before truncating.
	SDLoc DL(X);
	X = DAG.getNode(ISD::SRL, DL,
	X.getValueType(), X,
	DAG.getConstant(OrigXWidth-VTWidth, DL,
	X.getValueType()));
	AddToWorklist(X.getNode());
	X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X);
	AddToWorklist(X.getNode());
	}

	if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) {
	APInt SignBit = APInt::getSignMask(VT.getSizeInBits() / 2);
	SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0));
	AddToWorklist(Cst.getNode());
	SDValue X = DAG.getBitcast(VT, N0.getOperand(1));
	AddToWorklist(X.getNode());
	SDValue XorResult = DAG.getNode(ISD::XOR, SDLoc(N0), VT, Cst, X);
	AddToWorklist(XorResult.getNode());
	SDValue XorResult64 = DAG.getNode(
	ISD::EXTRACT_ELEMENT, SDLoc(XorResult), MVT::i64, XorResult,
	DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG),
	SDLoc(XorResult)));
	AddToWorklist(XorResult64.getNode());
	SDValue FlipBit =
	DAG.getNode(ISD::AND, SDLoc(XorResult64), MVT::i64, XorResult64,
	DAG.getConstant(SignBit, SDLoc(XorResult64), MVT::i64));
	AddToWorklist(FlipBit.getNode());
	SDValue FlipBits =
	DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit);
	AddToWorklist(FlipBits.getNode());
	return DAG.getNode(ISD::XOR, SDLoc(N), VT, Cst, FlipBits);
	}
	APInt SignBit = APInt::getSignMask(VT.getSizeInBits());
	X = DAG.getNode(ISD::AND, SDLoc(X), VT,
	X, DAG.getConstant(SignBit, SDLoc(X), VT));
	AddToWorklist(X.getNode());

	SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0));
	Cst = DAG.getNode(ISD::AND, SDLoc(Cst), VT,
	Cst, DAG.getConstant(~SignBit, SDLoc(Cst), VT));
	AddToWorklist(Cst.getNode());

	return DAG.getNode(ISD::OR, SDLoc(N), VT, X, Cst);
	}
	}

	// bitconvert(build_pair(ld, ld)) -> ld iff load locations are consecutive.
	if (N0.getOpcode() == ISD::BUILD_PAIR)
	if (SDValue CombineLD = CombineConsecutiveLoads(N0.getNode(), VT))
	return CombineLD;

	// Remove double bitcasts from shuffles - this is often a legacy of
	// XformToShuffleWithZero being used to combine bitmaskings (of
	// float vectors bitcast to integer vectors) into shuffles.
	// bitcast(shuffle(bitcast(s0),bitcast(s1))) -> shuffle(s0,s1)
	if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT) && VT.isVector() &&
	N0->getOpcode() == ISD::VECTOR_SHUFFLE && N0.hasOneUse() &&
	VT.getVectorNumElements() >= N0.getValueType().getVectorNumElements() &&
	!(VT.getVectorNumElements() % N0.getValueType().getVectorNumElements())) {
	ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N0);

	// If operands are a bitcast, peek through if it casts the original VT.
	// If operands are a constant, just bitcast back to original VT.
	auto PeekThroughBitcast = [&](SDValue Op) {
	if (Op.getOpcode() == ISD::BITCAST &&
	Op.getOperand(0).getValueType() == VT)
	return SDValue(Op.getOperand(0));
	if (Op.isUndef() \|\| ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) \|\|
	ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()))
	return DAG.getBitcast(VT, Op);
	return SDValue();
	};

	// FIXME: If either input vector is bitcast, try to convert the shuffle to
	// the result type of this bitcast. This would eliminate at least one
	// bitcast. See the transform in InstCombine.
	SDValue SV0 = PeekThroughBitcast(N0->getOperand(0));
	SDValue SV1 = PeekThroughBitcast(N0->getOperand(1));
	if (!(SV0 && SV1))
	return SDValue();

	int MaskScale =
	VT.getVectorNumElements() / N0.getValueType().getVectorNumElements();
	SmallVector<int, 8> NewMask;
	for (int M : SVN->getMask())
	for (int i = 0; i != MaskScale; ++i)
	NewMask.push_back(M < 0 ? -1 : M * MaskScale + i);

	SDValue LegalShuffle =
	TLI.buildLegalVectorShuffle(VT, SDLoc(N), SV0, SV1, NewMask, DAG);
	if (LegalShuffle)
	return LegalShuffle;
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitBUILD_PAIR(SDNode *N) {
	EVT VT = N->getValueType(0);
	return CombineConsecutiveLoads(N, VT);
	}

	SDValue DAGCombiner::visitFREEZE(SDNode *N) {
	SDValue N0 = N->getOperand(0);

	// (freeze (freeze x)) -> (freeze x)
	if (N0.getOpcode() == ISD::FREEZE)
	return N0;

	// If the input is a constant, return it.
	if (isa<ConstantSDNode>(N0) \|\| isa<ConstantFPSDNode>(N0))
	return N0;

	return SDValue();
	}

	/// We know that BV is a build_vector node with Constant, ConstantFP or Undef
	/// operands. DstEltVT indicates the destination element value type.
	SDValue DAGCombiner::
	ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
	EVT SrcEltVT = BV->getValueType(0).getVectorElementType();

	// If this is already the right type, we're done.
	if (SrcEltVT == DstEltVT) return SDValue(BV, 0);

	unsigned SrcBitSize = SrcEltVT.getSizeInBits();
	unsigned DstBitSize = DstEltVT.getSizeInBits();

	// If this is a conversion of N elements of one type to N elements of another
	// type, convert each element. This handles FP<->INT cases.
	if (SrcBitSize == DstBitSize) {
	SmallVector<SDValue, 8> Ops;
	for (SDValue Op : BV->op_values()) {
	// If the vector element type is not legal, the BUILD_VECTOR operands
	// are promoted and implicitly truncated. Make that explicit here.
	if (Op.getValueType() != SrcEltVT)
	Op = DAG.getNode(ISD::TRUNCATE, SDLoc(BV), SrcEltVT, Op);
	Ops.push_back(DAG.getBitcast(DstEltVT, Op));
	AddToWorklist(Ops.back().getNode());
	}
	EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT,
	BV->getValueType(0).getVectorNumElements());
	return DAG.getBuildVector(VT, SDLoc(BV), Ops);
	}

	// Otherwise, we're growing or shrinking the elements. To avoid having to
	// handle annoying details of growing/shrinking FP values, we convert them to
	// int first.
	if (SrcEltVT.isFloatingPoint()) {
	// Convert the input float vector to a int vector where the elements are the
	// same sizes.
	EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltVT.getSizeInBits());
	BV = ConstantFoldBITCASTofBUILD_VECTOR(BV, IntVT).getNode();
	SrcEltVT = IntVT;
	}

	// Now we know the input is an integer vector. If the output is a FP type,
	// convert to integer first, then to FP of the right size.
	if (DstEltVT.isFloatingPoint()) {
	EVT TmpVT = EVT::getIntegerVT(*DAG.getContext(), DstEltVT.getSizeInBits());
	SDNode *Tmp = ConstantFoldBITCASTofBUILD_VECTOR(BV, TmpVT).getNode();

	// Next, convert to FP elements of the same size.
	return ConstantFoldBITCASTofBUILD_VECTOR(Tmp, DstEltVT);
	}

	SDLoc DL(BV);

	// Okay, we know the src/dst types are both integers of differing types.
	// Handling growing first.
	assert(SrcEltVT.isInteger() && DstEltVT.isInteger());
	if (SrcBitSize < DstBitSize) {
	unsigned NumInputsPerOutput = DstBitSize/SrcBitSize;

	SmallVector<SDValue, 8> Ops;
	for (unsigned i = 0, e = BV->getNumOperands(); i != e;
	i += NumInputsPerOutput) {
	bool isLE = DAG.getDataLayout().isLittleEndian();
	APInt NewBits = APInt(DstBitSize, 0);
	bool EltIsUndef = true;
	for (unsigned j = 0; j != NumInputsPerOutput; ++j) {
	// Shift the previously computed bits over.
	NewBits <<= SrcBitSize;
	SDValue Op = BV->getOperand(i+ (isLE ? (NumInputsPerOutput-j-1) : j));
	if (Op.isUndef()) continue;
	EltIsUndef = false;

	NewBits \|= cast<ConstantSDNode>(Op)->getAPIntValue().
	zextOrTrunc(SrcBitSize).zext(DstBitSize);
	}

	if (EltIsUndef)
	Ops.push_back(DAG.getUNDEF(DstEltVT));
	else
	Ops.push_back(DAG.getConstant(NewBits, DL, DstEltVT));
	}

	EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, Ops.size());
	return DAG.getBuildVector(VT, DL, Ops);
	}

	// Finally, this must be the case where we are shrinking elements: each input
	// turns into multiple outputs.
	unsigned NumOutputsPerInput = SrcBitSize/DstBitSize;
	EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT,
	NumOutputsPerInput*BV->getNumOperands());
	SmallVector<SDValue, 8> Ops;

	for (const SDValue &Op : BV->op_values()) {
	if (Op.isUndef()) {
	Ops.append(NumOutputsPerInput, DAG.getUNDEF(DstEltVT));
	continue;
	}

	APInt OpVal = cast<ConstantSDNode>(Op)->
	getAPIntValue().zextOrTrunc(SrcBitSize);

	for (unsigned j = 0; j != NumOutputsPerInput; ++j) {
	APInt ThisVal = OpVal.trunc(DstBitSize);
	Ops.push_back(DAG.getConstant(ThisVal, DL, DstEltVT));
	OpVal.lshrInPlace(DstBitSize);
	}

	// For big endian targets, swap the order of the pieces of each element.
	if (DAG.getDataLayout().isBigEndian())
	std::reverse(Ops.end()-NumOutputsPerInput, Ops.end());
	}

	return DAG.getBuildVector(VT, DL, Ops);
	}

	static bool isContractable(SDNode *N) {
	SDNodeFlags F = N->getFlags();
	return F.hasAllowContract() \|\| F.hasAllowReassociation();
	}

	/// Try to perform FMA combining on a given FADD node.
	SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc SL(N);

	const TargetOptions &Options = DAG.getTarget().Options;

	// Floating-point multiply-add with intermediate rounding.
	bool HasFMAD = (LegalOperations && TLI.isFMADLegal(DAG, N));

	// Floating-point multiply-add without intermediate rounding.
	bool HasFMA =
	TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) &&
	(!LegalOperations \|\| TLI.isOperationLegalOrCustom(ISD::FMA, VT));

	// No valid opcode, do not combine.
	if (!HasFMAD && !HasFMA)
	return SDValue();

	SDNodeFlags Flags = N->getFlags();
	bool CanFuse = Options.UnsafeFPMath \|\| isContractable(N);
	bool CanReassociate =
	Options.UnsafeFPMath \|\| N->getFlags().hasAllowReassociation();
	bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast \|\|
	CanFuse \|\| HasFMAD);
	// If the addition is not contractable, do not combine.
	if (!AllowFusionGlobally && !isContractable(N))
	return SDValue();

	if (STI && STI->generateFMAsInMachineCombiner(OptLevel))
	return SDValue();

	// Always prefer FMAD to FMA for precision.
	unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
	bool Aggressive = TLI.enableAggressiveFMAFusion(VT);

	// Is the node an FMUL and contractable either due to global flags or
	// SDNodeFlags.
	auto isContractableFMUL = [AllowFusionGlobally](SDValue N) {
	if (N.getOpcode() != ISD::FMUL)
	return false;
	return AllowFusionGlobally \|\| isContractable(N.getNode());
	};
	// If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
	// prefer to fold the multiply with fewer uses.
	if (Aggressive && isContractableFMUL(N0) && isContractableFMUL(N1)) {
	if (N0.getNode()->use_size() > N1.getNode()->use_size())
	std::swap(N0, N1);
	}

	// fold (fadd (fmul x, y), z) -> (fma x, y, z)
	if (isContractableFMUL(N0) && (Aggressive \|\| N0->hasOneUse())) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	N0.getOperand(0), N0.getOperand(1), N1, Flags);
	}

	// fold (fadd x, (fmul y, z)) -> (fma y, z, x)
	// Note: Commutes FADD operands.
	if (isContractableFMUL(N1) && (Aggressive \|\| N1->hasOneUse())) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	N1.getOperand(0), N1.getOperand(1), N0, Flags);
	}

	// fadd (fma A, B, (fmul C, D)), E --> fma A, B, (fma C, D, E)
	// fadd E, (fma A, B, (fmul C, D)) --> fma A, B, (fma C, D, E)
	// This requires reassociation because it changes the order of operations.
	SDValue FMA, E;
	if (CanReassociate && N0.getOpcode() == PreferredFusedOpcode &&
	N0.getOperand(2).getOpcode() == ISD::FMUL && N0.hasOneUse() &&
	N0.getOperand(2).hasOneUse()) {
	FMA = N0;
	E = N1;
	} else if (CanReassociate && N1.getOpcode() == PreferredFusedOpcode &&
	N1.getOperand(2).getOpcode() == ISD::FMUL && N1.hasOneUse() &&
	N1.getOperand(2).hasOneUse()) {
	FMA = N1;
	E = N0;
	}
	if (FMA && E) {
	SDValue A = FMA.getOperand(0);
	SDValue B = FMA.getOperand(1);
	SDValue C = FMA.getOperand(2).getOperand(0);
	SDValue D = FMA.getOperand(2).getOperand(1);
	SDValue CDE = DAG.getNode(PreferredFusedOpcode, SL, VT, C, D, E, Flags);
	return DAG.getNode(PreferredFusedOpcode, SL, VT, A, B, CDE, Flags);
	}

	// Look through FP_EXTEND nodes to do more combining.

	// fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
	if (N0.getOpcode() == ISD::FP_EXTEND) {
	SDValue N00 = N0.getOperand(0);
	if (isContractableFMUL(N00) &&
	TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
	N00.getValueType())) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N00.getOperand(0)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N00.getOperand(1)), N1, Flags);
	}
	}

	// fold (fadd x, (fpext (fmul y, z))) -> (fma (fpext y), (fpext z), x)
	// Note: Commutes FADD operands.
	if (N1.getOpcode() == ISD::FP_EXTEND) {
	SDValue N10 = N1.getOperand(0);
	if (isContractableFMUL(N10) &&
	TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
	N10.getValueType())) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N10.getOperand(0)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N10.getOperand(1)), N0, Flags);
	}
	}

	// More folding opportunities when target permits.
	if (Aggressive) {
	// fold (fadd (fma x, y, (fpext (fmul u, v))), z)
	// -> (fma x, y, (fma (fpext u), (fpext v), z))
	auto FoldFAddFMAFPExtFMul = [&] (
	SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z,
	SDNodeFlags Flags) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT, X, Y,
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT, U),
	DAG.getNode(ISD::FP_EXTEND, SL, VT, V),
	Z, Flags), Flags);
	};
	if (N0.getOpcode() == PreferredFusedOpcode) {
	SDValue N02 = N0.getOperand(2);
	if (N02.getOpcode() == ISD::FP_EXTEND) {
	SDValue N020 = N02.getOperand(0);
	if (isContractableFMUL(N020) &&
	TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
	N020.getValueType())) {
	return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1),
	N020.getOperand(0), N020.getOperand(1),
	N1, Flags);
	}
	}
	}

	// fold (fadd (fpext (fma x, y, (fmul u, v))), z)
	// -> (fma (fpext x), (fpext y), (fma (fpext u), (fpext v), z))
	// FIXME: This turns two single-precision and one double-precision
	// operation into two double-precision operations, which might not be
	// interesting for all targets, especially GPUs.
	auto FoldFAddFPExtFMAFMul = [&] (
	SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z,
	SDNodeFlags Flags) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT, X),
	DAG.getNode(ISD::FP_EXTEND, SL, VT, Y),
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT, U),
	DAG.getNode(ISD::FP_EXTEND, SL, VT, V),
	Z, Flags), Flags);
	};
	if (N0.getOpcode() == ISD::FP_EXTEND) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getOpcode() == PreferredFusedOpcode) {
	SDValue N002 = N00.getOperand(2);
	if (isContractableFMUL(N002) &&
	TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
	N00.getValueType())) {
	return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1),
	N002.getOperand(0), N002.getOperand(1),
	N1, Flags);
	}
	}
	}

	// fold (fadd x, (fma y, z, (fpext (fmul u, v)))
	// -> (fma y, z, (fma (fpext u), (fpext v), x))
	if (N1.getOpcode() == PreferredFusedOpcode) {
	SDValue N12 = N1.getOperand(2);
	if (N12.getOpcode() == ISD::FP_EXTEND) {
	SDValue N120 = N12.getOperand(0);
	if (isContractableFMUL(N120) &&
	TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
	N120.getValueType())) {
	return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1),
	N120.getOperand(0), N120.getOperand(1),
	N0, Flags);
	}
	}
	}

	// fold (fadd x, (fpext (fma y, z, (fmul u, v)))
	// -> (fma (fpext y), (fpext z), (fma (fpext u), (fpext v), x))
	// FIXME: This turns two single-precision and one double-precision
	// operation into two double-precision operations, which might not be
	// interesting for all targets, especially GPUs.
	if (N1.getOpcode() == ISD::FP_EXTEND) {
	SDValue N10 = N1.getOperand(0);
	if (N10.getOpcode() == PreferredFusedOpcode) {
	SDValue N102 = N10.getOperand(2);
	if (isContractableFMUL(N102) &&
	TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
	N10.getValueType())) {
	return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1),
	N102.getOperand(0), N102.getOperand(1),
	N0, Flags);
	}
	}
	}
	}

	return SDValue();
	}

	/// Try to perform FMA combining on a given FSUB node.
	SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc SL(N);

	const TargetOptions &Options = DAG.getTarget().Options;
	// Floating-point multiply-add with intermediate rounding.
	bool HasFMAD = (LegalOperations && TLI.isFMADLegal(DAG, N));

	// Floating-point multiply-add without intermediate rounding.
	bool HasFMA =
	TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) &&
	(!LegalOperations \|\| TLI.isOperationLegalOrCustom(ISD::FMA, VT));

	// No valid opcode, do not combine.
	if (!HasFMAD && !HasFMA)
	return SDValue();

	const SDNodeFlags Flags = N->getFlags();
	bool CanFuse = Options.UnsafeFPMath \|\| isContractable(N);
	bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast \|\|
	CanFuse \|\| HasFMAD);

	// If the subtraction is not contractable, do not combine.
	if (!AllowFusionGlobally && !isContractable(N))
	return SDValue();

	if (STI && STI->generateFMAsInMachineCombiner(OptLevel))
	return SDValue();

	// Always prefer FMAD to FMA for precision.
	unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
	bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
	bool NoSignedZero = Options.NoSignedZerosFPMath \|\| Flags.hasNoSignedZeros();

	// Is the node an FMUL and contractable either due to global flags or
	// SDNodeFlags.
	auto isContractableFMUL = [AllowFusionGlobally](SDValue N) {
	if (N.getOpcode() != ISD::FMUL)
	return false;
	return AllowFusionGlobally \|\| isContractable(N.getNode());
	};

	// fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
	auto tryToFoldXYSubZ = [&](SDValue XY, SDValue Z) {
	if (isContractableFMUL(XY) && (Aggressive \|\| XY->hasOneUse())) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT, XY.getOperand(0),
	XY.getOperand(1), DAG.getNode(ISD::FNEG, SL, VT, Z),
	Flags);
	}
	return SDValue();
	};

	// fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
	// Note: Commutes FSUB operands.
	auto tryToFoldXSubYZ = [&](SDValue X, SDValue YZ) {
	if (isContractableFMUL(YZ) && (Aggressive \|\| YZ->hasOneUse())) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT, YZ.getOperand(0)),
	YZ.getOperand(1), X, Flags);
	}
	return SDValue();
	};

	// If we have two choices trying to fold (fsub (fmul u, v), (fmul x, y)),
	// prefer to fold the multiply with fewer uses.
	if (isContractableFMUL(N0) && isContractableFMUL(N1) &&
	(N0.getNode()->use_size() > N1.getNode()->use_size())) {
	// fold (fsub (fmul a, b), (fmul c, d)) -> (fma (fneg c), d, (fmul a, b))
	if (SDValue V = tryToFoldXSubYZ(N0, N1))
	return V;
	// fold (fsub (fmul a, b), (fmul c, d)) -> (fma a, b, (fneg (fmul c, d)))
	if (SDValue V = tryToFoldXYSubZ(N0, N1))
	return V;
	} else {
	// fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
	if (SDValue V = tryToFoldXYSubZ(N0, N1))
	return V;
	// fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
	if (SDValue V = tryToFoldXSubYZ(N0, N1))
	return V;
	}

	// fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
	if (N0.getOpcode() == ISD::FNEG && isContractableFMUL(N0.getOperand(0)) &&
	(Aggressive \|\| (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) {
	SDValue N00 = N0.getOperand(0).getOperand(0);
	SDValue N01 = N0.getOperand(0).getOperand(1);
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT, N00), N01,
	DAG.getNode(ISD::FNEG, SL, VT, N1), Flags);
	}

	// Look through FP_EXTEND nodes to do more combining.

	// fold (fsub (fpext (fmul x, y)), z)
	// -> (fma (fpext x), (fpext y), (fneg z))
	if (N0.getOpcode() == ISD::FP_EXTEND) {
	SDValue N00 = N0.getOperand(0);
	if (isContractableFMUL(N00) &&
	TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
	N00.getValueType())) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N00.getOperand(0)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N00.getOperand(1)),
	DAG.getNode(ISD::FNEG, SL, VT, N1), Flags);
	}
	}

	// fold (fsub x, (fpext (fmul y, z)))
	// -> (fma (fneg (fpext y)), (fpext z), x)
	// Note: Commutes FSUB operands.
	if (N1.getOpcode() == ISD::FP_EXTEND) {
	SDValue N10 = N1.getOperand(0);
	if (isContractableFMUL(N10) &&
	TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
	N10.getValueType())) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N10.getOperand(0))),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N10.getOperand(1)),
	N0, Flags);
	}
	}

	// fold (fsub (fpext (fneg (fmul, x, y))), z)
	// -> (fneg (fma (fpext x), (fpext y), z))
	// Note: This could be removed with appropriate canonicalization of the
	// input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the
	// orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent
	// from implementing the canonicalization in visitFSUB.
	if (N0.getOpcode() == ISD::FP_EXTEND) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getOpcode() == ISD::FNEG) {
	SDValue N000 = N00.getOperand(0);
	if (isContractableFMUL(N000) &&
	TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
	N00.getValueType())) {
	return DAG.getNode(ISD::FNEG, SL, VT,
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N000.getOperand(0)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N000.getOperand(1)),
	N1, Flags));
	}
	}
	}

	// fold (fsub (fneg (fpext (fmul, x, y))), z)
	// -> (fneg (fma (fpext x)), (fpext y), z)
	// Note: This could be removed with appropriate canonicalization of the
	// input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the
	// orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent
	// from implementing the canonicalization in visitFSUB.
	if (N0.getOpcode() == ISD::FNEG) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getOpcode() == ISD::FP_EXTEND) {
	SDValue N000 = N00.getOperand(0);
	if (isContractableFMUL(N000) &&
	TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
	N000.getValueType())) {
	return DAG.getNode(ISD::FNEG, SL, VT,
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N000.getOperand(0)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N000.getOperand(1)),
	N1, Flags));
	}
	}
	}

	// More folding opportunities when target permits.
	if (Aggressive) {
	// fold (fsub (fma x, y, (fmul u, v)), z)
	// -> (fma x, y (fma u, v, (fneg z)))
	if (CanFuse && N0.getOpcode() == PreferredFusedOpcode &&
	isContractableFMUL(N0.getOperand(2)) && N0->hasOneUse() &&
	N0.getOperand(2)->hasOneUse()) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	N0.getOperand(0), N0.getOperand(1),
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	N0.getOperand(2).getOperand(0),
	N0.getOperand(2).getOperand(1),
	DAG.getNode(ISD::FNEG, SL, VT,
	N1), Flags), Flags);
	}

	// fold (fsub x, (fma y, z, (fmul u, v)))
	// -> (fma (fneg y), z, (fma (fneg u), v, x))
	if (CanFuse && N1.getOpcode() == PreferredFusedOpcode &&
	isContractableFMUL(N1.getOperand(2)) &&
	N1->hasOneUse() && NoSignedZero) {
	SDValue N20 = N1.getOperand(2).getOperand(0);
	SDValue N21 = N1.getOperand(2).getOperand(1);
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT,
	N1.getOperand(0)),
	N1.getOperand(1),
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT, N20),
	N21, N0, Flags), Flags);
	}


	// fold (fsub (fma x, y, (fpext (fmul u, v))), z)
	// -> (fma x, y (fma (fpext u), (fpext v), (fneg z)))
	if (N0.getOpcode() == PreferredFusedOpcode &&
	N0->hasOneUse()) {
	SDValue N02 = N0.getOperand(2);
	if (N02.getOpcode() == ISD::FP_EXTEND) {
	SDValue N020 = N02.getOperand(0);
	if (isContractableFMUL(N020) &&
	TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
	N020.getValueType())) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	N0.getOperand(0), N0.getOperand(1),
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N020.getOperand(0)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N020.getOperand(1)),
	DAG.getNode(ISD::FNEG, SL, VT,
	N1), Flags), Flags);
	}
	}
	}

	// fold (fsub (fpext (fma x, y, (fmul u, v))), z)
	// -> (fma (fpext x), (fpext y),
	// (fma (fpext u), (fpext v), (fneg z)))
	// FIXME: This turns two single-precision and one double-precision
	// operation into two double-precision operations, which might not be
	// interesting for all targets, especially GPUs.
	if (N0.getOpcode() == ISD::FP_EXTEND) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getOpcode() == PreferredFusedOpcode) {
	SDValue N002 = N00.getOperand(2);
	if (isContractableFMUL(N002) &&
	TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
	N00.getValueType())) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N00.getOperand(0)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N00.getOperand(1)),
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N002.getOperand(0)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N002.getOperand(1)),
	DAG.getNode(ISD::FNEG, SL, VT,
	N1), Flags), Flags);
	}
	}
	}

	// fold (fsub x, (fma y, z, (fpext (fmul u, v))))
	// -> (fma (fneg y), z, (fma (fneg (fpext u)), (fpext v), x))
	if (N1.getOpcode() == PreferredFusedOpcode &&
	N1.getOperand(2).getOpcode() == ISD::FP_EXTEND &&
	N1->hasOneUse()) {
	SDValue N120 = N1.getOperand(2).getOperand(0);
	if (isContractableFMUL(N120) &&
	TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
	N120.getValueType())) {
	SDValue N1200 = N120.getOperand(0);
	SDValue N1201 = N120.getOperand(1);
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)),
	N1.getOperand(1),
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL,
	VT, N1200)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N1201),
	N0, Flags), Flags);
	}
	}

	// fold (fsub x, (fpext (fma y, z, (fmul u, v))))
	// -> (fma (fneg (fpext y)), (fpext z),
	// (fma (fneg (fpext u)), (fpext v), x))
	// FIXME: This turns two single-precision and one double-precision
	// operation into two double-precision operations, which might not be
	// interesting for all targets, especially GPUs.
	if (N1.getOpcode() == ISD::FP_EXTEND &&
	N1.getOperand(0).getOpcode() == PreferredFusedOpcode) {
	SDValue CvtSrc = N1.getOperand(0);
	SDValue N100 = CvtSrc.getOperand(0);
	SDValue N101 = CvtSrc.getOperand(1);
	SDValue N102 = CvtSrc.getOperand(2);
	if (isContractableFMUL(N102) &&
	TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
	CvtSrc.getValueType())) {
	SDValue N1020 = N102.getOperand(0);
	SDValue N1021 = N102.getOperand(1);
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N100)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT, N101),
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL,
	VT, N1020)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N1021),
	N0, Flags), Flags);
	}
	}
	}

	return SDValue();
	}

	/// Try to perform FMA combining on a given FMUL node based on the distributive
	/// law x * (y + 1) = x * y + x and variants thereof (commuted versions,
	/// subtraction instead of addition).
	SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc SL(N);
	const SDNodeFlags Flags = N->getFlags();

	assert(N->getOpcode() == ISD::FMUL && "Expected FMUL Operation");

	const TargetOptions &Options = DAG.getTarget().Options;

	// The transforms below are incorrect when x == 0 and y == inf, because the
	// intermediate multiplication produces a nan.
	if (!Options.NoInfsFPMath)
	return SDValue();

	// Floating-point multiply-add without intermediate rounding.
	bool HasFMA =
	(Options.AllowFPOpFusion == FPOpFusion::Fast \|\| Options.UnsafeFPMath) &&
	TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) &&
	(!LegalOperations \|\| TLI.isOperationLegalOrCustom(ISD::FMA, VT));

	// Floating-point multiply-add with intermediate rounding. This can result
	// in a less precise result due to the changed rounding order.
	bool HasFMAD = Options.UnsafeFPMath &&
	(LegalOperations && TLI.isFMADLegal(DAG, N));

	// No valid opcode, do not combine.
	if (!HasFMAD && !HasFMA)
	return SDValue();

	// Always prefer FMAD to FMA for precision.
	unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
	bool Aggressive = TLI.enableAggressiveFMAFusion(VT);

	// fold (fmul (fadd x0, +1.0), y) -> (fma x0, y, y)
	// fold (fmul (fadd x0, -1.0), y) -> (fma x0, y, (fneg y))
	auto FuseFADD = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) {
	if (X.getOpcode() == ISD::FADD && (Aggressive \|\| X->hasOneUse())) {
	if (auto *C = isConstOrConstSplatFP(X.getOperand(1), true)) {
	if (C->isExactlyValue(+1.0))
	return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
	Y, Flags);
	if (C->isExactlyValue(-1.0))
	return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
	DAG.getNode(ISD::FNEG, SL, VT, Y), Flags);
	}
	}
	return SDValue();
	};

	if (SDValue FMA = FuseFADD(N0, N1, Flags))
	return FMA;
	if (SDValue FMA = FuseFADD(N1, N0, Flags))
	return FMA;

	// fold (fmul (fsub +1.0, x1), y) -> (fma (fneg x1), y, y)
	// fold (fmul (fsub -1.0, x1), y) -> (fma (fneg x1), y, (fneg y))
	// fold (fmul (fsub x0, +1.0), y) -> (fma x0, y, (fneg y))
	// fold (fmul (fsub x0, -1.0), y) -> (fma x0, y, y)
	auto FuseFSUB = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) {
	if (X.getOpcode() == ISD::FSUB && (Aggressive \|\| X->hasOneUse())) {
	if (auto *C0 = isConstOrConstSplatFP(X.getOperand(0), true)) {
	if (C0->isExactlyValue(+1.0))
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
	Y, Flags);
	if (C0->isExactlyValue(-1.0))
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
	DAG.getNode(ISD::FNEG, SL, VT, Y), Flags);
	}
	if (auto *C1 = isConstOrConstSplatFP(X.getOperand(1), true)) {
	if (C1->isExactlyValue(+1.0))
	return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
	DAG.getNode(ISD::FNEG, SL, VT, Y), Flags);
	if (C1->isExactlyValue(-1.0))
	return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
	Y, Flags);
	}
	}
	return SDValue();
	};

	if (SDValue FMA = FuseFSUB(N0, N1, Flags))
	return FMA;
	if (SDValue FMA = FuseFSUB(N1, N0, Flags))
	return FMA;

	return SDValue();
	}

	SDValue DAGCombiner::visitFADD(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	bool N0CFP = isConstantFPBuildVectorOrConstantFP(N0);
	bool N1CFP = isConstantFPBuildVectorOrConstantFP(N1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);
	const TargetOptions &Options = DAG.getTarget().Options;
	const SDNodeFlags Flags = N->getFlags();

	if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
	return R;

	// fold vector ops
	if (VT.isVector())
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	// fold (fadd c1, c2) -> c1 + c2
	if (N0CFP && N1CFP)
	return DAG.getNode(ISD::FADD, DL, VT, N0, N1, Flags);

	// canonicalize constant to RHS
	if (N0CFP && !N1CFP)
	return DAG.getNode(ISD::FADD, DL, VT, N1, N0, Flags);

	// N0 + -0.0 --> N0 (also allowed with +0.0 and fast-math)
	ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1, true);
	if (N1C && N1C->isZero())
	if (N1C->isNegative() \|\| Options.NoSignedZerosFPMath \|\| Flags.hasNoSignedZeros())
	return N0;

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// fold (fadd A, (fneg B)) -> (fsub A, B)
	if (!LegalOperations \|\| TLI.isOperationLegalOrCustom(ISD::FSUB, VT))
	if (SDValue NegN1 = TLI.getCheaperNegatedExpression(
	N1, DAG, LegalOperations, ForCodeSize))
	return DAG.getNode(ISD::FSUB, DL, VT, N0, NegN1, Flags);

	// fold (fadd (fneg A), B) -> (fsub B, A)
	if (!LegalOperations \|\| TLI.isOperationLegalOrCustom(ISD::FSUB, VT))
	if (SDValue NegN0 = TLI.getCheaperNegatedExpression(
	N0, DAG, LegalOperations, ForCodeSize))
	return DAG.getNode(ISD::FSUB, DL, VT, N1, NegN0, Flags);

	auto isFMulNegTwo = [](SDValue FMul) {
	if (!FMul.hasOneUse() \|\| FMul.getOpcode() != ISD::FMUL)
	return false;
	auto *C = isConstOrConstSplatFP(FMul.getOperand(1), true);
	return C && C->isExactlyValue(-2.0);
	};

	// fadd (fmul B, -2.0), A --> fsub A, (fadd B, B)
	if (isFMulNegTwo(N0)) {
	SDValue B = N0.getOperand(0);
	SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B, Flags);
	return DAG.getNode(ISD::FSUB, DL, VT, N1, Add, Flags);
	}
	// fadd A, (fmul B, -2.0) --> fsub A, (fadd B, B)
	if (isFMulNegTwo(N1)) {
	SDValue B = N1.getOperand(0);
	SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B, Flags);
	return DAG.getNode(ISD::FSUB, DL, VT, N0, Add, Flags);
	}

	// No FP constant should be created after legalization as Instruction
	// Selection pass has a hard time dealing with FP constants.
	bool AllowNewConst = (Level < AfterLegalizeDAG);

	// If nnan is enabled, fold lots of things.
	if ((Options.NoNaNsFPMath \|\| Flags.hasNoNaNs()) && AllowNewConst) {
	// If allowed, fold (fadd (fneg x), x) -> 0.0
	if (N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1)
	return DAG.getConstantFP(0.0, DL, VT);

	// If allowed, fold (fadd x, (fneg x)) -> 0.0
	if (N1.getOpcode() == ISD::FNEG && N1.getOperand(0) == N0)
	return DAG.getConstantFP(0.0, DL, VT);
	}

	// If 'unsafe math' or reassoc and nsz, fold lots of things.
	// TODO: break out portions of the transformations below for which Unsafe is
	// considered and which do not require both nsz and reassoc
	if (((Options.UnsafeFPMath && Options.NoSignedZerosFPMath) \|\|
	(Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) &&
	AllowNewConst) {
	// fadd (fadd x, c1), c2 -> fadd x, c1 + c2
	if (N1CFP && N0.getOpcode() == ISD::FADD &&
	isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) {
	SDValue NewC = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), N1, Flags);
	return DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(0), NewC, Flags);
	}

	// We can fold chains of FADD's of the same value into multiplications.
	// This transform is not safe in general because we are reducing the number
	// of rounding steps.
	if (TLI.isOperationLegalOrCustom(ISD::FMUL, VT) && !N0CFP && !N1CFP) {
	if (N0.getOpcode() == ISD::FMUL) {
	bool CFP00 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
	bool CFP01 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(1));

	// (fadd (fmul x, c), x) -> (fmul x, c+1)
	if (CFP01 && !CFP00 && N0.getOperand(0) == N1) {
	SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1),
	DAG.getConstantFP(1.0, DL, VT), Flags);
	return DAG.getNode(ISD::FMUL, DL, VT, N1, NewCFP, Flags);
	}

	// (fadd (fmul x, c), (fadd x, x)) -> (fmul x, c+2)
	if (CFP01 && !CFP00 && N1.getOpcode() == ISD::FADD &&
	N1.getOperand(0) == N1.getOperand(1) &&
	N0.getOperand(0) == N1.getOperand(0)) {
	SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1),
	DAG.getConstantFP(2.0, DL, VT), Flags);
	return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), NewCFP, Flags);
	}
	}

	if (N1.getOpcode() == ISD::FMUL) {
	bool CFP10 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
	bool CFP11 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(1));

	// (fadd x, (fmul x, c)) -> (fmul x, c+1)
	if (CFP11 && !CFP10 && N1.getOperand(0) == N0) {
	SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1),
	DAG.getConstantFP(1.0, DL, VT), Flags);
	return DAG.getNode(ISD::FMUL, DL, VT, N0, NewCFP, Flags);
	}

	// (fadd (fadd x, x), (fmul x, c)) -> (fmul x, c+2)
	if (CFP11 && !CFP10 && N0.getOpcode() == ISD::FADD &&
	N0.getOperand(0) == N0.getOperand(1) &&
	N1.getOperand(0) == N0.getOperand(0)) {
	SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1),
	DAG.getConstantFP(2.0, DL, VT), Flags);
	return DAG.getNode(ISD::FMUL, DL, VT, N1.getOperand(0), NewCFP, Flags);
	}
	}

	if (N0.getOpcode() == ISD::FADD) {
	bool CFP00 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
	// (fadd (fadd x, x), x) -> (fmul x, 3.0)
	if (!CFP00 && N0.getOperand(0) == N0.getOperand(1) &&
	(N0.getOperand(0) == N1)) {
	return DAG.getNode(ISD::FMUL, DL, VT,
	N1, DAG.getConstantFP(3.0, DL, VT), Flags);
	}
	}

	if (N1.getOpcode() == ISD::FADD) {
	bool CFP10 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
	// (fadd x, (fadd x, x)) -> (fmul x, 3.0)
	if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) &&
	N1.getOperand(0) == N0) {
	return DAG.getNode(ISD::FMUL, DL, VT,
	N0, DAG.getConstantFP(3.0, DL, VT), Flags);
	}
	}

	// (fadd (fadd x, x), (fadd x, x)) -> (fmul x, 4.0)
	if (N0.getOpcode() == ISD::FADD && N1.getOpcode() == ISD::FADD &&
	N0.getOperand(0) == N0.getOperand(1) &&
	N1.getOperand(0) == N1.getOperand(1) &&
	N0.getOperand(0) == N1.getOperand(0)) {
	return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0),
	DAG.getConstantFP(4.0, DL, VT), Flags);
	}
	}
	} // enable-unsafe-fp-math

	// FADD -> FMA combines:
	if (SDValue Fused = visitFADDForFMACombine(N)) {
	AddToWorklist(Fused.getNode());
	return Fused;
	}
	return SDValue();
	}

	SDValue DAGCombiner::visitFSUB(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0, true);
	ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1, true);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);
	const TargetOptions &Options = DAG.getTarget().Options;
	const SDNodeFlags Flags = N->getFlags();

	if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
	return R;

	// fold vector ops
	if (VT.isVector())
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	// fold (fsub c1, c2) -> c1-c2
	if (N0CFP && N1CFP)
	return DAG.getNode(ISD::FSUB, DL, VT, N0, N1, Flags);

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// (fsub A, 0) -> A
	if (N1CFP && N1CFP->isZero()) {
	if (!N1CFP->isNegative() \|\| Options.NoSignedZerosFPMath \|\|
	Flags.hasNoSignedZeros()) {
	return N0;
	}
	}

	if (N0 == N1) {
	// (fsub x, x) -> 0.0
	if (Options.NoNaNsFPMath \|\| Flags.hasNoNaNs())
	return DAG.getConstantFP(0.0f, DL, VT);
	}

	// (fsub -0.0, N1) -> -N1
	// NOTE: It is safe to transform an FSUB(-0.0,X) into an FNEG(X), since the
	// FSUB does not specify the sign bit of a NaN. Also note that for
	// the same reason, the inverse transform is not safe, unless fast math
	// flags are in play.
	if (N0CFP && N0CFP->isZero()) {
	if (N0CFP->isNegative() \|\|
	(Options.NoSignedZerosFPMath \|\| Flags.hasNoSignedZeros())) {
	if (SDValue NegN1 =
	TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize))
	return NegN1;
	if (!LegalOperations \|\| TLI.isOperationLegal(ISD::FNEG, VT))
	return DAG.getNode(ISD::FNEG, DL, VT, N1, Flags);
	}
	}

	if (((Options.UnsafeFPMath && Options.NoSignedZerosFPMath) \|\|
	(Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) &&
	N1.getOpcode() == ISD::FADD) {
	// X - (X + Y) -> -Y
	if (N0 == N1->getOperand(0))
	return DAG.getNode(ISD::FNEG, DL, VT, N1->getOperand(1), Flags);
	// X - (Y + X) -> -Y
	if (N0 == N1->getOperand(1))
	return DAG.getNode(ISD::FNEG, DL, VT, N1->getOperand(0), Flags);
	}

	// fold (fsub A, (fneg B)) -> (fadd A, B)
	if (SDValue NegN1 =
	TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize))
	return DAG.getNode(ISD::FADD, DL, VT, N0, NegN1, Flags);

	// FSUB -> FMA combines:
	if (SDValue Fused = visitFSUBForFMACombine(N)) {
	AddToWorklist(Fused.getNode());
	return Fused;
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitFMUL(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0, true);
	ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1, true);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);
	const TargetOptions &Options = DAG.getTarget().Options;
	const SDNodeFlags Flags = N->getFlags();

	if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
	return R;

	// fold vector ops
	if (VT.isVector()) {
	// This just handles C1 * C2 for vectors. Other vector folds are below.
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;
	}

	// fold (fmul c1, c2) -> c1*c2
	if (N0CFP && N1CFP)
	return DAG.getNode(ISD::FMUL, DL, VT, N0, N1, Flags);

	// canonicalize constant to RHS
	if (isConstantFPBuildVectorOrConstantFP(N0) &&
	!isConstantFPBuildVectorOrConstantFP(N1))
	return DAG.getNode(ISD::FMUL, DL, VT, N1, N0, Flags);

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	if ((Options.NoNaNsFPMath && Options.NoSignedZerosFPMath) \|\|
	(Flags.hasNoNaNs() && Flags.hasNoSignedZeros())) {
	// fold (fmul A, 0) -> 0
	if (N1CFP && N1CFP->isZero())
	return N1;
	}

	if (Options.UnsafeFPMath \|\| Flags.hasAllowReassociation()) {
	// fmul (fmul X, C1), C2 -> fmul X, C1 * C2
	if (isConstantFPBuildVectorOrConstantFP(N1) &&
	N0.getOpcode() == ISD::FMUL) {
	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	// Avoid an infinite loop by making sure that N00 is not a constant
	// (the inner multiply has not been constant folded yet).
	if (isConstantFPBuildVectorOrConstantFP(N01) &&
	!isConstantFPBuildVectorOrConstantFP(N00)) {
	SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, N01, N1, Flags);
	return DAG.getNode(ISD::FMUL, DL, VT, N00, MulConsts, Flags);
	}
	}

	// Match a special-case: we convert X * 2.0 into fadd.
	// fmul (fadd X, X), C -> fmul X, 2.0 * C
	if (N0.getOpcode() == ISD::FADD && N0.hasOneUse() &&
	N0.getOperand(0) == N0.getOperand(1)) {
	const SDValue Two = DAG.getConstantFP(2.0, DL, VT);
	SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, Two, N1, Flags);
	return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), MulConsts, Flags);
	}
	}

	// fold (fmul X, 2.0) -> (fadd X, X)
	if (N1CFP && N1CFP->isExactlyValue(+2.0))
	return DAG.getNode(ISD::FADD, DL, VT, N0, N0, Flags);

	// fold (fmul X, -1.0) -> (fneg X)
	if (N1CFP && N1CFP->isExactlyValue(-1.0))
	if (!LegalOperations \|\| TLI.isOperationLegal(ISD::FNEG, VT))
	return DAG.getNode(ISD::FNEG, DL, VT, N0);

	// -N0 * -N1 --> N0 * N1
	TargetLowering::NegatibleCost CostN0 =
	TargetLowering::NegatibleCost::Expensive;
	TargetLowering::NegatibleCost CostN1 =
	TargetLowering::NegatibleCost::Expensive;
	SDValue NegN0 =
	TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize, CostN0);
	SDValue NegN1 =
	TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize, CostN1);
	if (NegN0 && NegN1 &&
	(CostN0 == TargetLowering::NegatibleCost::Cheaper \|\|
	CostN1 == TargetLowering::NegatibleCost::Cheaper))
	return DAG.getNode(ISD::FMUL, DL, VT, NegN0, NegN1, Flags);

	// fold (fmul X, (select (fcmp X > 0.0), -1.0, 1.0)) -> (fneg (fabs X))
	// fold (fmul X, (select (fcmp X > 0.0), 1.0, -1.0)) -> (fabs X)
	if (Flags.hasNoNaNs() && Flags.hasNoSignedZeros() &&
	(N0.getOpcode() == ISD::SELECT \|\| N1.getOpcode() == ISD::SELECT) &&
	TLI.isOperationLegal(ISD::FABS, VT)) {
	SDValue Select = N0, X = N1;
	if (Select.getOpcode() != ISD::SELECT)
	std::swap(Select, X);

	SDValue Cond = Select.getOperand(0);
	auto TrueOpnd = dyn_cast<ConstantFPSDNode>(Select.getOperand(1));
	auto FalseOpnd = dyn_cast<ConstantFPSDNode>(Select.getOperand(2));

	if (TrueOpnd && FalseOpnd &&
	Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0) == X &&
	isa<ConstantFPSDNode>(Cond.getOperand(1)) &&
	cast<ConstantFPSDNode>(Cond.getOperand(1))->isExactlyValue(0.0)) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
	switch (CC) {
	default: break;
	case ISD::SETOLT:
	case ISD::SETULT:
	case ISD::SETOLE:
	case ISD::SETULE:
	case ISD::SETLT:
	case ISD::SETLE:
	std::swap(TrueOpnd, FalseOpnd);
	LLVM_FALLTHROUGH;
	case ISD::SETOGT:
	case ISD::SETUGT:
	case ISD::SETOGE:
	case ISD::SETUGE:
	case ISD::SETGT:
	case ISD::SETGE:
	if (TrueOpnd->isExactlyValue(-1.0) && FalseOpnd->isExactlyValue(1.0) &&
	TLI.isOperationLegal(ISD::FNEG, VT))
	return DAG.getNode(ISD::FNEG, DL, VT,
	DAG.getNode(ISD::FABS, DL, VT, X));
	if (TrueOpnd->isExactlyValue(1.0) && FalseOpnd->isExactlyValue(-1.0))
	return DAG.getNode(ISD::FABS, DL, VT, X);

	break;
	}
	}
	}

	// FMUL -> FMA combines:
	if (SDValue Fused = visitFMULForFMADistributiveCombine(N)) {
	AddToWorklist(Fused.getNode());
	return Fused;
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitFMA(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue N2 = N->getOperand(2);
	ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
	ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);
	const TargetOptions &Options = DAG.getTarget().Options;

	// FMA nodes have flags that propagate to the created nodes.
	const SDNodeFlags Flags = N->getFlags();
	bool UnsafeFPMath = Options.UnsafeFPMath \|\| isContractable(N);

	// Constant fold FMA.
	if (isa<ConstantFPSDNode>(N0) &&
	isa<ConstantFPSDNode>(N1) &&
	isa<ConstantFPSDNode>(N2)) {
	return DAG.getNode(ISD::FMA, DL, VT, N0, N1, N2);
	}

	// (-N0 * -N1) + N2 --> (N0 * N1) + N2
	TargetLowering::NegatibleCost CostN0 =
	TargetLowering::NegatibleCost::Expensive;
	TargetLowering::NegatibleCost CostN1 =
	TargetLowering::NegatibleCost::Expensive;
	SDValue NegN0 =
	TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize, CostN0);
	SDValue NegN1 =
	TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize, CostN1);
	if (NegN0 && NegN1 &&
	(CostN0 == TargetLowering::NegatibleCost::Cheaper \|\|
	CostN1 == TargetLowering::NegatibleCost::Cheaper))
	return DAG.getNode(ISD::FMA, DL, VT, NegN0, NegN1, N2, Flags);

	if (UnsafeFPMath) {
	if (N0CFP && N0CFP->isZero())
	return N2;
	if (N1CFP && N1CFP->isZero())
	return N2;
	}
	// TODO: The FMA node should have flags that propagate to these nodes.
	if (N0CFP && N0CFP->isExactlyValue(1.0))
	return DAG.getNode(ISD::FADD, SDLoc(N), VT, N1, N2);
	if (N1CFP && N1CFP->isExactlyValue(1.0))
	return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N2);

	// Canonicalize (fma c, x, y) -> (fma x, c, y)
	if (isConstantFPBuildVectorOrConstantFP(N0) &&
	!isConstantFPBuildVectorOrConstantFP(N1))
	return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2);

	if (UnsafeFPMath) {
	// (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
	if (N2.getOpcode() == ISD::FMUL && N0 == N2.getOperand(0) &&
	isConstantFPBuildVectorOrConstantFP(N1) &&
	isConstantFPBuildVectorOrConstantFP(N2.getOperand(1))) {
	return DAG.getNode(ISD::FMUL, DL, VT, N0,
	DAG.getNode(ISD::FADD, DL, VT, N1, N2.getOperand(1),
	Flags), Flags);
	}

	// (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)
	if (N0.getOpcode() == ISD::FMUL &&
	isConstantFPBuildVectorOrConstantFP(N1) &&
	isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) {
	return DAG.getNode(ISD::FMA, DL, VT,
	N0.getOperand(0),
	DAG.getNode(ISD::FMUL, DL, VT, N1, N0.getOperand(1),
	Flags),
	N2);
	}
	}

	// (fma x, 1, y) -> (fadd x, y)
	// (fma x, -1, y) -> (fadd (fneg x), y)
	if (N1CFP) {
	if (N1CFP->isExactlyValue(1.0))
	// TODO: The FMA node should have flags that propagate to this node.
	return DAG.getNode(ISD::FADD, DL, VT, N0, N2);

	if (N1CFP->isExactlyValue(-1.0) &&
	(!LegalOperations \|\| TLI.isOperationLegal(ISD::FNEG, VT))) {
	SDValue RHSNeg = DAG.getNode(ISD::FNEG, DL, VT, N0);
	AddToWorklist(RHSNeg.getNode());
	// TODO: The FMA node should have flags that propagate to this node.
	return DAG.getNode(ISD::FADD, DL, VT, N2, RHSNeg);
	}

	// fma (fneg x), K, y -> fma x -K, y
	if (N0.getOpcode() == ISD::FNEG &&
	(TLI.isOperationLegal(ISD::ConstantFP, VT) \|\|
	(N1.hasOneUse() && !TLI.isFPImmLegal(N1CFP->getValueAPF(), VT,
	ForCodeSize)))) {
	return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0),
	DAG.getNode(ISD::FNEG, DL, VT, N1, Flags), N2);
	}
	}

	if (UnsafeFPMath) {
	// (fma x, c, x) -> (fmul x, (c+1))
	if (N1CFP && N0 == N2) {
	return DAG.getNode(ISD::FMUL, DL, VT, N0,
	DAG.getNode(ISD::FADD, DL, VT, N1,
	DAG.getConstantFP(1.0, DL, VT), Flags),
	Flags);
	}

	// (fma x, c, (fneg x)) -> (fmul x, (c-1))
	if (N1CFP && N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0) {
	return DAG.getNode(ISD::FMUL, DL, VT, N0,
	DAG.getNode(ISD::FADD, DL, VT, N1,
	DAG.getConstantFP(-1.0, DL, VT), Flags),
	Flags);
	}
	}

	// fold ((fma (fneg X), Y, (fneg Z)) -> fneg (fma X, Y, Z))
	// fold ((fma X, (fneg Y), (fneg Z)) -> fneg (fma X, Y, Z))
	if (!TLI.isFNegFree(VT))
	if (SDValue Neg = TLI.getCheaperNegatedExpression(
	SDValue(N, 0), DAG, LegalOperations, ForCodeSize))
	return DAG.getNode(ISD::FNEG, DL, VT, Neg, Flags);
	return SDValue();
	}

	// Combine multiple FDIVs with the same divisor into multiple FMULs by the
	// reciprocal.
	// E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip)
	// Notice that this is not always beneficial. One reason is different targets
	// may have different costs for FDIV and FMUL, so sometimes the cost of two
	// FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason
	// is the critical path is increased from "one FDIV" to "one FDIV + one FMUL".
	SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) {
	// TODO: Limit this transform based on optsize/minsize - it always creates at
	// least 1 extra instruction. But the perf win may be substantial enough
	// that only minsize should restrict this.
	bool UnsafeMath = DAG.getTarget().Options.UnsafeFPMath;
	const SDNodeFlags Flags = N->getFlags();
	if (LegalDAG \|\| (!UnsafeMath && !Flags.hasAllowReciprocal()))
	return SDValue();

	// Skip if current node is a reciprocal/fneg-reciprocal.
	SDValue N0 = N->getOperand(0);
	ConstantFPSDNode N0CFP = isConstOrConstSplatFP(N0, / AllowUndefs */ true);
	if (N0CFP && (N0CFP->isExactlyValue(1.0) \|\| N0CFP->isExactlyValue(-1.0)))
	return SDValue();

	// Exit early if the target does not want this transform or if there can't
	// possibly be enough uses of the divisor to make the transform worthwhile.
	SDValue N1 = N->getOperand(1);
	unsigned MinUses = TLI.combineRepeatedFPDivisors();

	// For splat vectors, scale the number of uses by the splat factor. If we can
	// convert the division into a scalar op, that will likely be much faster.
	unsigned NumElts = 1;
	EVT VT = N->getValueType(0);
	if (VT.isVector() && DAG.isSplatValue(N1))
	NumElts = VT.getVectorNumElements();

	if (!MinUses \|\| (N1->use_size() * NumElts) < MinUses)
	return SDValue();

	// Find all FDIV users of the same divisor.
	// Use a set because duplicates may be present in the user list.
	SetVector<SDNode *> Users;
	for (auto *U : N1->uses()) {
	if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1) {
	// This division is eligible for optimization only if global unsafe math
	// is enabled or if this division allows reciprocal formation.
	if (UnsafeMath \|\| U->getFlags().hasAllowReciprocal())
	Users.insert(U);
	}
	}

	// Now that we have the actual number of divisor uses, make sure it meets
	// the minimum threshold specified by the target.
	if ((Users.size() * NumElts) < MinUses)
	return SDValue();

	SDLoc DL(N);
	SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);
	SDValue Reciprocal = DAG.getNode(ISD::FDIV, DL, VT, FPOne, N1, Flags);

	// Dividend / Divisor -> Dividend * Reciprocal
	for (auto *U : Users) {
	SDValue Dividend = U->getOperand(0);
	if (Dividend != FPOne) {
	SDValue NewNode = DAG.getNode(ISD::FMUL, SDLoc(U), VT, Dividend,
	Reciprocal, Flags);
	CombineTo(U, NewNode);
	} else if (U != Reciprocal.getNode()) {
	// In the absence of fast-math-flags, this user node is always the
	// same node as Reciprocal, but with FMF they may be different nodes.
	CombineTo(U, Reciprocal);
	}
	}
	return SDValue(N, 0); // N was replaced.
	}

	SDValue DAGCombiner::visitFDIV(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
	ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);
	const TargetOptions &Options = DAG.getTarget().Options;
	SDNodeFlags Flags = N->getFlags();

	if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
	return R;

	// fold vector ops
	if (VT.isVector())
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	// fold (fdiv c1, c2) -> c1/c2
	if (N0CFP && N1CFP)
	return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1, Flags);

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	if (SDValue V = combineRepeatedFPDivisors(N))
	return V;

	if (Options.UnsafeFPMath \|\| Flags.hasAllowReciprocal()) {
	// fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
	if (N1CFP) {
	// Compute the reciprocal 1.0 / c2.
	const APFloat &N1APF = N1CFP->getValueAPF();
	APFloat Recip(N1APF.getSemantics(), 1); // 1.0
	APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);
	// Only do the transform if the reciprocal is a legal fp immediate that
	// isn't too nasty (eg NaN, denormal, ...).
	if ((st == APFloat::opOK \|\| st == APFloat::opInexact) && // Not too nasty
	(!LegalOperations \|\|
	// FIXME: custom lowering of ConstantFP might fail (see e.g. ARM
	// backend)... we should handle this gracefully after Legalize.
	// TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT) \|\|
	TLI.isOperationLegal(ISD::ConstantFP, VT) \|\|
	TLI.isFPImmLegal(Recip, VT, ForCodeSize)))
	return DAG.getNode(ISD::FMUL, DL, VT, N0,
	DAG.getConstantFP(Recip, DL, VT), Flags);
	}

	// If this FDIV is part of a reciprocal square root, it may be folded
	// into a target-specific square root estimate instruction.
	if (N1.getOpcode() == ISD::FSQRT) {
	if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0), Flags))
	return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
	} else if (N1.getOpcode() == ISD::FP_EXTEND &&
	N1.getOperand(0).getOpcode() == ISD::FSQRT) {
	if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0),
	Flags)) {
	RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV);
	AddToWorklist(RV.getNode());
	return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
	}
	} else if (N1.getOpcode() == ISD::FP_ROUND &&
	N1.getOperand(0).getOpcode() == ISD::FSQRT) {
	if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0),
	Flags)) {
	RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1));
	AddToWorklist(RV.getNode());
	return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
	}
	} else if (N1.getOpcode() == ISD::FMUL) {
	// Look through an FMUL. Even though this won't remove the FDIV directly,
	// it's still worthwhile to get rid of the FSQRT if possible.
	SDValue Sqrt, Y;
	if (N1.getOperand(0).getOpcode() == ISD::FSQRT) {
	Sqrt = N1.getOperand(0);
	Y = N1.getOperand(1);
	} else if (N1.getOperand(1).getOpcode() == ISD::FSQRT) {
	Sqrt = N1.getOperand(1);
	Y = N1.getOperand(0);
	}
	if (Sqrt.getNode()) {
	// If the other multiply operand is known positive, pull it into the
	// sqrt. That will eliminate the division if we convert to an estimate:
	// X / (fabs(A) * sqrt(Z)) --> X / sqrt(AAZ) --> X * rsqrt(AAZ)
	// TODO: Also fold the case where A == Z (fabs is missing).
	if (Flags.hasAllowReassociation() && N1.hasOneUse() &&
	N1->getFlags().hasAllowReassociation() && Sqrt.hasOneUse() &&
	Y.getOpcode() == ISD::FABS && Y.hasOneUse()) {
	SDValue AA = DAG.getNode(ISD::FMUL, DL, VT, Y.getOperand(0),
	Y.getOperand(0), Flags);
	SDValue AAZ =
	DAG.getNode(ISD::FMUL, DL, VT, AA, Sqrt.getOperand(0), Flags);
	if (SDValue Rsqrt = buildRsqrtEstimate(AAZ, Flags))
	return DAG.getNode(ISD::FMUL, DL, VT, N0, Rsqrt, Flags);

	// Estimate creation failed. Clean up speculatively created nodes.
	recursivelyDeleteUnusedNodes(AAZ.getNode());
	}

	// We found a FSQRT, so try to make this fold:
	// X / (Y * sqrt(Z)) -> X * (rsqrt(Z) / Y)
	if (SDValue Rsqrt = buildRsqrtEstimate(Sqrt.getOperand(0), Flags)) {
	SDValue Div = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, Rsqrt, Y, Flags);
	AddToWorklist(Div.getNode());
	return DAG.getNode(ISD::FMUL, DL, VT, N0, Div, Flags);
	}
	}
	}

	// Fold into a reciprocal estimate and multiply instead of a real divide.
	if (Options.NoInfsFPMath \|\| Flags.hasNoInfs())
	if (SDValue RV = BuildDivEstimate(N0, N1, Flags))
	return RV;
	}

	// (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y)
	TargetLowering::NegatibleCost CostN0 =
	TargetLowering::NegatibleCost::Expensive;
	TargetLowering::NegatibleCost CostN1 =
	TargetLowering::NegatibleCost::Expensive;
	SDValue NegN0 =
	TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize, CostN0);
	SDValue NegN1 =
	TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize, CostN1);
	if (NegN0 && NegN1 &&
	(CostN0 == TargetLowering::NegatibleCost::Cheaper \|\|
	CostN1 == TargetLowering::NegatibleCost::Cheaper))
	return DAG.getNode(ISD::FDIV, SDLoc(N), VT, NegN0, NegN1, Flags);

	return SDValue();
	}

	SDValue DAGCombiner::visitFREM(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
	ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
	EVT VT = N->getValueType(0);
	SDNodeFlags Flags = N->getFlags();

	if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
	return R;

	// fold (frem c1, c2) -> fmod(c1,c2)
	if (N0CFP && N1CFP)
	return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1, N->getFlags());

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	return SDValue();
	}

	SDValue DAGCombiner::visitFSQRT(SDNode *N) {
	SDNodeFlags Flags = N->getFlags();
	const TargetOptions &Options = DAG.getTarget().Options;

	// Require 'ninf' flag since sqrt(+Inf) = +Inf, but the estimation goes as:
	// sqrt(+Inf) == rsqrt(+Inf) * +Inf = 0 * +Inf = NaN
	if ((!Options.UnsafeFPMath && !Flags.hasApproximateFuncs()) \|\|
	(!Options.NoInfsFPMath && !Flags.hasNoInfs()))
	return SDValue();

	SDValue N0 = N->getOperand(0);
	if (TLI.isFsqrtCheap(N0, DAG))
	return SDValue();

	// FSQRT nodes have flags that propagate to the created nodes.
	return buildSqrtEstimate(N0, Flags);
	}

	/// copysign(x, fp_extend(y)) -> copysign(x, y)
	/// copysign(x, fp_round(y)) -> copysign(x, y)
	static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) {
	SDValue N1 = N->getOperand(1);
	if ((N1.getOpcode() == ISD::FP_EXTEND \|\|
	N1.getOpcode() == ISD::FP_ROUND)) {
	// Do not optimize out type conversion of f128 type yet.
	// For some targets like x86_64, configuration is changed to keep one f128
	// value in one SSE register, but instruction selection cannot handle
	// FCOPYSIGN on SSE registers yet.
	EVT N1VT = N1->getValueType(0);
	EVT N1Op0VT = N1->getOperand(0).getValueType();
	return (N1VT == N1Op0VT \|\| N1Op0VT != MVT::f128);
	}
	return false;
	}

	SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	bool N0CFP = isConstantFPBuildVectorOrConstantFP(N0);
	bool N1CFP = isConstantFPBuildVectorOrConstantFP(N1);
	EVT VT = N->getValueType(0);

	if (N0CFP && N1CFP) // Constant fold
	return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1);

	if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N->getOperand(1))) {
	const APFloat &V = N1C->getValueAPF();
	// copysign(x, c1) -> fabs(x) iff ispos(c1)
	// copysign(x, c1) -> fneg(fabs(x)) iff isneg(c1)
	if (!V.isNegative()) {
	if (!LegalOperations \|\| TLI.isOperationLegal(ISD::FABS, VT))
	return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);
	} else {
	if (!LegalOperations \|\| TLI.isOperationLegal(ISD::FNEG, VT))
	return DAG.getNode(ISD::FNEG, SDLoc(N), VT,
	DAG.getNode(ISD::FABS, SDLoc(N0), VT, N0));
	}
	}

	// copysign(fabs(x), y) -> copysign(x, y)
	// copysign(fneg(x), y) -> copysign(x, y)
	// copysign(copysign(x,z), y) -> copysign(x, y)
	if (N0.getOpcode() == ISD::FABS \|\| N0.getOpcode() == ISD::FNEG \|\|
	N0.getOpcode() == ISD::FCOPYSIGN)
	return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0.getOperand(0), N1);

	// copysign(x, abs(y)) -> abs(x)
	if (N1.getOpcode() == ISD::FABS)
	return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);

	// copysign(x, copysign(y,z)) -> copysign(x, z)
	if (N1.getOpcode() == ISD::FCOPYSIGN)
	return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(1));

	// copysign(x, fp_extend(y)) -> copysign(x, y)
	// copysign(x, fp_round(y)) -> copysign(x, y)
	if (CanCombineFCOPYSIGN_EXTEND_ROUND(N))
	return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(0));

	return SDValue();
	}

	SDValue DAGCombiner::visitFPOW(SDNode *N) {
	ConstantFPSDNode *ExponentC = isConstOrConstSplatFP(N->getOperand(1));
	if (!ExponentC)
	return SDValue();

	// Try to convert x ** (1/3) into cube root.
	// TODO: Handle the various flavors of long double.
	// TODO: Since we're approximating, we don't need an exact 1/3 exponent.
	// Some range near 1/3 should be fine.
	EVT VT = N->getValueType(0);
	if ((VT == MVT::f32 && ExponentC->getValueAPF().isExactlyValue(1.0f/3.0f)) \|\|
	(VT == MVT::f64 && ExponentC->getValueAPF().isExactlyValue(1.0/3.0))) {
	// pow(-0.0, 1/3) = +0.0; cbrt(-0.0) = -0.0.
	// pow(-inf, 1/3) = +inf; cbrt(-inf) = -inf.
	// pow(-val, 1/3) = nan; cbrt(-val) = -num.
	// For regular numbers, rounding may cause the results to differ.
	// Therefore, we require { nsz ninf nnan afn } for this transform.
	// TODO: We could select out the special cases if we don't have nsz/ninf.
	SDNodeFlags Flags = N->getFlags();
	if (!Flags.hasNoSignedZeros() \|\| !Flags.hasNoInfs() \|\| !Flags.hasNoNaNs() \|\|
	!Flags.hasApproximateFuncs())
	return SDValue();

	// Do not create a cbrt() libcall if the target does not have it, and do not
	// turn a pow that has lowering support into a cbrt() libcall.
	if (!DAG.getLibInfo().has(LibFunc_cbrt) \|\|
	(!DAG.getTargetLoweringInfo().isOperationExpand(ISD::FPOW, VT) &&
	DAG.getTargetLoweringInfo().isOperationExpand(ISD::FCBRT, VT)))
	return SDValue();

	return DAG.getNode(ISD::FCBRT, SDLoc(N), VT, N->getOperand(0), Flags);
	}

	// Try to convert x (1/4) and x (3/4) into square roots.
	// x ** (1/2) is canonicalized to sqrt, so we do not bother with that case.
	// TODO: This could be extended (using a target hook) to handle smaller
	// power-of-2 fractional exponents.
	bool ExponentIs025 = ExponentC->getValueAPF().isExactlyValue(0.25);
	bool ExponentIs075 = ExponentC->getValueAPF().isExactlyValue(0.75);
	if (ExponentIs025 \|\| ExponentIs075) {
	// pow(-0.0, 0.25) = +0.0; sqrt(sqrt(-0.0)) = -0.0.
	// pow(-inf, 0.25) = +inf; sqrt(sqrt(-inf)) = NaN.
	// pow(-0.0, 0.75) = +0.0; sqrt(-0.0) * sqrt(sqrt(-0.0)) = +0.0.
	// pow(-inf, 0.75) = +inf; sqrt(-inf) * sqrt(sqrt(-inf)) = NaN.
	// For regular numbers, rounding may cause the results to differ.
	// Therefore, we require { nsz ninf afn } for this transform.
	// TODO: We could select out the special cases if we don't have nsz/ninf.
	SDNodeFlags Flags = N->getFlags();

	// We only need no signed zeros for the 0.25 case.
	if ((!Flags.hasNoSignedZeros() && ExponentIs025) \|\| !Flags.hasNoInfs() \|\|
	!Flags.hasApproximateFuncs())
	return SDValue();

	// Don't double the number of libcalls. We are trying to inline fast code.
	if (!DAG.getTargetLoweringInfo().isOperationLegalOrCustom(ISD::FSQRT, VT))
	return SDValue();

	// Assume that libcalls are the smallest code.
	// TODO: This restriction should probably be lifted for vectors.
	if (ForCodeSize)
	return SDValue();

	// pow(X, 0.25) --> sqrt(sqrt(X))
	SDLoc DL(N);
	SDValue Sqrt = DAG.getNode(ISD::FSQRT, DL, VT, N->getOperand(0), Flags);
	SDValue SqrtSqrt = DAG.getNode(ISD::FSQRT, DL, VT, Sqrt, Flags);
	if (ExponentIs025)
	return SqrtSqrt;
	// pow(X, 0.75) --> sqrt(X) * sqrt(sqrt(X))
	return DAG.getNode(ISD::FMUL, DL, VT, Sqrt, SqrtSqrt, Flags);
	}

	return SDValue();
	}

	static SDValue foldFPToIntToFP(SDNode *N, SelectionDAG &DAG,
	const TargetLowering &TLI) {
	// This optimization is guarded by a function attribute because it may produce
	// unexpected results. Ie, programs may be relying on the platform-specific
	// undefined behavior when the float-to-int conversion overflows.
	const Function &F = DAG.getMachineFunction().getFunction();
	Attribute StrictOverflow = F.getFnAttribute("strict-float-cast-overflow");
	if (StrictOverflow.getValueAsString().equals("false"))
	return SDValue();

	// We only do this if the target has legal ftrunc. Otherwise, we'd likely be
	// replacing casts with a libcall. We also must be allowed to ignore -0.0
	// because FTRUNC will return -0.0 for (-1.0, -0.0), but using integer
	// conversions would return +0.0.
	// FIXME: We should be able to use node-level FMF here.
	// TODO: If strict math, should we use FABS (+ range check for signed cast)?
	EVT VT = N->getValueType(0);
	if (!TLI.isOperationLegal(ISD::FTRUNC, VT) \|\|
	!DAG.getTarget().Options.NoSignedZerosFPMath)
	return SDValue();

	// fptosi/fptoui round towards zero, so converting from FP to integer and
	// back is the same as an 'ftrunc': [us]itofp (fpto[us]i X) --> ftrunc X
	SDValue N0 = N->getOperand(0);
	if (N->getOpcode() == ISD::SINT_TO_FP && N0.getOpcode() == ISD::FP_TO_SINT &&
	N0.getOperand(0).getValueType() == VT)
	return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0.getOperand(0));

	if (N->getOpcode() == ISD::UINT_TO_FP && N0.getOpcode() == ISD::FP_TO_UINT &&
	N0.getOperand(0).getValueType() == VT)
	return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0.getOperand(0));

	return SDValue();
	}

	SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT OpVT = N0.getValueType();

	// [us]itofp(undef) = 0, because the result value is bounded.
	if (N0.isUndef())
	return DAG.getConstantFP(0.0, SDLoc(N), VT);

	// fold (sint_to_fp c1) -> c1fp
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
	// ...but only if the target supports immediate floating-point values
	(!LegalOperations \|\|
	TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))
	return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0);

	// If the input is a legal type, and SINT_TO_FP is not legal on this target,
	// but UINT_TO_FP is legal on this target, try to convert.
	if (!hasOperation(ISD::SINT_TO_FP, OpVT) &&
	hasOperation(ISD::UINT_TO_FP, OpVT)) {
	// If the sign bit is known to be zero, we can change this to UINT_TO_FP.
	if (DAG.SignBitIsZero(N0))
	return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0);
	}

	// The next optimizations are desirable only if SELECT_CC can be lowered.
	// fold (sint_to_fp (setcc x, y, cc)) -> (select (setcc x, y, cc), -1.0, 0.0)
	if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 &&
	!VT.isVector() &&
	(!LegalOperations \|\| TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
	SDLoc DL(N);
	return DAG.getSelect(DL, VT, N0, DAG.getConstantFP(-1.0, DL, VT),
	DAG.getConstantFP(0.0, DL, VT));
	}

	// fold (sint_to_fp (zext (setcc x, y, cc))) ->
	// (select (setcc x, y, cc), 1.0, 0.0)
	if (N0.getOpcode() == ISD::ZERO_EXTEND &&
	N0.getOperand(0).getOpcode() == ISD::SETCC && !VT.isVector() &&
	(!LegalOperations \|\| TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
	SDLoc DL(N);
	return DAG.getSelect(DL, VT, N0.getOperand(0),
	DAG.getConstantFP(1.0, DL, VT),
	DAG.getConstantFP(0.0, DL, VT));
	}

	if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI))
	return FTrunc;

	return SDValue();
	}

	SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT OpVT = N0.getValueType();

	// [us]itofp(undef) = 0, because the result value is bounded.
	if (N0.isUndef())
	return DAG.getConstantFP(0.0, SDLoc(N), VT);

	// fold (uint_to_fp c1) -> c1fp
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
	// ...but only if the target supports immediate floating-point values
	(!LegalOperations \|\|
	TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))
	return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0);

	// If the input is a legal type, and UINT_TO_FP is not legal on this target,
	// but SINT_TO_FP is legal on this target, try to convert.
	if (!hasOperation(ISD::UINT_TO_FP, OpVT) &&
	hasOperation(ISD::SINT_TO_FP, OpVT)) {
	// If the sign bit is known to be zero, we can change this to SINT_TO_FP.
	if (DAG.SignBitIsZero(N0))
	return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0);
	}

	// fold (uint_to_fp (setcc x, y, cc)) -> (select (setcc x, y, cc), 1.0, 0.0)
	if (N0.getOpcode() == ISD::SETCC && !VT.isVector() &&
	(!LegalOperations \|\| TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
	SDLoc DL(N);
	return DAG.getSelect(DL, VT, N0, DAG.getConstantFP(1.0, DL, VT),
	DAG.getConstantFP(0.0, DL, VT));
	}

	if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI))
	return FTrunc;

	return SDValue();
	}

	// Fold (fp_to_{s/u}int ({s/u}int_to_fpx)) -> zext x, sext x, trunc x, or x
	static SDValue FoldIntToFPToInt(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	if (N0.getOpcode() != ISD::UINT_TO_FP && N0.getOpcode() != ISD::SINT_TO_FP)
	return SDValue();

	SDValue Src = N0.getOperand(0);
	EVT SrcVT = Src.getValueType();
	bool IsInputSigned = N0.getOpcode() == ISD::SINT_TO_FP;
	bool IsOutputSigned = N->getOpcode() == ISD::FP_TO_SINT;

	// We can safely assume the conversion won't overflow the output range,
	// because (for example) (uint8_t)18293.f is undefined behavior.

	// Since we can assume the conversion won't overflow, our decision as to
	// whether the input will fit in the float should depend on the minimum
	// of the input range and output range.

	// This means this is also safe for a signed input and unsigned output, since
	// a negative input would lead to undefined behavior.
	unsigned InputSize = (int)SrcVT.getScalarSizeInBits() - IsInputSigned;
	unsigned OutputSize = (int)VT.getScalarSizeInBits() - IsOutputSigned;
	unsigned ActualSize = std::min(InputSize, OutputSize);
	const fltSemantics &sem = DAG.EVTToAPFloatSemantics(N0.getValueType());

	// We can only fold away the float conversion if the input range can be
	// represented exactly in the float range.
	if (APFloat::semanticsPrecision(sem) >= ActualSize) {
	if (VT.getScalarSizeInBits() > SrcVT.getScalarSizeInBits()) {
	unsigned ExtOp = IsInputSigned && IsOutputSigned ? ISD::SIGN_EXTEND
	: ISD::ZERO_EXTEND;
	return DAG.getNode(ExtOp, SDLoc(N), VT, Src);
	}
	if (VT.getScalarSizeInBits() < SrcVT.getScalarSizeInBits())
	return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Src);
	return DAG.getBitcast(VT, Src);
	}
	return SDValue();
	}

	SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (fp_to_sint undef) -> undef
	if (N0.isUndef())
	return DAG.getUNDEF(VT);

	// fold (fp_to_sint c1fp) -> c1
	if (isConstantFPBuildVectorOrConstantFP(N0))
	return DAG.getNode(ISD::FP_TO_SINT, SDLoc(N), VT, N0);

	return FoldIntToFPToInt(N, DAG);
	}

	SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (fp_to_uint undef) -> undef
	if (N0.isUndef())
	return DAG.getUNDEF(VT);

	// fold (fp_to_uint c1fp) -> c1
	if (isConstantFPBuildVectorOrConstantFP(N0))
	return DAG.getNode(ISD::FP_TO_UINT, SDLoc(N), VT, N0);

	return FoldIntToFPToInt(N, DAG);
	}

	SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
	EVT VT = N->getValueType(0);

	// fold (fp_round c1fp) -> c1fp
	if (N0CFP)
	return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, N0, N1);

	// fold (fp_round (fp_extend x)) -> x
	if (N0.getOpcode() == ISD::FP_EXTEND && VT == N0.getOperand(0).getValueType())
	return N0.getOperand(0);

	// fold (fp_round (fp_round x)) -> (fp_round x)
	if (N0.getOpcode() == ISD::FP_ROUND) {
	const bool NIsTrunc = N->getConstantOperandVal(1) == 1;
	const bool N0IsTrunc = N0.getConstantOperandVal(1) == 1;

	// Skip this folding if it results in an fp_round from f80 to f16.
	//
	// f80 to f16 always generates an expensive (and as yet, unimplemented)
	// libcall to __truncxfhf2 instead of selecting native f16 conversion
	// instructions from f32 or f64. Moreover, the first (value-preserving)
	// fp_round from f80 to either f32 or f64 may become a NOP in platforms like
	// x86.
	if (N0.getOperand(0).getValueType() == MVT::f80 && VT == MVT::f16)
	return SDValue();

	// If the first fp_round isn't a value preserving truncation, it might
	// introduce a tie in the second fp_round, that wouldn't occur in the
	// single-step fp_round we want to fold to.
	// In other words, double rounding isn't the same as rounding.
	// Also, this is a value preserving truncation iff both fp_round's are.
	if (DAG.getTarget().Options.UnsafeFPMath \|\| N0IsTrunc) {
	SDLoc DL(N);
	return DAG.getNode(ISD::FP_ROUND, DL, VT, N0.getOperand(0),
	DAG.getIntPtrConstant(NIsTrunc && N0IsTrunc, DL));
	}
	}

	// fold (fp_round (copysign X, Y)) -> (copysign (fp_round X), Y)
	if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse()) {
	SDValue Tmp = DAG.getNode(ISD::FP_ROUND, SDLoc(N0), VT,
	N0.getOperand(0), N1);
	AddToWorklist(Tmp.getNode());
	return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT,
	Tmp, N0.getOperand(1));
	}

	if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
	return NewVSel;

	return SDValue();
	}

	SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
	if (N->hasOneUse() &&
	N->use_begin()->getOpcode() == ISD::FP_ROUND)
	return SDValue();

	// fold (fp_extend c1fp) -> c1fp
	if (isConstantFPBuildVectorOrConstantFP(N0))
	return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N0);

	// fold (fp_extend (fp16_to_fp op)) -> (fp16_to_fp op)
	if (N0.getOpcode() == ISD::FP16_TO_FP &&
	TLI.getOperationAction(ISD::FP16_TO_FP, VT) == TargetLowering::Legal)
	return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), VT, N0.getOperand(0));

	// Turn fp_extend(fp_round(X, 1)) -> x since the fp_round doesn't affect the
	// value of X.
	if (N0.getOpcode() == ISD::FP_ROUND
	&& N0.getConstantOperandVal(1) == 1) {
	SDValue In = N0.getOperand(0);
	if (In.getValueType() == VT) return In;
	if (VT.bitsLT(In.getValueType()))
	return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT,
	In, N0.getOperand(1));
	return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, In);
	}

	// fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
	if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
	TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
	LN0->getChain(),
	LN0->getBasePtr(), N0.getValueType(),
	LN0->getMemOperand());
	CombineTo(N, ExtLoad);
	CombineTo(N0.getNode(),
	DAG.getNode(ISD::FP_ROUND, SDLoc(N0),
	N0.getValueType(), ExtLoad,
	DAG.getIntPtrConstant(1, SDLoc(N0))),
	ExtLoad.getValue(1));
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}

	if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
	return NewVSel;

	return SDValue();
	}

	SDValue DAGCombiner::visitFCEIL(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (fceil c1) -> fceil(c1)
	if (isConstantFPBuildVectorOrConstantFP(N0))
	return DAG.getNode(ISD::FCEIL, SDLoc(N), VT, N0);

	return SDValue();
	}

	SDValue DAGCombiner::visitFTRUNC(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (ftrunc c1) -> ftrunc(c1)
	if (isConstantFPBuildVectorOrConstantFP(N0))
	return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0);

	// fold ftrunc (known rounded int x) -> x
	// ftrunc is a part of fptosi/fptoui expansion on some targets, so this is
	// likely to be generated to extract integer from a rounded floating value.
	switch (N0.getOpcode()) {
	default: break;
	case ISD::FRINT:
	case ISD::FTRUNC:
	case ISD::FNEARBYINT:
	case ISD::FFLOOR:
	case ISD::FCEIL:
	return N0;
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitFFLOOR(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (ffloor c1) -> ffloor(c1)
	if (isConstantFPBuildVectorOrConstantFP(N0))
	return DAG.getNode(ISD::FFLOOR, SDLoc(N), VT, N0);

	return SDValue();
	}

	// FIXME: FNEG and FABS have a lot in common; refactor.
	SDValue DAGCombiner::visitFNEG(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// Constant fold FNEG.
	if (isConstantFPBuildVectorOrConstantFP(N0))
	return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0);

	if (SDValue NegN0 =
	TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize))
	return NegN0;

	// -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0
	// FIXME: This is duplicated in getNegatibleCost, but getNegatibleCost doesn't
	// know it was called from a context with a nsz flag if the input fsub does
	// not.
	if (N0.getOpcode() == ISD::FSUB &&
	(DAG.getTarget().Options.NoSignedZerosFPMath \|\|
	N->getFlags().hasNoSignedZeros()) && N0.hasOneUse()) {
	return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N0.getOperand(1),
	N0.getOperand(0), N->getFlags());
	}

	// Transform fneg(bitconvert(x)) -> bitconvert(x ^ sign) to avoid loading
	// constant pool values.
	if (!TLI.isFNegFree(VT) &&
	N0.getOpcode() == ISD::BITCAST &&
	N0.getNode()->hasOneUse()) {
	SDValue Int = N0.getOperand(0);
	EVT IntVT = Int.getValueType();
	if (IntVT.isInteger() && !IntVT.isVector()) {
	APInt SignMask;
	if (N0.getValueType().isVector()) {
	// For a vector, get a mask such as 0x80... per scalar element
	// and splat it.
	SignMask = APInt::getSignMask(N0.getScalarValueSizeInBits());
	SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask);
	} else {
	// For a scalar, just generate 0x80...
	SignMask = APInt::getSignMask(IntVT.getSizeInBits());
	}
	SDLoc DL0(N0);
	Int = DAG.getNode(ISD::XOR, DL0, IntVT, Int,
	DAG.getConstant(SignMask, DL0, IntVT));
	AddToWorklist(Int.getNode());
	return DAG.getBitcast(VT, Int);
	}
	}

	// (fneg (fmul c, x)) -> (fmul -c, x)
	if (N0.getOpcode() == ISD::FMUL &&
	(N0.getNode()->hasOneUse() \|\| !TLI.isFNegFree(VT))) {
	ConstantFPSDNode *CFP1 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
	if (CFP1) {
	APFloat CVal = CFP1->getValueAPF();
	CVal.changeSign();
	if (LegalDAG && (TLI.isFPImmLegal(CVal, VT, ForCodeSize) \|\|
	TLI.isOperationLegal(ISD::ConstantFP, VT)))
	return DAG.getNode(
	ISD::FMUL, SDLoc(N), VT, N0.getOperand(0),
	DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0.getOperand(1)),
	N0->getFlags());
	}
	}

	return SDValue();
	}

	static SDValue visitFMinMax(SelectionDAG &DAG, SDNode *N,
	APFloat (*Op)(const APFloat &, const APFloat &)) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	const ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0);
	const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1);

	if (N0CFP && N1CFP) {
	const APFloat &C0 = N0CFP->getValueAPF();
	const APFloat &C1 = N1CFP->getValueAPF();
	return DAG.getConstantFP(Op(C0, C1), SDLoc(N), VT);
	}

	// Canonicalize to constant on RHS.
	if (isConstantFPBuildVectorOrConstantFP(N0) &&
	!isConstantFPBuildVectorOrConstantFP(N1))
	return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0);

	return SDValue();
	}

	SDValue DAGCombiner::visitFMINNUM(SDNode *N) {
	return visitFMinMax(DAG, N, minnum);
	}

	SDValue DAGCombiner::visitFMAXNUM(SDNode *N) {
	return visitFMinMax(DAG, N, maxnum);
	}

	SDValue DAGCombiner::visitFMINIMUM(SDNode *N) {
	return visitFMinMax(DAG, N, minimum);
	}

	SDValue DAGCombiner::visitFMAXIMUM(SDNode *N) {
	return visitFMinMax(DAG, N, maximum);
	}

	SDValue DAGCombiner::visitFABS(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (fabs c1) -> fabs(c1)
	if (isConstantFPBuildVectorOrConstantFP(N0))
	return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);

	// fold (fabs (fabs x)) -> (fabs x)
	if (N0.getOpcode() == ISD::FABS)
	return N->getOperand(0);

	// fold (fabs (fneg x)) -> (fabs x)
	// fold (fabs (fcopysign x, y)) -> (fabs x)
	if (N0.getOpcode() == ISD::FNEG \|\| N0.getOpcode() == ISD::FCOPYSIGN)
	return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0.getOperand(0));

	// fabs(bitcast(x)) -> bitcast(x & ~sign) to avoid constant pool loads.
	if (!TLI.isFAbsFree(VT) && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) {
	SDValue Int = N0.getOperand(0);
	EVT IntVT = Int.getValueType();
	if (IntVT.isInteger() && !IntVT.isVector()) {
	APInt SignMask;
	if (N0.getValueType().isVector()) {
	// For a vector, get a mask such as 0x7f... per scalar element
	// and splat it.
	SignMask = ~APInt::getSignMask(N0.getScalarValueSizeInBits());
	SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask);
	} else {
	// For a scalar, just generate 0x7f...
	SignMask = ~APInt::getSignMask(IntVT.getSizeInBits());
	}
	SDLoc DL(N0);
	Int = DAG.getNode(ISD::AND, DL, IntVT, Int,
	DAG.getConstant(SignMask, DL, IntVT));
	AddToWorklist(Int.getNode());
	return DAG.getBitcast(N->getValueType(0), Int);
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitBRCOND(SDNode *N) {
	SDValue Chain = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue N2 = N->getOperand(2);

	// If N is a constant we could fold this into a fallthrough or unconditional
	// branch. However that doesn't happen very often in normal code, because
	// Instcombine/SimplifyCFG should have handled the available opportunities.
	// If we did this folding here, it would be necessary to update the
	// MachineBasicBlock CFG, which is awkward.

	// fold a brcond with a setcc condition into a BR_CC node if BR_CC is legal
	// on the target.
	if (N1.getOpcode() == ISD::SETCC &&
	TLI.isOperationLegalOrCustom(ISD::BR_CC,
	N1.getOperand(0).getValueType())) {
	return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other,
	Chain, N1.getOperand(2),
	N1.getOperand(0), N1.getOperand(1), N2);
	}

	if (N1.hasOneUse()) {
	// rebuildSetCC calls visitXor which may change the Chain when there is a
	// STRICT_FSETCC/STRICT_FSETCCS involved. Use a handle to track changes.
	HandleSDNode ChainHandle(Chain);
	if (SDValue NewN1 = rebuildSetCC(N1))
	return DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other,
	ChainHandle.getValue(), NewN1, N2);
	}

	return SDValue();
	}

	SDValue DAGCombiner::rebuildSetCC(SDValue N) {
	if (N.getOpcode() == ISD::SRL \|\|
	(N.getOpcode() == ISD::TRUNCATE &&
	(N.getOperand(0).hasOneUse() &&
	N.getOperand(0).getOpcode() == ISD::SRL))) {
	// Look pass the truncate.
	if (N.getOpcode() == ISD::TRUNCATE)
	N = N.getOperand(0);

	// Match this pattern so that we can generate simpler code:
	//
	// %a = ...
	// %b = and i32 %a, 2
	// %c = srl i32 %b, 1
	// brcond i32 %c ...
	//
	// into
	//
	// %a = ...
	// %b = and i32 %a, 2
	// %c = setcc eq %b, 0
	// brcond %c ...
	//
	// This applies only when the AND constant value has one bit set and the
	// SRL constant is equal to the log2 of the AND constant. The back-end is
	// smart enough to convert the result into a TEST/JMP sequence.
	SDValue Op0 = N.getOperand(0);
	SDValue Op1 = N.getOperand(1);

	if (Op0.getOpcode() == ISD::AND && Op1.getOpcode() == ISD::Constant) {
	SDValue AndOp1 = Op0.getOperand(1);

	if (AndOp1.getOpcode() == ISD::Constant) {
	const APInt &AndConst = cast<ConstantSDNode>(AndOp1)->getAPIntValue();

	if (AndConst.isPowerOf2() &&
	cast<ConstantSDNode>(Op1)->getAPIntValue() == AndConst.logBase2()) {
	SDLoc DL(N);
	return DAG.getSetCC(DL, getSetCCResultType(Op0.getValueType()),
	Op0, DAG.getConstant(0, DL, Op0.getValueType()),
	ISD::SETNE);
	}
	}
	}
	}

	// Transform (brcond (xor x, y)) -> (brcond (setcc, x, y, ne))
	// Transform (brcond (xor (xor x, y), -1)) -> (brcond (setcc, x, y, eq))
	if (N.getOpcode() == ISD::XOR) {
	// Because we may call this on a speculatively constructed
	// SimplifiedSetCC Node, we need to simplify this node first.
	// Ideally this should be folded into SimplifySetCC and not
	// here. For now, grab a handle to N so we don't lose it from
	// replacements interal to the visit.
	HandleSDNode XORHandle(N);
	while (N.getOpcode() == ISD::XOR) {
	SDValue Tmp = visitXOR(N.getNode());
	// No simplification done.
	if (!Tmp.getNode())
	break;
	// Returning N is form in-visit replacement that may invalidated
	// N. Grab value from Handle.
	if (Tmp.getNode() == N.getNode())
	N = XORHandle.getValue();
	else // Node simplified. Try simplifying again.
	N = Tmp;
	}

	if (N.getOpcode() != ISD::XOR)
	return N;

	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	if (Op0.getOpcode() != ISD::SETCC && Op1.getOpcode() != ISD::SETCC) {
	bool Equal = false;
	// (brcond (xor (xor x, y), -1)) -> (brcond (setcc x, y, eq))
	if (isBitwiseNot(N) && Op0.hasOneUse() && Op0.getOpcode() == ISD::XOR &&
	Op0.getValueType() == MVT::i1) {
	N = Op0;
	Op0 = N->getOperand(0);
	Op1 = N->getOperand(1);
	Equal = true;
	}

	EVT SetCCVT = N.getValueType();
	if (LegalTypes)
	SetCCVT = getSetCCResultType(SetCCVT);
	// Replace the uses of XOR with SETCC
	return DAG.getSetCC(SDLoc(N), SetCCVT, Op0, Op1,
	Equal ? ISD::SETEQ : ISD::SETNE);
	}
	}

	return SDValue();
	}

	// Operand List for BR_CC: Chain, CondCC, CondLHS, CondRHS, DestBB.
	//
	SDValue DAGCombiner::visitBR_CC(SDNode *N) {
	CondCodeSDNode *CC = cast<CondCodeSDNode>(N->getOperand(1));
	SDValue CondLHS = N->getOperand(2), CondRHS = N->getOperand(3);

	// If N is a constant we could fold this into a fallthrough or unconditional
	// branch. However that doesn't happen very often in normal code, because
	// Instcombine/SimplifyCFG should have handled the available opportunities.
	// If we did this folding here, it would be necessary to update the
	// MachineBasicBlock CFG, which is awkward.

	// Use SimplifySetCC to simplify SETCC's.
	SDValue Simp = SimplifySetCC(getSetCCResultType(CondLHS.getValueType()),
	CondLHS, CondRHS, CC->get(), SDLoc(N),
	false);
	if (Simp.getNode()) AddToWorklist(Simp.getNode());

	// fold to a simpler setcc
	if (Simp.getNode() && Simp.getOpcode() == ISD::SETCC)
	return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other,
	N->getOperand(0), Simp.getOperand(2),
	Simp.getOperand(0), Simp.getOperand(1),
	N->getOperand(4));

	return SDValue();
	}

	/// Return true if 'Use' is a load or a store that uses N as its base pointer
	/// and that N may be folded in the load / store addressing mode.
	static bool canFoldInAddressingMode(SDNode N, SDNode Use,
	SelectionDAG &DAG,
	const TargetLowering &TLI) {
	EVT VT;
	unsigned AS;

	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Use)) {
	if (LD->isIndexed() \|\| LD->getBasePtr().getNode() != N)
	return false;
	VT = LD->getMemoryVT();
	AS = LD->getAddressSpace();
	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Use)) {
	if (ST->isIndexed() \|\| ST->getBasePtr().getNode() != N)
	return false;
	VT = ST->getMemoryVT();
	AS = ST->getAddressSpace();
	} else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(Use)) {
	if (LD->isIndexed() \|\| LD->getBasePtr().getNode() != N)
	return false;
	VT = LD->getMemoryVT();
	AS = LD->getAddressSpace();
	} else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(Use)) {
	if (ST->isIndexed() \|\| ST->getBasePtr().getNode() != N)
	return false;
	VT = ST->getMemoryVT();
	AS = ST->getAddressSpace();
	} else
	return false;

	TargetLowering::AddrMode AM;
	if (N->getOpcode() == ISD::ADD) {
	AM.HasBaseReg = true;
	ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (Offset)
	// [reg +/- imm]
	AM.BaseOffs = Offset->getSExtValue();
	else
	// [reg +/- reg]
	AM.Scale = 1;
	} else if (N->getOpcode() == ISD::SUB) {
	AM.HasBaseReg = true;
	ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (Offset)
	// [reg +/- imm]
	AM.BaseOffs = -Offset->getSExtValue();
	else
	// [reg +/- reg]
	AM.Scale = 1;
	} else
	return false;

	return TLI.isLegalAddressingMode(DAG.getDataLayout(), AM,
	VT.getTypeForEVT(*DAG.getContext()), AS);
	}

	static bool getCombineLoadStoreParts(SDNode *N, unsigned Inc, unsigned Dec,
	bool &IsLoad, bool &IsMasked, SDValue &Ptr,
	const TargetLowering &TLI) {
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
	if (LD->isIndexed())
	return false;
	EVT VT = LD->getMemoryVT();
	if (!TLI.isIndexedLoadLegal(Inc, VT) && !TLI.isIndexedLoadLegal(Dec, VT))
	return false;
	Ptr = LD->getBasePtr();
	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
	if (ST->isIndexed())
	return false;
	EVT VT = ST->getMemoryVT();
	if (!TLI.isIndexedStoreLegal(Inc, VT) && !TLI.isIndexedStoreLegal(Dec, VT))
	return false;
	Ptr = ST->getBasePtr();
	IsLoad = false;
	} else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
	if (LD->isIndexed())
	return false;
	EVT VT = LD->getMemoryVT();
	if (!TLI.isIndexedMaskedLoadLegal(Inc, VT) &&
	!TLI.isIndexedMaskedLoadLegal(Dec, VT))
	return false;
	Ptr = LD->getBasePtr();
	IsMasked = true;
	} else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
	if (ST->isIndexed())
	return false;
	EVT VT = ST->getMemoryVT();
	if (!TLI.isIndexedMaskedStoreLegal(Inc, VT) &&
	!TLI.isIndexedMaskedStoreLegal(Dec, VT))
	return false;
	Ptr = ST->getBasePtr();
	IsLoad = false;
	IsMasked = true;
	} else {
	return false;
	}
	return true;
	}

	/// Try turning a load/store into a pre-indexed load/store when the base
	/// pointer is an add or subtract and it has other uses besides the load/store.
	/// After the transformation, the new indexed load/store has effectively folded
	/// the add/subtract in and all of its other uses are redirected to the
	/// new load/store.
	bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
	if (Level < AfterLegalizeDAG)
	return false;

	bool IsLoad = true;
	bool IsMasked = false;
	SDValue Ptr;
	if (!getCombineLoadStoreParts(N, ISD::PRE_INC, ISD::PRE_DEC, IsLoad, IsMasked,
	Ptr, TLI))
	return false;

	// If the pointer is not an add/sub, or if it doesn't have multiple uses, bail
	// out. There is no reason to make this a preinc/predec.
	if ((Ptr.getOpcode() != ISD::ADD && Ptr.getOpcode() != ISD::SUB) \|\|
	Ptr.getNode()->hasOneUse())
	return false;

	// Ask the target to do addressing mode selection.
	SDValue BasePtr;
	SDValue Offset;
	ISD::MemIndexedMode AM = ISD::UNINDEXED;
	if (!TLI.getPreIndexedAddressParts(N, BasePtr, Offset, AM, DAG))
	return false;

	// Backends without true r+i pre-indexed forms may need to pass a
	// constant base with a variable offset so that constant coercion
	// will work with the patterns in canonical form.
	bool Swapped = false;
	if (isa<ConstantSDNode>(BasePtr)) {
	std::swap(BasePtr, Offset);
	Swapped = true;
	}

	// Don't create a indexed load / store with zero offset.
	if (isNullConstant(Offset))
	return false;

	// Try turning it into a pre-indexed load / store except when:
	// 1) The new base ptr is a frame index.
	// 2) If N is a store and the new base ptr is either the same as or is a
	// predecessor of the value being stored.
	// 3) Another use of old base ptr is a predecessor of N. If ptr is folded
	// that would create a cycle.
	// 4) All uses are load / store ops that use it as old base ptr.

	// Check #1. Preinc'ing a frame index would require copying the stack pointer
	// (plus the implicit offset) to a register to preinc anyway.
	if (isa<FrameIndexSDNode>(BasePtr) \|\| isa<RegisterSDNode>(BasePtr))
	return false;

	// Check #2.
	if (!IsLoad) {
	SDValue Val = IsMasked ? cast<MaskedStoreSDNode>(N)->getValue()
	: cast<StoreSDNode>(N)->getValue();

	// Would require a copy.
	if (Val == BasePtr)
	return false;

	// Would create a cycle.
	if (Val == Ptr \|\| Ptr->isPredecessorOf(Val.getNode()))
	return false;
	}

	// Caches for hasPredecessorHelper.
	SmallPtrSet<const SDNode *, 32> Visited;
	SmallVector<const SDNode *, 16> Worklist;
	Worklist.push_back(N);

	// If the offset is a constant, there may be other adds of constants that
	// can be folded with this one. We should do this to avoid having to keep
	// a copy of the original base pointer.
	SmallVector<SDNode *, 16> OtherUses;
	if (isa<ConstantSDNode>(Offset))
	for (SDNode::use_iterator UI = BasePtr.getNode()->use_begin(),
	UE = BasePtr.getNode()->use_end();
	UI != UE; ++UI) {
	SDUse &Use = UI.getUse();
	// Skip the use that is Ptr and uses of other results from BasePtr's
	// node (important for nodes that return multiple results).
	if (Use.getUser() == Ptr.getNode() \|\| Use != BasePtr)
	continue;

	if (SDNode::hasPredecessorHelper(Use.getUser(), Visited, Worklist))
	continue;

	if (Use.getUser()->getOpcode() != ISD::ADD &&
	Use.getUser()->getOpcode() != ISD::SUB) {
	OtherUses.clear();
	break;
	}

	SDValue Op1 = Use.getUser()->getOperand((UI.getOperandNo() + 1) & 1);
	if (!isa<ConstantSDNode>(Op1)) {
	OtherUses.clear();
	break;
	}

	// FIXME: In some cases, we can be smarter about this.
	if (Op1.getValueType() != Offset.getValueType()) {
	OtherUses.clear();
	break;
	}

	OtherUses.push_back(Use.getUser());
	}

	if (Swapped)
	std::swap(BasePtr, Offset);

	// Now check for #3 and #4.
	bool RealUse = false;

	for (SDNode *Use : Ptr.getNode()->uses()) {
	if (Use == N)
	continue;
	if (SDNode::hasPredecessorHelper(Use, Visited, Worklist))
	return false;

	// If Ptr may be folded in addressing mode of other use, then it's
	// not profitable to do this transformation.
	if (!canFoldInAddressingMode(Ptr.getNode(), Use, DAG, TLI))
	RealUse = true;
	}

	if (!RealUse)
	return false;

	SDValue Result;
	if (!IsMasked) {
	if (IsLoad)
	Result = DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr, Offset, AM);
	else
	Result =
	DAG.getIndexedStore(SDValue(N, 0), SDLoc(N), BasePtr, Offset, AM);
	} else {
	if (IsLoad)
	Result = DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N), BasePtr,
	Offset, AM);
	else
	Result = DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N), BasePtr,
	Offset, AM);
	}
	++PreIndexedNodes;
	++NodesCombined;
	LLVM_DEBUG(dbgs() << "\nReplacing.4 "; N->dump(&DAG); dbgs() << "\nWith: ";
	Result.getNode()->dump(&DAG); dbgs() << '\n');
	WorklistRemover DeadNodes(*this);
	if (IsLoad) {
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
	} else {
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));
	}

	// Finally, since the node is now dead, remove it from the graph.
	deleteAndRecombine(N);

	if (Swapped)
	std::swap(BasePtr, Offset);

	// Replace other uses of BasePtr that can be updated to use Ptr
	for (unsigned i = 0, e = OtherUses.size(); i != e; ++i) {
	unsigned OffsetIdx = 1;
	if (OtherUses[i]->getOperand(OffsetIdx).getNode() == BasePtr.getNode())
	OffsetIdx = 0;
	assert(OtherUses[i]->getOperand(!OffsetIdx).getNode() ==
	BasePtr.getNode() && "Expected BasePtr operand");

	// We need to replace ptr0 in the following expression:
	// x0 * offset0 + y0 * ptr0 = t0
	// knowing that
	// x1 * offset1 + y1 * ptr0 = t1 (the indexed load/store)
	//
	// where x0, x1, y0 and y1 in {-1, 1} are given by the types of the
	// indexed load/store and the expression that needs to be re-written.
	//
	// Therefore, we have:
	// t0 = (x0 * offset0 - x1 * y0 * y1 offset1) + (y0 y1) * t1

	ConstantSDNode *CN =
	cast<ConstantSDNode>(OtherUses[i]->getOperand(OffsetIdx));
	int X0, X1, Y0, Y1;
	const APInt &Offset0 = CN->getAPIntValue();
	APInt Offset1 = cast<ConstantSDNode>(Offset)->getAPIntValue();

	X0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 1) ? -1 : 1;
	Y0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 0) ? -1 : 1;
	X1 = (AM == ISD::PRE_DEC && !Swapped) ? -1 : 1;
	Y1 = (AM == ISD::PRE_DEC && Swapped) ? -1 : 1;

	unsigned Opcode = (Y0 * Y1 < 0) ? ISD::SUB : ISD::ADD;

	APInt CNV = Offset0;
	if (X0 < 0) CNV = -CNV;
	if (X1 * Y0 * Y1 < 0) CNV = CNV + Offset1;
	else CNV = CNV - Offset1;

	SDLoc DL(OtherUses[i]);

	// We can now generate the new expression.
	SDValue NewOp1 = DAG.getConstant(CNV, DL, CN->getValueType(0));
	SDValue NewOp2 = Result.getValue(IsLoad ? 1 : 0);

	SDValue NewUse = DAG.getNode(Opcode,
	DL,
	OtherUses[i]->getValueType(0), NewOp1, NewOp2);
	DAG.ReplaceAllUsesOfValueWith(SDValue(OtherUses[i], 0), NewUse);
	deleteAndRecombine(OtherUses[i]);
	}

	// Replace the uses of Ptr with uses of the updated base value.
	DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(IsLoad ? 1 : 0));
	deleteAndRecombine(Ptr.getNode());
	AddToWorklist(Result.getNode());

	return true;
	}

	static bool shouldCombineToPostInc(SDNode N, SDValue Ptr, SDNode PtrUse,
	SDValue &BasePtr, SDValue &Offset,
	ISD::MemIndexedMode &AM,
	SelectionDAG &DAG,
	const TargetLowering &TLI) {
	if (PtrUse == N \|\|
	(PtrUse->getOpcode() != ISD::ADD && PtrUse->getOpcode() != ISD::SUB))
	return false;

	if (!TLI.getPostIndexedAddressParts(N, PtrUse, BasePtr, Offset, AM, DAG))
	return false;

	// Don't create a indexed load / store with zero offset.
	if (isNullConstant(Offset))
	return false;

	if (isa<FrameIndexSDNode>(BasePtr) \|\| isa<RegisterSDNode>(BasePtr))
	return false;

	SmallPtrSet<const SDNode *, 32> Visited;
	for (SDNode *Use : BasePtr.getNode()->uses()) {
	if (Use == Ptr.getNode())
	continue;

	// No if there's a later user which could perform the index instead.
	if (isa<MemSDNode>(Use)) {
	bool IsLoad = true;
	bool IsMasked = false;
	SDValue OtherPtr;
	if (getCombineLoadStoreParts(Use, ISD::POST_INC, ISD::POST_DEC, IsLoad,
	IsMasked, OtherPtr, TLI)) {
	SmallVector<const SDNode *, 2> Worklist;
	Worklist.push_back(Use);
	if (SDNode::hasPredecessorHelper(N, Visited, Worklist))
	return false;
	}
	}

	// If all the uses are load / store addresses, then don't do the
	// transformation.
	if (Use->getOpcode() == ISD::ADD \|\| Use->getOpcode() == ISD::SUB) {
	for (SDNode *UseUse : Use->uses())
	if (canFoldInAddressingMode(Use, UseUse, DAG, TLI))
	return false;
	}
	}
	return true;
	}

	static SDNode getPostIndexedLoadStoreOp(SDNode N, bool &IsLoad,
	bool &IsMasked, SDValue &Ptr,
	SDValue &BasePtr, SDValue &Offset,
	ISD::MemIndexedMode &AM,
	SelectionDAG &DAG,
	const TargetLowering &TLI) {
	if (!getCombineLoadStoreParts(N, ISD::POST_INC, ISD::POST_DEC, IsLoad,
	IsMasked, Ptr, TLI) \|\|
	Ptr.getNode()->hasOneUse())
	return nullptr;

	// Try turning it into a post-indexed load / store except when
	// 1) All uses are load / store ops that use it as base ptr (and
	// it may be folded as addressing mmode).
	// 2) Op must be independent of N, i.e. Op is neither a predecessor
	// nor a successor of N. Otherwise, if Op is folded that would
	// create a cycle.
	for (SDNode *Op : Ptr->uses()) {
	// Check for #1.
	if (!shouldCombineToPostInc(N, Ptr, Op, BasePtr, Offset, AM, DAG, TLI))
	continue;

	// Check for #2.
	SmallPtrSet<const SDNode *, 32> Visited;
	SmallVector<const SDNode *, 8> Worklist;
	// Ptr is predecessor to both N and Op.
	Visited.insert(Ptr.getNode());
	Worklist.push_back(N);
	Worklist.push_back(Op);
	if (!SDNode::hasPredecessorHelper(N, Visited, Worklist) &&
	!SDNode::hasPredecessorHelper(Op, Visited, Worklist))
	return Op;
	}
	return nullptr;
	}

	/// Try to combine a load/store with a add/sub of the base pointer node into a
	/// post-indexed load/store. The transformation folded the add/subtract into the
	/// new indexed load/store effectively and all of its uses are redirected to the
	/// new load/store.
	bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
	if (Level < AfterLegalizeDAG)
	return false;

	bool IsLoad = true;
	bool IsMasked = false;
	SDValue Ptr;
	SDValue BasePtr;
	SDValue Offset;
	ISD::MemIndexedMode AM = ISD::UNINDEXED;
	SDNode *Op = getPostIndexedLoadStoreOp(N, IsLoad, IsMasked, Ptr, BasePtr,
	Offset, AM, DAG, TLI);
	if (!Op)
	return false;

	SDValue Result;
	if (!IsMasked)
	Result = IsLoad ? DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr,
	Offset, AM)
	: DAG.getIndexedStore(SDValue(N, 0), SDLoc(N),
	BasePtr, Offset, AM);
	else
	Result = IsLoad ? DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N),
	BasePtr, Offset, AM)
	: DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N),
	BasePtr, Offset, AM);
	++PostIndexedNodes;
	++NodesCombined;
	LLVM_DEBUG(dbgs() << "\nReplacing.5 "; N->dump(&DAG);
	dbgs() << "\nWith: "; Result.getNode()->dump(&DAG);
	dbgs() << '\n');
	WorklistRemover DeadNodes(*this);
	if (IsLoad) {
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
	} else {
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));
	}

	// Finally, since the node is now dead, remove it from the graph.
	deleteAndRecombine(N);

	// Replace the uses of Use with uses of the updated base value.
	DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0),
	Result.getValue(IsLoad ? 1 : 0));
	deleteAndRecombine(Op);
	return true;
	}

	/// Return the base-pointer arithmetic from an indexed \p LD.
	SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) {
	ISD::MemIndexedMode AM = LD->getAddressingMode();
	assert(AM != ISD::UNINDEXED);
	SDValue BP = LD->getOperand(1);
	SDValue Inc = LD->getOperand(2);

	// Some backends use TargetConstants for load offsets, but don't expect
	// TargetConstants in general ADD nodes. We can convert these constants into
	// regular Constants (if the constant is not opaque).
	assert((Inc.getOpcode() != ISD::TargetConstant \|\|
	!cast<ConstantSDNode>(Inc)->isOpaque()) &&
	"Cannot split out indexing using opaque target constants");
	if (Inc.getOpcode() == ISD::TargetConstant) {
	ConstantSDNode *ConstInc = cast<ConstantSDNode>(Inc);
	Inc = DAG.getConstant(*ConstInc->getConstantIntValue(), SDLoc(Inc),
	ConstInc->getValueType(0));
	}

	unsigned Opc =
	(AM == ISD::PRE_INC \|\| AM == ISD::POST_INC ? ISD::ADD : ISD::SUB);
	return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc);
	}

	static inline int numVectorEltsOrZero(EVT T) {
	return T.isVector() ? T.getVectorNumElements() : 0;
	}

	bool DAGCombiner::getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val) {
	Val = ST->getValue();
	EVT STType = Val.getValueType();
	EVT STMemType = ST->getMemoryVT();
	if (STType == STMemType)
	return true;
	if (isTypeLegal(STMemType))
	return false; // fail.
	if (STType.isFloatingPoint() && STMemType.isFloatingPoint() &&
	TLI.isOperationLegal(ISD::FTRUNC, STMemType)) {
	Val = DAG.getNode(ISD::FTRUNC, SDLoc(ST), STMemType, Val);
	return true;
	}
	if (numVectorEltsOrZero(STType) == numVectorEltsOrZero(STMemType) &&
	STType.isInteger() && STMemType.isInteger()) {
	Val = DAG.getNode(ISD::TRUNCATE, SDLoc(ST), STMemType, Val);
	return true;
	}
	if (STType.getSizeInBits() == STMemType.getSizeInBits()) {
	Val = DAG.getBitcast(STMemType, Val);
	return true;
	}
	return false; // fail.
	}

	bool DAGCombiner::extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val) {
	EVT LDMemType = LD->getMemoryVT();
	EVT LDType = LD->getValueType(0);
	assert(Val.getValueType() == LDMemType &&
	"Attempting to extend value of non-matching type");
	if (LDType == LDMemType)
	return true;
	if (LDMemType.isInteger() && LDType.isInteger()) {
	switch (LD->getExtensionType()) {
	case ISD::NON_EXTLOAD:
	Val = DAG.getBitcast(LDType, Val);
	return true;
	case ISD::EXTLOAD:
	Val = DAG.getNode(ISD::ANY_EXTEND, SDLoc(LD), LDType, Val);
	return true;
	case ISD::SEXTLOAD:
	Val = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(LD), LDType, Val);
	return true;
	case ISD::ZEXTLOAD:
	Val = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(LD), LDType, Val);
	return true;
	}
	}
	return false;
	}

	SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
	if (OptLevel == CodeGenOpt::None \|\| !LD->isSimple())
	return SDValue();
	SDValue Chain = LD->getOperand(0);
	StoreSDNode *ST = dyn_cast<StoreSDNode>(Chain.getNode());
	// TODO: Relax this restriction for unordered atomics (see D66309)
	if (!ST \|\| !ST->isSimple())
	return SDValue();

	EVT LDType = LD->getValueType(0);
	EVT LDMemType = LD->getMemoryVT();
	EVT STMemType = ST->getMemoryVT();
	EVT STType = ST->getValue().getValueType();

	BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG);
	BaseIndexOffset BasePtrST = BaseIndexOffset::match(ST, DAG);
	int64_t Offset;
	if (!BasePtrST.equalBaseIndex(BasePtrLD, DAG, Offset))
	return SDValue();

	// Normalize for Endianness. After this Offset=0 will denote that the least
	// significant bit in the loaded value maps to the least significant bit in
	// the stored value). With Offset=n (for n > 0) the loaded value starts at the
	// n:th least significant byte of the stored value.
	if (DAG.getDataLayout().isBigEndian())
	Offset = ((int64_t)STMemType.getStoreSizeInBits() -
	(int64_t)LDMemType.getStoreSizeInBits()) / 8 - Offset;

	// Check that the stored value cover all bits that are loaded.
	bool STCoversLD =
	(Offset >= 0) &&
	(Offset * 8 + LDMemType.getSizeInBits() <= STMemType.getSizeInBits());

	auto ReplaceLd = [&](LoadSDNode *LD, SDValue Val, SDValue Chain) -> SDValue {
	if (LD->isIndexed()) {
	// Cannot handle opaque target constants and we must respect the user's
	// request not to split indexes from loads.
	if (!canSplitIdx(LD))
	return SDValue();
	SDValue Idx = SplitIndexingFromLoad(LD);
	SDValue Ops[] = {Val, Idx, Chain};
	return CombineTo(LD, Ops, 3);
	}
	return CombineTo(LD, Val, Chain);
	};

	if (!STCoversLD)
	return SDValue();

	// Memory as copy space (potentially masked).
	if (Offset == 0 && LDType == STType && STMemType == LDMemType) {
	// Simple case: Direct non-truncating forwarding
	if (LDType.getSizeInBits() == LDMemType.getSizeInBits())
	return ReplaceLd(LD, ST->getValue(), Chain);
	// Can we model the truncate and extension with an and mask?
	if (STType.isInteger() && LDMemType.isInteger() && !STType.isVector() &&
	!LDMemType.isVector() && LD->getExtensionType() != ISD::SEXTLOAD) {
	// Mask to size of LDMemType
	auto Mask =
	DAG.getConstant(APInt::getLowBitsSet(STType.getSizeInBits(),
	STMemType.getSizeInBits()),
	SDLoc(ST), STType);
	auto Val = DAG.getNode(ISD::AND, SDLoc(LD), LDType, ST->getValue(), Mask);
	return ReplaceLd(LD, Val, Chain);
	}
	}

	// TODO: Deal with nonzero offset.
	if (LD->getBasePtr().isUndef() \|\| Offset != 0)
	return SDValue();
	// Model necessary truncations / extenstions.
	SDValue Val;
	// Truncate Value To Stored Memory Size.
	do {
	if (!getTruncatedStoreValue(ST, Val))
	continue;
	if (!isTypeLegal(LDMemType))
	continue;
	if (STMemType != LDMemType) {
	// TODO: Support vectors? This requires extract_subvector/bitcast.
	if (!STMemType.isVector() && !LDMemType.isVector() &&
	STMemType.isInteger() && LDMemType.isInteger())
	Val = DAG.getNode(ISD::TRUNCATE, SDLoc(LD), LDMemType, Val);
	else
	continue;
	}
	if (!extendLoadedValueToExtension(LD, Val))
	continue;
	return ReplaceLd(LD, Val, Chain);
	} while (false);

	// On failure, cleanup dead nodes we may have created.
	if (Val->use_empty())
	deleteAndRecombine(Val.getNode());
	return SDValue();
	}

	SDValue DAGCombiner::visitLOAD(SDNode *N) {
	LoadSDNode *LD = cast<LoadSDNode>(N);
	SDValue Chain = LD->getChain();
	SDValue Ptr = LD->getBasePtr();

	// If load is not volatile and there are no uses of the loaded value (and
	// the updated indexed value in case of indexed loads), change uses of the
	// chain value into uses of the chain input (i.e. delete the dead load).
	// TODO: Allow this for unordered atomics (see D66309)
	if (LD->isSimple()) {
	if (N->getValueType(1) == MVT::Other) {
	// Unindexed loads.
	if (!N->hasAnyUseOfValue(0)) {
	// It's not safe to use the two value CombineTo variant here. e.g.
	// v1, chain2 = load chain1, loc
	// v2, chain3 = load chain2, loc
	// v3 = add v2, c
	// Now we replace use of chain2 with chain1. This makes the second load
	// isomorphic to the one we are deleting, and thus makes this load live.
	LLVM_DEBUG(dbgs() << "\nReplacing.6 "; N->dump(&DAG);
	dbgs() << "\nWith chain: "; Chain.getNode()->dump(&DAG);
	dbgs() << "\n");
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
	AddUsersToWorklist(Chain.getNode());
	if (N->use_empty())
	deleteAndRecombine(N);

	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	} else {
	// Indexed loads.
	assert(N->getValueType(2) == MVT::Other && "Malformed indexed loads?");

	// If this load has an opaque TargetConstant offset, then we cannot split
	// the indexing into an add/sub directly (that TargetConstant may not be
	// valid for a different type of node, and we cannot convert an opaque
	// target constant into a regular constant).
	bool CanSplitIdx = canSplitIdx(LD);

	if (!N->hasAnyUseOfValue(0) && (CanSplitIdx \|\| !N->hasAnyUseOfValue(1))) {
	SDValue Undef = DAG.getUNDEF(N->getValueType(0));
	SDValue Index;
	if (N->hasAnyUseOfValue(1) && CanSplitIdx) {
	Index = SplitIndexingFromLoad(LD);
	// Try to fold the base pointer arithmetic into subsequent loads and
	// stores.
	AddUsersToWorklist(N);
	} else
	Index = DAG.getUNDEF(N->getValueType(1));
	LLVM_DEBUG(dbgs() << "\nReplacing.7 "; N->dump(&DAG);
	dbgs() << "\nWith: "; Undef.getNode()->dump(&DAG);
	dbgs() << " and 2 other values\n");
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Index);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain);
	deleteAndRecombine(N);
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}
	}

	// If this load is directly stored, replace the load value with the stored
	// value.
	if (auto V = ForwardStoreValueToDirectLoad(LD))
	return V;

	// Try to infer better alignment information than the load already has.
	if (OptLevel != CodeGenOpt::None && LD->isUnindexed() && !LD->isAtomic()) {
	if (MaybeAlign Alignment = DAG.InferPtrAlign(Ptr)) {
	if (*Alignment > LD->getAlign() &&
	isAligned(*Alignment, LD->getSrcValueOffset())) {
	SDValue NewLoad = DAG.getExtLoad(
	LD->getExtensionType(), SDLoc(N), LD->getValueType(0), Chain, Ptr,
	LD->getPointerInfo(), LD->getMemoryVT(), *Alignment,
	LD->getMemOperand()->getFlags(), LD->getAAInfo());
	// NewLoad will always be N as we are only refining the alignment
	assert(NewLoad.getNode() == N);
	(void)NewLoad;
	}
	}
	}

	if (LD->isUnindexed()) {
	// Walk up chain skipping non-aliasing memory nodes.
	SDValue BetterChain = FindBetterChain(LD, Chain);

	// If there is a better chain.
	if (Chain != BetterChain) {
	SDValue ReplLoad;

	// Replace the chain to void dependency.
	if (LD->getExtensionType() == ISD::NON_EXTLOAD) {
	ReplLoad = DAG.getLoad(N->getValueType(0), SDLoc(LD),
	BetterChain, Ptr, LD->getMemOperand());
	} else {
	ReplLoad = DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD),
	LD->getValueType(0),
	BetterChain, Ptr, LD->getMemoryVT(),
	LD->getMemOperand());
	}

	// Create token factor to keep old chain connected.
	SDValue Token = DAG.getNode(ISD::TokenFactor, SDLoc(N),
	MVT::Other, Chain, ReplLoad.getValue(1));

	// Replace uses with load result and token factor
	return CombineTo(N, ReplLoad.getValue(0), Token);
	}
	}

	// Try transforming N to an indexed load.
	if (CombineToPreIndexedLoadStore(N) \|\| CombineToPostIndexedLoadStore(N))
	return SDValue(N, 0);

	// Try to slice up N to more direct loads if the slices are mapped to
	// different register banks or pairing can take place.
	if (SliceUpLoad(N))
	return SDValue(N, 0);

	return SDValue();
	}

	namespace {

	/// Helper structure used to slice a load in smaller loads.
	/// Basically a slice is obtained from the following sequence:
	/// Origin = load Ty1, Base
	/// Shift = srl Ty1 Origin, CstTy Amount
	/// Inst = trunc Shift to Ty2
	///
	/// Then, it will be rewritten into:
	/// Slice = load SliceTy, Base + SliceOffset
	/// [Inst = zext Slice to Ty2], only if SliceTy <> Ty2
	///
	/// SliceTy is deduced from the number of bits that are actually used to
	/// build Inst.
	struct LoadedSlice {
	/// Helper structure used to compute the cost of a slice.
	struct Cost {
	/// Are we optimizing for code size.
	bool ForCodeSize = false;

	/// Various cost.
	unsigned Loads = 0;
	unsigned Truncates = 0;
	unsigned CrossRegisterBanksCopies = 0;
	unsigned ZExts = 0;
	unsigned Shift = 0;

	explicit Cost(bool ForCodeSize) : ForCodeSize(ForCodeSize) {}

	/// Get the cost of one isolated slice.
	Cost(const LoadedSlice &LS, bool ForCodeSize)
	: ForCodeSize(ForCodeSize), Loads(1) {
	EVT TruncType = LS.Inst->getValueType(0);
	EVT LoadedType = LS.getLoadedType();
	if (TruncType != LoadedType &&
	!LS.DAG->getTargetLoweringInfo().isZExtFree(LoadedType, TruncType))
	ZExts = 1;
	}

	/// Account for slicing gain in the current cost.
	/// Slicing provide a few gains like removing a shift or a
	/// truncate. This method allows to grow the cost of the original
	/// load with the gain from this slice.
	void addSliceGain(const LoadedSlice &LS) {
	// Each slice saves a truncate.
	const TargetLowering &TLI = LS.DAG->getTargetLoweringInfo();
	if (!TLI.isTruncateFree(LS.Inst->getOperand(0).getValueType(),
	LS.Inst->getValueType(0)))
	++Truncates;
	// If there is a shift amount, this slice gets rid of it.
	if (LS.Shift)
	++Shift;
	// If this slice can merge a cross register bank copy, account for it.
	if (LS.canMergeExpensiveCrossRegisterBankCopy())
	++CrossRegisterBanksCopies;
	}

	Cost &operator+=(const Cost &RHS) {
	Loads += RHS.Loads;
	Truncates += RHS.Truncates;
	CrossRegisterBanksCopies += RHS.CrossRegisterBanksCopies;
	ZExts += RHS.ZExts;
	Shift += RHS.Shift;
	return *this;
	}

	bool operator==(const Cost &RHS) const {
	return Loads == RHS.Loads && Truncates == RHS.Truncates &&
	CrossRegisterBanksCopies == RHS.CrossRegisterBanksCopies &&
	ZExts == RHS.ZExts && Shift == RHS.Shift;
	}

	bool operator!=(const Cost &RHS) const { return !(*this == RHS); }

	bool operator<(const Cost &RHS) const {
	// Assume cross register banks copies are as expensive as loads.
	// FIXME: Do we want some more target hooks?
	unsigned ExpensiveOpsLHS = Loads + CrossRegisterBanksCopies;
	unsigned ExpensiveOpsRHS = RHS.Loads + RHS.CrossRegisterBanksCopies;
	// Unless we are optimizing for code size, consider the
	// expensive operation first.
	if (!ForCodeSize && ExpensiveOpsLHS != ExpensiveOpsRHS)
	return ExpensiveOpsLHS < ExpensiveOpsRHS;
	return (Truncates + ZExts + Shift + ExpensiveOpsLHS) <
	(RHS.Truncates + RHS.ZExts + RHS.Shift + ExpensiveOpsRHS);
	}

	bool operator>(const Cost &RHS) const { return RHS < *this; }

	bool operator<=(const Cost &RHS) const { return !(RHS < *this); }

	bool operator>=(const Cost &RHS) const { return !(*this < RHS); }
	};

	// The last instruction that represent the slice. This should be a
	// truncate instruction.
	SDNode *Inst;

	// The original load instruction.
	LoadSDNode *Origin;

	// The right shift amount in bits from the original load.
	unsigned Shift;

	// The DAG from which Origin came from.
	// This is used to get some contextual information about legal types, etc.
	SelectionDAG *DAG;

	LoadedSlice(SDNode Inst = nullptr, LoadSDNode Origin = nullptr,
	unsigned Shift = 0, SelectionDAG *DAG = nullptr)
	: Inst(Inst), Origin(Origin), Shift(Shift), DAG(DAG) {}

	/// Get the bits used in a chunk of bits \p BitWidth large.
	/// \return Result is \p BitWidth and has used bits set to 1 and
	/// not used bits set to 0.
	APInt getUsedBits() const {
	// Reproduce the trunc(lshr) sequence:
	// - Start from the truncated value.
	// - Zero extend to the desired bit width.
	// - Shift left.
	assert(Origin && "No original load to compare against.");
	unsigned BitWidth = Origin->getValueSizeInBits(0);
	assert(Inst && "This slice is not bound to an instruction");
	assert(Inst->getValueSizeInBits(0) <= BitWidth &&
	"Extracted slice is bigger than the whole type!");
	APInt UsedBits(Inst->getValueSizeInBits(0), 0);
	UsedBits.setAllBits();
	UsedBits = UsedBits.zext(BitWidth);
	UsedBits <<= Shift;
	return UsedBits;
	}

	/// Get the size of the slice to be loaded in bytes.
	unsigned getLoadedSize() const {
	unsigned SliceSize = getUsedBits().countPopulation();
	assert(!(SliceSize & 0x7) && "Size is not a multiple of a byte.");
	return SliceSize / 8;
	}

	/// Get the type that will be loaded for this slice.
	/// Note: This may not be the final type for the slice.
	EVT getLoadedType() const {
	assert(DAG && "Missing context");
	LLVMContext &Ctxt = *DAG->getContext();
	return EVT::getIntegerVT(Ctxt, getLoadedSize() * 8);
	}

	/// Get the alignment of the load used for this slice.
	Align getAlign() const {
	Align Alignment = Origin->getAlign();
	uint64_t Offset = getOffsetFromBase();
	if (Offset != 0)
	Alignment = commonAlignment(Alignment, Alignment.value() + Offset);
	return Alignment;
	}

	/// Check if this slice can be rewritten with legal operations.
	bool isLegal() const {
	// An invalid slice is not legal.
	if (!Origin \|\| !Inst \|\| !DAG)
	return false;

	// Offsets are for indexed load only, we do not handle that.
	if (!Origin->getOffset().isUndef())
	return false;

	const TargetLowering &TLI = DAG->getTargetLoweringInfo();

	// Check that the type is legal.
	EVT SliceType = getLoadedType();
	if (!TLI.isTypeLegal(SliceType))
	return false;

	// Check that the load is legal for this type.
	if (!TLI.isOperationLegal(ISD::LOAD, SliceType))
	return false;

	// Check that the offset can be computed.
	// 1. Check its type.
	EVT PtrType = Origin->getBasePtr().getValueType();
	if (PtrType == MVT::Untyped \|\| PtrType.isExtended())
	return false;

	// 2. Check that it fits in the immediate.
	if (!TLI.isLegalAddImmediate(getOffsetFromBase()))
	return false;

	// 3. Check that the computation is legal.
	if (!TLI.isOperationLegal(ISD::ADD, PtrType))
	return false;

	// Check that the zext is legal if it needs one.
	EVT TruncateType = Inst->getValueType(0);
	if (TruncateType != SliceType &&
	!TLI.isOperationLegal(ISD::ZERO_EXTEND, TruncateType))
	return false;

	return true;
	}

	/// Get the offset in bytes of this slice in the original chunk of
	/// bits.
	/// \pre DAG != nullptr.
	uint64_t getOffsetFromBase() const {
	assert(DAG && "Missing context.");
	bool IsBigEndian = DAG->getDataLayout().isBigEndian();
	assert(!(Shift & 0x7) && "Shifts not aligned on Bytes are not supported.");
	uint64_t Offset = Shift / 8;
	unsigned TySizeInBytes = Origin->getValueSizeInBits(0) / 8;
	assert(!(Origin->getValueSizeInBits(0) & 0x7) &&
	"The size of the original loaded type is not a multiple of a"
	" byte.");
	// If Offset is bigger than TySizeInBytes, it means we are loading all
	// zeros. This should have been optimized before in the process.
	assert(TySizeInBytes > Offset &&
	"Invalid shift amount for given loaded size");
	if (IsBigEndian)
	Offset = TySizeInBytes - Offset - getLoadedSize();
	return Offset;
	}

	/// Generate the sequence of instructions to load the slice
	/// represented by this object and redirect the uses of this slice to
	/// this new sequence of instructions.
	/// \pre this->Inst && this->Origin are valid Instructions and this
	/// object passed the legal check: LoadedSlice::isLegal returned true.
	/// \return The last instruction of the sequence used to load the slice.
	SDValue loadSlice() const {
	assert(Inst && Origin && "Unable to replace a non-existing slice.");
	const SDValue &OldBaseAddr = Origin->getBasePtr();
	SDValue BaseAddr = OldBaseAddr;
	// Get the offset in that chunk of bytes w.r.t. the endianness.
	int64_t Offset = static_cast<int64_t>(getOffsetFromBase());
	assert(Offset >= 0 && "Offset too big to fit in int64_t!");
	if (Offset) {
	// BaseAddr = BaseAddr + Offset.
	EVT ArithType = BaseAddr.getValueType();
	SDLoc DL(Origin);
	BaseAddr = DAG->getNode(ISD::ADD, DL, ArithType, BaseAddr,
	DAG->getConstant(Offset, DL, ArithType));
	}

	// Create the type of the loaded slice according to its size.
	EVT SliceType = getLoadedType();

	// Create the load for the slice.
	SDValue LastInst =
	DAG->getLoad(SliceType, SDLoc(Origin), Origin->getChain(), BaseAddr,
	Origin->getPointerInfo().getWithOffset(Offset), getAlign(),
	Origin->getMemOperand()->getFlags());
	// If the final type is not the same as the loaded type, this means that
	// we have to pad with zero. Create a zero extend for that.
	EVT FinalType = Inst->getValueType(0);
	if (SliceType != FinalType)
	LastInst =
	DAG->getNode(ISD::ZERO_EXTEND, SDLoc(LastInst), FinalType, LastInst);
	return LastInst;
	}

	/// Check if this slice can be merged with an expensive cross register
	/// bank copy. E.g.,
	/// i = load i32
	/// f = bitcast i32 i to float
	bool canMergeExpensiveCrossRegisterBankCopy() const {
	if (!Inst \|\| !Inst->hasOneUse())
	return false;
	SDNode Use = Inst->use_begin();
	if (Use->getOpcode() != ISD::BITCAST)
	return false;
	assert(DAG && "Missing context");
	const TargetLowering &TLI = DAG->getTargetLoweringInfo();
	EVT ResVT = Use->getValueType(0);
	const TargetRegisterClass *ResRC =
	TLI.getRegClassFor(ResVT.getSimpleVT(), Use->isDivergent());
	const TargetRegisterClass *ArgRC =
	TLI.getRegClassFor(Use->getOperand(0).getValueType().getSimpleVT(),
	Use->getOperand(0)->isDivergent());
	if (ArgRC == ResRC \|\| !TLI.isOperationLegal(ISD::LOAD, ResVT))
	return false;

	// At this point, we know that we perform a cross-register-bank copy.
	// Check if it is expensive.
	const TargetRegisterInfo *TRI = DAG->getSubtarget().getRegisterInfo();
	// Assume bitcasts are cheap, unless both register classes do not
	// explicitly share a common sub class.
	if (!TRI \|\| TRI->getCommonSubClass(ArgRC, ResRC))
	return false;

	// Check if it will be merged with the load.
	// 1. Check the alignment constraint.
	Align RequiredAlignment = DAG->getDataLayout().getABITypeAlign(
	ResVT.getTypeForEVT(*DAG->getContext()));

	if (RequiredAlignment > getAlign())
	return false;

	// 2. Check that the load is a legal operation for that type.
	if (!TLI.isOperationLegal(ISD::LOAD, ResVT))
	return false;

	// 3. Check that we do not have a zext in the way.
	if (Inst->getValueType(0) != getLoadedType())
	return false;

	return true;
	}
	};

	} // end anonymous namespace

	/// Check that all bits set in \p UsedBits form a dense region, i.e.,
	/// \p UsedBits looks like 0..0 1..1 0..0.
	static bool areUsedBitsDense(const APInt &UsedBits) {
	// If all the bits are one, this is dense!
	if (UsedBits.isAllOnesValue())
	return true;

	// Get rid of the unused bits on the right.
	APInt NarrowedUsedBits = UsedBits.lshr(UsedBits.countTrailingZeros());
	// Get rid of the unused bits on the left.
	if (NarrowedUsedBits.countLeadingZeros())
	NarrowedUsedBits = NarrowedUsedBits.trunc(NarrowedUsedBits.getActiveBits());
	// Check that the chunk of bits is completely used.
	return NarrowedUsedBits.isAllOnesValue();
	}

	/// Check whether or not \p First and \p Second are next to each other
	/// in memory. This means that there is no hole between the bits loaded
	/// by \p First and the bits loaded by \p Second.
	static bool areSlicesNextToEachOther(const LoadedSlice &First,
	const LoadedSlice &Second) {
	assert(First.Origin == Second.Origin && First.Origin &&
	"Unable to match different memory origins.");
	APInt UsedBits = First.getUsedBits();
	assert((UsedBits & Second.getUsedBits()) == 0 &&
	"Slices are not supposed to overlap.");
	UsedBits \|= Second.getUsedBits();
	return areUsedBitsDense(UsedBits);
	}

	/// Adjust the \p GlobalLSCost according to the target
	/// paring capabilities and the layout of the slices.
	/// \pre \p GlobalLSCost should account for at least as many loads as
	/// there is in the slices in \p LoadedSlices.
	static void adjustCostForPairing(SmallVectorImpl<LoadedSlice> &LoadedSlices,
	LoadedSlice::Cost &GlobalLSCost) {
	unsigned NumberOfSlices = LoadedSlices.size();
	// If there is less than 2 elements, no pairing is possible.
	if (NumberOfSlices < 2)
	return;

	// Sort the slices so that elements that are likely to be next to each
	// other in memory are next to each other in the list.
	llvm::sort(LoadedSlices, [](const LoadedSlice &LHS, const LoadedSlice &RHS) {
	assert(LHS.Origin == RHS.Origin && "Different bases not implemented.");
	return LHS.getOffsetFromBase() < RHS.getOffsetFromBase();
	});
	const TargetLowering &TLI = LoadedSlices[0].DAG->getTargetLoweringInfo();
	// First (resp. Second) is the first (resp. Second) potentially candidate
	// to be placed in a paired load.
	const LoadedSlice *First = nullptr;
	const LoadedSlice *Second = nullptr;
	for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice,
	// Set the beginning of the pair.
	First = Second) {
	Second = &LoadedSlices[CurrSlice];

	// If First is NULL, it means we start a new pair.
	// Get to the next slice.
	if (!First)
	continue;

	EVT LoadedType = First->getLoadedType();

	// If the types of the slices are different, we cannot pair them.
	if (LoadedType != Second->getLoadedType())
	continue;

	// Check if the target supplies paired loads for this type.
	Align RequiredAlignment;
	if (!TLI.hasPairedLoad(LoadedType, RequiredAlignment)) {
	// move to the next pair, this type is hopeless.
	Second = nullptr;
	continue;
	}
	// Check if we meet the alignment requirement.
	if (First->getAlign() < RequiredAlignment)
	continue;

	// Check that both loads are next to each other in memory.
	if (!areSlicesNextToEachOther(First, Second))
	continue;

	assert(GlobalLSCost.Loads > 0 && "We save more loads than we created!");
	--GlobalLSCost.Loads;
	// Move to the next pair.
	Second = nullptr;
	}
	}

	/// Check the profitability of all involved LoadedSlice.
	/// Currently, it is considered profitable if there is exactly two
	/// involved slices (1) which are (2) next to each other in memory, and
	/// whose cost (\see LoadedSlice::Cost) is smaller than the original load (3).
	///
	/// Note: The order of the elements in \p LoadedSlices may be modified, but not
	/// the elements themselves.
	///
	/// FIXME: When the cost model will be mature enough, we can relax
	/// constraints (1) and (2).
	static bool isSlicingProfitable(SmallVectorImpl<LoadedSlice> &LoadedSlices,
	const APInt &UsedBits, bool ForCodeSize) {
	unsigned NumberOfSlices = LoadedSlices.size();
	if (StressLoadSlicing)
	return NumberOfSlices > 1;

	// Check (1).
	if (NumberOfSlices != 2)
	return false;

	// Check (2).
	if (!areUsedBitsDense(UsedBits))
	return false;

	// Check (3).
	LoadedSlice::Cost OrigCost(ForCodeSize), GlobalSlicingCost(ForCodeSize);
	// The original code has one big load.
	OrigCost.Loads = 1;
	for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice) {
	const LoadedSlice &LS = LoadedSlices[CurrSlice];
	// Accumulate the cost of all the slices.
	LoadedSlice::Cost SliceCost(LS, ForCodeSize);
	GlobalSlicingCost += SliceCost;

	// Account as cost in the original configuration the gain obtained
	// with the current slices.
	OrigCost.addSliceGain(LS);
	}

	// If the target supports paired load, adjust the cost accordingly.
	adjustCostForPairing(LoadedSlices, GlobalSlicingCost);
	return OrigCost > GlobalSlicingCost;
	}

	/// If the given load, \p LI, is used only by trunc or trunc(lshr)
	/// operations, split it in the various pieces being extracted.
	///
	/// This sort of thing is introduced by SROA.
	/// This slicing takes care not to insert overlapping loads.
	/// \pre LI is a simple load (i.e., not an atomic or volatile load).
	bool DAGCombiner::SliceUpLoad(SDNode *N) {
	if (Level < AfterLegalizeDAG)
	return false;

	LoadSDNode *LD = cast<LoadSDNode>(N);
	if (!LD->isSimple() \|\| !ISD::isNormalLoad(LD) \|\|
	!LD->getValueType(0).isInteger())
	return false;

	// The algorithm to split up a load of a scalable vector into individual
	// elements currently requires knowing the length of the loaded type,
	// so will need adjusting to work on scalable vectors.
	if (LD->getValueType(0).isScalableVector())
	return false;

	// Keep track of already used bits to detect overlapping values.
	// In that case, we will just abort the transformation.
	APInt UsedBits(LD->getValueSizeInBits(0), 0);

	SmallVector<LoadedSlice, 4> LoadedSlices;

	// Check if this load is used as several smaller chunks of bits.
	// Basically, look for uses in trunc or trunc(lshr) and record a new chain
	// of computation for each trunc.
	for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
	UI != UIEnd; ++UI) {
	// Skip the uses of the chain.
	if (UI.getUse().getResNo() != 0)
	continue;

	SDNode User = UI;
	unsigned Shift = 0;

	// Check if this is a trunc(lshr).
	if (User->getOpcode() == ISD::SRL && User->hasOneUse() &&
	isa<ConstantSDNode>(User->getOperand(1))) {
	Shift = User->getConstantOperandVal(1);
	User = *User->use_begin();
	}

	// At this point, User is a Truncate, iff we encountered, trunc or
	// trunc(lshr).
	if (User->getOpcode() != ISD::TRUNCATE)
	return false;

	// The width of the type must be a power of 2 and greater than 8-bits.
	// Otherwise the load cannot be represented in LLVM IR.
	// Moreover, if we shifted with a non-8-bits multiple, the slice
	// will be across several bytes. We do not support that.
	unsigned Width = User->getValueSizeInBits(0);
	if (Width < 8 \|\| !isPowerOf2_32(Width) \|\| (Shift & 0x7))
	return false;

	// Build the slice for this chain of computations.
	LoadedSlice LS(User, LD, Shift, &DAG);
	APInt CurrentUsedBits = LS.getUsedBits();

	// Check if this slice overlaps with another.
	if ((CurrentUsedBits & UsedBits) != 0)
	return false;
	// Update the bits used globally.
	UsedBits \|= CurrentUsedBits;

	// Check if the new slice would be legal.
	if (!LS.isLegal())
	return false;

	// Record the slice.
	LoadedSlices.push_back(LS);
	}

	// Abort slicing if it does not seem to be profitable.
	if (!isSlicingProfitable(LoadedSlices, UsedBits, ForCodeSize))
	return false;

	++SlicedLoads;

	// Rewrite each chain to use an independent load.
	// By construction, each chain can be represented by a unique load.

	// Prepare the argument for the new token factor for all the slices.
	SmallVector<SDValue, 8> ArgChains;
	for (SmallVectorImpl<LoadedSlice>::const_iterator
	LSIt = LoadedSlices.begin(),
	LSItEnd = LoadedSlices.end();
	LSIt != LSItEnd; ++LSIt) {
	SDValue SliceInst = LSIt->loadSlice();
	CombineTo(LSIt->Inst, SliceInst, true);
	if (SliceInst.getOpcode() != ISD::LOAD)
	SliceInst = SliceInst.getOperand(0);
	assert(SliceInst->getOpcode() == ISD::LOAD &&
	"It takes more than a zext to get to the loaded slice!!");
	ArgChains.push_back(SliceInst.getValue(1));
	}

	SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other,
	ArgChains);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
	AddToWorklist(Chain.getNode());
	return true;
	}

	/// Check to see if V is (and load (ptr), imm), where the load is having
	/// specific bytes cleared out. If so, return the byte size being masked out
	/// and the shift amount.
	static std::pair<unsigned, unsigned>
	CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) {
	std::pair<unsigned, unsigned> Result(0, 0);

	// Check for the structure we're looking for.
	if (V->getOpcode() != ISD::AND \|\|
	!isa<ConstantSDNode>(V->getOperand(1)) \|\|
	!ISD::isNormalLoad(V->getOperand(0).getNode()))
	return Result;

	// Check the chain and pointer.
	LoadSDNode *LD = cast<LoadSDNode>(V->getOperand(0));
	if (LD->getBasePtr() != Ptr) return Result; // Not from same pointer.

	// This only handles simple types.
	if (V.getValueType() != MVT::i16 &&
	V.getValueType() != MVT::i32 &&
	V.getValueType() != MVT::i64)
	return Result;

	// Check the constant mask. Invert it so that the bits being masked out are
	// 0 and the bits being kept are 1. Use getSExtValue so that leading bits
	// follow the sign bit for uniformity.
	uint64_t NotMask = ~cast<ConstantSDNode>(V->getOperand(1))->getSExtValue();
	unsigned NotMaskLZ = countLeadingZeros(NotMask);
	if (NotMaskLZ & 7) return Result; // Must be multiple of a byte.
	unsigned NotMaskTZ = countTrailingZeros(NotMask);
	if (NotMaskTZ & 7) return Result; // Must be multiple of a byte.
	if (NotMaskLZ == 64) return Result; // All zero mask.

	// See if we have a continuous run of bits. If so, we have 01+0
	if (countTrailingOnes(NotMask >> NotMaskTZ) + NotMaskTZ + NotMaskLZ != 64)
	return Result;

	// Adjust NotMaskLZ down to be from the actual size of the int instead of i64.
	if (V.getValueType() != MVT::i64 && NotMaskLZ)
	NotMaskLZ -= 64-V.getValueSizeInBits();

	unsigned MaskedBytes = (V.getValueSizeInBits()-NotMaskLZ-NotMaskTZ)/8;
	switch (MaskedBytes) {
	case 1:
	case 2:
	case 4: break;
	default: return Result; // All one mask, or 5-byte mask.
	}

	// Verify that the first bit starts at a multiple of mask so that the access
	// is aligned the same as the access width.
	if (NotMaskTZ && NotMaskTZ/8 % MaskedBytes) return Result;

	// For narrowing to be valid, it must be the case that the load the
	// immediately preceding memory operation before the store.
	if (LD == Chain.getNode())
	; // ok.
	else if (Chain->getOpcode() == ISD::TokenFactor &&
	SDValue(LD, 1).hasOneUse()) {
	// LD has only 1 chain use so they are no indirect dependencies.
	if (!LD->isOperandOf(Chain.getNode()))
	return Result;
	} else
	return Result; // Fail.

	Result.first = MaskedBytes;
	Result.second = NotMaskTZ/8;
	return Result;
	}

	/// Check to see if IVal is something that provides a value as specified by
	/// MaskInfo. If so, replace the specified store with a narrower store of
	/// truncated IVal.
	static SDValue
	ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
	SDValue IVal, StoreSDNode *St,
	DAGCombiner *DC) {
	unsigned NumBytes = MaskInfo.first;
	unsigned ByteShift = MaskInfo.second;
	SelectionDAG &DAG = DC->getDAG();

	// Check to see if IVal is all zeros in the part being masked in by the 'or'
	// that uses this. If not, this is not a replacement.
	APInt Mask = ~APInt::getBitsSet(IVal.getValueSizeInBits(),
	ByteShift8, (ByteShift+NumBytes)8);
	if (!DAG.MaskedValueIsZero(IVal, Mask)) return SDValue();

	// Check that it is legal on the target to do this. It is legal if the new
	// VT we're shrinking to (i8/i16/i32) is legal or we're still before type
	// legalization (and the target doesn't explicitly think this is a bad idea).
	MVT VT = MVT::getIntegerVT(NumBytes * 8);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!DC->isTypeLegal(VT))
	return SDValue();
	if (St->getMemOperand() &&
	!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
	*St->getMemOperand()))
	return SDValue();

	// Okay, we can do this! Replace the 'St' store with a store of IVal that is
	// shifted by ByteShift and truncated down to NumBytes.
	if (ByteShift) {
	SDLoc DL(IVal);
	IVal = DAG.getNode(ISD::SRL, DL, IVal.getValueType(), IVal,
	DAG.getConstant(ByteShift*8, DL,
	DC->getShiftAmountTy(IVal.getValueType())));
	}

	// Figure out the offset for the store and the alignment of the access.
	unsigned StOffset;
	unsigned NewAlign = St->getAlignment();

	if (DAG.getDataLayout().isLittleEndian())
	StOffset = ByteShift;
	else
	StOffset = IVal.getValueType().getStoreSize() - ByteShift - NumBytes;

	SDValue Ptr = St->getBasePtr();
	if (StOffset) {
	SDLoc DL(IVal);
	Ptr = DAG.getMemBasePlusOffset(Ptr, StOffset, DL);
	NewAlign = MinAlign(NewAlign, StOffset);
	}

	// Truncate down to the new size.
	IVal = DAG.getNode(ISD::TRUNCATE, SDLoc(IVal), VT, IVal);

	++OpsNarrowed;
	return DAG
	.getStore(St->getChain(), SDLoc(St), IVal, Ptr,
	St->getPointerInfo().getWithOffset(StOffset), NewAlign);
	}

	/// Look for sequence of load / op / store where op is one of 'or', 'xor', and
	/// 'and' of immediates. If 'op' is only touching some of the loaded bits, try
	/// narrowing the load and store if it would end up being a win for performance
	/// or code size.
	SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
	StoreSDNode *ST = cast<StoreSDNode>(N);
	if (!ST->isSimple())
	return SDValue();

	SDValue Chain = ST->getChain();
	SDValue Value = ST->getValue();
	SDValue Ptr = ST->getBasePtr();
	EVT VT = Value.getValueType();

	if (ST->isTruncatingStore() \|\| VT.isVector() \|\| !Value.hasOneUse())
	return SDValue();

	unsigned Opc = Value.getOpcode();

	// If this is "store (or X, Y), P" and X is "(and (load P), cst)", where cst
	// is a byte mask indicating a consecutive number of bytes, check to see if
	// Y is known to provide just those bytes. If so, we try to replace the
	// load + replace + store sequence with a single (narrower) store, which makes
	// the load dead.
	if (Opc == ISD::OR && EnableShrinkLoadReplaceStoreWithStore) {
	std::pair<unsigned, unsigned> MaskedLoad;
	MaskedLoad = CheckForMaskedLoad(Value.getOperand(0), Ptr, Chain);
	if (MaskedLoad.first)
	if (SDValue NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
	Value.getOperand(1), ST,this))
	return NewST;

	// Or is commutative, so try swapping X and Y.
	MaskedLoad = CheckForMaskedLoad(Value.getOperand(1), Ptr, Chain);
	if (MaskedLoad.first)
	if (SDValue NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
	Value.getOperand(0), ST,this))
	return NewST;
	}

	if (!EnableReduceLoadOpStoreWidth)
	return SDValue();

	if ((Opc != ISD::OR && Opc != ISD::XOR && Opc != ISD::AND) \|\|
	Value.getOperand(1).getOpcode() != ISD::Constant)
	return SDValue();

	SDValue N0 = Value.getOperand(0);
	if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
	Chain == SDValue(N0.getNode(), 1)) {
	LoadSDNode *LD = cast<LoadSDNode>(N0);
	if (LD->getBasePtr() != Ptr \|\|
	LD->getPointerInfo().getAddrSpace() !=
	ST->getPointerInfo().getAddrSpace())
	return SDValue();

	// Find the type to narrow it the load / op / store to.
	SDValue N1 = Value.getOperand(1);
	unsigned BitWidth = N1.getValueSizeInBits();
	APInt Imm = cast<ConstantSDNode>(N1)->getAPIntValue();
	if (Opc == ISD::AND)
	Imm ^= APInt::getAllOnesValue(BitWidth);
	if (Imm == 0 \|\| Imm.isAllOnesValue())
	return SDValue();
	unsigned ShAmt = Imm.countTrailingZeros();
	unsigned MSB = BitWidth - Imm.countLeadingZeros() - 1;
	unsigned NewBW = NextPowerOf2(MSB - ShAmt);
	EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
	// The narrowing should be profitable, the load/store operation should be
	// legal (or custom) and the store size should be equal to the NewVT width.
	while (NewBW < BitWidth &&
	(NewVT.getStoreSizeInBits() != NewBW \|\|
	!TLI.isOperationLegalOrCustom(Opc, NewVT) \|\|
	!TLI.isNarrowingProfitable(VT, NewVT))) {
	NewBW = NextPowerOf2(NewBW);
	NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
	}
	if (NewBW >= BitWidth)
	return SDValue();

	// If the lsb changed does not start at the type bitwidth boundary,
	// start at the previous one.
	if (ShAmt % NewBW)
	ShAmt = (((ShAmt + NewBW - 1) / NewBW) * NewBW) - NewBW;
	APInt Mask = APInt::getBitsSet(BitWidth, ShAmt,
	std::min(BitWidth, ShAmt + NewBW));
	if ((Imm & Mask) == Imm) {
	APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW);
	if (Opc == ISD::AND)
	NewImm ^= APInt::getAllOnesValue(NewBW);
	uint64_t PtrOff = ShAmt / 8;
	// For big endian targets, we need to adjust the offset to the pointer to
	// load the correct bytes.
	if (DAG.getDataLayout().isBigEndian())
	PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff;

	Align NewAlign = commonAlignment(LD->getAlign(), PtrOff);
	Type NewVTTy = NewVT.getTypeForEVT(DAG.getContext());
	if (NewAlign < DAG.getDataLayout().getABITypeAlign(NewVTTy))
	return SDValue();

	SDValue NewPtr = DAG.getMemBasePlusOffset(Ptr, PtrOff, SDLoc(LD));
	SDValue NewLD =
	DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr,
	LD->getPointerInfo().getWithOffset(PtrOff), NewAlign,
	LD->getMemOperand()->getFlags(), LD->getAAInfo());
	SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
	DAG.getConstant(NewImm, SDLoc(Value),
	NewVT));
	SDValue NewST =
	DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr,
	ST->getPointerInfo().getWithOffset(PtrOff), NewAlign);

	AddToWorklist(NewPtr.getNode());
	AddToWorklist(NewLD.getNode());
	AddToWorklist(NewVal.getNode());
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1));
	++OpsNarrowed;
	return NewST;
	}
	}

	return SDValue();
	}

	/// For a given floating point load / store pair, if the load value isn't used
	/// by any other operations, then consider transforming the pair to integer
	/// load / store operations if the target deems the transformation profitable.
	SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {
	StoreSDNode *ST = cast<StoreSDNode>(N);
	SDValue Value = ST->getValue();
	if (ISD::isNormalStore(ST) && ISD::isNormalLoad(Value.getNode()) &&
	Value.hasOneUse()) {
	LoadSDNode *LD = cast<LoadSDNode>(Value);
	EVT VT = LD->getMemoryVT();
	if (!VT.isFloatingPoint() \|\|
	VT != ST->getMemoryVT() \|\|
	LD->isNonTemporal() \|\|
	ST->isNonTemporal() \|\|
	LD->getPointerInfo().getAddrSpace() != 0 \|\|
	ST->getPointerInfo().getAddrSpace() != 0)
	return SDValue();

	TypeSize VTSize = VT.getSizeInBits();

	// We don't know the size of scalable types at compile time so we cannot
	// create an integer of the equivalent size.
	if (VTSize.isScalable())
	return SDValue();

	EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VTSize.getFixedSize());
	if (!TLI.isOperationLegal(ISD::LOAD, IntVT) \|\|
	!TLI.isOperationLegal(ISD::STORE, IntVT) \|\|
	!TLI.isDesirableToTransformToIntegerOp(ISD::LOAD, VT) \|\|
	!TLI.isDesirableToTransformToIntegerOp(ISD::STORE, VT))
	return SDValue();

	Align LDAlign = LD->getAlign();
	Align STAlign = ST->getAlign();
	Type IntVTTy = IntVT.getTypeForEVT(DAG.getContext());
	Align ABIAlign = DAG.getDataLayout().getABITypeAlign(IntVTTy);
	if (LDAlign < ABIAlign \|\| STAlign < ABIAlign)
	return SDValue();

	SDValue NewLD =
	DAG.getLoad(IntVT, SDLoc(Value), LD->getChain(), LD->getBasePtr(),
	LD->getPointerInfo(), LDAlign);

	SDValue NewST =
	DAG.getStore(ST->getChain(), SDLoc(N), NewLD, ST->getBasePtr(),
	ST->getPointerInfo(), STAlign);

	AddToWorklist(NewLD.getNode());
	AddToWorklist(NewST.getNode());
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(Value.getValue(1), NewLD.getValue(1));
	++LdStFP2Int;
	return NewST;
	}

	return SDValue();
	}

	// This is a helper function for visitMUL to check the profitability
	// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).
	// MulNode is the original multiply, AddNode is (add x, c1),
	// and ConstNode is c2.
	//
	// If the (add x, c1) has multiple uses, we could increase
	// the number of adds if we make this transformation.
	// It would only be worth doing this if we can remove a
	// multiply in the process. Check for that here.
	// To illustrate:
	// (A + c1) * c3
	// (A + c2) * c3
	// We're checking for cases where we have common "c3 * A" expressions.
	bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode,
	SDValue &AddNode,
	SDValue &ConstNode) {
	APInt Val;

	// If the add only has one use, this would be OK to do.
	if (AddNode.getNode()->hasOneUse())
	return true;

	// Walk all the users of the constant with which we're multiplying.
	for (SDNode *Use : ConstNode->uses()) {
	if (Use == MulNode) // This use is the one we're on right now. Skip it.
	continue;

	if (Use->getOpcode() == ISD::MUL) { // We have another multiply use.
	SDNode *OtherOp;
	SDNode *MulVar = AddNode.getOperand(0).getNode();

	// OtherOp is what we're multiplying against the constant.
	if (Use->getOperand(0) == ConstNode)
	OtherOp = Use->getOperand(1).getNode();
	else
	OtherOp = Use->getOperand(0).getNode();

	// Check to see if multiply is with the same operand of our "add".
	//
	// ConstNode = CONST
	// Use = ConstNode * A <-- visiting Use. OtherOp is A.
	// ...
	// AddNode = (A + c1) <-- MulVar is A.
	// = AddNode * ConstNode <-- current visiting instruction.
	//
	// If we make this transformation, we will have a common
	// multiply (ConstNode * A) that we can save.
	if (OtherOp == MulVar)
	return true;

	// Now check to see if a future expansion will give us a common
	// multiply.
	//
	// ConstNode = CONST
	// AddNode = (A + c1)
	// ... = AddNode * ConstNode <-- current visiting instruction.
	// ...
	// OtherOp = (A + c2)
	// Use = OtherOp * ConstNode <-- visiting Use.
	//
	// If we make this transformation, we will have a common
	// multiply (CONST * A) after we also do the same transformation
	// to the "t2" instruction.
	if (OtherOp->getOpcode() == ISD::ADD &&
	DAG.isConstantIntBuildVectorOrConstantInt(OtherOp->getOperand(1)) &&
	OtherOp->getOperand(0).getNode() == MulVar)
	return true;
	}
	}

	// Didn't find a case where this would be profitable.
	return false;
	}

	SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
	unsigned NumStores) {
	SmallVector<SDValue, 8> Chains;
	SmallPtrSet<const SDNode *, 8> Visited;
	SDLoc StoreDL(StoreNodes[0].MemNode);

	for (unsigned i = 0; i < NumStores; ++i) {
	Visited.insert(StoreNodes[i].MemNode);
	}

	// don't include nodes that are children or repeated nodes.
	for (unsigned i = 0; i < NumStores; ++i) {
	if (Visited.insert(StoreNodes[i].MemNode->getChain().getNode()).second)
	Chains.push_back(StoreNodes[i].MemNode->getChain());
	}

	assert(Chains.size() > 0 && "Chain should have generated a chain");
	return DAG.getTokenFactor(StoreDL, Chains);
	}

	bool DAGCombiner::mergeStoresOfConstantsOrVecElts(
	SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT, unsigned NumStores,
	bool IsConstantSrc, bool UseVector, bool UseTrunc) {
	// Make sure we have something to merge.
	if (NumStores < 2)
	return false;

	// The latest Node in the DAG.
	SDLoc DL(StoreNodes[0].MemNode);

	TypeSize ElementSizeBits = MemVT.getStoreSizeInBits();
	unsigned SizeInBits = NumStores * ElementSizeBits;
	unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;

	EVT StoreTy;
	if (UseVector) {
	unsigned Elts = NumStores * NumMemElts;
	// Get the type for the merged vector store.
	StoreTy = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
	} else
	StoreTy = EVT::getIntegerVT(*DAG.getContext(), SizeInBits);

	SDValue StoredVal;
	if (UseVector) {
	if (IsConstantSrc) {
	SmallVector<SDValue, 8> BuildVector;
	for (unsigned I = 0; I != NumStores; ++I) {
	StoreSDNode *St = cast<StoreSDNode>(StoreNodes[I].MemNode);
	SDValue Val = St->getValue();
	// If constant is of the wrong type, convert it now.
	if (MemVT != Val.getValueType()) {
	Val = peekThroughBitcasts(Val);
	// Deal with constants of wrong size.
	if (ElementSizeBits != Val.getValueSizeInBits()) {
	EVT IntMemVT =
	EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
	if (isa<ConstantFPSDNode>(Val)) {
	// Not clear how to truncate FP values.
	return false;
	} else if (auto *C = dyn_cast<ConstantSDNode>(Val))
	Val = DAG.getConstant(C->getAPIntValue()
	.zextOrTrunc(Val.getValueSizeInBits())
	.zextOrTrunc(ElementSizeBits),
	SDLoc(C), IntMemVT);
	}
	// Make sure correctly size type is the correct type.
	Val = DAG.getBitcast(MemVT, Val);
	}
	BuildVector.push_back(Val);
	}
	StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS
	: ISD::BUILD_VECTOR,
	DL, StoreTy, BuildVector);
	} else {
	SmallVector<SDValue, 8> Ops;
	for (unsigned i = 0; i < NumStores; ++i) {
	StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
	SDValue Val = peekThroughBitcasts(St->getValue());
	// All operands of BUILD_VECTOR / CONCAT_VECTOR must be of
	// type MemVT. If the underlying value is not the correct
	// type, but it is an extraction of an appropriate vector we
	// can recast Val to be of the correct type. This may require
	// converting between EXTRACT_VECTOR_ELT and
	// EXTRACT_SUBVECTOR.
	if ((MemVT != Val.getValueType()) &&
	(Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT \|\|
	Val.getOpcode() == ISD::EXTRACT_SUBVECTOR)) {
	EVT MemVTScalarTy = MemVT.getScalarType();
	// We may need to add a bitcast here to get types to line up.
	if (MemVTScalarTy != Val.getValueType().getScalarType()) {
	Val = DAG.getBitcast(MemVT, Val);
	} else {
	unsigned OpC = MemVT.isVector() ? ISD::EXTRACT_SUBVECTOR
	: ISD::EXTRACT_VECTOR_ELT;
	SDValue Vec = Val.getOperand(0);
	SDValue Idx = Val.getOperand(1);
	Val = DAG.getNode(OpC, SDLoc(Val), MemVT, Vec, Idx);
	}
	}
	Ops.push_back(Val);
	}

	// Build the extracted vector elements back into a vector.
	StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS
	: ISD::BUILD_VECTOR,
	DL, StoreTy, Ops);
	}
	} else {
	// We should always use a vector store when merging extracted vector
	// elements, so this path implies a store of constants.
	assert(IsConstantSrc && "Merged vector elements should use vector store");

	APInt StoreInt(SizeInBits, 0);

	// Construct a single integer constant which is made of the smaller
	// constant inputs.
	bool IsLE = DAG.getDataLayout().isLittleEndian();
	for (unsigned i = 0; i < NumStores; ++i) {
	unsigned Idx = IsLE ? (NumStores - 1 - i) : i;
	StoreSDNode *St = cast<StoreSDNode>(StoreNodes[Idx].MemNode);

	SDValue Val = St->getValue();
	Val = peekThroughBitcasts(Val);
	StoreInt <<= ElementSizeBits;
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) {
	StoreInt \|= C->getAPIntValue()
	.zextOrTrunc(ElementSizeBits)
	.zextOrTrunc(SizeInBits);
	} else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) {
	StoreInt \|= C->getValueAPF()
	.bitcastToAPInt()
	.zextOrTrunc(ElementSizeBits)
	.zextOrTrunc(SizeInBits);
	// If fp truncation is necessary give up for now.
	if (MemVT.getSizeInBits() != ElementSizeBits)
	return false;
	} else {
	llvm_unreachable("Invalid constant element type");
	}
	}

	// Create the new Load and Store operations.
	StoredVal = DAG.getConstant(StoreInt, DL, StoreTy);
	}

	LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
	SDValue NewChain = getMergeStoreChains(StoreNodes, NumStores);

	// make sure we use trunc store if it's necessary to be legal.
	SDValue NewStore;
	if (!UseTrunc) {
	NewStore = DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(),
	FirstInChain->getPointerInfo(),
	FirstInChain->getAlignment());
	} else { // Must be realized as a trunc store
	EVT LegalizedStoredValTy =
	TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType());
	unsigned LegalizedStoreSize = LegalizedStoredValTy.getSizeInBits();
	ConstantSDNode *C = cast<ConstantSDNode>(StoredVal);
	SDValue ExtendedStoreVal =
	DAG.getConstant(C->getAPIntValue().zextOrTrunc(LegalizedStoreSize), DL,
	LegalizedStoredValTy);
	NewStore = DAG.getTruncStore(
	NewChain, DL, ExtendedStoreVal, FirstInChain->getBasePtr(),
	FirstInChain->getPointerInfo(), StoredVal.getValueType() /TVT/,
	FirstInChain->getAlignment(),
	FirstInChain->getMemOperand()->getFlags());
	}

	// Replace all merged stores with the new store.
	for (unsigned i = 0; i < NumStores; ++i)
	CombineTo(StoreNodes[i].MemNode, NewStore);

	AddToWorklist(NewChain.getNode());
	return true;
	}

	void DAGCombiner::getStoreMergeCandidates(
	StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes,
	SDNode *&RootNode) {
	// This holds the base pointer, index, and the offset in bytes from the base
	// pointer.
	BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);
	EVT MemVT = St->getMemoryVT();

	SDValue Val = peekThroughBitcasts(St->getValue());
	// We must have a base and an offset.
	if (!BasePtr.getBase().getNode())
	return;

	// Do not handle stores to undef base pointers.
	if (BasePtr.getBase().isUndef())
	return;

	StoreSource StoreSrc = getStoreSource(Val);
	assert(StoreSrc != StoreSource::Unknown && "Expected known source for store");
	BaseIndexOffset LBasePtr;
	// Match on loadbaseptr if relevant.
	EVT LoadVT;
	if (StoreSrc == StoreSource::Load) {
	auto *Ld = cast<LoadSDNode>(Val);
	LBasePtr = BaseIndexOffset::match(Ld, DAG);
	LoadVT = Ld->getMemoryVT();
	// Load and store should be the same type.
	if (MemVT != LoadVT)
	return;
	// Loads must only have one use.
	if (!Ld->hasNUsesOfValue(1, 0))
	return;
	// The memory operands must not be volatile/indexed/atomic.
	// TODO: May be able to relax for unordered atomics (see D66309)
	if (!Ld->isSimple() \|\| Ld->isIndexed())
	return;
	}
	auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr,
	int64_t &Offset) -> bool {
	// The memory operands must not be volatile/indexed/atomic.
	// TODO: May be able to relax for unordered atomics (see D66309)
	if (!Other->isSimple() \|\| Other->isIndexed())
	return false;
	// Don't mix temporal stores with non-temporal stores.
	if (St->isNonTemporal() != Other->isNonTemporal())
	return false;
	SDValue OtherBC = peekThroughBitcasts(Other->getValue());
	// Allow merging constants of different types as integers.
	bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT())
	: Other->getMemoryVT() != MemVT;
	if (StoreSrc == StoreSource::Load) {
	if (NoTypeMatch)
	return false;
	// The Load's Base Ptr must also match
	if (LoadSDNode *OtherLd = dyn_cast<LoadSDNode>(OtherBC)) {
	BaseIndexOffset LPtr = BaseIndexOffset::match(OtherLd, DAG);
	if (LoadVT != OtherLd->getMemoryVT())
	return false;
	// Loads must only have one use.
	if (!OtherLd->hasNUsesOfValue(1, 0))
	return false;
	// The memory operands must not be volatile/indexed/atomic.
	// TODO: May be able to relax for unordered atomics (see D66309)
	if (!OtherLd->isSimple() \|\|
	OtherLd->isIndexed())
	return false;
	// Don't mix temporal loads with non-temporal loads.
	if (cast<LoadSDNode>(Val)->isNonTemporal() != OtherLd->isNonTemporal())
	return false;
	if (!(LBasePtr.equalBaseIndex(LPtr, DAG)))
	return false;
	} else
	return false;
	}
	if (StoreSrc == StoreSource::Constant) {
	if (NoTypeMatch)
	return false;
	if (!(isa<ConstantSDNode>(OtherBC) \|\| isa<ConstantFPSDNode>(OtherBC)))
	return false;
	}
	if (StoreSrc == StoreSource::Extract) {
	// Do not merge truncated stores here.
	if (Other->isTruncatingStore())
	return false;
	if (!MemVT.bitsEq(OtherBC.getValueType()))
	return false;
	if (OtherBC.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
	OtherBC.getOpcode() != ISD::EXTRACT_SUBVECTOR)
	return false;
	}
	Ptr = BaseIndexOffset::match(Other, DAG);
	return (BasePtr.equalBaseIndex(Ptr, DAG, Offset));
	};

	// Check if the pair of StoreNode and the RootNode already bail out many
	// times which is over the limit in dependence check.
	auto OverLimitInDependenceCheck = [&](SDNode *StoreNode,
	SDNode *RootNode) -> bool {
	auto RootCount = StoreRootCountMap.find(StoreNode);
	if (RootCount != StoreRootCountMap.end() &&
	RootCount->second.first == RootNode &&
	RootCount->second.second > StoreMergeDependenceLimit)
	return true;
	return false;
	};

	// We looking for a root node which is an ancestor to all mergable
	// stores. We search up through a load, to our root and then down
	// through all children. For instance we will find Store{1,2,3} if
	// St is Store1, Store2. or Store3 where the root is not a load
	// which always true for nonvolatile ops. TODO: Expand
	// the search to find all valid candidates through multiple layers of loads.
	//
	// Root
	// \|-------\|-------\|
	// Load Load Store3
	// \| \|
	// Store1 Store2
	//
	// FIXME: We should be able to climb and
	// descend TokenFactors to find candidates as well.

	RootNode = St->getChain().getNode();

	unsigned NumNodesExplored = 0;
	if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(RootNode)) {
	RootNode = Ldn->getChain().getNode();
	for (auto I = RootNode->use_begin(), E = RootNode->use_end();
	I != E && NumNodesExplored < 1024; ++I, ++NumNodesExplored)
	if (I.getOperandNo() == 0 && isa<LoadSDNode>(*I)) // walk down chain
	for (auto I2 = (I)->use_begin(), E2 = (I)->use_end(); I2 != E2; ++I2)
	if (I2.getOperandNo() == 0)
	if (StoreSDNode OtherST = dyn_cast<StoreSDNode>(I2)) {
	BaseIndexOffset Ptr;
	int64_t PtrDiff;
	if (CandidateMatch(OtherST, Ptr, PtrDiff) &&
	!OverLimitInDependenceCheck(OtherST, RootNode))
	StoreNodes.push_back(MemOpLink(OtherST, PtrDiff));
	}
	} else
	for (auto I = RootNode->use_begin(), E = RootNode->use_end();
	I != E && NumNodesExplored < 1024; ++I, ++NumNodesExplored)
	if (I.getOperandNo() == 0)
	if (StoreSDNode OtherST = dyn_cast<StoreSDNode>(I)) {
	BaseIndexOffset Ptr;
	int64_t PtrDiff;
	if (CandidateMatch(OtherST, Ptr, PtrDiff) &&
	!OverLimitInDependenceCheck(OtherST, RootNode))
	StoreNodes.push_back(MemOpLink(OtherST, PtrDiff));
	}
	}

	// We need to check that merging these stores does not cause a loop in
	// the DAG. Any store candidate may depend on another candidate
	// indirectly through its operand (we already consider dependencies
	// through the chain). Check in parallel by searching up from
	// non-chain operands of candidates.
	bool DAGCombiner::checkMergeStoreCandidatesForDependencies(
	SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores,
	SDNode *RootNode) {
	// FIXME: We should be able to truncate a full search of
	// predecessors by doing a BFS and keeping tabs the originating
	// stores from which worklist nodes come from in a similar way to
	// TokenFactor simplfication.

	SmallPtrSet<const SDNode *, 32> Visited;
	SmallVector<const SDNode *, 8> Worklist;

	// RootNode is a predecessor to all candidates so we need not search
	// past it. Add RootNode (peeking through TokenFactors). Do not count
	// these towards size check.

	Worklist.push_back(RootNode);
	while (!Worklist.empty()) {
	auto N = Worklist.pop_back_val();
	if (!Visited.insert(N).second)
	continue; // Already present in Visited.
	if (N->getOpcode() == ISD::TokenFactor) {
	for (SDValue Op : N->ops())
	Worklist.push_back(Op.getNode());
	}
	}

	// Don't count pruning nodes towards max.
	unsigned int Max = 1024 + Visited.size();
	// Search Ops of store candidates.
	for (unsigned i = 0; i < NumStores; ++i) {
	SDNode *N = StoreNodes[i].MemNode;
	// Of the 4 Store Operands:
	// * Chain (Op 0) -> We have already considered these
	// in candidate selection and can be
	// safely ignored
	// * Value (Op 1) -> Cycles may happen (e.g. through load chains)
	// * Address (Op 2) -> Merged addresses may only vary by a fixed constant,
	// but aren't necessarily fromt the same base node, so
	// cycles possible (e.g. via indexed store).
	// * (Op 3) -> Represents the pre or post-indexing offset (or undef for
	// non-indexed stores). Not constant on all targets (e.g. ARM)
	// and so can participate in a cycle.
	for (unsigned j = 1; j < N->getNumOperands(); ++j)
	Worklist.push_back(N->getOperand(j).getNode());
	}
	// Search through DAG. We can stop early if we find a store node.
	for (unsigned i = 0; i < NumStores; ++i)
	if (SDNode::hasPredecessorHelper(StoreNodes[i].MemNode, Visited, Worklist,
	Max)) {
	// If the searching bail out, record the StoreNode and RootNode in the
	// StoreRootCountMap. If we have seen the pair many times over a limit,
	// we won't add the StoreNode into StoreNodes set again.
	if (Visited.size() >= Max) {
	auto &RootCount = StoreRootCountMap[StoreNodes[i].MemNode];
	if (RootCount.first == RootNode)
	RootCount.second++;
	else
	RootCount = {RootNode, 1};
	}
	return false;
	}
	return true;
	}

	unsigned
	DAGCombiner::getConsecutiveStores(SmallVectorImpl<MemOpLink> &StoreNodes,
	int64_t ElementSizeBytes) const {
	while (true) {
	// Find a store past the width of the first store.
	size_t StartIdx = 0;
	while ((StartIdx + 1 < StoreNodes.size()) &&
	StoreNodes[StartIdx].OffsetFromBase + ElementSizeBytes !=
	StoreNodes[StartIdx + 1].OffsetFromBase)
	++StartIdx;

	// Bail if we don't have enough candidates to merge.
	if (StartIdx + 1 >= StoreNodes.size())
	return 0;

	// Trim stores that overlapped with the first store.
	if (StartIdx)
	StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + StartIdx);

	// Scan the memory operations on the chain and find the first
	// non-consecutive store memory address.
	unsigned NumConsecutiveStores = 1;
	int64_t StartAddress = StoreNodes[0].OffsetFromBase;
	// Check that the addresses are consecutive starting from the second
	// element in the list of stores.
	for (unsigned i = 1, e = StoreNodes.size(); i < e; ++i) {
	int64_t CurrAddress = StoreNodes[i].OffsetFromBase;
	if (CurrAddress - StartAddress != (ElementSizeBytes * i))
	break;
	NumConsecutiveStores = i + 1;
	}
	if (NumConsecutiveStores > 1)
	return NumConsecutiveStores;

	// There are no consecutive stores at the start of the list.
	// Remove the first store and try again.
	StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1);
	}
	}

	bool DAGCombiner::tryStoreMergeOfConstants(
	SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumConsecutiveStores,
	EVT MemVT, SDNode *RootNode, bool AllowVectors) {
	LLVMContext &Context = *DAG.getContext();
	const DataLayout &DL = DAG.getDataLayout();
	int64_t ElementSizeBytes = MemVT.getStoreSize();
	unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;
	bool MadeChange = false;

	// Store the constants into memory as one consecutive store.
	while (NumConsecutiveStores >= 2) {
	LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
	unsigned FirstStoreAS = FirstInChain->getAddressSpace();
	unsigned FirstStoreAlign = FirstInChain->getAlignment();
	unsigned LastLegalType = 1;
	unsigned LastLegalVectorType = 1;
	bool LastIntegerTrunc = false;
	bool NonZero = false;
	unsigned FirstZeroAfterNonZero = NumConsecutiveStores;
	for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
	StoreSDNode *ST = cast<StoreSDNode>(StoreNodes[i].MemNode);
	SDValue StoredVal = ST->getValue();
	bool IsElementZero = false;
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal))
	IsElementZero = C->isNullValue();
	else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal))
	IsElementZero = C->getConstantFPValue()->isNullValue();
	if (IsElementZero) {
	if (NonZero && FirstZeroAfterNonZero == NumConsecutiveStores)
	FirstZeroAfterNonZero = i;
	}
	NonZero \|= !IsElementZero;

	// Find a legal type for the constant store.
	unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
	EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits);
	bool IsFast = false;

	// Break early when size is too large to be legal.
	if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits)
	break;

	if (TLI.isTypeLegal(StoreTy) &&
	TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
	TLI.allowsMemoryAccess(Context, DL, StoreTy,
	*FirstInChain->getMemOperand(), &IsFast) &&
	IsFast) {
	LastIntegerTrunc = false;
	LastLegalType = i + 1;
	// Or check whether a truncstore is legal.
	} else if (TLI.getTypeAction(Context, StoreTy) ==
	TargetLowering::TypePromoteInteger) {
	EVT LegalizedStoredValTy =
	TLI.getTypeToTransformTo(Context, StoredVal.getValueType());
	if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) &&
	TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy, DAG) &&
	TLI.allowsMemoryAccess(Context, DL, StoreTy,
	*FirstInChain->getMemOperand(), &IsFast) &&
	IsFast) {
	LastIntegerTrunc = true;
	LastLegalType = i + 1;
	}
	}

	// We only use vectors if the constant is known to be zero or the
	// target allows it and the function is not marked with the
	// noimplicitfloat attribute.
	if ((!NonZero \|\|
	TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) &&
	AllowVectors) {
	// Find a legal type for the vector store.
	unsigned Elts = (i + 1) * NumMemElts;
	EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
	if (TLI.isTypeLegal(Ty) && TLI.isTypeLegal(MemVT) &&
	TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&
	TLI.allowsMemoryAccess(Context, DL, Ty,
	*FirstInChain->getMemOperand(), &IsFast) &&
	IsFast)
	LastLegalVectorType = i + 1;
	}
	}

	bool UseVector = (LastLegalVectorType > LastLegalType) && AllowVectors;
	unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType;

	// Check if we found a legal integer type that creates a meaningful
	// merge.
	if (NumElem < 2) {
	// We know that candidate stores are in order and of correct
	// shape. While there is no mergeable sequence from the
	// beginning one may start later in the sequence. The only
	// reason a merge of size N could have failed where another of
	// the same size would not have, is if the alignment has
	// improved or we've dropped a non-zero value. Drop as many
	// candidates as we can here.
	unsigned NumSkip = 1;
	while ((NumSkip < NumConsecutiveStores) &&
	(NumSkip < FirstZeroAfterNonZero) &&
	(StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))
	NumSkip++;

	StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
	NumConsecutiveStores -= NumSkip;
	continue;
	}

	// Check that we can merge these candidates without causing a cycle.
	if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem,
	RootNode)) {
	StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
	NumConsecutiveStores -= NumElem;
	continue;
	}

	MadeChange \|= mergeStoresOfConstantsOrVecElts(
	StoreNodes, MemVT, NumElem, true, UseVector, LastIntegerTrunc);

	// Remove merged stores for next iteration.
	StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
	NumConsecutiveStores -= NumElem;
	}
	return MadeChange;
	}

	bool DAGCombiner::tryStoreMergeOfExtracts(
	SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumConsecutiveStores,
	EVT MemVT, SDNode *RootNode) {
	LLVMContext &Context = *DAG.getContext();
	const DataLayout &DL = DAG.getDataLayout();
	unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;
	bool MadeChange = false;

	// Loop on Consecutive Stores on success.
	while (NumConsecutiveStores >= 2) {
	LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
	unsigned FirstStoreAS = FirstInChain->getAddressSpace();
	unsigned FirstStoreAlign = FirstInChain->getAlignment();
	unsigned NumStoresToMerge = 1;
	for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
	// Find a legal type for the vector store.
	unsigned Elts = (i + 1) * NumMemElts;
	EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
	bool IsFast = false;

	// Break early when size is too large to be legal.
	if (Ty.getSizeInBits() > MaximumLegalStoreInBits)
	break;

	if (TLI.isTypeLegal(Ty) && TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&
	TLI.allowsMemoryAccess(Context, DL, Ty,
	*FirstInChain->getMemOperand(), &IsFast) &&
	IsFast)
	NumStoresToMerge = i + 1;
	}

	// Check if we found a legal integer type creating a meaningful
	// merge.
	if (NumStoresToMerge < 2) {
	// We know that candidate stores are in order and of correct
	// shape. While there is no mergeable sequence from the
	// beginning one may start later in the sequence. The only
	// reason a merge of size N could have failed where another of
	// the same size would not have, is if the alignment has
	// improved. Drop as many candidates as we can here.
	unsigned NumSkip = 1;
	while ((NumSkip < NumConsecutiveStores) &&
	(StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))
	NumSkip++;

	StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
	NumConsecutiveStores -= NumSkip;
	continue;
	}

	// Check that we can merge these candidates without causing a cycle.
	if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumStoresToMerge,
	RootNode)) {
	StoreNodes.erase(StoreNodes.begin(),
	StoreNodes.begin() + NumStoresToMerge);
	NumConsecutiveStores -= NumStoresToMerge;
	continue;
	}

	MadeChange \|= mergeStoresOfConstantsOrVecElts(
	StoreNodes, MemVT, NumStoresToMerge, false, true, false);

	StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumStoresToMerge);
	NumConsecutiveStores -= NumStoresToMerge;
	}
	return MadeChange;
	}

	bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes,
	unsigned NumConsecutiveStores, EVT MemVT,
	SDNode *RootNode, bool AllowVectors,
	bool IsNonTemporalStore,
	bool IsNonTemporalLoad) {
	LLVMContext &Context = *DAG.getContext();
	const DataLayout &DL = DAG.getDataLayout();
	int64_t ElementSizeBytes = MemVT.getStoreSize();
	unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;
	bool MadeChange = false;

	int64_t StartAddress = StoreNodes[0].OffsetFromBase;

	// Look for load nodes which are used by the stored values.
	SmallVector<MemOpLink, 8> LoadNodes;

	// Find acceptable loads. Loads need to have the same chain (token factor),
	// must not be zext, volatile, indexed, and they must be consecutive.
	BaseIndexOffset LdBasePtr;

	for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
	StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
	SDValue Val = peekThroughBitcasts(St->getValue());
	LoadSDNode *Ld = cast<LoadSDNode>(Val);

	BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld, DAG);
	// If this is not the first ptr that we check.
	int64_t LdOffset = 0;
	if (LdBasePtr.getBase().getNode()) {
	// The base ptr must be the same.
	if (!LdBasePtr.equalBaseIndex(LdPtr, DAG, LdOffset))
	break;
	} else {
	// Check that all other base pointers are the same as this one.
	LdBasePtr = LdPtr;
	}

	// We found a potential memory operand to merge.
	LoadNodes.push_back(MemOpLink(Ld, LdOffset));
	}

	while (NumConsecutiveStores >= 2 && LoadNodes.size() >= 2) {
	Align RequiredAlignment;
	bool NeedRotate = false;
	if (LoadNodes.size() == 2) {
	// If we have load/store pair instructions and we only have two values,
	// don't bother merging.
	if (TLI.hasPairedLoad(MemVT, RequiredAlignment) &&
	StoreNodes[0].MemNode->getAlign() >= RequiredAlignment) {
	StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 2);
	LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + 2);
	break;
	}
	// If the loads are reversed, see if we can rotate the halves into place.
	int64_t Offset0 = LoadNodes[0].OffsetFromBase;
	int64_t Offset1 = LoadNodes[1].OffsetFromBase;
	EVT PairVT = EVT::getIntegerVT(Context, ElementSizeBytes * 8 * 2);
	if (Offset0 - Offset1 == ElementSizeBytes &&
	(hasOperation(ISD::ROTL, PairVT) \|\|
	hasOperation(ISD::ROTR, PairVT))) {
	std::swap(LoadNodes[0], LoadNodes[1]);
	NeedRotate = true;
	}
	}
	LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
	unsigned FirstStoreAS = FirstInChain->getAddressSpace();
	unsigned FirstStoreAlign = FirstInChain->getAlignment();
	LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode);

	// Scan the memory operations on the chain and find the first
	// non-consecutive load memory address. These variables hold the index in
	// the store node array.

	unsigned LastConsecutiveLoad = 1;

	// This variable refers to the size and not index in the array.
	unsigned LastLegalVectorType = 1;
	unsigned LastLegalIntegerType = 1;
	bool isDereferenceable = true;
	bool DoIntegerTruncate = false;
	StartAddress = LoadNodes[0].OffsetFromBase;
	SDValue LoadChain = FirstLoad->getChain();
	for (unsigned i = 1; i < LoadNodes.size(); ++i) {
	// All loads must share the same chain.
	if (LoadNodes[i].MemNode->getChain() != LoadChain)
	break;

	int64_t CurrAddress = LoadNodes[i].OffsetFromBase;
	if (CurrAddress - StartAddress != (ElementSizeBytes * i))
	break;
	LastConsecutiveLoad = i;

	if (isDereferenceable && !LoadNodes[i].MemNode->isDereferenceable())
	isDereferenceable = false;

	// Find a legal type for the vector store.
	unsigned Elts = (i + 1) * NumMemElts;
	EVT StoreTy = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);

	// Break early when size is too large to be legal.
	if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits)
	break;

	bool IsFastSt = false;
	bool IsFastLd = false;
	if (TLI.isTypeLegal(StoreTy) &&
	TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
	TLI.allowsMemoryAccess(Context, DL, StoreTy,
	*FirstInChain->getMemOperand(), &IsFastSt) &&
	IsFastSt &&
	TLI.allowsMemoryAccess(Context, DL, StoreTy,
	*FirstLoad->getMemOperand(), &IsFastLd) &&
	IsFastLd) {
	LastLegalVectorType = i + 1;
	}

	// Find a legal type for the integer store.
	unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
	StoreTy = EVT::getIntegerVT(Context, SizeInBits);
	if (TLI.isTypeLegal(StoreTy) &&
	TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
	TLI.allowsMemoryAccess(Context, DL, StoreTy,
	*FirstInChain->getMemOperand(), &IsFastSt) &&
	IsFastSt &&
	TLI.allowsMemoryAccess(Context, DL, StoreTy,
	*FirstLoad->getMemOperand(), &IsFastLd) &&
	IsFastLd) {
	LastLegalIntegerType = i + 1;
	DoIntegerTruncate = false;
	// Or check whether a truncstore and extload is legal.
	} else if (TLI.getTypeAction(Context, StoreTy) ==
	TargetLowering::TypePromoteInteger) {
	EVT LegalizedStoredValTy = TLI.getTypeToTransformTo(Context, StoreTy);
	if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) &&
	TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy, DAG) &&
	TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValTy, StoreTy) &&
	TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValTy, StoreTy) &&
	TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValTy, StoreTy) &&
	TLI.allowsMemoryAccess(Context, DL, StoreTy,
	*FirstInChain->getMemOperand(), &IsFastSt) &&
	IsFastSt &&
	TLI.allowsMemoryAccess(Context, DL, StoreTy,
	*FirstLoad->getMemOperand(), &IsFastLd) &&
	IsFastLd) {
	LastLegalIntegerType = i + 1;
	DoIntegerTruncate = true;
	}
	}
	}

	// Only use vector types if the vector type is larger than the integer
	// type. If they are the same, use integers.
	bool UseVectorTy =
	LastLegalVectorType > LastLegalIntegerType && AllowVectors;
	unsigned LastLegalType =
	std::max(LastLegalVectorType, LastLegalIntegerType);

	// We add +1 here because the LastXXX variables refer to location while
	// the NumElem refers to array/index size.
	unsigned NumElem = std::min(NumConsecutiveStores, LastConsecutiveLoad + 1);
	NumElem = std::min(LastLegalType, NumElem);
	unsigned FirstLoadAlign = FirstLoad->getAlignment();

	if (NumElem < 2) {
	// We know that candidate stores are in order and of correct
	// shape. While there is no mergeable sequence from the
	// beginning one may start later in the sequence. The only
	// reason a merge of size N could have failed where another of
	// the same size would not have is if the alignment or either
	// the load or store has improved. Drop as many candidates as we
	// can here.
	unsigned NumSkip = 1;
	while ((NumSkip < LoadNodes.size()) &&
	(LoadNodes[NumSkip].MemNode->getAlignment() <= FirstLoadAlign) &&
	(StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))
	NumSkip++;
	StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
	LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumSkip);
	NumConsecutiveStores -= NumSkip;
	continue;
	}

	// Check that we can merge these candidates without causing a cycle.
	if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem,
	RootNode)) {
	StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
	LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem);
	NumConsecutiveStores -= NumElem;
	continue;
	}

	// Find if it is better to use vectors or integers to load and store
	// to memory.
	EVT JointMemOpVT;
	if (UseVectorTy) {
	// Find a legal type for the vector store.
	unsigned Elts = NumElem * NumMemElts;
	JointMemOpVT = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
	} else {
	unsigned SizeInBits = NumElem * ElementSizeBytes * 8;
	JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits);
	}

	SDLoc LoadDL(LoadNodes[0].MemNode);
	SDLoc StoreDL(StoreNodes[0].MemNode);

	// The merged loads are required to have the same incoming chain, so
	// using the first's chain is acceptable.

	SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem);
	AddToWorklist(NewStoreChain.getNode());

	MachineMemOperand::Flags LdMMOFlags =
	isDereferenceable ? MachineMemOperand::MODereferenceable
	: MachineMemOperand::MONone;
	if (IsNonTemporalLoad)
	LdMMOFlags \|= MachineMemOperand::MONonTemporal;

	MachineMemOperand::Flags StMMOFlags = IsNonTemporalStore
	? MachineMemOperand::MONonTemporal
	: MachineMemOperand::MONone;

	SDValue NewLoad, NewStore;
	if (UseVectorTy \|\| !DoIntegerTruncate) {
	NewLoad = DAG.getLoad(
	JointMemOpVT, LoadDL, FirstLoad->getChain(), FirstLoad->getBasePtr(),
	FirstLoad->getPointerInfo(), FirstLoadAlign, LdMMOFlags);
	SDValue StoreOp = NewLoad;
	if (NeedRotate) {
	unsigned LoadWidth = ElementSizeBytes * 8 * 2;
	assert(JointMemOpVT == EVT::getIntegerVT(Context, LoadWidth) &&
	"Unexpected type for rotate-able load pair");
	SDValue RotAmt =
	DAG.getShiftAmountConstant(LoadWidth / 2, JointMemOpVT, LoadDL);
	// Target can convert to the identical ROTR if it does not have ROTL.
	StoreOp = DAG.getNode(ISD::ROTL, LoadDL, JointMemOpVT, NewLoad, RotAmt);
	}
	NewStore = DAG.getStore(
	NewStoreChain, StoreDL, StoreOp, FirstInChain->getBasePtr(),
	FirstInChain->getPointerInfo(), FirstStoreAlign, StMMOFlags);
	} else { // This must be the truncstore/extload case
	EVT ExtendedTy =
	TLI.getTypeToTransformTo(*DAG.getContext(), JointMemOpVT);
	NewLoad = DAG.getExtLoad(ISD::EXTLOAD, LoadDL, ExtendedTy,
	FirstLoad->getChain(), FirstLoad->getBasePtr(),
	FirstLoad->getPointerInfo(), JointMemOpVT,
	FirstLoadAlign, LdMMOFlags);
	NewStore = DAG.getTruncStore(NewStoreChain, StoreDL, NewLoad,
	FirstInChain->getBasePtr(),
	FirstInChain->getPointerInfo(), JointMemOpVT,
	FirstInChain->getAlignment(),
	FirstInChain->getMemOperand()->getFlags());
	}

	// Transfer chain users from old loads to the new load.
	for (unsigned i = 0; i < NumElem; ++i) {
	LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1),
	SDValue(NewLoad.getNode(), 1));
	}

	// Replace all stores with the new store. Recursively remove corresponding
	// values if they are no longer used.
	for (unsigned i = 0; i < NumElem; ++i) {
	SDValue Val = StoreNodes[i].MemNode->getOperand(1);
	CombineTo(StoreNodes[i].MemNode, NewStore);
	if (Val.getNode()->use_empty())
	recursivelyDeleteUnusedNodes(Val.getNode());
	}

	MadeChange = true;
	StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
	LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem);
	NumConsecutiveStores -= NumElem;
	}
	return MadeChange;
	}

	bool DAGCombiner::mergeConsecutiveStores(StoreSDNode *St) {
	if (OptLevel == CodeGenOpt::None \|\| !EnableStoreMerging)
	return false;

	// TODO: Extend this function to merge stores of scalable vectors.
	// (i.e. two <vscale x 8 x i8> stores can be merged to one <vscale x 16 x i8>
	// store since we know <vscale x 16 x i8> is exactly twice as large as
	// <vscale x 8 x i8>). Until then, bail out for scalable vectors.
	EVT MemVT = St->getMemoryVT();
	if (MemVT.isScalableVector())
	return false;
	if (!MemVT.isSimple() \|\| MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits)
	return false;

	// This function cannot currently deal with non-byte-sized memory sizes.
	int64_t ElementSizeBytes = MemVT.getStoreSize();
	if (ElementSizeBytes * 8 != (int64_t)MemVT.getSizeInBits())
	return false;

	// Do not bother looking at stored values that are not constants, loads, or
	// extracted vector elements.
	SDValue StoredVal = peekThroughBitcasts(St->getValue());
	const StoreSource StoreSrc = getStoreSource(StoredVal);
	if (StoreSrc == StoreSource::Unknown)
	return false;

	SmallVector<MemOpLink, 8> StoreNodes;
	SDNode *RootNode;
	// Find potential store merge candidates by searching through chain sub-DAG
	getStoreMergeCandidates(St, StoreNodes, RootNode);

	// Check if there is anything to merge.
	if (StoreNodes.size() < 2)
	return false;

	// Sort the memory operands according to their distance from the
	// base pointer.
	llvm::sort(StoreNodes, [](MemOpLink LHS, MemOpLink RHS) {
	return LHS.OffsetFromBase < RHS.OffsetFromBase;
	});

	bool AllowVectors = !DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat);
	bool IsNonTemporalStore = St->isNonTemporal();
	bool IsNonTemporalLoad = StoreSrc == StoreSource::Load &&
	cast<LoadSDNode>(StoredVal)->isNonTemporal();

	// Store Merge attempts to merge the lowest stores. This generally
	// works out as if successful, as the remaining stores are checked
	// after the first collection of stores is merged. However, in the
	// case that a non-mergeable store is found first, e.g., {p[-2],
	// p[0], p[1], p[2], p[3]}, we would fail and miss the subsequent
	// mergeable cases. To prevent this, we prune such stores from the
	// front of StoreNodes here.
	bool MadeChange = false;
	while (StoreNodes.size() > 1) {
	unsigned NumConsecutiveStores =
	getConsecutiveStores(StoreNodes, ElementSizeBytes);
	// There are no more stores in the list to examine.
	if (NumConsecutiveStores == 0)
	return MadeChange;

	// We have at least 2 consecutive stores. Try to merge them.
	assert(NumConsecutiveStores >= 2 && "Expected at least 2 stores");
	switch (StoreSrc) {
	case StoreSource::Constant:
	MadeChange \|= tryStoreMergeOfConstants(StoreNodes, NumConsecutiveStores,
	MemVT, RootNode, AllowVectors);
	break;

	case StoreSource::Extract:
	MadeChange \|= tryStoreMergeOfExtracts(StoreNodes, NumConsecutiveStores,
	MemVT, RootNode);
	break;

	case StoreSource::Load:
	MadeChange \|= tryStoreMergeOfLoads(StoreNodes, NumConsecutiveStores,
	MemVT, RootNode, AllowVectors,
	IsNonTemporalStore, IsNonTemporalLoad);
	break;

	default:
	llvm_unreachable("Unhandled store source type");
	}
	}
	return MadeChange;
	}

	SDValue DAGCombiner::replaceStoreChain(StoreSDNode *ST, SDValue BetterChain) {
	SDLoc SL(ST);
	SDValue ReplStore;

	// Replace the chain to avoid dependency.
	if (ST->isTruncatingStore()) {
	ReplStore = DAG.getTruncStore(BetterChain, SL, ST->getValue(),
	ST->getBasePtr(), ST->getMemoryVT(),
	ST->getMemOperand());
	} else {
	ReplStore = DAG.getStore(BetterChain, SL, ST->getValue(), ST->getBasePtr(),
	ST->getMemOperand());
	}

	// Create token to keep both nodes around.
	SDValue Token = DAG.getNode(ISD::TokenFactor, SL,
	MVT::Other, ST->getChain(), ReplStore);

	// Make sure the new and old chains are cleaned up.
	AddToWorklist(Token.getNode());

	// Don't add users to work list.
	return CombineTo(ST, Token, false);
	}

	SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) {
	SDValue Value = ST->getValue();
	if (Value.getOpcode() == ISD::TargetConstantFP)
	return SDValue();

	if (!ISD::isNormalStore(ST))
	return SDValue();

	SDLoc DL(ST);

	SDValue Chain = ST->getChain();
	SDValue Ptr = ST->getBasePtr();

	const ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Value);

	// NOTE: If the original store is volatile, this transform must not increase
	// the number of stores. For example, on x86-32 an f64 can be stored in one
	// processor operation but an i64 (which is not legal) requires two. So the
	// transform should not be done in this case.

	SDValue Tmp;
	switch (CFP->getSimpleValueType(0).SimpleTy) {
	default:
	llvm_unreachable("Unknown FP type");
	case MVT::f16: // We don't do this for these yet.
	case MVT::f80:
	case MVT::f128:
	case MVT::ppcf128:
	return SDValue();
	case MVT::f32:
	if ((isTypeLegal(MVT::i32) && !LegalOperations && ST->isSimple()) \|\|
	TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
	;
	Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF().
	bitcastToAPInt().getZExtValue(), SDLoc(CFP),
	MVT::i32);
	return DAG.getStore(Chain, DL, Tmp, Ptr, ST->getMemOperand());
	}

	return SDValue();
	case MVT::f64:
	if ((TLI.isTypeLegal(MVT::i64) && !LegalOperations &&
	ST->isSimple()) \|\|
	TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i64)) {
	;
	Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
	getZExtValue(), SDLoc(CFP), MVT::i64);
	return DAG.getStore(Chain, DL, Tmp,
	Ptr, ST->getMemOperand());
	}

	if (ST->isSimple() &&
	TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
	// Many FP stores are not made apparent until after legalize, e.g. for
	// argument passing. Since this is so common, custom legalize the
	// 64-bit integer store into two 32-bit stores.
	uint64_t Val = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
	SDValue Lo = DAG.getConstant(Val & 0xFFFFFFFF, SDLoc(CFP), MVT::i32);
	SDValue Hi = DAG.getConstant(Val >> 32, SDLoc(CFP), MVT::i32);
	if (DAG.getDataLayout().isBigEndian())
	std::swap(Lo, Hi);

	unsigned Alignment = ST->getAlignment();
	MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
	AAMDNodes AAInfo = ST->getAAInfo();

	SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(),
	ST->getAlignment(), MMOFlags, AAInfo);
	Ptr = DAG.getMemBasePlusOffset(Ptr, 4, DL);
	Alignment = MinAlign(Alignment, 4U);
	SDValue St1 = DAG.getStore(Chain, DL, Hi, Ptr,
	ST->getPointerInfo().getWithOffset(4),
	Alignment, MMOFlags, AAInfo);
	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
	St0, St1);
	}

	return SDValue();
	}
	}

	SDValue DAGCombiner::visitSTORE(SDNode *N) {
	StoreSDNode *ST = cast<StoreSDNode>(N);
	SDValue Chain = ST->getChain();
	SDValue Value = ST->getValue();
	SDValue Ptr = ST->getBasePtr();

	// If this is a store of a bit convert, store the input value if the
	// resultant store does not need a higher alignment than the original.
	if (Value.getOpcode() == ISD::BITCAST && !ST->isTruncatingStore() &&
	ST->isUnindexed()) {
	EVT SVT = Value.getOperand(0).getValueType();
	// If the store is volatile, we only want to change the store type if the
	// resulting store is legal. Otherwise we might increase the number of
	// memory accesses. We don't care if the original type was legal or not
	// as we assume software couldn't rely on the number of accesses of an
	// illegal type.
	// TODO: May be able to relax for unordered atomics (see D66309)
	if (((!LegalOperations && ST->isSimple()) \|\|
	TLI.isOperationLegal(ISD::STORE, SVT)) &&
	TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT,
	DAG, *ST->getMemOperand())) {
	return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
	ST->getMemOperand());
	}
	}

	// Turn 'store undef, Ptr' -> nothing.
	if (Value.isUndef() && ST->isUnindexed())
	return Chain;

	// Try to infer better alignment information than the store already has.
	if (OptLevel != CodeGenOpt::None && ST->isUnindexed() && !ST->isAtomic()) {
	if (MaybeAlign Alignment = DAG.InferPtrAlign(Ptr)) {
	if (*Alignment > ST->getAlign() &&
	isAligned(*Alignment, ST->getSrcValueOffset())) {
	SDValue NewStore =
	DAG.getTruncStore(Chain, SDLoc(N), Value, Ptr, ST->getPointerInfo(),
	ST->getMemoryVT(), *Alignment,
	ST->getMemOperand()->getFlags(), ST->getAAInfo());
	// NewStore will always be N as we are only refining the alignment
	assert(NewStore.getNode() == N);
	(void)NewStore;
	}
	}
	}

	// Try transforming a pair floating point load / store ops to integer
	// load / store ops.
	if (SDValue NewST = TransformFPLoadStorePair(N))
	return NewST;

	// Try transforming several stores into STORE (BSWAP).
	if (SDValue Store = MatchStoreCombine(ST))
	return Store;

	if (ST->isUnindexed()) {
	// Walk up chain skipping non-aliasing memory nodes, on this store and any
	// adjacent stores.
	if (findBetterNeighborChains(ST)) {
	// replaceStoreChain uses CombineTo, which handled all of the worklist
	// manipulation. Return the original node to not do anything else.
	return SDValue(ST, 0);
	}
	Chain = ST->getChain();
	}

	// FIXME: is there such a thing as a truncating indexed store?
	if (ST->isTruncatingStore() && ST->isUnindexed() &&
	Value.getValueType().isInteger() &&
	(!isa<ConstantSDNode>(Value) \|\|
	!cast<ConstantSDNode>(Value)->isOpaque())) {
	APInt TruncDemandedBits =
	APInt::getLowBitsSet(Value.getScalarValueSizeInBits(),
	ST->getMemoryVT().getScalarSizeInBits());

	// See if we can simplify the input to this truncstore with knowledge that
	// only the low bits are being used. For example:
	// "truncstore (or (shl x, 8), y), i8" -> "truncstore y, i8"
	AddToWorklist(Value.getNode());
	if (SDValue Shorter = DAG.GetDemandedBits(Value, TruncDemandedBits))
	return DAG.getTruncStore(Chain, SDLoc(N), Shorter, Ptr, ST->getMemoryVT(),
	ST->getMemOperand());

	// Otherwise, see if we can simplify the operation with
	// SimplifyDemandedBits, which only works if the value has a single use.
	if (SimplifyDemandedBits(Value, TruncDemandedBits)) {
	// Re-visit the store if anything changed and the store hasn't been merged
	// with another node (N is deleted) SimplifyDemandedBits will add Value's
	// node back to the worklist if necessary, but we also need to re-visit
	// the Store node itself.
	if (N->getOpcode() != ISD::DELETED_NODE)
	AddToWorklist(N);
	return SDValue(N, 0);
	}
	}

	// If this is a load followed by a store to the same location, then the store
	// is dead/noop.
	// TODO: Can relax for unordered atomics (see D66309)
	if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Value)) {
	if (Ld->getBasePtr() == Ptr && ST->getMemoryVT() == Ld->getMemoryVT() &&
	ST->isUnindexed() && ST->isSimple() &&
	// There can't be any side effects between the load and store, such as
	// a call or store.
	Chain.reachesChainWithoutSideEffects(SDValue(Ld, 1))) {
	// The store is dead, remove it.
	return Chain;
	}
	}

	// TODO: Can relax for unordered atomics (see D66309)
	if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) {
	if (ST->isUnindexed() && ST->isSimple() &&
	ST1->isUnindexed() && ST1->isSimple()) {
	if (ST1->getBasePtr() == Ptr && ST1->getValue() == Value &&
	ST->getMemoryVT() == ST1->getMemoryVT()) {
	// If this is a store followed by a store with the same value to the
	// same location, then the store is dead/noop.
	return Chain;
	}

	if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() &&
	!ST1->getBasePtr().isUndef() &&
	// BaseIndexOffset and the code below requires knowing the size
	// of a vector, so bail out if MemoryVT is scalable.
	!ST1->getMemoryVT().isScalableVector()) {
	const BaseIndexOffset STBase = BaseIndexOffset::match(ST, DAG);
	const BaseIndexOffset ChainBase = BaseIndexOffset::match(ST1, DAG);
	unsigned STBitSize = ST->getMemoryVT().getSizeInBits();
	unsigned ChainBitSize = ST1->getMemoryVT().getSizeInBits();
	// If this is a store who's preceding store to a subset of the current
	// location and no one other node is chained to that store we can
	// effectively drop the store. Do not remove stores to undef as they may
	// be used as data sinks.
	if (STBase.contains(DAG, STBitSize, ChainBase, ChainBitSize)) {
	CombineTo(ST1, ST1->getChain());
	return SDValue();
	}
	}
	}
	}

	// If this is an FP_ROUND or TRUNC followed by a store, fold this into a
	// truncating store. We can do this even if this is already a truncstore.
	if ((Value.getOpcode() == ISD::FP_ROUND \|\| Value.getOpcode() == ISD::TRUNCATE)
	&& Value.getNode()->hasOneUse() && ST->isUnindexed() &&
	TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
	ST->getMemoryVT())) {
	return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0),
	Ptr, ST->getMemoryVT(), ST->getMemOperand());
	}

	// Always perform this optimization before types are legal. If the target
	// prefers, also try this after legalization to catch stores that were created
	// by intrinsics or other nodes.
	if (!LegalTypes \|\| (TLI.mergeStoresAfterLegalization(ST->getMemoryVT()))) {
	while (true) {
	// There can be multiple store sequences on the same chain.
	// Keep trying to merge store sequences until we are unable to do so
	// or until we merge the last store on the chain.
	bool Changed = mergeConsecutiveStores(ST);
	if (!Changed) break;
	// Return N as merge only uses CombineTo and no worklist clean
	// up is necessary.
	if (N->getOpcode() == ISD::DELETED_NODE \|\| !isa<StoreSDNode>(N))
	return SDValue(N, 0);
	}
	}

	// Try transforming N to an indexed store.
	if (CombineToPreIndexedLoadStore(N) \|\| CombineToPostIndexedLoadStore(N))
	return SDValue(N, 0);

	// Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr'
	//
	// Make sure to do this only after attempting to merge stores in order to
	// avoid changing the types of some subset of stores due to visit order,
	// preventing their merging.
	if (isa<ConstantFPSDNode>(ST->getValue())) {
	if (SDValue NewSt = replaceStoreOfFPConstant(ST))
	return NewSt;
	}

	if (SDValue NewSt = splitMergedValStore(ST))
	return NewSt;

	return ReduceLoadOpStoreWidth(N);
	}

	SDValue DAGCombiner::visitLIFETIME_END(SDNode *N) {
	const auto *LifetimeEnd = cast<LifetimeSDNode>(N);
	if (!LifetimeEnd->hasOffset())
	return SDValue();

	const BaseIndexOffset LifetimeEndBase(N->getOperand(1), SDValue(),
	LifetimeEnd->getOffset(), false);

	// We walk up the chains to find stores.
	SmallVector<SDValue, 8> Chains = {N->getOperand(0)};
	while (!Chains.empty()) {
	SDValue Chain = Chains.back();
	Chains.pop_back();
	if (!Chain.hasOneUse())
	continue;
	switch (Chain.getOpcode()) {
	case ISD::TokenFactor:
	for (unsigned Nops = Chain.getNumOperands(); Nops;)
	Chains.push_back(Chain.getOperand(--Nops));
	break;
	case ISD::LIFETIME_START:
	case ISD::LIFETIME_END:
	// We can forward past any lifetime start/end that can be proven not to
	// alias the node.
	if (!isAlias(Chain.getNode(), N))
	Chains.push_back(Chain.getOperand(0));
	break;
	case ISD::STORE: {
	StoreSDNode *ST = dyn_cast<StoreSDNode>(Chain);
	// TODO: Can relax for unordered atomics (see D66309)
	if (!ST->isSimple() \|\| ST->isIndexed())
	continue;
	const BaseIndexOffset StoreBase = BaseIndexOffset::match(ST, DAG);
	// If we store purely within object bounds just before its lifetime ends,
	// we can remove the store.
	if (LifetimeEndBase.contains(DAG, LifetimeEnd->getSize() * 8, StoreBase,
	ST->getMemoryVT().getStoreSizeInBits())) {
	LLVM_DEBUG(dbgs() << "\nRemoving store:"; StoreBase.dump();
	dbgs() << "\nwithin LIFETIME_END of : ";
	LifetimeEndBase.dump(); dbgs() << "\n");
	CombineTo(ST, ST->getChain());
	return SDValue(N, 0);
	}
	}
	}
	}
	return SDValue();
	}

	/// For the instruction sequence of store below, F and I values
	/// are bundled together as an i64 value before being stored into memory.
	/// Sometimes it is more efficent to generate separate stores for F and I,
	/// which can remove the bitwise instructions or sink them to colder places.
	///
	/// (store (or (zext (bitcast F to i32) to i64),
	/// (shl (zext I to i64), 32)), addr) -->
	/// (store F, addr) and (store I, addr+4)
	///
	/// Similarly, splitting for other merged store can also be beneficial, like:
	/// For pair of {i32, i32}, i64 store --> two i32 stores.
	/// For pair of {i32, i16}, i64 store --> two i32 stores.
	/// For pair of {i16, i16}, i32 store --> two i16 stores.
	/// For pair of {i16, i8}, i32 store --> two i16 stores.
	/// For pair of {i8, i8}, i16 store --> two i8 stores.
	///
	/// We allow each target to determine specifically which kind of splitting is
	/// supported.
	///
	/// The store patterns are commonly seen from the simple code snippet below
	/// if only std::make_pair(...) is sroa transformed before inlined into hoo.
	/// void goo(const std::pair<int, float> &);
	/// hoo() {
	/// ...
	/// goo(std::make_pair(tmp, ftmp));
	/// ...
	/// }
	///
	SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) {
	if (OptLevel == CodeGenOpt::None)
	return SDValue();

	// Can't change the number of memory accesses for a volatile store or break
	// atomicity for an atomic one.
	if (!ST->isSimple())
	return SDValue();

	SDValue Val = ST->getValue();
	SDLoc DL(ST);

	// Match OR operand.
	if (!Val.getValueType().isScalarInteger() \|\| Val.getOpcode() != ISD::OR)
	return SDValue();

	// Match SHL operand and get Lower and Higher parts of Val.
	SDValue Op1 = Val.getOperand(0);
	SDValue Op2 = Val.getOperand(1);
	SDValue Lo, Hi;
	if (Op1.getOpcode() != ISD::SHL) {
	std::swap(Op1, Op2);
	if (Op1.getOpcode() != ISD::SHL)
	return SDValue();
	}
	Lo = Op2;
	Hi = Op1.getOperand(0);
	if (!Op1.hasOneUse())
	return SDValue();

	// Match shift amount to HalfValBitSize.
	unsigned HalfValBitSize = Val.getValueSizeInBits() / 2;
	ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(Op1.getOperand(1));
	if (!ShAmt \|\| ShAmt->getAPIntValue() != HalfValBitSize)
	return SDValue();

	// Lo and Hi are zero-extended from int with size less equal than 32
	// to i64.
	if (Lo.getOpcode() != ISD::ZERO_EXTEND \|\| !Lo.hasOneUse() \|\|
	!Lo.getOperand(0).getValueType().isScalarInteger() \|\|
	Lo.getOperand(0).getValueSizeInBits() > HalfValBitSize \|\|
	Hi.getOpcode() != ISD::ZERO_EXTEND \|\| !Hi.hasOneUse() \|\|
	!Hi.getOperand(0).getValueType().isScalarInteger() \|\|
	Hi.getOperand(0).getValueSizeInBits() > HalfValBitSize)
	return SDValue();

	// Use the EVT of low and high parts before bitcast as the input
	// of target query.
	EVT LowTy = (Lo.getOperand(0).getOpcode() == ISD::BITCAST)
	? Lo.getOperand(0).getValueType()
	: Lo.getValueType();
	EVT HighTy = (Hi.getOperand(0).getOpcode() == ISD::BITCAST)
	? Hi.getOperand(0).getValueType()
	: Hi.getValueType();
	if (!TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy))
	return SDValue();

	// Start to split store.
	unsigned Alignment = ST->getAlignment();
	MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
	AAMDNodes AAInfo = ST->getAAInfo();

	// Change the sizes of Lo and Hi's value types to HalfValBitSize.
	EVT VT = EVT::getIntegerVT(*DAG.getContext(), HalfValBitSize);
	Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Lo.getOperand(0));
	Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Hi.getOperand(0));

	SDValue Chain = ST->getChain();
	SDValue Ptr = ST->getBasePtr();
	// Lower value store.
	SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(),
	ST->getAlignment(), MMOFlags, AAInfo);
	Ptr = DAG.getMemBasePlusOffset(Ptr, HalfValBitSize / 8, DL);
	// Higher value store.
	SDValue St1 =
	DAG.getStore(St0, DL, Hi, Ptr,
	ST->getPointerInfo().getWithOffset(HalfValBitSize / 8),
	Alignment / 2, MMOFlags, AAInfo);
	return St1;
	}

	/// Convert a disguised subvector insertion into a shuffle:
	SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) {
	assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT &&
	"Expected extract_vector_elt");
	SDValue InsertVal = N->getOperand(1);
	SDValue Vec = N->getOperand(0);

	// (insert_vector_elt (vector_shuffle X, Y), (extract_vector_elt X, N),
	// InsIndex)
	// --> (vector_shuffle X, Y) and variations where shuffle operands may be
	// CONCAT_VECTORS.
	if (Vec.getOpcode() == ISD::VECTOR_SHUFFLE && Vec.hasOneUse() &&
	InsertVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	isa<ConstantSDNode>(InsertVal.getOperand(1))) {
	ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Vec.getNode());
	ArrayRef<int> Mask = SVN->getMask();

	SDValue X = Vec.getOperand(0);
	SDValue Y = Vec.getOperand(1);

	// Vec's operand 0 is using indices from 0 to N-1 and
	// operand 1 from N to 2N - 1, where N is the number of
	// elements in the vectors.
	SDValue InsertVal0 = InsertVal.getOperand(0);
	int ElementOffset = -1;

	// We explore the inputs of the shuffle in order to see if we find the
	// source of the extract_vector_elt. If so, we can use it to modify the
	// shuffle rather than perform an insert_vector_elt.
	SmallVector<std::pair<int, SDValue>, 8> ArgWorkList;
	ArgWorkList.emplace_back(Mask.size(), Y);
	ArgWorkList.emplace_back(0, X);

	while (!ArgWorkList.empty()) {
	int ArgOffset;
	SDValue ArgVal;
	std::tie(ArgOffset, ArgVal) = ArgWorkList.pop_back_val();

	if (ArgVal == InsertVal0) {
	ElementOffset = ArgOffset;
	break;
	}

	// Peek through concat_vector.
	if (ArgVal.getOpcode() == ISD::CONCAT_VECTORS) {
	int CurrentArgOffset =
	ArgOffset + ArgVal.getValueType().getVectorNumElements();
	int Step = ArgVal.getOperand(0).getValueType().getVectorNumElements();
	for (SDValue Op : reverse(ArgVal->ops())) {
	CurrentArgOffset -= Step;
	ArgWorkList.emplace_back(CurrentArgOffset, Op);
	}

	// Make sure we went through all the elements and did not screw up index
	// computation.
	assert(CurrentArgOffset == ArgOffset);
	}
	}

	if (ElementOffset != -1) {
	SmallVector<int, 16> NewMask(Mask.begin(), Mask.end());

	auto *ExtrIndex = cast<ConstantSDNode>(InsertVal.getOperand(1));
	NewMask[InsIndex] = ElementOffset + ExtrIndex->getZExtValue();
	assert(NewMask[InsIndex] <
	(int)(2 * Vec.getValueType().getVectorNumElements()) &&
	NewMask[InsIndex] >= 0 && "NewMask[InsIndex] is out of bound");

	SDValue LegalShuffle =
	TLI.buildLegalVectorShuffle(Vec.getValueType(), SDLoc(N), X,
	Y, NewMask, DAG);
	if (LegalShuffle)
	return LegalShuffle;
	}
	}

	// insert_vector_elt V, (bitcast X from vector type), IdxC -->
	// bitcast(shuffle (bitcast V), (extended X), Mask)
	// Note: We do not use an insert_subvector node because that requires a
	// legal subvector type.
	if (InsertVal.getOpcode() != ISD::BITCAST \|\| !InsertVal.hasOneUse() \|\|
	!InsertVal.getOperand(0).getValueType().isVector())
	return SDValue();

	SDValue SubVec = InsertVal.getOperand(0);
	SDValue DestVec = N->getOperand(0);
	EVT SubVecVT = SubVec.getValueType();
	EVT VT = DestVec.getValueType();
	unsigned NumSrcElts = SubVecVT.getVectorNumElements();
	// If the source only has a single vector element, the cost of creating adding
	// it to a vector is likely to exceed the cost of a insert_vector_elt.
	if (NumSrcElts == 1)
	return SDValue();
	unsigned ExtendRatio = VT.getSizeInBits() / SubVecVT.getSizeInBits();
	unsigned NumMaskVals = ExtendRatio * NumSrcElts;

	// Step 1: Create a shuffle mask that implements this insert operation. The
	// vector that we are inserting into will be operand 0 of the shuffle, so
	// those elements are just 'i'. The inserted subvector is in the first
	// positions of operand 1 of the shuffle. Example:
	// insert v4i32 V, (v2i16 X), 2 --> shuffle v8i16 V', X', {0,1,2,3,8,9,6,7}
	SmallVector<int, 16> Mask(NumMaskVals);
	for (unsigned i = 0; i != NumMaskVals; ++i) {
	if (i / NumSrcElts == InsIndex)
	Mask[i] = (i % NumSrcElts) + NumMaskVals;
	else
	Mask[i] = i;
	}

	// Bail out if the target can not handle the shuffle we want to create.
	EVT SubVecEltVT = SubVecVT.getVectorElementType();
	EVT ShufVT = EVT::getVectorVT(*DAG.getContext(), SubVecEltVT, NumMaskVals);
	if (!TLI.isShuffleMaskLegal(Mask, ShufVT))
	return SDValue();

	// Step 2: Create a wide vector from the inserted source vector by appending
	// undefined elements. This is the same size as our destination vector.
	SDLoc DL(N);
	SmallVector<SDValue, 8> ConcatOps(ExtendRatio, DAG.getUNDEF(SubVecVT));
	ConcatOps[0] = SubVec;
	SDValue PaddedSubV = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShufVT, ConcatOps);

	// Step 3: Shuffle in the padded subvector.
	SDValue DestVecBC = DAG.getBitcast(ShufVT, DestVec);
	SDValue Shuf = DAG.getVectorShuffle(ShufVT, DL, DestVecBC, PaddedSubV, Mask);
	AddToWorklist(PaddedSubV.getNode());
	AddToWorklist(DestVecBC.getNode());
	AddToWorklist(Shuf.getNode());
	return DAG.getBitcast(VT, Shuf);
	}

	SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
	SDValue InVec = N->getOperand(0);
	SDValue InVal = N->getOperand(1);
	SDValue EltNo = N->getOperand(2);
	SDLoc DL(N);

	EVT VT = InVec.getValueType();
	auto *IndexC = dyn_cast<ConstantSDNode>(EltNo);

	// Insert into out-of-bounds element is undefined.
	if (IndexC && VT.isFixedLengthVector() &&
	IndexC->getZExtValue() >= VT.getVectorNumElements())
	return DAG.getUNDEF(VT);

	// Remove redundant insertions:
	// (insert_vector_elt x (extract_vector_elt x idx) idx) -> x
	if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	InVec == InVal.getOperand(0) && EltNo == InVal.getOperand(1))
	return InVec;

	if (!IndexC) {
	// If this is variable insert to undef vector, it might be better to splat:
	// inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... >
	if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT)) {
	if (VT.isScalableVector())
	return DAG.getSplatVector(VT, DL, InVal);
	else {
	SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), InVal);
	return DAG.getBuildVector(VT, DL, Ops);
	}
	}
	return SDValue();
	}

	if (VT.isScalableVector())
	return SDValue();

	unsigned NumElts = VT.getVectorNumElements();

	// We must know which element is being inserted for folds below here.
	unsigned Elt = IndexC->getZExtValue();
	if (SDValue Shuf = combineInsertEltToShuffle(N, Elt))
	return Shuf;

	// Canonicalize insert_vector_elt dag nodes.
	// Example:
	// (insert_vector_elt (insert_vector_elt A, Idx0), Idx1)
	// -> (insert_vector_elt (insert_vector_elt A, Idx1), Idx0)
	//
	// Do this only if the child insert_vector node has one use; also
	// do this only if indices are both constants and Idx1 < Idx0.
	if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT && InVec.hasOneUse()
	&& isa<ConstantSDNode>(InVec.getOperand(2))) {
	unsigned OtherElt = InVec.getConstantOperandVal(2);
	if (Elt < OtherElt) {
	// Swap nodes.
	SDValue NewOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
	InVec.getOperand(0), InVal, EltNo);
	AddToWorklist(NewOp.getNode());
	return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(InVec.getNode()),
	VT, NewOp, InVec.getOperand(1), InVec.getOperand(2));
	}
	}

	// If we can't generate a legal BUILD_VECTOR, exit
	if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
	return SDValue();

	// Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
	// be converted to a BUILD_VECTOR). Fill in the Ops vector with the
	// vector elements.
	SmallVector<SDValue, 8> Ops;
	// Do not combine these two vectors if the output vector will not replace
	// the input vector.
	if (InVec.getOpcode() == ISD::BUILD_VECTOR && InVec.hasOneUse()) {
	Ops.append(InVec.getNode()->op_begin(),
	InVec.getNode()->op_end());
	} else if (InVec.isUndef()) {
	Ops.append(NumElts, DAG.getUNDEF(InVal.getValueType()));
	} else {
	return SDValue();
	}
	assert(Ops.size() == NumElts && "Unexpected vector size");

	// Insert the element
	if (Elt < Ops.size()) {
	// All the operands of BUILD_VECTOR must have the same type;
	// we enforce that here.
	EVT OpVT = Ops[0].getValueType();
	Ops[Elt] = OpVT.isInteger() ? DAG.getAnyExtOrTrunc(InVal, DL, OpVT) : InVal;
	}

	// Return the new vector
	return DAG.getBuildVector(VT, DL, Ops);
	}

	SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
	SDValue EltNo,
	LoadSDNode *OriginalLoad) {
	assert(OriginalLoad->isSimple());

	EVT ResultVT = EVE->getValueType(0);
	EVT VecEltVT = InVecVT.getVectorElementType();
	Align Alignment = OriginalLoad->getAlign();
	Align NewAlign = DAG.getDataLayout().getABITypeAlign(
	VecEltVT.getTypeForEVT(*DAG.getContext()));

	if (NewAlign > Alignment \|\|
	!TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT))
	return SDValue();

	ISD::LoadExtType ExtTy = ResultVT.bitsGT(VecEltVT) ?
	ISD::NON_EXTLOAD : ISD::EXTLOAD;
	if (!TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT))
	return SDValue();

	Alignment = NewAlign;

	SDValue NewPtr = OriginalLoad->getBasePtr();
	SDValue Offset;
	EVT PtrType = NewPtr.getValueType();
	MachinePointerInfo MPI;
	SDLoc DL(EVE);
	if (auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo)) {
	int Elt = ConstEltNo->getZExtValue();
	unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8;
	Offset = DAG.getConstant(PtrOff, DL, PtrType);
	MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff);
	} else {
	Offset = DAG.getZExtOrTrunc(EltNo, DL, PtrType);
	Offset = DAG.getNode(
	ISD::MUL, DL, PtrType, Offset,
	DAG.getConstant(VecEltVT.getStoreSize(), DL, PtrType));
	// Discard the pointer info except the address space because the memory
	// operand can't represent this new access since the offset is variable.
	MPI = MachinePointerInfo(OriginalLoad->getPointerInfo().getAddrSpace());
	}
	NewPtr = DAG.getMemBasePlusOffset(NewPtr, Offset, DL);

	// The replacement we need to do here is a little tricky: we need to
	// replace an extractelement of a load with a load.
	// Use ReplaceAllUsesOfValuesWith to do the replacement.
	// Note that this replacement assumes that the extractvalue is the only
	// use of the load; that's okay because we don't want to perform this
	// transformation in other cases anyway.
	SDValue Load;
	SDValue Chain;
	if (ResultVT.bitsGT(VecEltVT)) {
	// If the result type of vextract is wider than the load, then issue an
	// extending load instead.
	ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, ResultVT,
	VecEltVT)
	? ISD::ZEXTLOAD
	: ISD::EXTLOAD;
	Load = DAG.getExtLoad(ExtType, SDLoc(EVE), ResultVT,
	OriginalLoad->getChain(), NewPtr, MPI, VecEltVT,
	Alignment, OriginalLoad->getMemOperand()->getFlags(),
	OriginalLoad->getAAInfo());
	Chain = Load.getValue(1);
	} else {
	Load = DAG.getLoad(
	VecEltVT, SDLoc(EVE), OriginalLoad->getChain(), NewPtr, MPI, Alignment,
	OriginalLoad->getMemOperand()->getFlags(), OriginalLoad->getAAInfo());
	Chain = Load.getValue(1);
	if (ResultVT.bitsLT(VecEltVT))
	Load = DAG.getNode(ISD::TRUNCATE, SDLoc(EVE), ResultVT, Load);
	else
	Load = DAG.getBitcast(ResultVT, Load);
	}
	WorklistRemover DeadNodes(*this);
	SDValue From[] = { SDValue(EVE, 0), SDValue(OriginalLoad, 1) };
	SDValue To[] = { Load, Chain };
	DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
	// Make sure to revisit this node to clean it up; it will usually be dead.
	AddToWorklist(EVE);
	// Since we're explicitly calling ReplaceAllUses, add the new node to the
	// worklist explicitly as well.
	AddToWorklistWithUsers(Load.getNode());
	++OpsNarrowed;
	return SDValue(EVE, 0);
	}

	/// Transform a vector binary operation into a scalar binary operation by moving
	/// the math/logic after an extract element of a vector.
	static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG,
	bool LegalOperations) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Vec = ExtElt->getOperand(0);
	SDValue Index = ExtElt->getOperand(1);
	auto *IndexC = dyn_cast<ConstantSDNode>(Index);
	if (!IndexC \|\| !TLI.isBinOp(Vec.getOpcode()) \|\| !Vec.hasOneUse() \|\|
	Vec.getNode()->getNumValues() != 1)
	return SDValue();

	// Targets may want to avoid this to prevent an expensive register transfer.
	if (!TLI.shouldScalarizeBinop(Vec))
	return SDValue();

	// Extracting an element of a vector constant is constant-folded, so this
	// transform is just replacing a vector op with a scalar op while moving the
	// extract.
	SDValue Op0 = Vec.getOperand(0);
	SDValue Op1 = Vec.getOperand(1);
	if (isAnyConstantBuildVector(Op0, true) \|\|
	isAnyConstantBuildVector(Op1, true)) {
	// extractelt (binop X, C), IndexC --> binop (extractelt X, IndexC), C'
	// extractelt (binop C, X), IndexC --> binop C', (extractelt X, IndexC)
	SDLoc DL(ExtElt);
	EVT VT = ExtElt->getValueType(0);
	SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Index);
	SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op1, Index);
	return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1);
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
	SDValue VecOp = N->getOperand(0);
	SDValue Index = N->getOperand(1);
	EVT ScalarVT = N->getValueType(0);
	EVT VecVT = VecOp.getValueType();
	if (VecOp.isUndef())
	return DAG.getUNDEF(ScalarVT);

	// extract_vector_elt (insert_vector_elt vec, val, idx), idx) -> val
	//
	// This only really matters if the index is non-constant since other combines
	// on the constant elements already work.
	SDLoc DL(N);
	if (VecOp.getOpcode() == ISD::INSERT_VECTOR_ELT &&
	Index == VecOp.getOperand(2)) {
	SDValue Elt = VecOp.getOperand(1);
	return VecVT.isInteger() ? DAG.getAnyExtOrTrunc(Elt, DL, ScalarVT) : Elt;
	}

	// (vextract (scalar_to_vector val, 0) -> val
	if (VecOp.getOpcode() == ISD::SCALAR_TO_VECTOR) {
	// Only 0'th element of SCALAR_TO_VECTOR is defined.
	if (DAG.isKnownNeverZero(Index))
	return DAG.getUNDEF(ScalarVT);

	// Check if the result type doesn't match the inserted element type. A
	// SCALAR_TO_VECTOR may truncate the inserted element and the
	// EXTRACT_VECTOR_ELT may widen the extracted vector.
	SDValue InOp = VecOp.getOperand(0);
	if (InOp.getValueType() != ScalarVT) {
	assert(InOp.getValueType().isInteger() && ScalarVT.isInteger());
	return DAG.getSExtOrTrunc(InOp, DL, ScalarVT);
	}
	return InOp;
	}

	// extract_vector_elt of out-of-bounds element -> UNDEF
	auto *IndexC = dyn_cast<ConstantSDNode>(Index);
	if (IndexC && VecVT.isFixedLengthVector() &&
	IndexC->getAPIntValue().uge(VecVT.getVectorNumElements()))
	return DAG.getUNDEF(ScalarVT);

	// extract_vector_elt (build_vector x, y), 1 -> y
	if (((IndexC && VecOp.getOpcode() == ISD::BUILD_VECTOR) \|\|
	VecOp.getOpcode() == ISD::SPLAT_VECTOR) &&
	TLI.isTypeLegal(VecVT) &&
	(VecOp.hasOneUse() \|\| TLI.aggressivelyPreferBuildVectorSources(VecVT))) {
	assert((VecOp.getOpcode() != ISD::BUILD_VECTOR \|\|
	VecVT.isFixedLengthVector()) &&
	"BUILD_VECTOR used for scalable vectors");
	unsigned IndexVal =
	VecOp.getOpcode() == ISD::BUILD_VECTOR ? IndexC->getZExtValue() : 0;
	SDValue Elt = VecOp.getOperand(IndexVal);
	EVT InEltVT = Elt.getValueType();

	// Sometimes build_vector's scalar input types do not match result type.
	if (ScalarVT == InEltVT)
	return Elt;

	// TODO: It may be useful to truncate if free if the build_vector implicitly
	// converts.
	}

	if (VecVT.isScalableVector())
	return SDValue();

	// All the code from this point onwards assumes fixed width vectors, but it's
	// possible that some of the combinations could be made to work for scalable
	// vectors too.
	unsigned NumElts = VecVT.getVectorNumElements();
	unsigned VecEltBitWidth = VecVT.getScalarSizeInBits();

	// TODO: These transforms should not require the 'hasOneUse' restriction, but
	// there are regressions on multiple targets without it. We can end up with a
	// mess of scalar and vector code if we reduce only part of the DAG to scalar.
	if (IndexC && VecOp.getOpcode() == ISD::BITCAST && VecVT.isInteger() &&
	VecOp.hasOneUse()) {
	// The vector index of the LSBs of the source depend on the endian-ness.
	bool IsLE = DAG.getDataLayout().isLittleEndian();
	unsigned ExtractIndex = IndexC->getZExtValue();
	// extract_elt (v2i32 (bitcast i64:x)), BCTruncElt -> i32 (trunc i64:x)
	unsigned BCTruncElt = IsLE ? 0 : NumElts - 1;
	SDValue BCSrc = VecOp.getOperand(0);
	if (ExtractIndex == BCTruncElt && BCSrc.getValueType().isScalarInteger())
	return DAG.getNode(ISD::TRUNCATE, DL, ScalarVT, BCSrc);

	if (LegalTypes && BCSrc.getValueType().isInteger() &&
	BCSrc.getOpcode() == ISD::SCALAR_TO_VECTOR) {
	// ext_elt (bitcast (scalar_to_vec i64 X to v2i64) to v4i32), TruncElt -->
	// trunc i64 X to i32
	SDValue X = BCSrc.getOperand(0);
	assert(X.getValueType().isScalarInteger() && ScalarVT.isScalarInteger() &&
	"Extract element and scalar to vector can't change element type "
	"from FP to integer.");
	unsigned XBitWidth = X.getValueSizeInBits();
	BCTruncElt = IsLE ? 0 : XBitWidth / VecEltBitWidth - 1;

	// An extract element return value type can be wider than its vector
	// operand element type. In that case, the high bits are undefined, so
	// it's possible that we may need to extend rather than truncate.
	if (ExtractIndex == BCTruncElt && XBitWidth > VecEltBitWidth) {
	assert(XBitWidth % VecEltBitWidth == 0 &&
	"Scalar bitwidth must be a multiple of vector element bitwidth");
	return DAG.getAnyExtOrTrunc(X, DL, ScalarVT);
	}
	}
	}

	if (SDValue BO = scalarizeExtractedBinop(N, DAG, LegalOperations))
	return BO;

	// Transform: (EXTRACT_VECTOR_ELT( VECTOR_SHUFFLE )) -> EXTRACT_VECTOR_ELT.
	// We only perform this optimization before the op legalization phase because
	// we may introduce new vector instructions which are not backed by TD
	// patterns. For example on AVX, extracting elements from a wide vector
	// without using extract_subvector. However, if we can find an underlying
	// scalar value, then we can always use that.
	if (IndexC && VecOp.getOpcode() == ISD::VECTOR_SHUFFLE) {
	auto *Shuf = cast<ShuffleVectorSDNode>(VecOp);
	// Find the new index to extract from.
	int OrigElt = Shuf->getMaskElt(IndexC->getZExtValue());

	// Extracting an undef index is undef.
	if (OrigElt == -1)
	return DAG.getUNDEF(ScalarVT);

	// Select the right vector half to extract from.
	SDValue SVInVec;
	if (OrigElt < (int)NumElts) {
	SVInVec = VecOp.getOperand(0);
	} else {
	SVInVec = VecOp.getOperand(1);
	OrigElt -= NumElts;
	}

	if (SVInVec.getOpcode() == ISD::BUILD_VECTOR) {
	SDValue InOp = SVInVec.getOperand(OrigElt);
	if (InOp.getValueType() != ScalarVT) {
	assert(InOp.getValueType().isInteger() && ScalarVT.isInteger());
	InOp = DAG.getSExtOrTrunc(InOp, DL, ScalarVT);
	}

	return InOp;
	}

	// FIXME: We should handle recursing on other vector shuffles and
	// scalar_to_vector here as well.

	if (!LegalOperations \|\|
	// FIXME: Should really be just isOperationLegalOrCustom.
	TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VecVT) \|\|
	TLI.isOperationExpand(ISD::VECTOR_SHUFFLE, VecVT)) {
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SVInVec,
	DAG.getVectorIdxConstant(OrigElt, DL));
	}
	}

	// If only EXTRACT_VECTOR_ELT nodes use the source vector we can
	// simplify it based on the (valid) extraction indices.
	if (llvm::all_of(VecOp->uses(), [&](SDNode *Use) {
	return Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Use->getOperand(0) == VecOp &&
	isa<ConstantSDNode>(Use->getOperand(1));
	})) {
	APInt DemandedElts = APInt::getNullValue(NumElts);
	for (SDNode *Use : VecOp->uses()) {
	auto *CstElt = cast<ConstantSDNode>(Use->getOperand(1));
	if (CstElt->getAPIntValue().ult(NumElts))
	DemandedElts.setBit(CstElt->getZExtValue());
	}
	if (SimplifyDemandedVectorElts(VecOp, DemandedElts, true)) {
	// We simplified the vector operand of this extract element. If this
	// extract is not dead, visit it again so it is folded properly.
	if (N->getOpcode() != ISD::DELETED_NODE)
	AddToWorklist(N);
	return SDValue(N, 0);
	}
	APInt DemandedBits = APInt::getAllOnesValue(VecEltBitWidth);
	if (SimplifyDemandedBits(VecOp, DemandedBits, DemandedElts, true)) {
	// We simplified the vector operand of this extract element. If this
	// extract is not dead, visit it again so it is folded properly.
	if (N->getOpcode() != ISD::DELETED_NODE)
	AddToWorklist(N);
	return SDValue(N, 0);
	}
	}

	// Everything under here is trying to match an extract of a loaded value.
	// If the result of load has to be truncated, then it's not necessarily
	// profitable.
	bool BCNumEltsChanged = false;
	EVT ExtVT = VecVT.getVectorElementType();
	EVT LVT = ExtVT;
	if (ScalarVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, ScalarVT))
	return SDValue();

	if (VecOp.getOpcode() == ISD::BITCAST) {
	// Don't duplicate a load with other uses.
	if (!VecOp.hasOneUse())
	return SDValue();

	EVT BCVT = VecOp.getOperand(0).getValueType();
	if (!BCVT.isVector() \|\| ExtVT.bitsGT(BCVT.getVectorElementType()))
	return SDValue();
	if (NumElts != BCVT.getVectorNumElements())
	BCNumEltsChanged = true;
	VecOp = VecOp.getOperand(0);
	ExtVT = BCVT.getVectorElementType();
	}

	// extract (vector load $addr), i --> load $addr + i * size
	if (!LegalOperations && !IndexC && VecOp.hasOneUse() &&
	ISD::isNormalLoad(VecOp.getNode()) &&
	!Index->hasPredecessor(VecOp.getNode())) {
	auto *VecLoad = dyn_cast<LoadSDNode>(VecOp);
	if (VecLoad && VecLoad->isSimple())
	return scalarizeExtractedVectorLoad(N, VecVT, Index, VecLoad);
	}

	// Perform only after legalization to ensure build_vector / vector_shuffle
	// optimizations have already been done.
	if (!LegalOperations \|\| !IndexC)
	return SDValue();

	// (vextract (v4f32 load $addr), c) -> (f32 load $addr+c*size)
	// (vextract (v4f32 s2v (f32 load $addr)), c) -> (f32 load $addr+c*size)
	// (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), 0) -> (f32 load $addr)
	int Elt = IndexC->getZExtValue();
	LoadSDNode *LN0 = nullptr;
	if (ISD::isNormalLoad(VecOp.getNode())) {
	LN0 = cast<LoadSDNode>(VecOp);
	} else if (VecOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	VecOp.getOperand(0).getValueType() == ExtVT &&
	ISD::isNormalLoad(VecOp.getOperand(0).getNode())) {
	// Don't duplicate a load with other uses.
	if (!VecOp.hasOneUse())
	return SDValue();

	LN0 = cast<LoadSDNode>(VecOp.getOperand(0));
	}
	if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(VecOp)) {
	// (vextract (vector_shuffle (load $addr), v2, <1, u, u, u>), 1)
	// =>
	// (load $addr+1*size)

	// Don't duplicate a load with other uses.
	if (!VecOp.hasOneUse())
	return SDValue();

	// If the bit convert changed the number of elements, it is unsafe
	// to examine the mask.
	if (BCNumEltsChanged)
	return SDValue();

	// Select the input vector, guarding against out of range extract vector.
	int Idx = (Elt > (int)NumElts) ? -1 : Shuf->getMaskElt(Elt);
	VecOp = (Idx < (int)NumElts) ? VecOp.getOperand(0) : VecOp.getOperand(1);

	if (VecOp.getOpcode() == ISD::BITCAST) {
	// Don't duplicate a load with other uses.
	if (!VecOp.hasOneUse())
	return SDValue();

	VecOp = VecOp.getOperand(0);
	}
	if (ISD::isNormalLoad(VecOp.getNode())) {
	LN0 = cast<LoadSDNode>(VecOp);
	Elt = (Idx < (int)NumElts) ? Idx : Idx - (int)NumElts;
	Index = DAG.getConstant(Elt, DL, Index.getValueType());
	}
	} else if (VecOp.getOpcode() == ISD::CONCAT_VECTORS && !BCNumEltsChanged &&
	VecVT.getVectorElementType() == ScalarVT &&
	(!LegalTypes \|\|
	TLI.isTypeLegal(
	VecOp.getOperand(0).getValueType().getVectorElementType()))) {
	// extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 0
	// -> extract_vector_elt a, 0
	// extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 1
	// -> extract_vector_elt a, 1
	// extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 2
	// -> extract_vector_elt b, 0
	// extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 3
	// -> extract_vector_elt b, 1
	SDLoc SL(N);
	EVT ConcatVT = VecOp.getOperand(0).getValueType();
	unsigned ConcatNumElts = ConcatVT.getVectorNumElements();
	SDValue NewIdx = DAG.getConstant(Elt % ConcatNumElts, SL,
	Index.getValueType());

	SDValue ConcatOp = VecOp.getOperand(Elt / ConcatNumElts);
	SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL,
	ConcatVT.getVectorElementType(),
	ConcatOp, NewIdx);
	return DAG.getNode(ISD::BITCAST, SL, ScalarVT, Elt);
	}

	// Make sure we found a non-volatile load and the extractelement is
	// the only use.
	if (!LN0 \|\| !LN0->hasNUsesOfValue(1,0) \|\| !LN0->isSimple())
	return SDValue();

	// If Idx was -1 above, Elt is going to be -1, so just return undef.
	if (Elt == -1)
	return DAG.getUNDEF(LVT);

	return scalarizeExtractedVectorLoad(N, VecVT, Index, LN0);
	}

	// Simplify (build_vec (ext )) to (bitcast (build_vec ))
	SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) {
	// We perform this optimization post type-legalization because
	// the type-legalizer often scalarizes integer-promoted vectors.
	// Performing this optimization before may create bit-casts which
	// will be type-legalized to complex code sequences.
	// We perform this optimization only before the operation legalizer because we
	// may introduce illegal operations.
	if (Level != AfterLegalizeVectorOps && Level != AfterLegalizeTypes)
	return SDValue();

	unsigned NumInScalars = N->getNumOperands();
	SDLoc DL(N);
	EVT VT = N->getValueType(0);

	// Check to see if this is a BUILD_VECTOR of a bunch of values
	// which come from any_extend or zero_extend nodes. If so, we can create
	// a new BUILD_VECTOR using bit-casts which may enable other BUILD_VECTOR
	// optimizations. We do not handle sign-extend because we can't fill the sign
	// using shuffles.
	EVT SourceType = MVT::Other;
	bool AllAnyExt = true;

	for (unsigned i = 0; i != NumInScalars; ++i) {
	SDValue In = N->getOperand(i);
	// Ignore undef inputs.
	if (In.isUndef()) continue;

	bool AnyExt = In.getOpcode() == ISD::ANY_EXTEND;
	bool ZeroExt = In.getOpcode() == ISD::ZERO_EXTEND;

	// Abort if the element is not an extension.
	if (!ZeroExt && !AnyExt) {
	SourceType = MVT::Other;
	break;
	}

	// The input is a ZeroExt or AnyExt. Check the original type.
	EVT InTy = In.getOperand(0).getValueType();

	// Check that all of the widened source types are the same.
	if (SourceType == MVT::Other)
	// First time.
	SourceType = InTy;
	else if (InTy != SourceType) {
	// Multiple income types. Abort.
	SourceType = MVT::Other;
	break;
	}

	// Check if all of the extends are ANY_EXTENDs.
	AllAnyExt &= AnyExt;
	}

	// In order to have valid types, all of the inputs must be extended from the
	// same source type and all of the inputs must be any or zero extend.
	// Scalar sizes must be a power of two.
	EVT OutScalarTy = VT.getScalarType();
	bool ValidTypes = SourceType != MVT::Other &&
	isPowerOf2_32(OutScalarTy.getSizeInBits()) &&
	isPowerOf2_32(SourceType.getSizeInBits());

	// Create a new simpler BUILD_VECTOR sequence which other optimizations can
	// turn into a single shuffle instruction.
	if (!ValidTypes)
	return SDValue();

	// If we already have a splat buildvector, then don't fold it if it means
	// introducing zeros.
	if (!AllAnyExt && DAG.isSplatValue(SDValue(N, 0), /AllowUndefs/ true))
	return SDValue();

	bool isLE = DAG.getDataLayout().isLittleEndian();
	unsigned ElemRatio = OutScalarTy.getSizeInBits()/SourceType.getSizeInBits();
	assert(ElemRatio > 1 && "Invalid element size ratio");
	SDValue Filler = AllAnyExt ? DAG.getUNDEF(SourceType):
	DAG.getConstant(0, DL, SourceType);

	unsigned NewBVElems = ElemRatio * VT.getVectorNumElements();
	SmallVector<SDValue, 8> Ops(NewBVElems, Filler);

	// Populate the new build_vector
	for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
	SDValue Cast = N->getOperand(i);
	assert((Cast.getOpcode() == ISD::ANY_EXTEND \|\|
	Cast.getOpcode() == ISD::ZERO_EXTEND \|\|
	Cast.isUndef()) && "Invalid cast opcode");
	SDValue In;
	if (Cast.isUndef())
	In = DAG.getUNDEF(SourceType);
	else
	In = Cast->getOperand(0);
	unsigned Index = isLE ? (i * ElemRatio) :
	(i * ElemRatio + (ElemRatio - 1));

	assert(Index < Ops.size() && "Invalid index");
	Ops[Index] = In;
	}

	// The type of the new BUILD_VECTOR node.
	EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SourceType, NewBVElems);
	assert(VecVT.getSizeInBits() == VT.getSizeInBits() &&
	"Invalid vector size");
	// Check if the new vector type is legal.
	if (!isTypeLegal(VecVT) \|\|
	(!TLI.isOperationLegal(ISD::BUILD_VECTOR, VecVT) &&
	TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)))
	return SDValue();

	// Make the new BUILD_VECTOR.
	SDValue BV = DAG.getBuildVector(VecVT, DL, Ops);

	// The new BUILD_VECTOR node has the potential to be further optimized.
	AddToWorklist(BV.getNode());
	// Bitcast to the desired type.
	return DAG.getBitcast(VT, BV);
	}

	// Simplify (build_vec (trunc $1)
	// (trunc (srl $1 half-width))
	// (trunc (srl $1 (2 * half-width))) …)
	// to (bitcast $1)
	SDValue DAGCombiner::reduceBuildVecTruncToBitCast(SDNode *N) {
	assert(N->getOpcode() == ISD::BUILD_VECTOR && "Expected build vector");

	// Only for little endian
	if (!DAG.getDataLayout().isLittleEndian())
	return SDValue();

	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	EVT OutScalarTy = VT.getScalarType();
	uint64_t ScalarTypeBitsize = OutScalarTy.getSizeInBits();

	// Only for power of two types to be sure that bitcast works well
	if (!isPowerOf2_64(ScalarTypeBitsize))
	return SDValue();

	unsigned NumInScalars = N->getNumOperands();

	// Look through bitcasts
	auto PeekThroughBitcast = [](SDValue Op) {
	if (Op.getOpcode() == ISD::BITCAST)
	return Op.getOperand(0);
	return Op;
	};

	// The source value where all the parts are extracted.
	SDValue Src;
	for (unsigned i = 0; i != NumInScalars; ++i) {
	SDValue In = PeekThroughBitcast(N->getOperand(i));
	// Ignore undef inputs.
	if (In.isUndef()) continue;

	if (In.getOpcode() != ISD::TRUNCATE)
	return SDValue();

	In = PeekThroughBitcast(In.getOperand(0));

	if (In.getOpcode() != ISD::SRL) {
	// For now only build_vec without shuffling, handle shifts here in the
	// future.
	if (i != 0)
	return SDValue();

	Src = In;
	} else {
	// In is SRL
	SDValue part = PeekThroughBitcast(In.getOperand(0));

	if (!Src) {
	Src = part;
	} else if (Src != part) {
	// Vector parts do not stem from the same variable
	return SDValue();
	}

	SDValue ShiftAmtVal = In.getOperand(1);
	if (!isa<ConstantSDNode>(ShiftAmtVal))
	return SDValue();

	uint64_t ShiftAmt = In.getNode()->getConstantOperandVal(1);

	// The extracted value is not extracted at the right position
	if (ShiftAmt != i * ScalarTypeBitsize)
	return SDValue();
	}
	}

	// Only cast if the size is the same
	if (Src.getValueType().getSizeInBits() != VT.getSizeInBits())
	return SDValue();

	return DAG.getBitcast(VT, Src);
	}

	SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,
	ArrayRef<int> VectorMask,
	SDValue VecIn1, SDValue VecIn2,
	unsigned LeftIdx, bool DidSplitVec) {
	SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);

	EVT VT = N->getValueType(0);
	EVT InVT1 = VecIn1.getValueType();
	EVT InVT2 = VecIn2.getNode() ? VecIn2.getValueType() : InVT1;

	unsigned NumElems = VT.getVectorNumElements();
	unsigned ShuffleNumElems = NumElems;

	// If we artificially split a vector in two already, then the offsets in the
	// operands will all be based off of VecIn1, even those in VecIn2.
	unsigned Vec2Offset = DidSplitVec ? 0 : InVT1.getVectorNumElements();

	// We can't generate a shuffle node with mismatched input and output types.
	// Try to make the types match the type of the output.
	if (InVT1 != VT \|\| InVT2 != VT) {
	if ((VT.getSizeInBits() % InVT1.getSizeInBits() == 0) && InVT1 == InVT2) {
	// If the output vector length is a multiple of both input lengths,
	// we can concatenate them and pad the rest with undefs.
	unsigned NumConcats = VT.getSizeInBits() / InVT1.getSizeInBits();
	assert(NumConcats >= 2 && "Concat needs at least two inputs!");
	SmallVector<SDValue, 2> ConcatOps(NumConcats, DAG.getUNDEF(InVT1));
	ConcatOps[0] = VecIn1;
	ConcatOps[1] = VecIn2 ? VecIn2 : DAG.getUNDEF(InVT1);
	VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
	VecIn2 = SDValue();
	} else if (InVT1.getSizeInBits() == VT.getSizeInBits() * 2) {
	if (!TLI.isExtractSubvectorCheap(VT, InVT1, NumElems))
	return SDValue();

	if (!VecIn2.getNode()) {
	// If we only have one input vector, and it's twice the size of the
	// output, split it in two.
	VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, VecIn1,
	DAG.getVectorIdxConstant(NumElems, DL));
	VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, VecIn1, ZeroIdx);
	// Since we now have shorter input vectors, adjust the offset of the
	// second vector's start.
	Vec2Offset = NumElems;
	} else if (InVT2.getSizeInBits() <= InVT1.getSizeInBits()) {
	// VecIn1 is wider than the output, and we have another, possibly
	// smaller input. Pad the smaller input with undefs, shuffle at the
	// input vector width, and extract the output.
	// The shuffle type is different than VT, so check legality again.
	if (LegalOperations &&
	!TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, InVT1))
	return SDValue();

	// Legalizing INSERT_SUBVECTOR is tricky - you basically have to
	// lower it back into a BUILD_VECTOR. So if the inserted type is
	// illegal, don't even try.
	if (InVT1 != InVT2) {
	if (!TLI.isTypeLegal(InVT2))
	return SDValue();
	VecIn2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT1,
	DAG.getUNDEF(InVT1), VecIn2, ZeroIdx);
	}
	ShuffleNumElems = NumElems * 2;
	} else {
	// Both VecIn1 and VecIn2 are wider than the output, and VecIn2 is wider
	// than VecIn1. We can't handle this for now - this case will disappear
	// when we start sorting the vectors by type.
	return SDValue();
	}
	} else if (InVT2.getSizeInBits() * 2 == VT.getSizeInBits() &&
	InVT1.getSizeInBits() == VT.getSizeInBits()) {
	SmallVector<SDValue, 2> ConcatOps(2, DAG.getUNDEF(InVT2));
	ConcatOps[0] = VecIn2;
	VecIn2 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
	} else {
	// TODO: Support cases where the length mismatch isn't exactly by a
	// factor of 2.
	// TODO: Move this check upwards, so that if we have bad type
	// mismatches, we don't create any DAG nodes.
	return SDValue();
	}
	}

	// Initialize mask to undef.
	SmallVector<int, 8> Mask(ShuffleNumElems, -1);

	// Only need to run up to the number of elements actually used, not the
	// total number of elements in the shuffle - if we are shuffling a wider
	// vector, the high lanes should be set to undef.
	for (unsigned i = 0; i != NumElems; ++i) {
	if (VectorMask[i] <= 0)
	continue;

	unsigned ExtIndex = N->getOperand(i).getConstantOperandVal(1);
	if (VectorMask[i] == (int)LeftIdx) {
	Mask[i] = ExtIndex;
	} else if (VectorMask[i] == (int)LeftIdx + 1) {
	Mask[i] = Vec2Offset + ExtIndex;
	}
	}

	// The type the input vectors may have changed above.
	InVT1 = VecIn1.getValueType();

	// If we already have a VecIn2, it should have the same type as VecIn1.
	// If we don't, get an undef/zero vector of the appropriate type.
	VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(InVT1);
	assert(InVT1 == VecIn2.getValueType() && "Unexpected second input type.");

	SDValue Shuffle = DAG.getVectorShuffle(InVT1, DL, VecIn1, VecIn2, Mask);
	if (ShuffleNumElems > NumElems)
	Shuffle = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuffle, ZeroIdx);

	return Shuffle;
	}

	static SDValue reduceBuildVecToShuffleWithZero(SDNode *BV, SelectionDAG &DAG) {
	assert(BV->getOpcode() == ISD::BUILD_VECTOR && "Expected build vector");

	// First, determine where the build vector is not undef.
	// TODO: We could extend this to handle zero elements as well as undefs.
	int NumBVOps = BV->getNumOperands();
	int ZextElt = -1;
	for (int i = 0; i != NumBVOps; ++i) {
	SDValue Op = BV->getOperand(i);
	if (Op.isUndef())
	continue;
	if (ZextElt == -1)
	ZextElt = i;
	else
	return SDValue();
	}
	// Bail out if there's no non-undef element.
	if (ZextElt == -1)
	return SDValue();

	// The build vector contains some number of undef elements and exactly
	// one other element. That other element must be a zero-extended scalar
	// extracted from a vector at a constant index to turn this into a shuffle.
	// Also, require that the build vector does not implicitly truncate/extend
	// its elements.
	// TODO: This could be enhanced to allow ANY_EXTEND as well as ZERO_EXTEND.
	EVT VT = BV->getValueType(0);
	SDValue Zext = BV->getOperand(ZextElt);
	if (Zext.getOpcode() != ISD::ZERO_EXTEND \|\| !Zext.hasOneUse() \|\|
	Zext.getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(Zext.getOperand(0).getOperand(1)) \|\|
	Zext.getValueSizeInBits() != VT.getScalarSizeInBits())
	return SDValue();

	// The zero-extend must be a multiple of the source size, and we must be
	// building a vector of the same size as the source of the extract element.
	SDValue Extract = Zext.getOperand(0);
	unsigned DestSize = Zext.getValueSizeInBits();
	unsigned SrcSize = Extract.getValueSizeInBits();
	if (DestSize % SrcSize != 0 \|\|
	Extract.getOperand(0).getValueSizeInBits() != VT.getSizeInBits())
	return SDValue();

	// Create a shuffle mask that will combine the extracted element with zeros
	// and undefs.
	int ZextRatio = DestSize / SrcSize;
	int NumMaskElts = NumBVOps * ZextRatio;
	SmallVector<int, 32> ShufMask(NumMaskElts, -1);
	for (int i = 0; i != NumMaskElts; ++i) {
	if (i / ZextRatio == ZextElt) {
	// The low bits of the (potentially translated) extracted element map to
	// the source vector. The high bits map to zero. We will use a zero vector
	// as the 2nd source operand of the shuffle, so use the 1st element of
	// that vector (mask value is number-of-elements) for the high bits.
	if (i % ZextRatio == 0)
	ShufMask[i] = Extract.getConstantOperandVal(1);
	else
	ShufMask[i] = NumMaskElts;
	}

	// Undef elements of the build vector remain undef because we initialize
	// the shuffle mask with -1.
	}

	// buildvec undef, ..., (zext (extractelt V, IndexC)), undef... -->
	// bitcast (shuffle V, ZeroVec, VectorMask)
	SDLoc DL(BV);
	EVT VecVT = Extract.getOperand(0).getValueType();
	SDValue ZeroVec = DAG.getConstant(0, DL, VecVT);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Shuf = TLI.buildLegalVectorShuffle(VecVT, DL, Extract.getOperand(0),
	ZeroVec, ShufMask, DAG);
	if (!Shuf)
	return SDValue();
	return DAG.getBitcast(VT, Shuf);
	}

	// Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT
	// operations. If the types of the vectors we're extracting from allow it,
	// turn this into a vector_shuffle node.
	SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
	SDLoc DL(N);
	EVT VT = N->getValueType(0);

	// Only type-legal BUILD_VECTOR nodes are converted to shuffle nodes.
	if (!isTypeLegal(VT))
	return SDValue();

	if (SDValue V = reduceBuildVecToShuffleWithZero(N, DAG))
	return V;

	// May only combine to shuffle after legalize if shuffle is legal.
	if (LegalOperations && !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, VT))
	return SDValue();

	bool UsesZeroVector = false;
	unsigned NumElems = N->getNumOperands();

	// Record, for each element of the newly built vector, which input vector
	// that element comes from. -1 stands for undef, 0 for the zero vector,
	// and positive values for the input vectors.
	// VectorMask maps each element to its vector number, and VecIn maps vector
	// numbers to their initial SDValues.

	SmallVector<int, 8> VectorMask(NumElems, -1);
	SmallVector<SDValue, 8> VecIn;
	VecIn.push_back(SDValue());

	for (unsigned i = 0; i != NumElems; ++i) {
	SDValue Op = N->getOperand(i);

	if (Op.isUndef())
	continue;

	// See if we can use a blend with a zero vector.
	// TODO: Should we generalize this to a blend with an arbitrary constant
	// vector?
	if (isNullConstant(Op) \|\| isNullFPConstant(Op)) {
	UsesZeroVector = true;
	VectorMask[i] = 0;
	continue;
	}

	// Not an undef or zero. If the input is something other than an
	// EXTRACT_VECTOR_ELT with an in-range constant index, bail out.
	if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(Op.getOperand(1)))
	return SDValue();
	SDValue ExtractedFromVec = Op.getOperand(0);

	if (ExtractedFromVec.getValueType().isScalableVector())
	return SDValue();

	const APInt &ExtractIdx = Op.getConstantOperandAPInt(1);
	if (ExtractIdx.uge(ExtractedFromVec.getValueType().getVectorNumElements()))
	return SDValue();

	// All inputs must have the same element type as the output.
	if (VT.getVectorElementType() !=
	ExtractedFromVec.getValueType().getVectorElementType())
	return SDValue();

	// Have we seen this input vector before?
	// The vectors are expected to be tiny (usually 1 or 2 elements), so using
	// a map back from SDValues to numbers isn't worth it.
	unsigned Idx = std::distance(
	VecIn.begin(), std::find(VecIn.begin(), VecIn.end(), ExtractedFromVec));
	if (Idx == VecIn.size())
	VecIn.push_back(ExtractedFromVec);

	VectorMask[i] = Idx;
	}

	// If we didn't find at least one input vector, bail out.
	if (VecIn.size() < 2)
	return SDValue();

	// If all the Operands of BUILD_VECTOR extract from same
	// vector, then split the vector efficiently based on the maximum
	// vector access index and adjust the VectorMask and
	// VecIn accordingly.
	bool DidSplitVec = false;
	if (VecIn.size() == 2) {
	unsigned MaxIndex = 0;
	unsigned NearestPow2 = 0;
	SDValue Vec = VecIn.back();
	EVT InVT = Vec.getValueType();
	SmallVector<unsigned, 8> IndexVec(NumElems, 0);

	for (unsigned i = 0; i < NumElems; i++) {
	if (VectorMask[i] <= 0)
	continue;
	unsigned Index = N->getOperand(i).getConstantOperandVal(1);
	IndexVec[i] = Index;
	MaxIndex = std::max(MaxIndex, Index);
	}

	NearestPow2 = PowerOf2Ceil(MaxIndex);
	if (InVT.isSimple() && NearestPow2 > 2 && MaxIndex < NearestPow2 &&
	NumElems * 2 < NearestPow2) {
	unsigned SplitSize = NearestPow2 / 2;
	EVT SplitVT = EVT::getVectorVT(*DAG.getContext(),
	InVT.getVectorElementType(), SplitSize);
	if (TLI.isTypeLegal(SplitVT)) {
	SDValue VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec,
	DAG.getVectorIdxConstant(SplitSize, DL));
	SDValue VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec,
	DAG.getVectorIdxConstant(0, DL));
	VecIn.pop_back();
	VecIn.push_back(VecIn1);
	VecIn.push_back(VecIn2);
	DidSplitVec = true;

	for (unsigned i = 0; i < NumElems; i++) {
	if (VectorMask[i] <= 0)
	continue;
	VectorMask[i] = (IndexVec[i] < SplitSize) ? 1 : 2;
	}
	}
	}
	}

	// TODO: We want to sort the vectors by descending length, so that adjacent
	// pairs have similar length, and the longer vector is always first in the
	// pair.

	// TODO: Should this fire if some of the input vectors has illegal type (like
	// it does now), or should we let legalization run its course first?

	// Shuffle phase:
	// Take pairs of vectors, and shuffle them so that the result has elements
	// from these vectors in the correct places.
	// For example, given:
	// t10: i32 = extract_vector_elt t1, Constant:i64<0>
	// t11: i32 = extract_vector_elt t2, Constant:i64<0>
	// t12: i32 = extract_vector_elt t3, Constant:i64<0>
	// t13: i32 = extract_vector_elt t1, Constant:i64<1>
	// t14: v4i32 = BUILD_VECTOR t10, t11, t12, t13
	// We will generate:
	// t20: v4i32 = vector_shuffle<0,4,u,1> t1, t2
	// t21: v4i32 = vector_shuffle<u,u,0,u> t3, undef
	SmallVector<SDValue, 4> Shuffles;
	for (unsigned In = 0, Len = (VecIn.size() / 2); In < Len; ++In) {
	unsigned LeftIdx = 2 * In + 1;
	SDValue VecLeft = VecIn[LeftIdx];
	SDValue VecRight =
	(LeftIdx + 1) < VecIn.size() ? VecIn[LeftIdx + 1] : SDValue();

	if (SDValue Shuffle = createBuildVecShuffle(DL, N, VectorMask, VecLeft,
	VecRight, LeftIdx, DidSplitVec))
	Shuffles.push_back(Shuffle);
	else
	return SDValue();
	}

	// If we need the zero vector as an "ingredient" in the blend tree, add it
	// to the list of shuffles.
	if (UsesZeroVector)
	Shuffles.push_back(VT.isInteger() ? DAG.getConstant(0, DL, VT)
	: DAG.getConstantFP(0.0, DL, VT));

	// If we only have one shuffle, we're done.
	if (Shuffles.size() == 1)
	return Shuffles[0];

	// Update the vector mask to point to the post-shuffle vectors.
	for (int &Vec : VectorMask)
	if (Vec == 0)
	Vec = Shuffles.size() - 1;
	else
	Vec = (Vec - 1) / 2;

	// More than one shuffle. Generate a binary tree of blends, e.g. if from
	// the previous step we got the set of shuffles t10, t11, t12, t13, we will
	// generate:
	// t10: v8i32 = vector_shuffle<0,8,u,u,u,u,u,u> t1, t2
	// t11: v8i32 = vector_shuffle<u,u,0,8,u,u,u,u> t3, t4
	// t12: v8i32 = vector_shuffle<u,u,u,u,0,8,u,u> t5, t6
	// t13: v8i32 = vector_shuffle<u,u,u,u,u,u,0,8> t7, t8
	// t20: v8i32 = vector_shuffle<0,1,10,11,u,u,u,u> t10, t11
	// t21: v8i32 = vector_shuffle<u,u,u,u,4,5,14,15> t12, t13
	// t30: v8i32 = vector_shuffle<0,1,2,3,12,13,14,15> t20, t21

	// Make sure the initial size of the shuffle list is even.
	if (Shuffles.size() % 2)
	Shuffles.push_back(DAG.getUNDEF(VT));

	for (unsigned CurSize = Shuffles.size(); CurSize > 1; CurSize /= 2) {
	if (CurSize % 2) {
	Shuffles[CurSize] = DAG.getUNDEF(VT);
	CurSize++;
	}
	for (unsigned In = 0, Len = CurSize / 2; In < Len; ++In) {
	int Left = 2 * In;
	int Right = 2 * In + 1;
	SmallVector<int, 8> Mask(NumElems, -1);
	for (unsigned i = 0; i != NumElems; ++i) {
	if (VectorMask[i] == Left) {
	Mask[i] = i;
	VectorMask[i] = In;
	} else if (VectorMask[i] == Right) {
	Mask[i] = i + NumElems;
	VectorMask[i] = In;
	}
	}

	Shuffles[In] =
	DAG.getVectorShuffle(VT, DL, Shuffles[Left], Shuffles[Right], Mask);
	}
	}
	return Shuffles[0];
	}

	// Try to turn a build vector of zero extends of extract vector elts into a
	// a vector zero extend and possibly an extract subvector.
	// TODO: Support sign extend?
	// TODO: Allow undef elements?
	SDValue DAGCombiner::convertBuildVecZextToZext(SDNode *N) {
	if (LegalOperations)
	return SDValue();

	EVT VT = N->getValueType(0);

	bool FoundZeroExtend = false;
	SDValue Op0 = N->getOperand(0);
	auto checkElem = [&](SDValue Op) -> int64_t {
	unsigned Opc = Op.getOpcode();
	FoundZeroExtend \|= (Opc == ISD::ZERO_EXTEND);
	if ((Opc == ISD::ZERO_EXTEND \|\| Opc == ISD::ANY_EXTEND) &&
	Op.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Op0.getOperand(0).getOperand(0) == Op.getOperand(0).getOperand(0))
	if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(0).getOperand(1)))
	return C->getZExtValue();
	return -1;
	};

	// Make sure the first element matches
	// (zext (extract_vector_elt X, C))
	int64_t Offset = checkElem(Op0);
	if (Offset < 0)
	return SDValue();

	unsigned NumElems = N->getNumOperands();
	SDValue In = Op0.getOperand(0).getOperand(0);
	EVT InSVT = In.getValueType().getScalarType();
	EVT InVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumElems);

	// Don't create an illegal input type after type legalization.
	if (LegalTypes && !TLI.isTypeLegal(InVT))
	return SDValue();

	// Ensure all the elements come from the same vector and are adjacent.
	for (unsigned i = 1; i != NumElems; ++i) {
	if ((Offset + i) != checkElem(N->getOperand(i)))
	return SDValue();
	}

	SDLoc DL(N);
	In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InVT, In,
	Op0.getOperand(0).getOperand(1));
	return DAG.getNode(FoundZeroExtend ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND, DL,
	VT, In);
	}

	SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
	EVT VT = N->getValueType(0);

	// A vector built entirely of undefs is undef.
	if (ISD::allOperandsUndef(N))
	return DAG.getUNDEF(VT);

	// If this is a splat of a bitcast from another vector, change to a
	// concat_vector.
	// For example:
	// (build_vector (i64 (bitcast (v2i32 X))), (i64 (bitcast (v2i32 X)))) ->
	// (v2i64 (bitcast (concat_vectors (v2i32 X), (v2i32 X))))
	//
	// If X is a build_vector itself, the concat can become a larger build_vector.
	// TODO: Maybe this is useful for non-splat too?
	if (!LegalOperations) {
	if (SDValue Splat = cast<BuildVectorSDNode>(N)->getSplatValue()) {
	Splat = peekThroughBitcasts(Splat);
	EVT SrcVT = Splat.getValueType();
	if (SrcVT.isVector()) {
	unsigned NumElts = N->getNumOperands() * SrcVT.getVectorNumElements();
	EVT NewVT = EVT::getVectorVT(*DAG.getContext(),
	SrcVT.getVectorElementType(), NumElts);
	if (!LegalTypes \|\| TLI.isTypeLegal(NewVT)) {
	SmallVector<SDValue, 8> Ops(N->getNumOperands(), Splat);
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N),
	NewVT, Ops);
	return DAG.getBitcast(VT, Concat);
	}
	}
	}
	}

	// A splat of a single element is a SPLAT_VECTOR if supported on the target.
	if (TLI.getOperationAction(ISD::SPLAT_VECTOR, VT) != TargetLowering::Expand)
	if (SDValue V = cast<BuildVectorSDNode>(N)->getSplatValue()) {
	assert(!V.isUndef() && "Splat of undef should have been handled earlier");
	return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V);
	}

	// Check if we can express BUILD VECTOR via subvector extract.
	if (!LegalTypes && (N->getNumOperands() > 1)) {
	SDValue Op0 = N->getOperand(0);
	auto checkElem = [&](SDValue Op) -> uint64_t {
	if ((Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) &&
	(Op0.getOperand(0) == Op.getOperand(0)))
	if (auto CNode = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
	return CNode->getZExtValue();
	return -1;
	};

	int Offset = checkElem(Op0);
	for (unsigned i = 0; i < N->getNumOperands(); ++i) {
	if (Offset + i != checkElem(N->getOperand(i))) {
	Offset = -1;
	break;
	}
	}

	if ((Offset == 0) &&
	(Op0.getOperand(0).getValueType() == N->getValueType(0)))
	return Op0.getOperand(0);
	if ((Offset != -1) &&
	((Offset % N->getValueType(0).getVectorNumElements()) ==
	0)) // IDX must be multiple of output size.
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), N->getValueType(0),
	Op0.getOperand(0), Op0.getOperand(1));
	}

	if (SDValue V = convertBuildVecZextToZext(N))
	return V;

	if (SDValue V = reduceBuildVecExtToExtBuildVec(N))
	return V;

	if (SDValue V = reduceBuildVecTruncToBitCast(N))
	return V;

	if (SDValue V = reduceBuildVecToShuffle(N))
	return V;

	return SDValue();
	}

	static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT OpVT = N->getOperand(0).getValueType();

	// If the operands are legal vectors, leave them alone.
	if (TLI.isTypeLegal(OpVT))
	return SDValue();

	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	SmallVector<SDValue, 8> Ops;

	EVT SVT = EVT::getIntegerVT(*DAG.getContext(), OpVT.getSizeInBits());
	SDValue ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT);

	// Keep track of what we encounter.
	bool AnyInteger = false;
	bool AnyFP = false;
	for (const SDValue &Op : N->ops()) {
	if (ISD::BITCAST == Op.getOpcode() &&
	!Op.getOperand(0).getValueType().isVector())
	Ops.push_back(Op.getOperand(0));
	else if (ISD::UNDEF == Op.getOpcode())
	Ops.push_back(ScalarUndef);
	else
	return SDValue();

	// Note whether we encounter an integer or floating point scalar.
	// If it's neither, bail out, it could be something weird like x86mmx.
	EVT LastOpVT = Ops.back().getValueType();
	if (LastOpVT.isFloatingPoint())
	AnyFP = true;
	else if (LastOpVT.isInteger())
	AnyInteger = true;
	else
	return SDValue();
	}

	// If any of the operands is a floating point scalar bitcast to a vector,
	// use floating point types throughout, and bitcast everything.
	// Replace UNDEFs by another scalar UNDEF node, of the final desired type.
	if (AnyFP) {
	SVT = EVT::getFloatingPointVT(OpVT.getSizeInBits());
	ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT);
	if (AnyInteger) {
	for (SDValue &Op : Ops) {
	if (Op.getValueType() == SVT)
	continue;
	if (Op.isUndef())
	Op = ScalarUndef;
	else
	Op = DAG.getBitcast(SVT, Op);
	}
	}
	}

	EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SVT,
	VT.getSizeInBits() / SVT.getSizeInBits());
	return DAG.getBitcast(VT, DAG.getBuildVector(VecVT, DL, Ops));
	}

	// Check to see if this is a CONCAT_VECTORS of a bunch of EXTRACT_SUBVECTOR
	// operations. If so, and if the EXTRACT_SUBVECTOR vector inputs come from at
	// most two distinct vectors the same size as the result, attempt to turn this
	// into a legal shuffle.
	static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) {
	EVT VT = N->getValueType(0);
	EVT OpVT = N->getOperand(0).getValueType();
	+
	+ // We currently can't generate an appropriate shuffle for a scalable vector.
	+ if (VT.isScalableVector())
	+ return SDValue();
	+
	int NumElts = VT.getVectorNumElements();
	int NumOpElts = OpVT.getVectorNumElements();

	SDValue SV0 = DAG.getUNDEF(VT), SV1 = DAG.getUNDEF(VT);
	SmallVector<int, 8> Mask;

	for (SDValue Op : N->ops()) {
	Op = peekThroughBitcasts(Op);

	// UNDEF nodes convert to UNDEF shuffle mask values.
	if (Op.isUndef()) {
	Mask.append((unsigned)NumOpElts, -1);
	continue;
	}

	if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR)
	return SDValue();

	// What vector are we extracting the subvector from and at what index?
	SDValue ExtVec = Op.getOperand(0);
	int ExtIdx = Op.getConstantOperandVal(1);

	// We want the EVT of the original extraction to correctly scale the
	// extraction index.
	EVT ExtVT = ExtVec.getValueType();
	ExtVec = peekThroughBitcasts(ExtVec);

	// UNDEF nodes convert to UNDEF shuffle mask values.
	if (ExtVec.isUndef()) {
	Mask.append((unsigned)NumOpElts, -1);
	continue;
	}

	// Ensure that we are extracting a subvector from a vector the same
	// size as the result.
	if (ExtVT.getSizeInBits() != VT.getSizeInBits())
	return SDValue();

	// Scale the subvector index to account for any bitcast.
	int NumExtElts = ExtVT.getVectorNumElements();
	if (0 == (NumExtElts % NumElts))
	ExtIdx /= (NumExtElts / NumElts);
	else if (0 == (NumElts % NumExtElts))
	ExtIdx *= (NumElts / NumExtElts);
	else
	return SDValue();

	// At most we can reference 2 inputs in the final shuffle.
	if (SV0.isUndef() \|\| SV0 == ExtVec) {
	SV0 = ExtVec;
	for (int i = 0; i != NumOpElts; ++i)
	Mask.push_back(i + ExtIdx);
	} else if (SV1.isUndef() \|\| SV1 == ExtVec) {
	SV1 = ExtVec;
	for (int i = 0; i != NumOpElts; ++i)
	Mask.push_back(i + ExtIdx + NumElts);
	} else {
	return SDValue();
	}
	}

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	return TLI.buildLegalVectorShuffle(VT, SDLoc(N), DAG.getBitcast(VT, SV0),
	DAG.getBitcast(VT, SV1), Mask, DAG);
	}

	static SDValue combineConcatVectorOfCasts(SDNode *N, SelectionDAG &DAG) {
	unsigned CastOpcode = N->getOperand(0).getOpcode();
	switch (CastOpcode) {
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	// TODO: Allow more opcodes?
	// case ISD::BITCAST:
	// case ISD::TRUNCATE:
	// case ISD::ZERO_EXTEND:
	// case ISD::SIGN_EXTEND:
	// case ISD::FP_EXTEND:
	break;
	default:
	return SDValue();
	}

	EVT SrcVT = N->getOperand(0).getOperand(0).getValueType();
	if (!SrcVT.isVector())
	return SDValue();

	// All operands of the concat must be the same kind of cast from the same
	// source type.
	SmallVector<SDValue, 4> SrcOps;
	for (SDValue Op : N->ops()) {
	if (Op.getOpcode() != CastOpcode \|\| !Op.hasOneUse() \|\|
	Op.getOperand(0).getValueType() != SrcVT)
	return SDValue();
	SrcOps.push_back(Op.getOperand(0));
	}

	// The wider cast must be supported by the target. This is unusual because
	// the operation support type parameter depends on the opcode. In addition,
	// check the other type in the cast to make sure this is really legal.
	EVT VT = N->getValueType(0);
	EVT SrcEltVT = SrcVT.getVectorElementType();
	unsigned NumElts = SrcVT.getVectorElementCount().Min * N->getNumOperands();
	EVT ConcatSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcEltVT, NumElts);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	switch (CastOpcode) {
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	if (!TLI.isOperationLegalOrCustom(CastOpcode, ConcatSrcVT) \|\|
	!TLI.isTypeLegal(VT))
	return SDValue();
	break;
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	if (!TLI.isOperationLegalOrCustom(CastOpcode, VT) \|\|
	!TLI.isTypeLegal(ConcatSrcVT))
	return SDValue();
	break;
	default:
	llvm_unreachable("Unexpected cast opcode");
	}

	// concat (cast X), (cast Y)... -> cast (concat X, Y...)
	SDLoc DL(N);
	SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatSrcVT, SrcOps);
	return DAG.getNode(CastOpcode, DL, VT, NewConcat);
	}

	SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
	// If we only have one input vector, we don't need to do any concatenation.
	if (N->getNumOperands() == 1)
	return N->getOperand(0);

	// Check if all of the operands are undefs.
	EVT VT = N->getValueType(0);
	if (ISD::allOperandsUndef(N))
	return DAG.getUNDEF(VT);

	// Optimize concat_vectors where all but the first of the vectors are undef.
	if (std::all_of(std::next(N->op_begin()), N->op_end(), [](const SDValue &Op) {
	return Op.isUndef();
	})) {
	SDValue In = N->getOperand(0);
	assert(In.getValueType().isVector() && "Must concat vectors");

	// If the input is a concat_vectors, just make a larger concat by padding
	// with smaller undefs.
	if (In.getOpcode() == ISD::CONCAT_VECTORS && In.hasOneUse()) {
	unsigned NumOps = N->getNumOperands() * In.getNumOperands();
	SmallVector<SDValue, 4> Ops(In->op_begin(), In->op_end());
	Ops.resize(NumOps, DAG.getUNDEF(Ops[0].getValueType()));
	return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
	}

	SDValue Scalar = peekThroughOneUseBitcasts(In);

	// concat_vectors(scalar_to_vector(scalar), undef) ->
	// scalar_to_vector(scalar)
	if (!LegalOperations && Scalar.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	Scalar.hasOneUse()) {
	EVT SVT = Scalar.getValueType().getVectorElementType();
	if (SVT == Scalar.getOperand(0).getValueType())
	Scalar = Scalar.getOperand(0);
	}

	// concat_vectors(scalar, undef) -> scalar_to_vector(scalar)
	if (!Scalar.getValueType().isVector()) {
	// If the bitcast type isn't legal, it might be a trunc of a legal type;
	// look through the trunc so we can still do the transform:
	// concat_vectors(trunc(scalar), undef) -> scalar_to_vector(scalar)
	if (Scalar->getOpcode() == ISD::TRUNCATE &&
	!TLI.isTypeLegal(Scalar.getValueType()) &&
	TLI.isTypeLegal(Scalar->getOperand(0).getValueType()))
	Scalar = Scalar->getOperand(0);

	EVT SclTy = Scalar.getValueType();

	if (!SclTy.isFloatingPoint() && !SclTy.isInteger())
	return SDValue();

	// Bail out if the vector size is not a multiple of the scalar size.
	if (VT.getSizeInBits() % SclTy.getSizeInBits())
	return SDValue();

	unsigned VNTNumElms = VT.getSizeInBits() / SclTy.getSizeInBits();
	if (VNTNumElms < 2)
	return SDValue();

	EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy, VNTNumElms);
	if (!TLI.isTypeLegal(NVT) \|\| !TLI.isTypeLegal(Scalar.getValueType()))
	return SDValue();

	SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), NVT, Scalar);
	return DAG.getBitcast(VT, Res);
	}
	}

	// Fold any combination of BUILD_VECTOR or UNDEF nodes into one BUILD_VECTOR.
	// We have already tested above for an UNDEF only concatenation.
	// fold (concat_vectors (BUILD_VECTOR A, B, ...), (BUILD_VECTOR C, D, ...))
	// -> (BUILD_VECTOR A, B, ..., C, D, ...)
	auto IsBuildVectorOrUndef = [](const SDValue &Op) {
	return ISD::UNDEF == Op.getOpcode() \|\| ISD::BUILD_VECTOR == Op.getOpcode();
	};
	if (llvm::all_of(N->ops(), IsBuildVectorOrUndef)) {
	SmallVector<SDValue, 8> Opnds;
	EVT SVT = VT.getScalarType();

	EVT MinVT = SVT;
	if (!SVT.isFloatingPoint()) {
	// If BUILD_VECTOR are from built from integer, they may have different
	// operand types. Get the smallest type and truncate all operands to it.
	bool FoundMinVT = false;
	for (const SDValue &Op : N->ops())
	if (ISD::BUILD_VECTOR == Op.getOpcode()) {
	EVT OpSVT = Op.getOperand(0).getValueType();
	MinVT = (!FoundMinVT \|\| OpSVT.bitsLE(MinVT)) ? OpSVT : MinVT;
	FoundMinVT = true;
	}
	assert(FoundMinVT && "Concat vector type mismatch");
	}

	for (const SDValue &Op : N->ops()) {
	EVT OpVT = Op.getValueType();
	unsigned NumElts = OpVT.getVectorNumElements();

	if (ISD::UNDEF == Op.getOpcode())
	Opnds.append(NumElts, DAG.getUNDEF(MinVT));

	if (ISD::BUILD_VECTOR == Op.getOpcode()) {
	if (SVT.isFloatingPoint()) {
	assert(SVT == OpVT.getScalarType() && "Concat vector type mismatch");
	Opnds.append(Op->op_begin(), Op->op_begin() + NumElts);
	} else {
	for (unsigned i = 0; i != NumElts; ++i)
	Opnds.push_back(
	DAG.getNode(ISD::TRUNCATE, SDLoc(N), MinVT, Op.getOperand(i)));
	}
	}
	}

	assert(VT.getVectorNumElements() == Opnds.size() &&
	"Concat vector type mismatch");
	return DAG.getBuildVector(VT, SDLoc(N), Opnds);
	}

	// Fold CONCAT_VECTORS of only bitcast scalars (or undef) to BUILD_VECTOR.
	if (SDValue V = combineConcatVectorOfScalars(N, DAG))
	return V;

	// Fold CONCAT_VECTORS of EXTRACT_SUBVECTOR (or undef) to VECTOR_SHUFFLE.
	if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT))
	if (SDValue V = combineConcatVectorOfExtracts(N, DAG))
	return V;

	if (SDValue V = combineConcatVectorOfCasts(N, DAG))
	return V;

	// Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR
	- // nodes often generate nop CONCAT_VECTOR nodes.
	- // Scan the CONCAT_VECTOR operands and look for a CONCAT operations that
	- // place the incoming vectors at the exact same location.
	+ // nodes often generate nop CONCAT_VECTOR nodes. Scan the CONCAT_VECTOR
	+ // operands and look for a CONCAT operations that place the incoming vectors
	+ // at the exact same location.
	+ //
	+ // For scalable vectors, EXTRACT_SUBVECTOR indexes are implicitly scaled.
	SDValue SingleSource = SDValue();
	- unsigned PartNumElem = N->getOperand(0).getValueType().getVectorNumElements();
	+ unsigned PartNumElem =
	+ N->getOperand(0).getValueType().getVectorMinNumElements();

	for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
	SDValue Op = N->getOperand(i);

	if (Op.isUndef())
	continue;

	// Check if this is the identity extract:
	if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR)
	return SDValue();

	// Find the single incoming vector for the extract_subvector.
	if (SingleSource.getNode()) {
	if (Op.getOperand(0) != SingleSource)
	return SDValue();
	} else {
	SingleSource = Op.getOperand(0);

	// Check the source type is the same as the type of the result.
	// If not, this concat may extend the vector, so we can not
	// optimize it away.
	if (SingleSource.getValueType() != N->getValueType(0))
	return SDValue();
	}

	// Check that we are reading from the identity index.
	unsigned IdentityIndex = i * PartNumElem;
	if (Op.getConstantOperandAPInt(1) != IdentityIndex)
	return SDValue();
	}

	if (SingleSource.getNode())
	return SingleSource;

	return SDValue();
	}

	// Helper that peeks through INSERT_SUBVECTOR/CONCAT_VECTORS to find
	// if the subvector can be sourced for free.
	static SDValue getSubVectorSrc(SDValue V, SDValue Index, EVT SubVT) {
	if (V.getOpcode() == ISD::INSERT_SUBVECTOR &&
	V.getOperand(1).getValueType() == SubVT && V.getOperand(2) == Index) {
	return V.getOperand(1);
	}
	auto *IndexC = dyn_cast<ConstantSDNode>(Index);
	if (IndexC && V.getOpcode() == ISD::CONCAT_VECTORS &&
	V.getOperand(0).getValueType() == SubVT &&
	(IndexC->getZExtValue() % SubVT.getVectorNumElements()) == 0) {
	uint64_t SubIdx = IndexC->getZExtValue() / SubVT.getVectorNumElements();
	return V.getOperand(SubIdx);
	}
	return SDValue();
	}

	static SDValue narrowInsertExtractVectorBinOp(SDNode *Extract,
	SelectionDAG &DAG) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue BinOp = Extract->getOperand(0);
	unsigned BinOpcode = BinOp.getOpcode();
	if (!TLI.isBinOp(BinOpcode) \|\| BinOp.getNode()->getNumValues() != 1)
	return SDValue();

	EVT VecVT = BinOp.getValueType();
	SDValue Bop0 = BinOp.getOperand(0), Bop1 = BinOp.getOperand(1);
	if (VecVT != Bop0.getValueType() \|\| VecVT != Bop1.getValueType())
	return SDValue();

	SDValue Index = Extract->getOperand(1);
	EVT SubVT = Extract->getValueType(0);
	if (!TLI.isOperationLegalOrCustom(BinOpcode, SubVT))
	return SDValue();

	SDValue Sub0 = getSubVectorSrc(Bop0, Index, SubVT);
	SDValue Sub1 = getSubVectorSrc(Bop1, Index, SubVT);

	// TODO: We could handle the case where only 1 operand is being inserted by
	// creating an extract of the other operand, but that requires checking
	// number of uses and/or costs.
	if (!Sub0 \|\| !Sub1)
	return SDValue();

	// We are inserting both operands of the wide binop only to extract back
	// to the narrow vector size. Eliminate all of the insert/extract:
	// ext (binop (ins ?, X, Index), (ins ?, Y, Index)), Index --> binop X, Y
	return DAG.getNode(BinOpcode, SDLoc(Extract), SubVT, Sub0, Sub1,
	BinOp->getFlags());
	}

	/// If we are extracting a subvector produced by a wide binary operator try
	/// to use a narrow binary operator and/or avoid concatenation and extraction.
	static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
	// TODO: Refactor with the caller (visitEXTRACT_SUBVECTOR), so we can share
	// some of these bailouts with other transforms.

	if (SDValue V = narrowInsertExtractVectorBinOp(Extract, DAG))
	return V;

	// The extract index must be a constant, so we can map it to a concat operand.
	auto *ExtractIndexC = dyn_cast<ConstantSDNode>(Extract->getOperand(1));
	if (!ExtractIndexC)
	return SDValue();

	// We are looking for an optionally bitcasted wide vector binary operator
	// feeding an extract subvector.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue BinOp = peekThroughBitcasts(Extract->getOperand(0));
	unsigned BOpcode = BinOp.getOpcode();
	if (!TLI.isBinOp(BOpcode) \|\| BinOp.getNode()->getNumValues() != 1)
	return SDValue();

	// Exclude the fake form of fneg (fsub -0.0, x) because that is likely to be
	// reduced to the unary fneg when it is visited, and we probably want to deal
	// with fneg in a target-specific way.
	if (BOpcode == ISD::FSUB) {
	auto C = isConstOrConstSplatFP(BinOp.getOperand(0), /AllowUndefs*/ true);
	if (C && C->getValueAPF().isNegZero())
	return SDValue();
	}

	// The binop must be a vector type, so we can extract some fraction of it.
	EVT WideBVT = BinOp.getValueType();
	- if (!WideBVT.isVector())
	+ // The optimisations below currently assume we are dealing with fixed length
	+ // vectors. It is possible to add support for scalable vectors, but at the
	+ // moment we've done no analysis to prove whether they are profitable or not.
	+ if (!WideBVT.isFixedLengthVector())
	return SDValue();

	EVT VT = Extract->getValueType(0);
	unsigned ExtractIndex = ExtractIndexC->getZExtValue();
	assert(ExtractIndex % VT.getVectorNumElements() == 0 &&
	"Extract index is not a multiple of the vector length.");

	// Bail out if this is not a proper multiple width extraction.
	unsigned WideWidth = WideBVT.getSizeInBits();
	unsigned NarrowWidth = VT.getSizeInBits();
	if (WideWidth % NarrowWidth != 0)
	return SDValue();

	// Bail out if we are extracting a fraction of a single operation. This can
	// occur because we potentially looked through a bitcast of the binop.
	unsigned NarrowingRatio = WideWidth / NarrowWidth;
	unsigned WideNumElts = WideBVT.getVectorNumElements();
	if (WideNumElts % NarrowingRatio != 0)
	return SDValue();

	// Bail out if the target does not support a narrower version of the binop.
	EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(),
	WideNumElts / NarrowingRatio);
	if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT))
	return SDValue();

	// If extraction is cheap, we don't need to look at the binop operands
	// for concat ops. The narrow binop alone makes this transform profitable.
	// We can't just reuse the original extract index operand because we may have
	// bitcasted.
	unsigned ConcatOpNum = ExtractIndex / VT.getVectorNumElements();
	unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements();
	if (TLI.isExtractSubvectorCheap(NarrowBVT, WideBVT, ExtBOIdx) &&
	BinOp.hasOneUse() && Extract->getOperand(0)->hasOneUse()) {
	// extract (binop B0, B1), N --> binop (extract B0, N), (extract B1, N)
	SDLoc DL(Extract);
	SDValue NewExtIndex = DAG.getVectorIdxConstant(ExtBOIdx, DL);
	SDValue X = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
	BinOp.getOperand(0), NewExtIndex);
	SDValue Y = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
	BinOp.getOperand(1), NewExtIndex);
	SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y,
	BinOp.getNode()->getFlags());
	return DAG.getBitcast(VT, NarrowBinOp);
	}

	// Only handle the case where we are doubling and then halving. A larger ratio
	// may require more than two narrow binops to replace the wide binop.
	if (NarrowingRatio != 2)
	return SDValue();

	// TODO: The motivating case for this transform is an x86 AVX1 target. That
	// target has temptingly almost legal versions of bitwise logic ops in 256-bit
	// flavors, but no other 256-bit integer support. This could be extended to
	// handle any binop, but that may require fixing/adding other folds to avoid
	// codegen regressions.
	if (BOpcode != ISD::AND && BOpcode != ISD::OR && BOpcode != ISD::XOR)
	return SDValue();

	// We need at least one concatenation operation of a binop operand to make
	// this transform worthwhile. The concat must double the input vector sizes.
	auto GetSubVector = [ConcatOpNum](SDValue V) -> SDValue {
	if (V.getOpcode() == ISD::CONCAT_VECTORS && V.getNumOperands() == 2)
	return V.getOperand(ConcatOpNum);
	return SDValue();
	};
	SDValue SubVecL = GetSubVector(peekThroughBitcasts(BinOp.getOperand(0)));
	SDValue SubVecR = GetSubVector(peekThroughBitcasts(BinOp.getOperand(1)));

	if (SubVecL \|\| SubVecR) {
	// If a binop operand was not the result of a concat, we must extract a
	// half-sized operand for our new narrow binop:
	// extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
	// extract (binop (concat X1, X2), Y), N --> binop XN, (extract Y, IndexC)
	// extract (binop X, (concat Y1, Y2)), N --> binop (extract X, IndexC), YN
	SDLoc DL(Extract);
	SDValue IndexC = DAG.getVectorIdxConstant(ExtBOIdx, DL);
	SDValue X = SubVecL ? DAG.getBitcast(NarrowBVT, SubVecL)
	: DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
	BinOp.getOperand(0), IndexC);

	SDValue Y = SubVecR ? DAG.getBitcast(NarrowBVT, SubVecR)
	: DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
	BinOp.getOperand(1), IndexC);

	SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y);
	return DAG.getBitcast(VT, NarrowBinOp);
	}

	return SDValue();
	}

	/// If we are extracting a subvector from a wide vector load, convert to a
	/// narrow load to eliminate the extraction:
	/// (extract_subvector (load wide vector)) --> (load narrow vector)
	static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {
	// TODO: Add support for big-endian. The offset calculation must be adjusted.
	if (DAG.getDataLayout().isBigEndian())
	return SDValue();

	auto *Ld = dyn_cast<LoadSDNode>(Extract->getOperand(0));
	auto *ExtIdx = dyn_cast<ConstantSDNode>(Extract->getOperand(1));
	if (!Ld \|\| Ld->getExtensionType() \|\| !Ld->isSimple() \|\|
	!ExtIdx)
	return SDValue();

	// Allow targets to opt-out.
	EVT VT = Extract->getValueType(0);

	// We can only create byte sized loads.
	if (!VT.isByteSized())
	return SDValue();

	unsigned Index = ExtIdx->getZExtValue();
	unsigned NumElts = VT.getVectorNumElements();

	// If the index is a multiple of the extract element count, we can offset the
	// address by the store size multiplied by the subvector index. Otherwise if
	// the scalar type is byte sized, we can just use the index multiplied by
	// the element size in bytes as the offset.
	unsigned Offset;
	if (Index % NumElts == 0)
	Offset = (Index / NumElts) * VT.getStoreSize();
	else if (VT.getScalarType().isByteSized())
	Offset = Index * VT.getScalarType().getStoreSize();
	else
	return SDValue();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.shouldReduceLoadWidth(Ld, Ld->getExtensionType(), VT))
	return SDValue();

	// The narrow load will be offset from the base address of the old load if
	// we are extracting from something besides index 0 (little-endian).
	SDLoc DL(Extract);
	SDValue BaseAddr = Ld->getBasePtr();

	// TODO: Use "BaseIndexOffset" to make this more effective.
	SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
	MachineFunction &MF = DAG.getMachineFunction();
	MachineMemOperand *MMO = MF.getMachineMemOperand(Ld->getMemOperand(), Offset,
	VT.getStoreSize());
	SDValue NewLd = DAG.getLoad(VT, DL, Ld->getChain(), NewAddr, MMO);
	DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
	return NewLd;
	}

	SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
	EVT NVT = N->getValueType(0);
	SDValue V = N->getOperand(0);
	uint64_t ExtIdx = N->getConstantOperandVal(1);

	// Extract from UNDEF is UNDEF.
	if (V.isUndef())
	return DAG.getUNDEF(NVT);

	if (TLI.isOperationLegalOrCustomOrPromote(ISD::LOAD, NVT))
	if (SDValue NarrowLoad = narrowExtractedVectorLoad(N, DAG))
	return NarrowLoad;

	// Combine an extract of an extract into a single extract_subvector.
	// ext (ext X, C), 0 --> ext X, C
	if (ExtIdx == 0 && V.getOpcode() == ISD::EXTRACT_SUBVECTOR && V.hasOneUse()) {
	if (TLI.isExtractSubvectorCheap(NVT, V.getOperand(0).getValueType(),
	V.getConstantOperandVal(1)) &&
	TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NVT)) {
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT, V.getOperand(0),
	V.getOperand(1));
	}
	}

	// Try to move vector bitcast after extract_subv by scaling extraction index:
	// extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')
	if (V.getOpcode() == ISD::BITCAST &&
	V.getOperand(0).getValueType().isVector()) {
	SDValue SrcOp = V.getOperand(0);
	EVT SrcVT = SrcOp.getValueType();
	unsigned SrcNumElts = SrcVT.getVectorMinNumElements();
	unsigned DestNumElts = V.getValueType().getVectorMinNumElements();
	if ((SrcNumElts % DestNumElts) == 0) {
	unsigned SrcDestRatio = SrcNumElts / DestNumElts;
	ElementCount NewExtEC = NVT.getVectorElementCount() * SrcDestRatio;
	EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(),
	NewExtEC);
	if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
	SDLoc DL(N);
	SDValue NewIndex = DAG.getVectorIdxConstant(ExtIdx * SrcDestRatio, DL);
	SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
	V.getOperand(0), NewIndex);
	return DAG.getBitcast(NVT, NewExtract);
	}
	}
	if ((DestNumElts % SrcNumElts) == 0) {
	unsigned DestSrcRatio = DestNumElts / SrcNumElts;
	if ((NVT.getVectorMinNumElements() % DestSrcRatio) == 0) {
	ElementCount NewExtEC = NVT.getVectorElementCount() / DestSrcRatio;
	EVT ScalarVT = SrcVT.getScalarType();
	if ((ExtIdx % DestSrcRatio) == 0) {
	SDLoc DL(N);
	unsigned IndexValScaled = ExtIdx / DestSrcRatio;
	EVT NewExtVT =
	EVT::getVectorVT(*DAG.getContext(), ScalarVT, NewExtEC);
	if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
	SDValue NewIndex = DAG.getVectorIdxConstant(IndexValScaled, DL);
	SDValue NewExtract =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
	V.getOperand(0), NewIndex);
	return DAG.getBitcast(NVT, NewExtract);
	}
	if (NewExtEC == 1 &&
	TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, ScalarVT)) {
	SDValue NewIndex = DAG.getVectorIdxConstant(IndexValScaled, DL);
	SDValue NewExtract =
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT,
	V.getOperand(0), NewIndex);
	return DAG.getBitcast(NVT, NewExtract);
	}
	}
	}
	}
	}

	if (V.getOpcode() == ISD::CONCAT_VECTORS) {
	unsigned ExtNumElts = NVT.getVectorMinNumElements();
	EVT ConcatSrcVT = V.getOperand(0).getValueType();
	assert(ConcatSrcVT.getVectorElementType() == NVT.getVectorElementType() &&
	"Concat and extract subvector do not change element type");
	assert((ExtIdx % ExtNumElts) == 0 &&
	"Extract index is not a multiple of the input vector length.");

	unsigned ConcatSrcNumElts = ConcatSrcVT.getVectorMinNumElements();
	unsigned ConcatOpIdx = ExtIdx / ConcatSrcNumElts;

	// If the concatenated source types match this extract, it's a direct
	// simplification:
	// extract_subvec (concat V1, V2, ...), i --> Vi
	if (ConcatSrcNumElts == ExtNumElts)
	return V.getOperand(ConcatOpIdx);

	// If the concatenated source vectors are a multiple length of this extract,
	// then extract a fraction of one of those source vectors directly from a
	// concat operand. Example:
	// v2i8 extract_subvec (v16i8 concat (v8i8 X), (v8i8 Y), 14 -->
	// v2i8 extract_subvec v8i8 Y, 6
	if (NVT.isFixedLengthVector() && ConcatSrcNumElts % ExtNumElts == 0) {
	SDLoc DL(N);
	unsigned NewExtIdx = ExtIdx - ConcatOpIdx * ConcatSrcNumElts;
	assert(NewExtIdx + ExtNumElts <= ConcatSrcNumElts &&
	"Trying to extract from >1 concat operand?");
	assert(NewExtIdx % ExtNumElts == 0 &&
	"Extract index is not a multiple of the input vector length.");
	SDValue NewIndexC = DAG.getVectorIdxConstant(NewExtIdx, DL);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT,
	V.getOperand(ConcatOpIdx), NewIndexC);
	}
	}

	V = peekThroughBitcasts(V);

	// If the input is a build vector. Try to make a smaller build vector.
	if (V.getOpcode() == ISD::BUILD_VECTOR) {
	EVT InVT = V.getValueType();
	unsigned ExtractSize = NVT.getSizeInBits();
	unsigned EltSize = InVT.getScalarSizeInBits();
	// Only do this if we won't split any elements.
	if (ExtractSize % EltSize == 0) {
	unsigned NumElems = ExtractSize / EltSize;
	EVT EltVT = InVT.getVectorElementType();
	EVT ExtractVT =
	NumElems == 1 ? EltVT
	: EVT::getVectorVT(*DAG.getContext(), EltVT, NumElems);
	if ((Level < AfterLegalizeDAG \|\|
	(NumElems == 1 \|\|
	TLI.isOperationLegal(ISD::BUILD_VECTOR, ExtractVT))) &&
	(!LegalTypes \|\| TLI.isTypeLegal(ExtractVT))) {
	unsigned IdxVal = (ExtIdx * NVT.getScalarSizeInBits()) / EltSize;

	if (NumElems == 1) {
	SDValue Src = V->getOperand(IdxVal);
	if (EltVT != Src.getValueType())
	Src = DAG.getNode(ISD::TRUNCATE, SDLoc(N), InVT, Src);
	return DAG.getBitcast(NVT, Src);
	}

	// Extract the pieces from the original build_vector.
	SDValue BuildVec = DAG.getBuildVector(ExtractVT, SDLoc(N),
	V->ops().slice(IdxVal, NumElems));
	return DAG.getBitcast(NVT, BuildVec);
	}
	}
	}

	if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
	// Handle only simple case where vector being inserted and vector
	// being extracted are of same size.
	EVT SmallVT = V.getOperand(1).getValueType();
	if (!NVT.bitsEq(SmallVT))
	return SDValue();

	// Combine:
	// (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx)
	// Into:
	// indices are equal or bit offsets are equal => V1
	// otherwise => (extract_subvec V1, ExtIdx)
	uint64_t InsIdx = V.getConstantOperandVal(2);
	if (InsIdx * SmallVT.getScalarSizeInBits() ==
	ExtIdx * NVT.getScalarSizeInBits())
	return DAG.getBitcast(NVT, V.getOperand(1));
	return DAG.getNode(
	ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT,
	DAG.getBitcast(N->getOperand(0).getValueType(), V.getOperand(0)),
	N->getOperand(1));
	}

	if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG))
	return NarrowBOp;

	if (SimplifyDemandedVectorElts(SDValue(N, 0)))
	return SDValue(N, 0);

	return SDValue();
	}

	/// Try to convert a wide shuffle of concatenated vectors into 2 narrow shuffles
	/// followed by concatenation. Narrow vector ops may have better performance
	/// than wide ops, and this can unlock further narrowing of other vector ops.
	/// Targets can invert this transform later if it is not profitable.
	static SDValue foldShuffleOfConcatUndefs(ShuffleVectorSDNode *Shuf,
	SelectionDAG &DAG) {
	SDValue N0 = Shuf->getOperand(0), N1 = Shuf->getOperand(1);
	if (N0.getOpcode() != ISD::CONCAT_VECTORS \|\| N0.getNumOperands() != 2 \|\|
	N1.getOpcode() != ISD::CONCAT_VECTORS \|\| N1.getNumOperands() != 2 \|\|
	!N0.getOperand(1).isUndef() \|\| !N1.getOperand(1).isUndef())
	return SDValue();

	// Split the wide shuffle mask into halves. Any mask element that is accessing
	// operand 1 is offset down to account for narrowing of the vectors.
	ArrayRef<int> Mask = Shuf->getMask();
	EVT VT = Shuf->getValueType(0);
	unsigned NumElts = VT.getVectorNumElements();
	unsigned HalfNumElts = NumElts / 2;
	SmallVector<int, 16> Mask0(HalfNumElts, -1);
	SmallVector<int, 16> Mask1(HalfNumElts, -1);
	for (unsigned i = 0; i != NumElts; ++i) {
	if (Mask[i] == -1)
	continue;
	int M = Mask[i] < (int)NumElts ? Mask[i] : Mask[i] - (int)HalfNumElts;
	if (i < HalfNumElts)
	Mask0[i] = M;
	else
	Mask1[i - HalfNumElts] = M;
	}

	// Ask the target if this is a valid transform.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
	HalfNumElts);
	if (!TLI.isShuffleMaskLegal(Mask0, HalfVT) \|\|
	!TLI.isShuffleMaskLegal(Mask1, HalfVT))
	return SDValue();

	// shuffle (concat X, undef), (concat Y, undef), Mask -->
	// concat (shuffle X, Y, Mask0), (shuffle X, Y, Mask1)
	SDValue X = N0.getOperand(0), Y = N1.getOperand(0);
	SDLoc DL(Shuf);
	SDValue Shuf0 = DAG.getVectorShuffle(HalfVT, DL, X, Y, Mask0);
	SDValue Shuf1 = DAG.getVectorShuffle(HalfVT, DL, X, Y, Mask1);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Shuf0, Shuf1);
	}

	// Tries to turn a shuffle of two CONCAT_VECTORS into a single concat,
	// or turn a shuffle of a single concat into simpler shuffle then concat.
	static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) {
	EVT VT = N->getValueType(0);
	unsigned NumElts = VT.getVectorNumElements();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
	ArrayRef<int> Mask = SVN->getMask();

	SmallVector<SDValue, 4> Ops;
	EVT ConcatVT = N0.getOperand(0).getValueType();
	unsigned NumElemsPerConcat = ConcatVT.getVectorNumElements();
	unsigned NumConcats = NumElts / NumElemsPerConcat;

	auto IsUndefMaskElt = [](int i) { return i == -1; };

	// Special case: shuffle(concat(A,B)) can be more efficiently represented
	// as concat(shuffle(A,B),UNDEF) if the shuffle doesn't set any of the high
	// half vector elements.
	if (NumElemsPerConcat * 2 == NumElts && N1.isUndef() &&
	llvm::all_of(Mask.slice(NumElemsPerConcat, NumElemsPerConcat),
	IsUndefMaskElt)) {
	N0 = DAG.getVectorShuffle(ConcatVT, SDLoc(N), N0.getOperand(0),
	N0.getOperand(1),
	Mask.slice(0, NumElemsPerConcat));
	N1 = DAG.getUNDEF(ConcatVT);
	return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0, N1);
	}

	// Look at every vector that's inserted. We're looking for exact
	// subvector-sized copies from a concatenated vector
	for (unsigned I = 0; I != NumConcats; ++I) {
	unsigned Begin = I * NumElemsPerConcat;
	ArrayRef<int> SubMask = Mask.slice(Begin, NumElemsPerConcat);

	// Make sure we're dealing with a copy.
	if (llvm::all_of(SubMask, IsUndefMaskElt)) {
	Ops.push_back(DAG.getUNDEF(ConcatVT));
	continue;
	}

	int OpIdx = -1;
	for (int i = 0; i != (int)NumElemsPerConcat; ++i) {
	if (IsUndefMaskElt(SubMask[i]))
	continue;
	if ((SubMask[i] % (int)NumElemsPerConcat) != i)
	return SDValue();
	int EltOpIdx = SubMask[i] / NumElemsPerConcat;
	if (0 <= OpIdx && EltOpIdx != OpIdx)
	return SDValue();
	OpIdx = EltOpIdx;
	}
	assert(0 <= OpIdx && "Unknown concat_vectors op");

	if (OpIdx < (int)N0.getNumOperands())
	Ops.push_back(N0.getOperand(OpIdx));
	else
	Ops.push_back(N1.getOperand(OpIdx - N0.getNumOperands()));
	}

	return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
	}

	// Attempt to combine a shuffle of 2 inputs of 'scalar sources' -
	// BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR.
	//
	// SHUFFLE(BUILD_VECTOR(), BUILD_VECTOR()) -> BUILD_VECTOR() is always
	// a simplification in some sense, but it isn't appropriate in general: some
	// BUILD_VECTORs are substantially cheaper than others. The general case
	// of a BUILD_VECTOR requires inserting each element individually (or
	// performing the equivalent in a temporary stack variable). A BUILD_VECTOR of
	// all constants is a single constant pool load. A BUILD_VECTOR where each
	// element is identical is a splat. A BUILD_VECTOR where most of the operands
	// are undef lowers to a small number of element insertions.
	//
	// To deal with this, we currently use a bunch of mostly arbitrary heuristics.
	// We don't fold shuffles where one side is a non-zero constant, and we don't
	// fold shuffles if the resulting (non-splat) BUILD_VECTOR would have duplicate
	// non-constant operands. This seems to work out reasonably well in practice.
	static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN,
	SelectionDAG &DAG,
	const TargetLowering &TLI) {
	EVT VT = SVN->getValueType(0);
	unsigned NumElts = VT.getVectorNumElements();
	SDValue N0 = SVN->getOperand(0);
	SDValue N1 = SVN->getOperand(1);

	if (!N0->hasOneUse())
	return SDValue();

	// If only one of N1,N2 is constant, bail out if it is not ALL_ZEROS as
	// discussed above.
	if (!N1.isUndef()) {
	if (!N1->hasOneUse())
	return SDValue();

	bool N0AnyConst = isAnyConstantBuildVector(N0);
	bool N1AnyConst = isAnyConstantBuildVector(N1);
	if (N0AnyConst && !N1AnyConst && !ISD::isBuildVectorAllZeros(N0.getNode()))
	return SDValue();
	if (!N0AnyConst && N1AnyConst && !ISD::isBuildVectorAllZeros(N1.getNode()))
	return SDValue();
	}

	// If both inputs are splats of the same value then we can safely merge this
	// to a single BUILD_VECTOR with undef elements based on the shuffle mask.
	bool IsSplat = false;
	auto *BV0 = dyn_cast<BuildVectorSDNode>(N0);
	auto *BV1 = dyn_cast<BuildVectorSDNode>(N1);
	if (BV0 && BV1)
	if (SDValue Splat0 = BV0->getSplatValue())
	IsSplat = (Splat0 == BV1->getSplatValue());

	SmallVector<SDValue, 8> Ops;
	SmallSet<SDValue, 16> DuplicateOps;
	for (int M : SVN->getMask()) {
	SDValue Op = DAG.getUNDEF(VT.getScalarType());
	if (M >= 0) {
	int Idx = M < (int)NumElts ? M : M - NumElts;
	SDValue &S = (M < (int)NumElts ? N0 : N1);
	if (S.getOpcode() == ISD::BUILD_VECTOR) {
	Op = S.getOperand(Idx);
	} else if (S.getOpcode() == ISD::SCALAR_TO_VECTOR) {
	SDValue Op0 = S.getOperand(0);
	Op = Idx == 0 ? Op0 : DAG.getUNDEF(Op0.getValueType());
	} else {
	// Operand can't be combined - bail out.
	return SDValue();
	}
	}

	// Don't duplicate a non-constant BUILD_VECTOR operand unless we're
	// generating a splat; semantically, this is fine, but it's likely to
	// generate low-quality code if the target can't reconstruct an appropriate
	// shuffle.
	if (!Op.isUndef() && !isa<ConstantSDNode>(Op) && !isa<ConstantFPSDNode>(Op))
	if (!IsSplat && !DuplicateOps.insert(Op).second)
	return SDValue();

	Ops.push_back(Op);
	}

	// BUILD_VECTOR requires all inputs to be of the same type, find the
	// maximum type and extend them all.
	EVT SVT = VT.getScalarType();
	if (SVT.isInteger())
	for (SDValue &Op : Ops)
	SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT);
	if (SVT != VT.getScalarType())
	for (SDValue &Op : Ops)
	Op = TLI.isZExtFree(Op.getValueType(), SVT)
	? DAG.getZExtOrTrunc(Op, SDLoc(SVN), SVT)
	: DAG.getSExtOrTrunc(Op, SDLoc(SVN), SVT);
	return DAG.getBuildVector(VT, SDLoc(SVN), Ops);
	}

	// Match shuffles that can be converted to any_vector_extend_in_reg.
	// This is often generated during legalization.
	// e.g. v4i32 <0,u,1,u> -> (v2i64 any_vector_extend_in_reg(v4i32 src))
	// TODO Add support for ZERO_EXTEND_VECTOR_INREG when we have a test case.
	static SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN,
	SelectionDAG &DAG,
	const TargetLowering &TLI,
	bool LegalOperations) {
	EVT VT = SVN->getValueType(0);
	bool IsBigEndian = DAG.getDataLayout().isBigEndian();

	// TODO Add support for big-endian when we have a test case.
	if (!VT.isInteger() \|\| IsBigEndian)
	return SDValue();

	unsigned NumElts = VT.getVectorNumElements();
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	ArrayRef<int> Mask = SVN->getMask();
	SDValue N0 = SVN->getOperand(0);

	// shuffle<0,-1,1,-1> == (v2i64 anyextend_vector_inreg(v4i32))
	auto isAnyExtend = [&Mask, &NumElts](unsigned Scale) {
	for (unsigned i = 0; i != NumElts; ++i) {
	if (Mask[i] < 0)
	continue;
	if ((i % Scale) == 0 && Mask[i] == (int)(i / Scale))
	continue;
	return false;
	}
	return true;
	};

	// Attempt to match a '*_extend_vector_inreg' shuffle, we just search for
	// power-of-2 extensions as they are the most likely.
	for (unsigned Scale = 2; Scale < NumElts; Scale *= 2) {
	// Check for non power of 2 vector sizes
	if (NumElts % Scale != 0)
	continue;
	if (!isAnyExtend(Scale))
	continue;

	EVT OutSVT = EVT::getIntegerVT(DAG.getContext(), EltSizeInBits Scale);
	EVT OutVT = EVT::getVectorVT(*DAG.getContext(), OutSVT, NumElts / Scale);
	// Never create an illegal type. Only create unsupported operations if we
	// are pre-legalization.
	if (TLI.isTypeLegal(OutVT))
	if (!LegalOperations \|\|
	TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND_VECTOR_INREG, OutVT))
	return DAG.getBitcast(VT,
	DAG.getNode(ISD::ANY_EXTEND_VECTOR_INREG,
	SDLoc(SVN), OutVT, N0));
	}

	return SDValue();
	}

	// Detect 'truncate_vector_inreg' style shuffles that pack the lower parts of
	// each source element of a large type into the lowest elements of a smaller
	// destination type. This is often generated during legalization.
	// If the source node itself was a '*_extend_vector_inreg' node then we should
	// then be able to remove it.
	static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN,
	SelectionDAG &DAG) {
	EVT VT = SVN->getValueType(0);
	bool IsBigEndian = DAG.getDataLayout().isBigEndian();

	// TODO Add support for big-endian when we have a test case.
	if (!VT.isInteger() \|\| IsBigEndian)
	return SDValue();

	SDValue N0 = peekThroughBitcasts(SVN->getOperand(0));

	unsigned Opcode = N0.getOpcode();
	if (Opcode != ISD::ANY_EXTEND_VECTOR_INREG &&
	Opcode != ISD::SIGN_EXTEND_VECTOR_INREG &&
	Opcode != ISD::ZERO_EXTEND_VECTOR_INREG)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	ArrayRef<int> Mask = SVN->getMask();
	unsigned NumElts = VT.getVectorNumElements();
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	unsigned ExtSrcSizeInBits = N00.getScalarValueSizeInBits();
	unsigned ExtDstSizeInBits = N0.getScalarValueSizeInBits();

	if (ExtDstSizeInBits % ExtSrcSizeInBits != 0)
	return SDValue();
	unsigned ExtScale = ExtDstSizeInBits / ExtSrcSizeInBits;

	// (v4i32 truncate_vector_inreg(v2i64)) == shuffle<0,2-1,-1>
	// (v8i16 truncate_vector_inreg(v4i32)) == shuffle<0,2,4,6,-1,-1,-1,-1>
	// (v8i16 truncate_vector_inreg(v2i64)) == shuffle<0,4,-1,-1,-1,-1,-1,-1>
	auto isTruncate = [&Mask, &NumElts](unsigned Scale) {
	for (unsigned i = 0; i != NumElts; ++i) {
	if (Mask[i] < 0)
	continue;
	if ((i * Scale) < NumElts && Mask[i] == (int)(i * Scale))
	continue;
	return false;
	}
	return true;
	};

	// At the moment we just handle the case where we've truncated back to the
	// same size as before the extension.
	// TODO: handle more extension/truncation cases as cases arise.
	if (EltSizeInBits != ExtSrcSizeInBits)
	return SDValue();

	// We can remove *extend_vector_inreg only if the truncation happens at
	// the same scale as the extension.
	if (isTruncate(ExtScale))
	return DAG.getBitcast(VT, N00);

	return SDValue();
	}

	// Combine shuffles of splat-shuffles of the form:
	// shuffle (shuffle V, undef, splat-mask), undef, M
	// If splat-mask contains undef elements, we need to be careful about
	// introducing undef's in the folded mask which are not the result of composing
	// the masks of the shuffles.
	static SDValue combineShuffleOfSplatVal(ShuffleVectorSDNode *Shuf,
	SelectionDAG &DAG) {
	if (!Shuf->getOperand(1).isUndef())
	return SDValue();
	auto *Splat = dyn_cast<ShuffleVectorSDNode>(Shuf->getOperand(0));
	if (!Splat \|\| !Splat->isSplat())
	return SDValue();

	ArrayRef<int> ShufMask = Shuf->getMask();
	ArrayRef<int> SplatMask = Splat->getMask();
	assert(ShufMask.size() == SplatMask.size() && "Mask length mismatch");

	// Prefer simplifying to the splat-shuffle, if possible. This is legal if
	// every undef mask element in the splat-shuffle has a corresponding undef
	// element in the user-shuffle's mask or if the composition of mask elements
	// would result in undef.
	// Examples for (shuffle (shuffle v, undef, SplatMask), undef, UserMask):
	// * UserMask=[0,2,u,u], SplatMask=[2,u,2,u] -> [2,2,u,u]
	// In this case it is not legal to simplify to the splat-shuffle because we
	// may be exposing the users of the shuffle an undef element at index 1
	// which was not there before the combine.
	// * UserMask=[0,u,2,u], SplatMask=[2,u,2,u] -> [2,u,2,u]
	// In this case the composition of masks yields SplatMask, so it's ok to
	// simplify to the splat-shuffle.
	// * UserMask=[3,u,2,u], SplatMask=[2,u,2,u] -> [u,u,2,u]
	// In this case the composed mask includes all undef elements of SplatMask
	// and in addition sets element zero to undef. It is safe to simplify to
	// the splat-shuffle.
	auto CanSimplifyToExistingSplat = [](ArrayRef<int> UserMask,
	ArrayRef<int> SplatMask) {
	for (unsigned i = 0, e = UserMask.size(); i != e; ++i)
	if (UserMask[i] != -1 && SplatMask[i] == -1 &&
	SplatMask[UserMask[i]] != -1)
	return false;
	return true;
	};
	if (CanSimplifyToExistingSplat(ShufMask, SplatMask))
	return Shuf->getOperand(0);

	// Create a new shuffle with a mask that is composed of the two shuffles'
	// masks.
	SmallVector<int, 32> NewMask;
	for (int Idx : ShufMask)
	NewMask.push_back(Idx == -1 ? -1 : SplatMask[Idx]);

	return DAG.getVectorShuffle(Splat->getValueType(0), SDLoc(Splat),
	Splat->getOperand(0), Splat->getOperand(1),
	NewMask);
	}

	/// Combine shuffle of shuffle of the form:
	/// shuf (shuf X, undef, InnerMask), undef, OuterMask --> splat X
	static SDValue formSplatFromShuffles(ShuffleVectorSDNode *OuterShuf,
	SelectionDAG &DAG) {
	if (!OuterShuf->getOperand(1).isUndef())
	return SDValue();
	auto *InnerShuf = dyn_cast<ShuffleVectorSDNode>(OuterShuf->getOperand(0));
	if (!InnerShuf \|\| !InnerShuf->getOperand(1).isUndef())
	return SDValue();

	ArrayRef<int> OuterMask = OuterShuf->getMask();
	ArrayRef<int> InnerMask = InnerShuf->getMask();
	unsigned NumElts = OuterMask.size();
	assert(NumElts == InnerMask.size() && "Mask length mismatch");
	SmallVector<int, 32> CombinedMask(NumElts, -1);
	int SplatIndex = -1;
	for (unsigned i = 0; i != NumElts; ++i) {
	// Undef lanes remain undef.
	int OuterMaskElt = OuterMask[i];
	if (OuterMaskElt == -1)
	continue;

	// Peek through the shuffle masks to get the underlying source element.
	int InnerMaskElt = InnerMask[OuterMaskElt];
	if (InnerMaskElt == -1)
	continue;

	// Initialize the splatted element.
	if (SplatIndex == -1)
	SplatIndex = InnerMaskElt;

	// Non-matching index - this is not a splat.
	if (SplatIndex != InnerMaskElt)
	return SDValue();

	CombinedMask[i] = InnerMaskElt;
	}
	assert((all_of(CombinedMask, [](int M) { return M == -1; }) \|\|
	getSplatIndex(CombinedMask) != -1) &&
	"Expected a splat mask");

	// TODO: The transform may be a win even if the mask is not legal.
	EVT VT = OuterShuf->getValueType(0);
	assert(VT == InnerShuf->getValueType(0) && "Expected matching shuffle types");
	if (!DAG.getTargetLoweringInfo().isShuffleMaskLegal(CombinedMask, VT))
	return SDValue();

	return DAG.getVectorShuffle(VT, SDLoc(OuterShuf), InnerShuf->getOperand(0),
	InnerShuf->getOperand(1), CombinedMask);
	}

	/// If the shuffle mask is taking exactly one element from the first vector
	/// operand and passing through all other elements from the second vector
	/// operand, return the index of the mask element that is choosing an element
	/// from the first operand. Otherwise, return -1.
	static int getShuffleMaskIndexOfOneElementFromOp0IntoOp1(ArrayRef<int> Mask) {
	int MaskSize = Mask.size();
	int EltFromOp0 = -1;
	// TODO: This does not match if there are undef elements in the shuffle mask.
	// Should we ignore undefs in the shuffle mask instead? The trade-off is
	// removing an instruction (a shuffle), but losing the knowledge that some
	// vector lanes are not needed.
	for (int i = 0; i != MaskSize; ++i) {
	if (Mask[i] >= 0 && Mask[i] < MaskSize) {
	// We're looking for a shuffle of exactly one element from operand 0.
	if (EltFromOp0 != -1)
	return -1;
	EltFromOp0 = i;
	} else if (Mask[i] != i + MaskSize) {
	// Nothing from operand 1 can change lanes.
	return -1;
	}
	}
	return EltFromOp0;
	}

	/// If a shuffle inserts exactly one element from a source vector operand into
	/// another vector operand and we can access the specified element as a scalar,
	/// then we can eliminate the shuffle.
	static SDValue replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf,
	SelectionDAG &DAG) {
	// First, check if we are taking one element of a vector and shuffling that
	// element into another vector.
	ArrayRef<int> Mask = Shuf->getMask();
	SmallVector<int, 16> CommutedMask(Mask.begin(), Mask.end());
	SDValue Op0 = Shuf->getOperand(0);
	SDValue Op1 = Shuf->getOperand(1);
	int ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(Mask);
	if (ShufOp0Index == -1) {
	// Commute mask and check again.
	ShuffleVectorSDNode::commuteMask(CommutedMask);
	ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(CommutedMask);
	if (ShufOp0Index == -1)
	return SDValue();
	// Commute operands to match the commuted shuffle mask.
	std::swap(Op0, Op1);
	Mask = CommutedMask;
	}

	// The shuffle inserts exactly one element from operand 0 into operand 1.
	// Now see if we can access that element as a scalar via a real insert element
	// instruction.
	// TODO: We can try harder to locate the element as a scalar. Examples: it
	// could be an operand of SCALAR_TO_VECTOR, BUILD_VECTOR, or a constant.
	assert(Mask[ShufOp0Index] >= 0 && Mask[ShufOp0Index] < (int)Mask.size() &&
	"Shuffle mask value must be from operand 0");
	if (Op0.getOpcode() != ISD::INSERT_VECTOR_ELT)
	return SDValue();

	auto *InsIndexC = dyn_cast<ConstantSDNode>(Op0.getOperand(2));
	if (!InsIndexC \|\| InsIndexC->getSExtValue() != Mask[ShufOp0Index])
	return SDValue();

	// There's an existing insertelement with constant insertion index, so we
	// don't need to check the legality/profitability of a replacement operation
	// that differs at most in the constant value. The target should be able to
	// lower any of those in a similar way. If not, legalization will expand this
	// to a scalar-to-vector plus shuffle.
	//
	// Note that the shuffle may move the scalar from the position that the insert
	// element used. Therefore, our new insert element occurs at the shuffle's
	// mask index value, not the insert's index value.
	// shuffle (insertelt v1, x, C), v2, mask --> insertelt v2, x, C'
	SDValue NewInsIndex = DAG.getVectorIdxConstant(ShufOp0Index, SDLoc(Shuf));
	return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Shuf), Op0.getValueType(),
	Op1, Op0.getOperand(1), NewInsIndex);
	}

	/// If we have a unary shuffle of a shuffle, see if it can be folded away
	/// completely. This has the potential to lose undef knowledge because the first
	/// shuffle may not have an undef mask element where the second one does. So
	/// only call this after doing simplifications based on demanded elements.
	static SDValue simplifyShuffleOfShuffle(ShuffleVectorSDNode *Shuf) {
	// shuf (shuf0 X, Y, Mask0), undef, Mask
	auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(Shuf->getOperand(0));
	if (!Shuf0 \|\| !Shuf->getOperand(1).isUndef())
	return SDValue();

	ArrayRef<int> Mask = Shuf->getMask();
	ArrayRef<int> Mask0 = Shuf0->getMask();
	for (int i = 0, e = (int)Mask.size(); i != e; ++i) {
	// Ignore undef elements.
	if (Mask[i] == -1)
	continue;
	assert(Mask[i] >= 0 && Mask[i] < e && "Unexpected shuffle mask value");

	// Is the element of the shuffle operand chosen by this shuffle the same as
	// the element chosen by the shuffle operand itself?
	if (Mask0[Mask[i]] != Mask0[i])
	return SDValue();
	}
	// Every element of this shuffle is identical to the result of the previous
	// shuffle, so we can replace this value.
	return Shuf->getOperand(0);
	}

	SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
	EVT VT = N->getValueType(0);
	unsigned NumElts = VT.getVectorNumElements();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	assert(N0.getValueType() == VT && "Vector shuffle must be normalized in DAG");

	// Canonicalize shuffle undef, undef -> undef
	if (N0.isUndef() && N1.isUndef())
	return DAG.getUNDEF(VT);

	ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);

	// Canonicalize shuffle v, v -> v, undef
	if (N0 == N1) {
	SmallVector<int, 8> NewMask;
	for (unsigned i = 0; i != NumElts; ++i) {
	int Idx = SVN->getMaskElt(i);
	if (Idx >= (int)NumElts) Idx -= NumElts;
	NewMask.push_back(Idx);
	}
	return DAG.getVectorShuffle(VT, SDLoc(N), N0, DAG.getUNDEF(VT), NewMask);
	}

	// Canonicalize shuffle undef, v -> v, undef. Commute the shuffle mask.
	if (N0.isUndef())
	return DAG.getCommutedVectorShuffle(*SVN);

	// Remove references to rhs if it is undef
	if (N1.isUndef()) {
	bool Changed = false;
	SmallVector<int, 8> NewMask;
	for (unsigned i = 0; i != NumElts; ++i) {
	int Idx = SVN->getMaskElt(i);
	if (Idx >= (int)NumElts) {
	Idx = -1;
	Changed = true;
	}
	NewMask.push_back(Idx);
	}
	if (Changed)
	return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, NewMask);
	}

	if (SDValue InsElt = replaceShuffleOfInsert(SVN, DAG))
	return InsElt;

	// A shuffle of a single vector that is a splatted value can always be folded.
	if (SDValue V = combineShuffleOfSplatVal(SVN, DAG))
	return V;

	if (SDValue V = formSplatFromShuffles(SVN, DAG))
	return V;

	// If it is a splat, check if the argument vector is another splat or a
	// build_vector.
	if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) {
	int SplatIndex = SVN->getSplatIndex();
	if (N0.hasOneUse() && TLI.isExtractVecEltCheap(VT, SplatIndex) &&
	TLI.isBinOp(N0.getOpcode()) && N0.getNode()->getNumValues() == 1) {
	// splat (vector_bo L, R), Index -->
	// splat (scalar_bo (extelt L, Index), (extelt R, Index))
	SDValue L = N0.getOperand(0), R = N0.getOperand(1);
	SDLoc DL(N);
	EVT EltVT = VT.getScalarType();
	SDValue Index = DAG.getVectorIdxConstant(SplatIndex, DL);
	SDValue ExtL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, L, Index);
	SDValue ExtR = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, R, Index);
	SDValue NewBO = DAG.getNode(N0.getOpcode(), DL, EltVT, ExtL, ExtR,
	N0.getNode()->getFlags());
	SDValue Insert = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, NewBO);
	SmallVector<int, 16> ZeroMask(VT.getVectorNumElements(), 0);
	return DAG.getVectorShuffle(VT, DL, Insert, DAG.getUNDEF(VT), ZeroMask);
	}

	// If this is a bit convert that changes the element type of the vector but
	// not the number of vector elements, look through it. Be careful not to
	// look though conversions that change things like v4f32 to v2f64.
	SDNode *V = N0.getNode();
	if (V->getOpcode() == ISD::BITCAST) {
	SDValue ConvInput = V->getOperand(0);
	if (ConvInput.getValueType().isVector() &&
	ConvInput.getValueType().getVectorNumElements() == NumElts)
	V = ConvInput.getNode();
	}

	if (V->getOpcode() == ISD::BUILD_VECTOR) {
	assert(V->getNumOperands() == NumElts &&
	"BUILD_VECTOR has wrong number of operands");
	SDValue Base;
	bool AllSame = true;
	for (unsigned i = 0; i != NumElts; ++i) {
	if (!V->getOperand(i).isUndef()) {
	Base = V->getOperand(i);
	break;
	}
	}
	// Splat of <u, u, u, u>, return <u, u, u, u>
	if (!Base.getNode())
	return N0;
	for (unsigned i = 0; i != NumElts; ++i) {
	if (V->getOperand(i) != Base) {
	AllSame = false;
	break;
	}
	}
	// Splat of <x, x, x, x>, return <x, x, x, x>
	if (AllSame)
	return N0;

	// Canonicalize any other splat as a build_vector.
	SDValue Splatted = V->getOperand(SplatIndex);
	SmallVector<SDValue, 8> Ops(NumElts, Splatted);
	SDValue NewBV = DAG.getBuildVector(V->getValueType(0), SDLoc(N), Ops);

	// We may have jumped through bitcasts, so the type of the
	// BUILD_VECTOR may not match the type of the shuffle.
	if (V->getValueType(0) != VT)
	NewBV = DAG.getBitcast(VT, NewBV);
	return NewBV;
	}
	}

	// Simplify source operands based on shuffle mask.
	if (SimplifyDemandedVectorElts(SDValue(N, 0)))
	return SDValue(N, 0);

	// This is intentionally placed after demanded elements simplification because
	// it could eliminate knowledge of undef elements created by this shuffle.
	if (SDValue ShufOp = simplifyShuffleOfShuffle(SVN))
	return ShufOp;

	// Match shuffles that can be converted to any_vector_extend_in_reg.
	if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations))
	return V;

	// Combine "truncate_vector_in_reg" style shuffles.
	if (SDValue V = combineTruncationShuffle(SVN, DAG))
	return V;

	if (N0.getOpcode() == ISD::CONCAT_VECTORS &&
	Level < AfterLegalizeVectorOps &&
	(N1.isUndef() \|\|
	(N1.getOpcode() == ISD::CONCAT_VECTORS &&
	N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()))) {
	if (SDValue V = partitionShuffleOfConcats(N, DAG))
	return V;
	}

	// A shuffle of a concat of the same narrow vector can be reduced to use
	// only low-half elements of a concat with undef:
	// shuf (concat X, X), undef, Mask --> shuf (concat X, undef), undef, Mask'
	if (N0.getOpcode() == ISD::CONCAT_VECTORS && N1.isUndef() &&
	N0.getNumOperands() == 2 &&
	N0.getOperand(0) == N0.getOperand(1)) {
	int HalfNumElts = (int)NumElts / 2;
	SmallVector<int, 8> NewMask;
	for (unsigned i = 0; i != NumElts; ++i) {
	int Idx = SVN->getMaskElt(i);
	if (Idx >= HalfNumElts) {
	assert(Idx < (int)NumElts && "Shuffle mask chooses undef op");
	Idx -= HalfNumElts;
	}
	NewMask.push_back(Idx);
	}
	if (TLI.isShuffleMaskLegal(NewMask, VT)) {
	SDValue UndefVec = DAG.getUNDEF(N0.getOperand(0).getValueType());
	SDValue NewCat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
	N0.getOperand(0), UndefVec);
	return DAG.getVectorShuffle(VT, SDLoc(N), NewCat, N1, NewMask);
	}
	}

	// Attempt to combine a shuffle of 2 inputs of 'scalar sources' -
	// BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR.
	if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT))
	if (SDValue Res = combineShuffleOfScalars(SVN, DAG, TLI))
	return Res;

	// If this shuffle only has a single input that is a bitcasted shuffle,
	// attempt to merge the 2 shuffles and suitably bitcast the inputs/output
	// back to their original types.
	if (N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() &&
	N1.isUndef() && Level < AfterLegalizeVectorOps &&
	TLI.isTypeLegal(VT)) {

	SDValue BC0 = peekThroughOneUseBitcasts(N0);
	if (BC0.getOpcode() == ISD::VECTOR_SHUFFLE && BC0.hasOneUse()) {
	EVT SVT = VT.getScalarType();
	EVT InnerVT = BC0->getValueType(0);
	EVT InnerSVT = InnerVT.getScalarType();

	// Determine which shuffle works with the smaller scalar type.
	EVT ScaleVT = SVT.bitsLT(InnerSVT) ? VT : InnerVT;
	EVT ScaleSVT = ScaleVT.getScalarType();

	if (TLI.isTypeLegal(ScaleVT) &&
	0 == (InnerSVT.getSizeInBits() % ScaleSVT.getSizeInBits()) &&
	0 == (SVT.getSizeInBits() % ScaleSVT.getSizeInBits())) {
	int InnerScale = InnerSVT.getSizeInBits() / ScaleSVT.getSizeInBits();
	int OuterScale = SVT.getSizeInBits() / ScaleSVT.getSizeInBits();

	// Scale the shuffle masks to the smaller scalar type.
	ShuffleVectorSDNode *InnerSVN = cast<ShuffleVectorSDNode>(BC0);
	SmallVector<int, 8> InnerMask;
	SmallVector<int, 8> OuterMask;
	narrowShuffleMaskElts(InnerScale, InnerSVN->getMask(), InnerMask);
	narrowShuffleMaskElts(OuterScale, SVN->getMask(), OuterMask);

	// Merge the shuffle masks.
	SmallVector<int, 8> NewMask;
	for (int M : OuterMask)
	NewMask.push_back(M < 0 ? -1 : InnerMask[M]);

	// Test for shuffle mask legality over both commutations.
	SDValue SV0 = BC0->getOperand(0);
	SDValue SV1 = BC0->getOperand(1);
	bool LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT);
	if (!LegalMask) {
	std::swap(SV0, SV1);
	ShuffleVectorSDNode::commuteMask(NewMask);
	LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT);
	}

	if (LegalMask) {
	SV0 = DAG.getBitcast(ScaleVT, SV0);
	SV1 = DAG.getBitcast(ScaleVT, SV1);
	return DAG.getBitcast(
	VT, DAG.getVectorShuffle(ScaleVT, SDLoc(N), SV0, SV1, NewMask));
	}
	}
	}
	}

	// Canonicalize shuffles according to rules:
	// shuffle(A, shuffle(A, B)) -> shuffle(shuffle(A,B), A)
	// shuffle(B, shuffle(A, B)) -> shuffle(shuffle(A,B), B)
	// shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
	if (N1.getOpcode() == ISD::VECTOR_SHUFFLE &&
	N0.getOpcode() != ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG &&
	TLI.isTypeLegal(VT)) {
	// The incoming shuffle must be of the same type as the result of the
	// current shuffle.
	assert(N1->getOperand(0).getValueType() == VT &&
	"Shuffle types don't match");

	SDValue SV0 = N1->getOperand(0);
	SDValue SV1 = N1->getOperand(1);
	bool HasSameOp0 = N0 == SV0;
	bool IsSV1Undef = SV1.isUndef();
	if (HasSameOp0 \|\| IsSV1Undef \|\| N0 == SV1)
	// Commute the operands of this shuffle so that next rule
	// will trigger.
	return DAG.getCommutedVectorShuffle(*SVN);
	}

	// Try to fold according to rules:
	// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
	// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
	// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
	// Don't try to fold shuffles with illegal type.
	// Only fold if this shuffle is the only user of the other shuffle.
	if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && N->isOnlyUserOf(N0.getNode()) &&
	Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) {
	ShuffleVectorSDNode *OtherSV = cast<ShuffleVectorSDNode>(N0);

	// Don't try to fold splats; they're likely to simplify somehow, or they
	// might be free.
	if (OtherSV->isSplat())
	return SDValue();

	// The incoming shuffle must be of the same type as the result of the
	// current shuffle.
	assert(OtherSV->getOperand(0).getValueType() == VT &&
	"Shuffle types don't match");

	SDValue SV0, SV1;
	SmallVector<int, 4> Mask;
	// Compute the combined shuffle mask for a shuffle with SV0 as the first
	// operand, and SV1 as the second operand.
	for (unsigned i = 0; i != NumElts; ++i) {
	int Idx = SVN->getMaskElt(i);
	if (Idx < 0) {
	// Propagate Undef.
	Mask.push_back(Idx);
	continue;
	}

	SDValue CurrentVec;
	if (Idx < (int)NumElts) {
	// This shuffle index refers to the inner shuffle N0. Lookup the inner
	// shuffle mask to identify which vector is actually referenced.
	Idx = OtherSV->getMaskElt(Idx);
	if (Idx < 0) {
	// Propagate Undef.
	Mask.push_back(Idx);
	continue;
	}

	CurrentVec = (Idx < (int) NumElts) ? OtherSV->getOperand(0)
	: OtherSV->getOperand(1);
	} else {
	// This shuffle index references an element within N1.
	CurrentVec = N1;
	}

	// Simple case where 'CurrentVec' is UNDEF.
	if (CurrentVec.isUndef()) {
	Mask.push_back(-1);
	continue;
	}

	// Canonicalize the shuffle index. We don't know yet if CurrentVec
	// will be the first or second operand of the combined shuffle.
	Idx = Idx % NumElts;
	if (!SV0.getNode() \|\| SV0 == CurrentVec) {
	// Ok. CurrentVec is the left hand side.
	// Update the mask accordingly.
	SV0 = CurrentVec;
	Mask.push_back(Idx);
	continue;
	}

	// Bail out if we cannot convert the shuffle pair into a single shuffle.
	if (SV1.getNode() && SV1 != CurrentVec)
	return SDValue();

	// Ok. CurrentVec is the right hand side.
	// Update the mask accordingly.
	SV1 = CurrentVec;
	Mask.push_back(Idx + NumElts);
	}

	// Check if all indices in Mask are Undef. In case, propagate Undef.
	bool isUndefMask = true;
	for (unsigned i = 0; i != NumElts && isUndefMask; ++i)
	isUndefMask &= Mask[i] < 0;

	if (isUndefMask)
	return DAG.getUNDEF(VT);

	if (!SV0.getNode())
	SV0 = DAG.getUNDEF(VT);
	if (!SV1.getNode())
	SV1 = DAG.getUNDEF(VT);

	// Avoid introducing shuffles with illegal mask.
	// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
	// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
	// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
	// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, A, M2)
	// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, A, M2)
	// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, B, M2)
	return TLI.buildLegalVectorShuffle(VT, SDLoc(N), SV0, SV1, Mask, DAG);
	}

	if (SDValue V = foldShuffleOfConcatUndefs(SVN, DAG))
	return V;

	return SDValue();
	}

	SDValue DAGCombiner::visitSCALAR_TO_VECTOR(SDNode *N) {
	SDValue InVal = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// Replace a SCALAR_TO_VECTOR(EXTRACT_VECTOR_ELT(V,C0)) pattern
	// with a VECTOR_SHUFFLE and possible truncate.
	if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	VT.isFixedLengthVector() &&
	InVal->getOperand(0).getValueType().isFixedLengthVector()) {
	SDValue InVec = InVal->getOperand(0);
	SDValue EltNo = InVal->getOperand(1);
	auto InVecT = InVec.getValueType();
	if (ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(EltNo)) {
	SmallVector<int, 8> NewMask(InVecT.getVectorNumElements(), -1);
	int Elt = C0->getZExtValue();
	NewMask[0] = Elt;
	// If we have an implict truncate do truncate here as long as it's legal.
	// if it's not legal, this should
	if (VT.getScalarType() != InVal.getValueType() &&
	InVal.getValueType().isScalarInteger() &&
	isTypeLegal(VT.getScalarType())) {
	SDValue Val =
	DAG.getNode(ISD::TRUNCATE, SDLoc(InVal), VT.getScalarType(), InVal);
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Val);
	}
	if (VT.getScalarType() == InVecT.getScalarType() &&
	VT.getVectorNumElements() <= InVecT.getVectorNumElements()) {
	SDValue LegalShuffle =
	TLI.buildLegalVectorShuffle(InVecT, SDLoc(N), InVec,
	DAG.getUNDEF(InVecT), NewMask, DAG);
	if (LegalShuffle) {
	// If the initial vector is the correct size this shuffle is a
	// valid result.
	if (VT == InVecT)
	return LegalShuffle;
	// If not we must truncate the vector.
	if (VT.getVectorNumElements() != InVecT.getVectorNumElements()) {
	SDValue ZeroIdx = DAG.getVectorIdxConstant(0, SDLoc(N));
	EVT SubVT = EVT::getVectorVT(*DAG.getContext(),
	InVecT.getVectorElementType(),
	VT.getVectorNumElements());
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), SubVT,
	LegalShuffle, ZeroIdx);
	}
	}
	}
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue N2 = N->getOperand(2);
	uint64_t InsIdx = N->getConstantOperandVal(2);

	// If inserting an UNDEF, just return the original vector.
	if (N1.isUndef())
	return N0;

	// If this is an insert of an extracted vector into an undef vector, we can
	// just use the input to the extract.
	if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N1.getOperand(1) == N2 && N1.getOperand(0).getValueType() == VT)
	return N1.getOperand(0);

	// If we are inserting a bitcast value into an undef, with the same
	// number of elements, just use the bitcast input of the extract.
	// i.e. INSERT_SUBVECTOR UNDEF (BITCAST N1) N2 ->
	// BITCAST (INSERT_SUBVECTOR UNDEF N1 N2)
	if (N0.isUndef() && N1.getOpcode() == ISD::BITCAST &&
	N1.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N1.getOperand(0).getOperand(1) == N2 &&
	N1.getOperand(0).getOperand(0).getValueType().getVectorNumElements() ==
	VT.getVectorNumElements() &&
	N1.getOperand(0).getOperand(0).getValueType().getSizeInBits() ==
	VT.getSizeInBits()) {
	return DAG.getBitcast(VT, N1.getOperand(0).getOperand(0));
	}

	// If both N1 and N2 are bitcast values on which insert_subvector
	// would makes sense, pull the bitcast through.
	// i.e. INSERT_SUBVECTOR (BITCAST N0) (BITCAST N1) N2 ->
	// BITCAST (INSERT_SUBVECTOR N0 N1 N2)
	if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
	SDValue CN0 = N0.getOperand(0);
	SDValue CN1 = N1.getOperand(0);
	EVT CN0VT = CN0.getValueType();
	EVT CN1VT = CN1.getValueType();
	if (CN0VT.isVector() && CN1VT.isVector() &&
	CN0VT.getVectorElementType() == CN1VT.getVectorElementType() &&
	CN0VT.getVectorNumElements() == VT.getVectorNumElements()) {
	SDValue NewINSERT = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N),
	CN0.getValueType(), CN0, CN1, N2);
	return DAG.getBitcast(VT, NewINSERT);
	}
	}

	// Combine INSERT_SUBVECTORs where we are inserting to the same index.
	// INSERT_SUBVECTOR( INSERT_SUBVECTOR( Vec, SubOld, Idx ), SubNew, Idx )
	// --> INSERT_SUBVECTOR( Vec, SubNew, Idx )
	if (N0.getOpcode() == ISD::INSERT_SUBVECTOR &&
	N0.getOperand(1).getValueType() == N1.getValueType() &&
	N0.getOperand(2) == N2)
	return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
	N1, N2);

	// Eliminate an intermediate insert into an undef vector:
	// insert_subvector undef, (insert_subvector undef, X, 0), N2 -->
	// insert_subvector undef, X, N2
	if (N0.isUndef() && N1.getOpcode() == ISD::INSERT_SUBVECTOR &&
	N1.getOperand(0).isUndef() && isNullConstant(N1.getOperand(2)))
	return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0,
	N1.getOperand(1), N2);

	// Push subvector bitcasts to the output, adjusting the index as we go.
	// insert_subvector(bitcast(v), bitcast(s), c1)
	// -> bitcast(insert_subvector(v, s, c2))
	if ((N0.isUndef() \|\| N0.getOpcode() == ISD::BITCAST) &&
	N1.getOpcode() == ISD::BITCAST) {
	SDValue N0Src = peekThroughBitcasts(N0);
	SDValue N1Src = peekThroughBitcasts(N1);
	EVT N0SrcSVT = N0Src.getValueType().getScalarType();
	EVT N1SrcSVT = N1Src.getValueType().getScalarType();
	if ((N0.isUndef() \|\| N0SrcSVT == N1SrcSVT) &&
	N0Src.getValueType().isVector() && N1Src.getValueType().isVector()) {
	EVT NewVT;
	SDLoc DL(N);
	SDValue NewIdx;
	LLVMContext &Ctx = *DAG.getContext();
	unsigned NumElts = VT.getVectorNumElements();
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	if ((EltSizeInBits % N1SrcSVT.getSizeInBits()) == 0) {
	unsigned Scale = EltSizeInBits / N1SrcSVT.getSizeInBits();
	NewVT = EVT::getVectorVT(Ctx, N1SrcSVT, NumElts * Scale);
	NewIdx = DAG.getVectorIdxConstant(InsIdx * Scale, DL);
	} else if ((N1SrcSVT.getSizeInBits() % EltSizeInBits) == 0) {
	unsigned Scale = N1SrcSVT.getSizeInBits() / EltSizeInBits;
	if ((NumElts % Scale) == 0 && (InsIdx % Scale) == 0) {
	NewVT = EVT::getVectorVT(Ctx, N1SrcSVT, NumElts / Scale);
	NewIdx = DAG.getVectorIdxConstant(InsIdx / Scale, DL);
	}
	}
	if (NewIdx && hasOperation(ISD::INSERT_SUBVECTOR, NewVT)) {
	SDValue Res = DAG.getBitcast(NewVT, N0Src);
	Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT, Res, N1Src, NewIdx);
	return DAG.getBitcast(VT, Res);
	}
	}
	}

	// Canonicalize insert_subvector dag nodes.
	// Example:
	// (insert_subvector (insert_subvector A, Idx0), Idx1)
	// -> (insert_subvector (insert_subvector A, Idx1), Idx0)
	if (N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.hasOneUse() &&
	N1.getValueType() == N0.getOperand(1).getValueType()) {
	unsigned OtherIdx = N0.getConstantOperandVal(2);
	if (InsIdx < OtherIdx) {
	// Swap nodes.
	SDValue NewOp = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT,
	N0.getOperand(0), N1, N2);
	AddToWorklist(NewOp.getNode());
	return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N0.getNode()),
	VT, NewOp, N0.getOperand(1), N0.getOperand(2));
	}
	}

	// If the input vector is a concatenation, and the insert replaces
	// one of the pieces, we can optimize into a single concat_vectors.
	if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0.hasOneUse() &&
	N0.getOperand(0).getValueType() == N1.getValueType()) {
	unsigned Factor = N1.getValueType().getVectorNumElements();
	SmallVector<SDValue, 8> Ops(N0->op_begin(), N0->op_end());
	Ops[InsIdx / Factor] = N1;
	return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
	}

	// Simplify source operands based on insertion.
	if (SimplifyDemandedVectorElts(SDValue(N, 0)))
	return SDValue(N, 0);

	return SDValue();
	}

	SDValue DAGCombiner::visitFP_TO_FP16(SDNode *N) {
	SDValue N0 = N->getOperand(0);

	// fold (fp_to_fp16 (fp16_to_fp op)) -> op
	if (N0->getOpcode() == ISD::FP16_TO_FP)
	return N0->getOperand(0);

	return SDValue();
	}

	SDValue DAGCombiner::visitFP16_TO_FP(SDNode *N) {
	SDValue N0 = N->getOperand(0);

	// fold fp16_to_fp(op & 0xffff) -> fp16_to_fp(op)
	if (N0->getOpcode() == ISD::AND) {
	ConstantSDNode *AndConst = getAsNonOpaqueConstant(N0.getOperand(1));
	if (AndConst && AndConst->getAPIntValue() == 0xffff) {
	return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), N->getValueType(0),
	N0.getOperand(0));
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitVECREDUCE(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N0.getValueType();
	unsigned Opcode = N->getOpcode();

	// VECREDUCE over 1-element vector is just an extract.
	if (VT.getVectorNumElements() == 1) {
	SDLoc dl(N);
	SDValue Res =
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT.getVectorElementType(), N0,
	DAG.getVectorIdxConstant(0, dl));
	if (Res.getValueType() != N->getValueType(0))
	Res = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Res);
	return Res;
	}

	// On an boolean vector an and/or reduction is the same as a umin/umax
	// reduction. Convert them if the latter is legal while the former isn't.
	if (Opcode == ISD::VECREDUCE_AND \|\| Opcode == ISD::VECREDUCE_OR) {
	unsigned NewOpcode = Opcode == ISD::VECREDUCE_AND
	? ISD::VECREDUCE_UMIN : ISD::VECREDUCE_UMAX;
	if (!TLI.isOperationLegalOrCustom(Opcode, VT) &&
	TLI.isOperationLegalOrCustom(NewOpcode, VT) &&
	DAG.ComputeNumSignBits(N0) == VT.getScalarSizeInBits())
	return DAG.getNode(NewOpcode, SDLoc(N), N->getValueType(0), N0);
	}

	return SDValue();
	}

	/// Returns a vector_shuffle if it able to transform an AND to a vector_shuffle
	/// with the destination vector and a zero vector.
	/// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==>
	/// vector_shuffle V, Zero, <0, 4, 2, 4>
	SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
	assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");

	EVT VT = N->getValueType(0);
	SDValue LHS = N->getOperand(0);
	SDValue RHS = peekThroughBitcasts(N->getOperand(1));
	SDLoc DL(N);

	// Make sure we're not running after operation legalization where it
	// may have custom lowered the vector shuffles.
	if (LegalOperations)
	return SDValue();

	if (RHS.getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	EVT RVT = RHS.getValueType();
	unsigned NumElts = RHS.getNumOperands();

	// Attempt to create a valid clear mask, splitting the mask into
	// sub elements and checking to see if each is
	// all zeros or all ones - suitable for shuffle masking.
	auto BuildClearMask = [&](int Split) {
	int NumSubElts = NumElts * Split;
	int NumSubBits = RVT.getScalarSizeInBits() / Split;

	SmallVector<int, 8> Indices;
	for (int i = 0; i != NumSubElts; ++i) {
	int EltIdx = i / Split;
	int SubIdx = i % Split;
	SDValue Elt = RHS.getOperand(EltIdx);
	// X & undef --> 0 (not undef). So this lane must be converted to choose
	// from the zero constant vector (same as if the element had all 0-bits).
	if (Elt.isUndef()) {
	Indices.push_back(i + NumSubElts);
	continue;
	}

	APInt Bits;
	if (isa<ConstantSDNode>(Elt))
	Bits = cast<ConstantSDNode>(Elt)->getAPIntValue();
	else if (isa<ConstantFPSDNode>(Elt))
	Bits = cast<ConstantFPSDNode>(Elt)->getValueAPF().bitcastToAPInt();
	else
	return SDValue();

	// Extract the sub element from the constant bit mask.
	if (DAG.getDataLayout().isBigEndian())
	Bits = Bits.extractBits(NumSubBits, (Split - SubIdx - 1) * NumSubBits);
	else
	Bits = Bits.extractBits(NumSubBits, SubIdx * NumSubBits);

	if (Bits.isAllOnesValue())
	Indices.push_back(i);
	else if (Bits == 0)
	Indices.push_back(i + NumSubElts);
	else
	return SDValue();
	}

	// Let's see if the target supports this vector_shuffle.
	EVT ClearSVT = EVT::getIntegerVT(*DAG.getContext(), NumSubBits);
	EVT ClearVT = EVT::getVectorVT(*DAG.getContext(), ClearSVT, NumSubElts);
	if (!TLI.isVectorClearMaskLegal(Indices, ClearVT))
	return SDValue();

	SDValue Zero = DAG.getConstant(0, DL, ClearVT);
	return DAG.getBitcast(VT, DAG.getVectorShuffle(ClearVT, DL,
	DAG.getBitcast(ClearVT, LHS),
	Zero, Indices));
	};

	// Determine maximum split level (byte level masking).
	int MaxSplit = 1;
	if (RVT.getScalarSizeInBits() % 8 == 0)
	MaxSplit = RVT.getScalarSizeInBits() / 8;

	for (int Split = 1; Split <= MaxSplit; ++Split)
	if (RVT.getScalarSizeInBits() % Split == 0)
	if (SDValue S = BuildClearMask(Split))
	return S;

	return SDValue();
	}

	/// If a vector binop is performed on splat values, it may be profitable to
	/// extract, scalarize, and insert/splat.
	static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	unsigned Opcode = N->getOpcode();
	EVT VT = N->getValueType(0);
	EVT EltVT = VT.getVectorElementType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// TODO: Remove/replace the extract cost check? If the elements are available
	// as scalars, then there may be no extract cost. Should we ask if
	// inserting a scalar back into a vector is cheap instead?
	int Index0, Index1;
	SDValue Src0 = DAG.getSplatSourceVector(N0, Index0);
	SDValue Src1 = DAG.getSplatSourceVector(N1, Index1);
	if (!Src0 \|\| !Src1 \|\| Index0 != Index1 \|\|
	Src0.getValueType().getVectorElementType() != EltVT \|\|
	Src1.getValueType().getVectorElementType() != EltVT \|\|
	!TLI.isExtractVecEltCheap(VT, Index0) \|\|
	!TLI.isOperationLegalOrCustom(Opcode, EltVT))
	return SDValue();

	SDLoc DL(N);
	SDValue IndexC = DAG.getVectorIdxConstant(Index0, DL);
	SDValue X = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src0, IndexC);
	SDValue Y = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src1, IndexC);
	SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, X, Y, N->getFlags());

	// If all lanes but 1 are undefined, no need to splat the scalar result.
	// TODO: Keep track of undefs and use that info in the general case.
	if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode() &&
	count_if(N0->ops(), [](SDValue V) { return !V.isUndef(); }) == 1 &&
	count_if(N1->ops(), [](SDValue V) { return !V.isUndef(); }) == 1) {
	// bo (build_vec ..undef, X, undef...), (build_vec ..undef, Y, undef...) -->
	// build_vec ..undef, (bo X, Y), undef...
	SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), DAG.getUNDEF(EltVT));
	Ops[Index0] = ScalarBO;
	return DAG.getBuildVector(VT, DL, Ops);
	}

	// bo (splat X, Index), (splat Y, Index) --> splat (bo X, Y), Index
	SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), ScalarBO);
	return DAG.getBuildVector(VT, DL, Ops);
	}

	/// Visit a binary vector operation, like ADD.
	SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
	assert(N->getValueType(0).isVector() &&
	"SimplifyVBinOp only works on vectors!");

	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	SDValue Ops[] = {LHS, RHS};
	EVT VT = N->getValueType(0);
	unsigned Opcode = N->getOpcode();
	SDNodeFlags Flags = N->getFlags();

	// See if we can constant fold the vector operation.
	if (SDValue Fold = DAG.FoldConstantVectorArithmetic(
	Opcode, SDLoc(LHS), LHS.getValueType(), Ops, N->getFlags()))
	return Fold;

	// Move unary shuffles with identical masks after a vector binop:
	// VBinOp (shuffle A, Undef, Mask), (shuffle B, Undef, Mask))
	// --> shuffle (VBinOp A, B), Undef, Mask
	// This does not require type legality checks because we are creating the
	// same types of operations that are in the original sequence. We do have to
	// restrict ops like integer div that have immediate UB (eg, div-by-zero)
	// though. This code is adapted from the identical transform in instcombine.
	if (Opcode != ISD::UDIV && Opcode != ISD::SDIV &&
	Opcode != ISD::UREM && Opcode != ISD::SREM &&
	Opcode != ISD::UDIVREM && Opcode != ISD::SDIVREM) {
	auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(LHS);
	auto *Shuf1 = dyn_cast<ShuffleVectorSDNode>(RHS);
	if (Shuf0 && Shuf1 && Shuf0->getMask().equals(Shuf1->getMask()) &&
	LHS.getOperand(1).isUndef() && RHS.getOperand(1).isUndef() &&
	(LHS.hasOneUse() \|\| RHS.hasOneUse() \|\| LHS == RHS)) {
	SDLoc DL(N);
	SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, LHS.getOperand(0),
	RHS.getOperand(0), Flags);
	SDValue UndefV = LHS.getOperand(1);
	return DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask());
	}

	// Try to sink a splat shuffle after a binop with a uniform constant.
	// This is limited to cases where neither the shuffle nor the constant have
	// undefined elements because that could be poison-unsafe or inhibit
	// demanded elements analysis. It is further limited to not change a splat
	// of an inserted scalar because that may be optimized better by
	// load-folding or other target-specific behaviors.
	if (isConstOrConstSplat(RHS) && Shuf0 && is_splat(Shuf0->getMask()) &&
	Shuf0->hasOneUse() && Shuf0->getOperand(1).isUndef() &&
	Shuf0->getOperand(0).getOpcode() != ISD::INSERT_VECTOR_ELT) {
	// binop (splat X), (splat C) --> splat (binop X, C)
	SDLoc DL(N);
	SDValue X = Shuf0->getOperand(0);
	SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, X, RHS, Flags);
	return DAG.getVectorShuffle(VT, DL, NewBinOp, DAG.getUNDEF(VT),
	Shuf0->getMask());
	}
	if (isConstOrConstSplat(LHS) && Shuf1 && is_splat(Shuf1->getMask()) &&
	Shuf1->hasOneUse() && Shuf1->getOperand(1).isUndef() &&
	Shuf1->getOperand(0).getOpcode() != ISD::INSERT_VECTOR_ELT) {
	// binop (splat C), (splat X) --> splat (binop C, X)
	SDLoc DL(N);
	SDValue X = Shuf1->getOperand(0);
	SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, LHS, X, Flags);
	return DAG.getVectorShuffle(VT, DL, NewBinOp, DAG.getUNDEF(VT),
	Shuf1->getMask());
	}
	}

	// The following pattern is likely to emerge with vector reduction ops. Moving
	// the binary operation ahead of insertion may allow using a narrower vector
	// instruction that has better performance than the wide version of the op:
	// VBinOp (ins undef, X, Z), (ins undef, Y, Z) --> ins VecC, (VBinOp X, Y), Z
	if (LHS.getOpcode() == ISD::INSERT_SUBVECTOR && LHS.getOperand(0).isUndef() &&
	RHS.getOpcode() == ISD::INSERT_SUBVECTOR && RHS.getOperand(0).isUndef() &&
	LHS.getOperand(2) == RHS.getOperand(2) &&
	(LHS.hasOneUse() \|\| RHS.hasOneUse())) {
	SDValue X = LHS.getOperand(1);
	SDValue Y = RHS.getOperand(1);
	SDValue Z = LHS.getOperand(2);
	EVT NarrowVT = X.getValueType();
	if (NarrowVT == Y.getValueType() &&
	TLI.isOperationLegalOrCustomOrPromote(Opcode, NarrowVT)) {
	// (binop undef, undef) may not return undef, so compute that result.
	SDLoc DL(N);
	SDValue VecC =
	DAG.getNode(Opcode, DL, VT, DAG.getUNDEF(VT), DAG.getUNDEF(VT));
	SDValue NarrowBO = DAG.getNode(Opcode, DL, NarrowVT, X, Y);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, VecC, NarrowBO, Z);
	}
	}

	// Make sure all but the first op are undef or constant.
	auto ConcatWithConstantOrUndef = [](SDValue Concat) {
	return Concat.getOpcode() == ISD::CONCAT_VECTORS &&
	std::all_of(std::next(Concat->op_begin()), Concat->op_end(),
	[](const SDValue &Op) {
	return Op.isUndef() \|\|
	ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
	});
	};

	// The following pattern is likely to emerge with vector reduction ops. Moving
	// the binary operation ahead of the concat may allow using a narrower vector
	// instruction that has better performance than the wide version of the op:
	// VBinOp (concat X, undef/constant), (concat Y, undef/constant) -->
	// concat (VBinOp X, Y), VecC
	if (ConcatWithConstantOrUndef(LHS) && ConcatWithConstantOrUndef(RHS) &&
	(LHS.hasOneUse() \|\| RHS.hasOneUse())) {
	EVT NarrowVT = LHS.getOperand(0).getValueType();
	if (NarrowVT == RHS.getOperand(0).getValueType() &&
	TLI.isOperationLegalOrCustomOrPromote(Opcode, NarrowVT)) {
	SDLoc DL(N);
	unsigned NumOperands = LHS.getNumOperands();
	SmallVector<SDValue, 4> ConcatOps;
	for (unsigned i = 0; i != NumOperands; ++i) {
	// This constant fold for operands 1 and up.
	ConcatOps.push_back(DAG.getNode(Opcode, DL, NarrowVT, LHS.getOperand(i),
	RHS.getOperand(i)));
	}

	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
	}
	}

	if (SDValue V = scalarizeBinOpOfSplats(N, DAG))
	return V;

	return SDValue();
	}

	SDValue DAGCombiner::SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1,
	SDValue N2) {
	assert(N0.getOpcode() ==ISD::SETCC && "First argument must be a SetCC node!");

	SDValue SCC = SimplifySelectCC(DL, N0.getOperand(0), N0.getOperand(1), N1, N2,
	cast<CondCodeSDNode>(N0.getOperand(2))->get());

	// If we got a simplified select_cc node back from SimplifySelectCC, then
	// break it down into a new SETCC node, and a new SELECT node, and then return
	// the SELECT node, since we were called with a SELECT node.
	if (SCC.getNode()) {
	// Check to see if we got a select_cc back (to turn into setcc/select).
	// Otherwise, just return whatever node we got back, like fabs.
	if (SCC.getOpcode() == ISD::SELECT_CC) {
	const SDNodeFlags Flags = N0.getNode()->getFlags();
	SDValue SETCC = DAG.getNode(ISD::SETCC, SDLoc(N0),
	N0.getValueType(),
	SCC.getOperand(0), SCC.getOperand(1),
	SCC.getOperand(4), Flags);
	AddToWorklist(SETCC.getNode());
	SDValue SelectNode = DAG.getSelect(SDLoc(SCC), SCC.getValueType(), SETCC,
	SCC.getOperand(2), SCC.getOperand(3));
	SelectNode->setFlags(Flags);
	return SelectNode;
	}

	return SCC;
	}
	return SDValue();
	}

	/// Given a SELECT or a SELECT_CC node, where LHS and RHS are the two values
	/// being selected between, see if we can simplify the select. Callers of this
	/// should assume that TheSelect is deleted if this returns true. As such, they
	/// should return the appropriate thing (e.g. the node) back to the top-level of
	/// the DAG combiner loop to avoid it being looked at.
	bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
	SDValue RHS) {
	// fold (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x))
	// The select + setcc is redundant, because fsqrt returns NaN for X < 0.
	if (const ConstantFPSDNode *NaN = isConstOrConstSplatFP(LHS)) {
	if (NaN->isNaN() && RHS.getOpcode() == ISD::FSQRT) {
	// We have: (select (setcc ?, ?, ?), NaN, (fsqrt ?))
	SDValue Sqrt = RHS;
	ISD::CondCode CC;
	SDValue CmpLHS;
	const ConstantFPSDNode *Zero = nullptr;

	if (TheSelect->getOpcode() == ISD::SELECT_CC) {
	CC = cast<CondCodeSDNode>(TheSelect->getOperand(4))->get();
	CmpLHS = TheSelect->getOperand(0);
	Zero = isConstOrConstSplatFP(TheSelect->getOperand(1));
	} else {
	// SELECT or VSELECT
	SDValue Cmp = TheSelect->getOperand(0);
	if (Cmp.getOpcode() == ISD::SETCC) {
	CC = cast<CondCodeSDNode>(Cmp.getOperand(2))->get();
	CmpLHS = Cmp.getOperand(0);
	Zero = isConstOrConstSplatFP(Cmp.getOperand(1));
	}
	}
	if (Zero && Zero->isZero() &&
	Sqrt.getOperand(0) == CmpLHS && (CC == ISD::SETOLT \|\|
	CC == ISD::SETULT \|\| CC == ISD::SETLT)) {
	// We have: (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x))
	CombineTo(TheSelect, Sqrt);
	return true;
	}
	}
	}
	// Cannot simplify select with vector condition
	if (TheSelect->getOperand(0).getValueType().isVector()) return false;

	// If this is a select from two identical things, try to pull the operation
	// through the select.
	if (LHS.getOpcode() != RHS.getOpcode() \|\|
	!LHS.hasOneUse() \|\| !RHS.hasOneUse())
	return false;

	// If this is a load and the token chain is identical, replace the select
	// of two loads with a load through a select of the address to load from.
	// This triggers in things like "select bool X, 10.0, 123.0" after the FP
	// constants have been dropped into the constant pool.
	if (LHS.getOpcode() == ISD::LOAD) {
	LoadSDNode *LLD = cast<LoadSDNode>(LHS);
	LoadSDNode *RLD = cast<LoadSDNode>(RHS);

	// Token chains must be identical.
	if (LHS.getOperand(0) != RHS.getOperand(0) \|\|
	// Do not let this transformation reduce the number of volatile loads.
	// Be conservative for atomics for the moment
	// TODO: This does appear to be legal for unordered atomics (see D66309)
	!LLD->isSimple() \|\| !RLD->isSimple() \|\|
	// FIXME: If either is a pre/post inc/dec load,
	// we'd need to split out the address adjustment.
	LLD->isIndexed() \|\| RLD->isIndexed() \|\|
	// If this is an EXTLOAD, the VT's must match.
	LLD->getMemoryVT() != RLD->getMemoryVT() \|\|
	// If this is an EXTLOAD, the kind of extension must match.
	(LLD->getExtensionType() != RLD->getExtensionType() &&
	// The only exception is if one of the extensions is anyext.
	LLD->getExtensionType() != ISD::EXTLOAD &&
	RLD->getExtensionType() != ISD::EXTLOAD) \|\|
	// FIXME: this discards src value information. This is
	// over-conservative. It would be beneficial to be able to remember
	// both potential memory locations. Since we are discarding
	// src value info, don't do the transformation if the memory
	// locations are not in the default address space.
	LLD->getPointerInfo().getAddrSpace() != 0 \|\|
	RLD->getPointerInfo().getAddrSpace() != 0 \|\|
	// We can't produce a CMOV of a TargetFrameIndex since we won't
	// generate the address generation required.
	LLD->getBasePtr().getOpcode() == ISD::TargetFrameIndex \|\|
	RLD->getBasePtr().getOpcode() == ISD::TargetFrameIndex \|\|
	!TLI.isOperationLegalOrCustom(TheSelect->getOpcode(),
	LLD->getBasePtr().getValueType()))
	return false;

	// The loads must not depend on one another.
	if (LLD->isPredecessorOf(RLD) \|\| RLD->isPredecessorOf(LLD))
	return false;

	// Check that the select condition doesn't reach either load. If so,
	// folding this will induce a cycle into the DAG. If not, this is safe to
	// xform, so create a select of the addresses.

	SmallPtrSet<const SDNode *, 32> Visited;
	SmallVector<const SDNode *, 16> Worklist;

	// Always fail if LLD and RLD are not independent. TheSelect is a
	// predecessor to all Nodes in question so we need not search past it.

	Visited.insert(TheSelect);
	Worklist.push_back(LLD);
	Worklist.push_back(RLD);

	if (SDNode::hasPredecessorHelper(LLD, Visited, Worklist) \|\|
	SDNode::hasPredecessorHelper(RLD, Visited, Worklist))
	return false;

	SDValue Addr;
	if (TheSelect->getOpcode() == ISD::SELECT) {
	// We cannot do this optimization if any pair of {RLD, LLD} is a
	// predecessor to {RLD, LLD, CondNode}. As we've already compared the
	// Loads, we only need to check if CondNode is a successor to one of the
	// loads. We can further avoid this if there's no use of their chain
	// value.
	SDNode *CondNode = TheSelect->getOperand(0).getNode();
	Worklist.push_back(CondNode);

	if ((LLD->hasAnyUseOfValue(1) &&
	SDNode::hasPredecessorHelper(LLD, Visited, Worklist)) \|\|
	(RLD->hasAnyUseOfValue(1) &&
	SDNode::hasPredecessorHelper(RLD, Visited, Worklist)))
	return false;

	Addr = DAG.getSelect(SDLoc(TheSelect),
	LLD->getBasePtr().getValueType(),
	TheSelect->getOperand(0), LLD->getBasePtr(),
	RLD->getBasePtr());
	} else { // Otherwise SELECT_CC
	// We cannot do this optimization if any pair of {RLD, LLD} is a
	// predecessor to {RLD, LLD, CondLHS, CondRHS}. As we've already compared
	// the Loads, we only need to check if CondLHS/CondRHS is a successor to
	// one of the loads. We can further avoid this if there's no use of their
	// chain value.

	SDNode *CondLHS = TheSelect->getOperand(0).getNode();
	SDNode *CondRHS = TheSelect->getOperand(1).getNode();
	Worklist.push_back(CondLHS);
	Worklist.push_back(CondRHS);

	if ((LLD->hasAnyUseOfValue(1) &&
	SDNode::hasPredecessorHelper(LLD, Visited, Worklist)) \|\|
	(RLD->hasAnyUseOfValue(1) &&
	SDNode::hasPredecessorHelper(RLD, Visited, Worklist)))
	return false;

	Addr = DAG.getNode(ISD::SELECT_CC, SDLoc(TheSelect),
	LLD->getBasePtr().getValueType(),
	TheSelect->getOperand(0),
	TheSelect->getOperand(1),
	LLD->getBasePtr(), RLD->getBasePtr(),
	TheSelect->getOperand(4));
	}

	SDValue Load;
	// It is safe to replace the two loads if they have different alignments,
	// but the new load must be the minimum (most restrictive) alignment of the
	// inputs.
	unsigned Alignment = std::min(LLD->getAlignment(), RLD->getAlignment());
	MachineMemOperand::Flags MMOFlags = LLD->getMemOperand()->getFlags();
	if (!RLD->isInvariant())
	MMOFlags &= ~MachineMemOperand::MOInvariant;
	if (!RLD->isDereferenceable())
	MMOFlags &= ~MachineMemOperand::MODereferenceable;
	if (LLD->getExtensionType() == ISD::NON_EXTLOAD) {
	// FIXME: Discards pointer and AA info.
	Load = DAG.getLoad(TheSelect->getValueType(0), SDLoc(TheSelect),
	LLD->getChain(), Addr, MachinePointerInfo(), Alignment,
	MMOFlags);
	} else {
	// FIXME: Discards pointer and AA info.
	Load = DAG.getExtLoad(
	LLD->getExtensionType() == ISD::EXTLOAD ? RLD->getExtensionType()
	: LLD->getExtensionType(),
	SDLoc(TheSelect), TheSelect->getValueType(0), LLD->getChain(), Addr,
	MachinePointerInfo(), LLD->getMemoryVT(), Alignment, MMOFlags);
	}

	// Users of the select now use the result of the load.
	CombineTo(TheSelect, Load);

	// Users of the old loads now use the new load's chain. We know the
	// old-load value is dead now.
	CombineTo(LHS.getNode(), Load.getValue(0), Load.getValue(1));
	CombineTo(RHS.getNode(), Load.getValue(0), Load.getValue(1));
	return true;
	}

	return false;
	}

	/// Try to fold an expression of the form (N0 cond N1) ? N2 : N3 to a shift and
	/// bitwise 'and'.
	SDValue DAGCombiner::foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0,
	SDValue N1, SDValue N2, SDValue N3,
	ISD::CondCode CC) {
	// If this is a select where the false operand is zero and the compare is a
	// check of the sign bit, see if we can perform the "gzip trick":
	// select_cc setlt X, 0, A, 0 -> and (sra X, size(X)-1), A
	// select_cc setgt X, 0, A, 0 -> and (not (sra X, size(X)-1)), A
	EVT XType = N0.getValueType();
	EVT AType = N2.getValueType();
	if (!isNullConstant(N3) \|\| !XType.bitsGE(AType))
	return SDValue();

	// If the comparison is testing for a positive value, we have to invert
	// the sign bit mask, so only do that transform if the target has a bitwise
	// 'and not' instruction (the invert is free).
	if (CC == ISD::SETGT && TLI.hasAndNot(N2)) {
	// (X > -1) ? A : 0
	// (X > 0) ? X : 0 <-- This is canonical signed max.
	if (!(isAllOnesConstant(N1) \|\| (isNullConstant(N1) && N0 == N2)))
	return SDValue();
	} else if (CC == ISD::SETLT) {
	// (X < 0) ? A : 0
	// (X < 1) ? X : 0 <-- This is un-canonicalized signed min.
	if (!(isNullConstant(N1) \|\| (isOneConstant(N1) && N0 == N2)))
	return SDValue();
	} else {
	return SDValue();
	}

	// and (sra X, size(X)-1), A -> "and (srl X, C2), A" iff A is a single-bit
	// constant.
	EVT ShiftAmtTy = getShiftAmountTy(N0.getValueType());
	auto *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
	if (N2C && ((N2C->getAPIntValue() & (N2C->getAPIntValue() - 1)) == 0)) {
	unsigned ShCt = XType.getSizeInBits() - N2C->getAPIntValue().logBase2() - 1;
	if (!TLI.shouldAvoidTransformToShift(XType, ShCt)) {
	SDValue ShiftAmt = DAG.getConstant(ShCt, DL, ShiftAmtTy);
	SDValue Shift = DAG.getNode(ISD::SRL, DL, XType, N0, ShiftAmt);
	AddToWorklist(Shift.getNode());

	if (XType.bitsGT(AType)) {
	Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift);
	AddToWorklist(Shift.getNode());
	}

	if (CC == ISD::SETGT)
	Shift = DAG.getNOT(DL, Shift, AType);

	return DAG.getNode(ISD::AND, DL, AType, Shift, N2);
	}
	}

	unsigned ShCt = XType.getSizeInBits() - 1;
	if (TLI.shouldAvoidTransformToShift(XType, ShCt))
	return SDValue();

	SDValue ShiftAmt = DAG.getConstant(ShCt, DL, ShiftAmtTy);
	SDValue Shift = DAG.getNode(ISD::SRA, DL, XType, N0, ShiftAmt);
	AddToWorklist(Shift.getNode());

	if (XType.bitsGT(AType)) {
	Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift);
	AddToWorklist(Shift.getNode());
	}

	if (CC == ISD::SETGT)
	Shift = DAG.getNOT(DL, Shift, AType);

	return DAG.getNode(ISD::AND, DL, AType, Shift, N2);
	}

	/// Turn "(a cond b) ? 1.0f : 2.0f" into "load (tmp + ((a cond b) ? 0 : 4)"
	/// where "tmp" is a constant pool entry containing an array with 1.0 and 2.0
	/// in it. This may be a win when the constant is not otherwise available
	/// because it replaces two constant pool loads with one.
	SDValue DAGCombiner::convertSelectOfFPConstantsToLoadOffset(
	const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3,
	ISD::CondCode CC) {
	if (!TLI.reduceSelectOfFPConstantLoads(N0.getValueType()))
	return SDValue();

	// If we are before legalize types, we want the other legalization to happen
	// first (for example, to avoid messing with soft float).
	auto *TV = dyn_cast<ConstantFPSDNode>(N2);
	auto *FV = dyn_cast<ConstantFPSDNode>(N3);
	EVT VT = N2.getValueType();
	if (!TV \|\| !FV \|\| !TLI.isTypeLegal(VT))
	return SDValue();

	// If a constant can be materialized without loads, this does not make sense.
	if (TLI.getOperationAction(ISD::ConstantFP, VT) == TargetLowering::Legal \|\|
	TLI.isFPImmLegal(TV->getValueAPF(), TV->getValueType(0), ForCodeSize) \|\|
	TLI.isFPImmLegal(FV->getValueAPF(), FV->getValueType(0), ForCodeSize))
	return SDValue();

	// If both constants have multiple uses, then we won't need to do an extra
	// load. The values are likely around in registers for other users.
	if (!TV->hasOneUse() && !FV->hasOneUse())
	return SDValue();

	Constant Elts[] = { const_cast<ConstantFP>(FV->getConstantFPValue()),
	const_cast<ConstantFP*>(TV->getConstantFPValue()) };
	Type *FPTy = Elts[0]->getType();
	const DataLayout &TD = DAG.getDataLayout();

	// Create a ConstantArray of the two constants.
	Constant *CA = ConstantArray::get(ArrayType::get(FPTy, 2), Elts);
	SDValue CPIdx = DAG.getConstantPool(CA, TLI.getPointerTy(DAG.getDataLayout()),
	TD.getPrefTypeAlign(FPTy));
	Align Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlign();

	// Get offsets to the 0 and 1 elements of the array, so we can select between
	// them.
	SDValue Zero = DAG.getIntPtrConstant(0, DL);
	unsigned EltSize = (unsigned)TD.getTypeAllocSize(Elts[0]->getType());
	SDValue One = DAG.getIntPtrConstant(EltSize, SDLoc(FV));
	SDValue Cond =
	DAG.getSetCC(DL, getSetCCResultType(N0.getValueType()), N0, N1, CC);
	AddToWorklist(Cond.getNode());
	SDValue CstOffset = DAG.getSelect(DL, Zero.getValueType(), Cond, One, Zero);
	AddToWorklist(CstOffset.getNode());
	CPIdx = DAG.getNode(ISD::ADD, DL, CPIdx.getValueType(), CPIdx, CstOffset);
	AddToWorklist(CPIdx.getNode());
	return DAG.getLoad(TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx,
	MachinePointerInfo::getConstantPool(
	DAG.getMachineFunction()), Alignment);
	}

	/// Simplify an expression of the form (N0 cond N1) ? N2 : N3
	/// where 'cond' is the comparison specified by CC.
	SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
	SDValue N2, SDValue N3, ISD::CondCode CC,
	bool NotExtCompare) {
	// (x ? y : y) -> y.
	if (N2 == N3) return N2;

	EVT CmpOpVT = N0.getValueType();
	EVT CmpResVT = getSetCCResultType(CmpOpVT);
	EVT VT = N2.getValueType();
	auto *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
	auto *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
	auto *N3C = dyn_cast<ConstantSDNode>(N3.getNode());

	// Determine if the condition we're dealing with is constant.
	if (SDValue SCC = DAG.FoldSetCC(CmpResVT, N0, N1, CC, DL)) {
	AddToWorklist(SCC.getNode());
	if (auto *SCCC = dyn_cast<ConstantSDNode>(SCC)) {
	// fold select_cc true, x, y -> x
	// fold select_cc false, x, y -> y
	return !(SCCC->isNullValue()) ? N2 : N3;
	}
	}

	if (SDValue V =
	convertSelectOfFPConstantsToLoadOffset(DL, N0, N1, N2, N3, CC))
	return V;

	if (SDValue V = foldSelectCCToShiftAnd(DL, N0, N1, N2, N3, CC))
	return V;

	// fold (select_cc seteq (and x, y), 0, 0, A) -> (and (shr (shl x)) A)
	// where y is has a single bit set.
	// A plaintext description would be, we can turn the SELECT_CC into an AND
	// when the condition can be materialized as an all-ones register. Any
	// single bit-test can be materialized as an all-ones register with
	// shift-left and shift-right-arith.
	if (CC == ISD::SETEQ && N0->getOpcode() == ISD::AND &&
	N0->getValueType(0) == VT && isNullConstant(N1) && isNullConstant(N2)) {
	SDValue AndLHS = N0->getOperand(0);
	auto *ConstAndRHS = dyn_cast<ConstantSDNode>(N0->getOperand(1));
	if (ConstAndRHS && ConstAndRHS->getAPIntValue().countPopulation() == 1) {
	// Shift the tested bit over the sign bit.
	const APInt &AndMask = ConstAndRHS->getAPIntValue();
	unsigned ShCt = AndMask.getBitWidth() - 1;
	if (!TLI.shouldAvoidTransformToShift(VT, ShCt)) {
	SDValue ShlAmt =
	DAG.getConstant(AndMask.countLeadingZeros(), SDLoc(AndLHS),
	getShiftAmountTy(AndLHS.getValueType()));
	SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N0), VT, AndLHS, ShlAmt);

	// Now arithmetic right shift it all the way over, so the result is
	// either all-ones, or zero.
	SDValue ShrAmt =
	DAG.getConstant(ShCt, SDLoc(Shl),
	getShiftAmountTy(Shl.getValueType()));
	SDValue Shr = DAG.getNode(ISD::SRA, SDLoc(N0), VT, Shl, ShrAmt);

	return DAG.getNode(ISD::AND, DL, VT, Shr, N3);
	}
	}
	}

	// fold select C, 16, 0 -> shl C, 4
	bool Fold = N2C && isNullConstant(N3) && N2C->getAPIntValue().isPowerOf2();
	bool Swap = N3C && isNullConstant(N2) && N3C->getAPIntValue().isPowerOf2();

	if ((Fold \|\| Swap) &&
	TLI.getBooleanContents(CmpOpVT) ==
	TargetLowering::ZeroOrOneBooleanContent &&
	(!LegalOperations \|\| TLI.isOperationLegal(ISD::SETCC, CmpOpVT))) {

	if (Swap) {
	CC = ISD::getSetCCInverse(CC, CmpOpVT);
	std::swap(N2C, N3C);
	}

	// If the caller doesn't want us to simplify this into a zext of a compare,
	// don't do it.
	if (NotExtCompare && N2C->isOne())
	return SDValue();

	SDValue Temp, SCC;
	// zext (setcc n0, n1)
	if (LegalTypes) {
	SCC = DAG.getSetCC(DL, CmpResVT, N0, N1, CC);
	if (VT.bitsLT(SCC.getValueType()))
	Temp = DAG.getZeroExtendInReg(SCC, SDLoc(N2), VT);
	else
	Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2), VT, SCC);
	} else {
	SCC = DAG.getSetCC(SDLoc(N0), MVT::i1, N0, N1, CC);
	Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2), VT, SCC);
	}

	AddToWorklist(SCC.getNode());
	AddToWorklist(Temp.getNode());

	if (N2C->isOne())
	return Temp;

	unsigned ShCt = N2C->getAPIntValue().logBase2();
	if (TLI.shouldAvoidTransformToShift(VT, ShCt))
	return SDValue();

	// shl setcc result by log2 n2c
	return DAG.getNode(ISD::SHL, DL, N2.getValueType(), Temp,
	DAG.getConstant(ShCt, SDLoc(Temp),
	getShiftAmountTy(Temp.getValueType())));
	}

	// select_cc seteq X, 0, sizeof(X), ctlz(X) -> ctlz(X)
	// select_cc seteq X, 0, sizeof(X), ctlz_zero_undef(X) -> ctlz(X)
	// select_cc seteq X, 0, sizeof(X), cttz(X) -> cttz(X)
	// select_cc seteq X, 0, sizeof(X), cttz_zero_undef(X) -> cttz(X)
	// select_cc setne X, 0, ctlz(X), sizeof(X) -> ctlz(X)
	// select_cc setne X, 0, ctlz_zero_undef(X), sizeof(X) -> ctlz(X)
	// select_cc setne X, 0, cttz(X), sizeof(X) -> cttz(X)
	// select_cc setne X, 0, cttz_zero_undef(X), sizeof(X) -> cttz(X)
	if (N1C && N1C->isNullValue() && (CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	SDValue ValueOnZero = N2;
	SDValue Count = N3;
	// If the condition is NE instead of E, swap the operands.
	if (CC == ISD::SETNE)
	std::swap(ValueOnZero, Count);
	// Check if the value on zero is a constant equal to the bits in the type.
	if (auto *ValueOnZeroC = dyn_cast<ConstantSDNode>(ValueOnZero)) {
	if (ValueOnZeroC->getAPIntValue() == VT.getSizeInBits()) {
	// If the other operand is cttz/cttz_zero_undef of N0, and cttz is
	// legal, combine to just cttz.
	if ((Count.getOpcode() == ISD::CTTZ \|\|
	Count.getOpcode() == ISD::CTTZ_ZERO_UNDEF) &&
	N0 == Count.getOperand(0) &&
	(!LegalOperations \|\| TLI.isOperationLegal(ISD::CTTZ, VT)))
	return DAG.getNode(ISD::CTTZ, DL, VT, N0);
	// If the other operand is ctlz/ctlz_zero_undef of N0, and ctlz is
	// legal, combine to just ctlz.
	if ((Count.getOpcode() == ISD::CTLZ \|\|
	Count.getOpcode() == ISD::CTLZ_ZERO_UNDEF) &&
	N0 == Count.getOperand(0) &&
	(!LegalOperations \|\| TLI.isOperationLegal(ISD::CTLZ, VT)))
	return DAG.getNode(ISD::CTLZ, DL, VT, N0);
	}
	}
	}

	return SDValue();
	}

	/// This is a stub for TargetLowering::SimplifySetCC.
	SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
	ISD::CondCode Cond, const SDLoc &DL,
	bool foldBooleans) {
	TargetLowering::DAGCombinerInfo
	DagCombineInfo(DAG, Level, false, this);
	return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL);
	}

	/// Given an ISD::SDIV node expressing a divide by constant, return
	/// a DAG expression to select that will generate the same value by multiplying
	/// by a magic number.
	/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
	SDValue DAGCombiner::BuildSDIV(SDNode *N) {
	// when optimising for minimum size, we don't want to expand a div to a mul
	// and a shift.
	if (DAG.getMachineFunction().getFunction().hasMinSize())
	return SDValue();

	SmallVector<SDNode *, 8> Built;
	if (SDValue S = TLI.BuildSDIV(N, DAG, LegalOperations, Built)) {
	for (SDNode *N : Built)
	AddToWorklist(N);
	return S;
	}

	return SDValue();
	}

	/// Given an ISD::SDIV node expressing a divide by constant power of 2, return a
	/// DAG expression that will generate the same value by right shifting.
	SDValue DAGCombiner::BuildSDIVPow2(SDNode *N) {
	ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
	if (!C)
	return SDValue();

	// Avoid division by zero.
	if (C->isNullValue())
	return SDValue();

	SmallVector<SDNode *, 8> Built;
	if (SDValue S = TLI.BuildSDIVPow2(N, C->getAPIntValue(), DAG, Built)) {
	for (SDNode *N : Built)
	AddToWorklist(N);
	return S;
	}

	return SDValue();
	}

	/// Given an ISD::UDIV node expressing a divide by constant, return a DAG
	/// expression that will generate the same value by multiplying by a magic
	/// number.
	/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
	SDValue DAGCombiner::BuildUDIV(SDNode *N) {
	// when optimising for minimum size, we don't want to expand a div to a mul
	// and a shift.
	if (DAG.getMachineFunction().getFunction().hasMinSize())
	return SDValue();

	SmallVector<SDNode *, 8> Built;
	if (SDValue S = TLI.BuildUDIV(N, DAG, LegalOperations, Built)) {
	for (SDNode *N : Built)
	AddToWorklist(N);
	return S;
	}

	return SDValue();
	}

	/// Determines the LogBase2 value for a non-null input value using the
	/// transform: LogBase2(V) = (EltBits - 1) - ctlz(V).
	SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL) {
	EVT VT = V.getValueType();
	unsigned EltBits = VT.getScalarSizeInBits();
	SDValue Ctlz = DAG.getNode(ISD::CTLZ, DL, VT, V);
	SDValue Base = DAG.getConstant(EltBits - 1, DL, VT);
	SDValue LogBase2 = DAG.getNode(ISD::SUB, DL, VT, Base, Ctlz);
	return LogBase2;
	}

	/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
	/// For the reciprocal, we need to find the zero of the function:
	/// F(X) = A X - 1 [which has a zero at X = 1/A]
	/// =>
	/// X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
	/// does not require additional intermediate precision]
	/// For the last iteration, put numerator N into it to gain more precision:
	/// Result = N X_i + X_i (N - N A X_i)
	SDValue DAGCombiner::BuildDivEstimate(SDValue N, SDValue Op,
	SDNodeFlags Flags) {
	if (LegalDAG)
	return SDValue();

	// TODO: Handle half and/or extended types?
	EVT VT = Op.getValueType();
	if (VT.getScalarType() != MVT::f32 && VT.getScalarType() != MVT::f64)
	return SDValue();

	// If estimates are explicitly disabled for this function, we're done.
	MachineFunction &MF = DAG.getMachineFunction();
	int Enabled = TLI.getRecipEstimateDivEnabled(VT, MF);
	if (Enabled == TLI.ReciprocalEstimate::Disabled)
	return SDValue();

	// Estimates may be explicitly enabled for this type with a custom number of
	// refinement steps.
	int Iterations = TLI.getDivRefinementSteps(VT, MF);
	if (SDValue Est = TLI.getRecipEstimate(Op, DAG, Enabled, Iterations)) {
	AddToWorklist(Est.getNode());

	SDLoc DL(Op);
	if (Iterations) {
	SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);

	// Newton iterations: Est = Est + Est (N - Arg * Est)
	// If this is the last iteration, also multiply by the numerator.
	for (int i = 0; i < Iterations; ++i) {
	SDValue MulEst = Est;

	if (i == Iterations - 1) {
	MulEst = DAG.getNode(ISD::FMUL, DL, VT, N, Est, Flags);
	AddToWorklist(MulEst.getNode());
	}

	SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, MulEst, Flags);
	AddToWorklist(NewEst.getNode());

	NewEst = DAG.getNode(ISD::FSUB, DL, VT,
	(i == Iterations - 1 ? N : FPOne), NewEst, Flags);
	AddToWorklist(NewEst.getNode());

	NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
	AddToWorklist(NewEst.getNode());

	Est = DAG.getNode(ISD::FADD, DL, VT, MulEst, NewEst, Flags);
	AddToWorklist(Est.getNode());
	}
	} else {
	// If no iterations are available, multiply with N.
	Est = DAG.getNode(ISD::FMUL, DL, VT, Est, N, Flags);
	AddToWorklist(Est.getNode());
	}

	return Est;
	}

	return SDValue();
	}

	/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
	/// For the reciprocal sqrt, we need to find the zero of the function:
	/// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
	/// =>
	/// X_{i+1} = X_i (1.5 - A X_i^2 / 2)
	/// As a result, we precompute A/2 prior to the iteration loop.
	SDValue DAGCombiner::buildSqrtNROneConst(SDValue Arg, SDValue Est,
	unsigned Iterations,
	SDNodeFlags Flags, bool Reciprocal) {
	EVT VT = Arg.getValueType();
	SDLoc DL(Arg);
	SDValue ThreeHalves = DAG.getConstantFP(1.5, DL, VT);

	// We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that
	// this entire sequence requires only one FP constant.
	SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg, Flags);
	HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg, Flags);

	// Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
	for (unsigned i = 0; i < Iterations; ++i) {
	SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est, Flags);
	NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst, Flags);
	NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst, Flags);
	Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
	}

	// If non-reciprocal square root is requested, multiply the result by Arg.
	if (!Reciprocal)
	Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg, Flags);

	return Est;
	}

	/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
	/// For the reciprocal sqrt, we need to find the zero of the function:
	/// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
	/// =>
	/// X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0))
	SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est,
	unsigned Iterations,
	SDNodeFlags Flags, bool Reciprocal) {
	EVT VT = Arg.getValueType();
	SDLoc DL(Arg);
	SDValue MinusThree = DAG.getConstantFP(-3.0, DL, VT);
	SDValue MinusHalf = DAG.getConstantFP(-0.5, DL, VT);

	// This routine must enter the loop below to work correctly
	// when (Reciprocal == false).
	assert(Iterations > 0);

	// Newton iterations for reciprocal square root:
	// E = (E * -0.5) * ((A * E) * E + -3.0)
	for (unsigned i = 0; i < Iterations; ++i) {
	SDValue AE = DAG.getNode(ISD::FMUL, DL, VT, Arg, Est, Flags);
	SDValue AEE = DAG.getNode(ISD::FMUL, DL, VT, AE, Est, Flags);
	SDValue RHS = DAG.getNode(ISD::FADD, DL, VT, AEE, MinusThree, Flags);

	// When calculating a square root at the last iteration build:
	// S = ((A * E) * -0.5) * ((A * E) * E + -3.0)
	// (notice a common subexpression)
	SDValue LHS;
	if (Reciprocal \|\| (i + 1) < Iterations) {
	// RSQRT: LHS = (E * -0.5)
	LHS = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf, Flags);
	} else {
	// SQRT: LHS = (A * E) * -0.5
	LHS = DAG.getNode(ISD::FMUL, DL, VT, AE, MinusHalf, Flags);
	}

	Est = DAG.getNode(ISD::FMUL, DL, VT, LHS, RHS, Flags);
	}

	return Est;
	}

	/// Build code to calculate either rsqrt(Op) or sqrt(Op). In the latter case
	/// Op*rsqrt(Op) is actually computed, so additional postprocessing is needed if
	/// Op can be zero.
	SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags,
	bool Reciprocal) {
	if (LegalDAG)
	return SDValue();

	// TODO: Handle half and/or extended types?
	EVT VT = Op.getValueType();
	if (VT.getScalarType() != MVT::f32 && VT.getScalarType() != MVT::f64)
	return SDValue();

	// If estimates are explicitly disabled for this function, we're done.
	MachineFunction &MF = DAG.getMachineFunction();
	int Enabled = TLI.getRecipEstimateSqrtEnabled(VT, MF);
	if (Enabled == TLI.ReciprocalEstimate::Disabled)
	return SDValue();

	// Estimates may be explicitly enabled for this type with a custom number of
	// refinement steps.
	int Iterations = TLI.getSqrtRefinementSteps(VT, MF);

	bool UseOneConstNR = false;
	if (SDValue Est =
	TLI.getSqrtEstimate(Op, DAG, Enabled, Iterations, UseOneConstNR,
	Reciprocal)) {
	AddToWorklist(Est.getNode());

	if (Iterations) {
	Est = UseOneConstNR
	? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal)
	: buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal);

	if (!Reciprocal) {
	// The estimate is now completely wrong if the input was exactly 0.0 or
	// possibly a denormal. Force the answer to 0.0 for those cases.
	SDLoc DL(Op);
	EVT CCVT = getSetCCResultType(VT);
	ISD::NodeType SelOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
	DenormalMode DenormMode = DAG.getDenormalMode(VT);
	if (DenormMode.Input == DenormalMode::IEEE) {
	// This is specifically a check for the handling of denormal inputs,
	// not the result.

	// fabs(X) < SmallestNormal ? 0.0 : Est
	const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT);
	APFloat SmallestNorm = APFloat::getSmallestNormalized(FltSem);
	SDValue NormC = DAG.getConstantFP(SmallestNorm, DL, VT);
	SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
	SDValue Fabs = DAG.getNode(ISD::FABS, DL, VT, Op);
	SDValue IsDenorm = DAG.getSetCC(DL, CCVT, Fabs, NormC, ISD::SETLT);
	Est = DAG.getNode(SelOpcode, DL, VT, IsDenorm, FPZero, Est);
	} else {
	// X == 0.0 ? 0.0 : Est
	SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
	SDValue IsZero = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
	Est = DAG.getNode(SelOpcode, DL, VT, IsZero, FPZero, Est);
	}
	}
	}
	return Est;
	}

	return SDValue();
	}

	SDValue DAGCombiner::buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags) {
	return buildSqrtEstimateImpl(Op, Flags, true);
	}

	SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags Flags) {
	return buildSqrtEstimateImpl(Op, Flags, false);
	}

	/// Return true if there is any possibility that the two addresses overlap.
	bool DAGCombiner::isAlias(SDNode Op0, SDNode Op1) const {

	struct MemUseCharacteristics {
	bool IsVolatile;
	bool IsAtomic;
	SDValue BasePtr;
	int64_t Offset;
	Optional<int64_t> NumBytes;
	MachineMemOperand *MMO;
	};

	auto getCharacteristics = [](SDNode *N) -> MemUseCharacteristics {
	if (const auto *LSN = dyn_cast<LSBaseSDNode>(N)) {
	int64_t Offset = 0;
	if (auto *C = dyn_cast<ConstantSDNode>(LSN->getOffset()))
	Offset = (LSN->getAddressingMode() == ISD::PRE_INC)
	? C->getSExtValue()
	: (LSN->getAddressingMode() == ISD::PRE_DEC)
	? -1 * C->getSExtValue()
	: 0;
	uint64_t Size =
	MemoryLocation::getSizeOrUnknown(LSN->getMemoryVT().getStoreSize());
	return {LSN->isVolatile(), LSN->isAtomic(), LSN->getBasePtr(),
	Offset /base offset/,
	Optional<int64_t>(Size),
	LSN->getMemOperand()};
	}
	if (const auto *LN = cast<LifetimeSDNode>(N))
	return {false /isVolatile/, /isAtomic/ false, LN->getOperand(1),
	(LN->hasOffset()) ? LN->getOffset() : 0,
	(LN->hasOffset()) ? Optional<int64_t>(LN->getSize())
	: Optional<int64_t>(),
	(MachineMemOperand *)nullptr};
	// Default.
	return {false /isvolatile/, /isAtomic/ false, SDValue(),
	(int64_t)0 /offset/,
	Optional<int64_t>() /size/, (MachineMemOperand *)nullptr};
	};

	MemUseCharacteristics MUC0 = getCharacteristics(Op0),
	MUC1 = getCharacteristics(Op1);

	// If they are to the same address, then they must be aliases.
	if (MUC0.BasePtr.getNode() && MUC0.BasePtr == MUC1.BasePtr &&
	MUC0.Offset == MUC1.Offset)
	return true;

	// If they are both volatile then they cannot be reordered.
	if (MUC0.IsVolatile && MUC1.IsVolatile)
	return true;

	// Be conservative about atomics for the moment
	// TODO: This is way overconservative for unordered atomics (see D66309)
	if (MUC0.IsAtomic && MUC1.IsAtomic)
	return true;

	if (MUC0.MMO && MUC1.MMO) {
	if ((MUC0.MMO->isInvariant() && MUC1.MMO->isStore()) \|\|
	(MUC1.MMO->isInvariant() && MUC0.MMO->isStore()))
	return false;
	}

	// Try to prove that there is aliasing, or that there is no aliasing. Either
	// way, we can return now. If nothing can be proved, proceed with more tests.
	bool IsAlias;
	if (BaseIndexOffset::computeAliasing(Op0, MUC0.NumBytes, Op1, MUC1.NumBytes,
	DAG, IsAlias))
	return IsAlias;

	// The following all rely on MMO0 and MMO1 being valid. Fail conservatively if
	// either are not known.
	if (!MUC0.MMO \|\| !MUC1.MMO)
	return true;

	// If one operation reads from invariant memory, and the other may store, they
	// cannot alias. These should really be checking the equivalent of mayWrite,
	// but it only matters for memory nodes other than load /store.
	if ((MUC0.MMO->isInvariant() && MUC1.MMO->isStore()) \|\|
	(MUC1.MMO->isInvariant() && MUC0.MMO->isStore()))
	return false;

	// If we know required SrcValue1 and SrcValue2 have relatively large
	// alignment compared to the size and offset of the access, we may be able
	// to prove they do not alias. This check is conservative for now to catch
	// cases created by splitting vector types, it only works when the offsets are
	// multiples of the size of the data.
	int64_t SrcValOffset0 = MUC0.MMO->getOffset();
	int64_t SrcValOffset1 = MUC1.MMO->getOffset();
	Align OrigAlignment0 = MUC0.MMO->getBaseAlign();
	Align OrigAlignment1 = MUC1.MMO->getBaseAlign();
	auto &Size0 = MUC0.NumBytes;
	auto &Size1 = MUC1.NumBytes;
	if (OrigAlignment0 == OrigAlignment1 && SrcValOffset0 != SrcValOffset1 &&
	Size0.hasValue() && Size1.hasValue() && Size0 == Size1 &&
	OrigAlignment0 > Size0 && SrcValOffset0 % Size0 == 0 &&
	SrcValOffset1 % *Size1 == 0) {
	int64_t OffAlign0 = SrcValOffset0 % OrigAlignment0.value();
	int64_t OffAlign1 = SrcValOffset1 % OrigAlignment1.value();

	// There is no overlap between these relatively aligned accesses of
	// similar size. Return no alias.
	if ((OffAlign0 + Size0) <= OffAlign1 \|\| (OffAlign1 + Size1) <= OffAlign0)
	return false;
	}

	bool UseAA = CombinerGlobalAA.getNumOccurrences() > 0
	? CombinerGlobalAA
	: DAG.getSubtarget().useAA();
	#ifndef NDEBUG
	if (CombinerAAOnlyFunc.getNumOccurrences() &&
	CombinerAAOnlyFunc != DAG.getMachineFunction().getName())
	UseAA = false;
	#endif

	if (UseAA && AA && MUC0.MMO->getValue() && MUC1.MMO->getValue() &&
	Size0.hasValue() && Size1.hasValue()) {
	// Use alias analysis information.
	int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1);
	int64_t Overlap0 = *Size0 + SrcValOffset0 - MinOffset;
	int64_t Overlap1 = *Size1 + SrcValOffset1 - MinOffset;
	AliasResult AAResult = AA->alias(
	MemoryLocation(MUC0.MMO->getValue(), Overlap0,
	UseTBAA ? MUC0.MMO->getAAInfo() : AAMDNodes()),
	MemoryLocation(MUC1.MMO->getValue(), Overlap1,
	UseTBAA ? MUC1.MMO->getAAInfo() : AAMDNodes()));
	if (AAResult == NoAlias)
	return false;
	}

	// Otherwise we have to assume they alias.
	return true;
	}

	/// Walk up chain skipping non-aliasing memory nodes,
	/// looking for aliasing nodes and adding them to the Aliases vector.
	void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
	SmallVectorImpl<SDValue> &Aliases) {
	SmallVector<SDValue, 8> Chains; // List of chains to visit.
	SmallPtrSet<SDNode *, 16> Visited; // Visited node set.

	// Get alias information for node.
	// TODO: relax aliasing for unordered atomics (see D66309)
	const bool IsLoad = isa<LoadSDNode>(N) && cast<LoadSDNode>(N)->isSimple();

	// Starting off.
	Chains.push_back(OriginalChain);
	unsigned Depth = 0;

	// Attempt to improve chain by a single step
	std::function<bool(SDValue &)> ImproveChain = [&](SDValue &C) -> bool {
	switch (C.getOpcode()) {
	case ISD::EntryToken:
	// No need to mark EntryToken.
	C = SDValue();
	return true;
	case ISD::LOAD:
	case ISD::STORE: {
	// Get alias information for C.
	// TODO: Relax aliasing for unordered atomics (see D66309)
	bool IsOpLoad = isa<LoadSDNode>(C.getNode()) &&
	cast<LSBaseSDNode>(C.getNode())->isSimple();
	if ((IsLoad && IsOpLoad) \|\| !isAlias(N, C.getNode())) {
	// Look further up the chain.
	C = C.getOperand(0);
	return true;
	}
	// Alias, so stop here.
	return false;
	}

	case ISD::CopyFromReg:
	// Always forward past past CopyFromReg.
	C = C.getOperand(0);
	return true;

	case ISD::LIFETIME_START:
	case ISD::LIFETIME_END: {
	// We can forward past any lifetime start/end that can be proven not to
	// alias the memory access.
	if (!isAlias(N, C.getNode())) {
	// Look further up the chain.
	C = C.getOperand(0);
	return true;
	}
	return false;
	}
	default:
	return false;
	}
	};

	// Look at each chain and determine if it is an alias. If so, add it to the
	// aliases list. If not, then continue up the chain looking for the next
	// candidate.
	while (!Chains.empty()) {
	SDValue Chain = Chains.pop_back_val();

	// Don't bother if we've seen Chain before.
	if (!Visited.insert(Chain.getNode()).second)
	continue;

	// For TokenFactor nodes, look at each operand and only continue up the
	// chain until we reach the depth limit.
	//
	// FIXME: The depth check could be made to return the last non-aliasing
	// chain we found before we hit a tokenfactor rather than the original
	// chain.
	if (Depth > TLI.getGatherAllAliasesMaxDepth()) {
	Aliases.clear();
	Aliases.push_back(OriginalChain);
	return;
	}

	if (Chain.getOpcode() == ISD::TokenFactor) {
	// We have to check each of the operands of the token factor for "small"
	// token factors, so we queue them up. Adding the operands to the queue
	// (stack) in reverse order maintains the original order and increases the
	// likelihood that getNode will find a matching token factor (CSE.)
	if (Chain.getNumOperands() > 16) {
	Aliases.push_back(Chain);
	continue;
	}
	for (unsigned n = Chain.getNumOperands(); n;)
	Chains.push_back(Chain.getOperand(--n));
	++Depth;
	continue;
	}
	// Everything else
	if (ImproveChain(Chain)) {
	// Updated Chain Found, Consider new chain if one exists.
	if (Chain.getNode())
	Chains.push_back(Chain);
	++Depth;
	continue;
	}
	// No Improved Chain Possible, treat as Alias.
	Aliases.push_back(Chain);
	}
	}

	/// Walk up chain skipping non-aliasing memory nodes, looking for a better chain
	/// (aliasing node.)
	SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
	if (OptLevel == CodeGenOpt::None)
	return OldChain;

	// Ops for replacing token factor.
	SmallVector<SDValue, 8> Aliases;

	// Accumulate all the aliases to this node.
	GatherAllAliases(N, OldChain, Aliases);

	// If no operands then chain to entry token.
	if (Aliases.size() == 0)
	return DAG.getEntryNode();

	// If a single operand then chain to it. We don't need to revisit it.
	if (Aliases.size() == 1)
	return Aliases[0];

	// Construct a custom tailored token factor.
	return DAG.getTokenFactor(SDLoc(N), Aliases);
	}

	namespace {
	// TODO: Replace with with std::monostate when we move to C++17.
	struct UnitT { } Unit;
	bool operator==(const UnitT &, const UnitT &) { return true; }
	bool operator!=(const UnitT &, const UnitT &) { return false; }
	} // namespace

	// This function tries to collect a bunch of potentially interesting
	// nodes to improve the chains of, all at once. This might seem
	// redundant, as this function gets called when visiting every store
	// node, so why not let the work be done on each store as it's visited?
	//
	// I believe this is mainly important because mergeConsecutiveStores
	// is unable to deal with merging stores of different sizes, so unless
	// we improve the chains of all the potential candidates up-front
	// before running mergeConsecutiveStores, it might only see some of
	// the nodes that will eventually be candidates, and then not be able
	// to go from a partially-merged state to the desired final
	// fully-merged state.

	bool DAGCombiner::parallelizeChainedStores(StoreSDNode *St) {
	SmallVector<StoreSDNode *, 8> ChainedStores;
	StoreSDNode *STChain = St;
	// Intervals records which offsets from BaseIndex have been covered. In
	// the common case, every store writes to the immediately previous address
	// space and thus merged with the previous interval at insertion time.

	using IMap =
	llvm::IntervalMap<int64_t, UnitT, 8, IntervalMapHalfOpenInfo<int64_t>>;
	IMap::Allocator A;
	IMap Intervals(A);

	// This holds the base pointer, index, and the offset in bytes from the base
	// pointer.
	const BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);

	// We must have a base and an offset.
	if (!BasePtr.getBase().getNode())
	return false;

	// Do not handle stores to undef base pointers.
	if (BasePtr.getBase().isUndef())
	return false;

	// BaseIndexOffset assumes that offsets are fixed-size, which
	// is not valid for scalable vectors where the offsets are
	// scaled by `vscale`, so bail out early.
	if (St->getMemoryVT().isScalableVector())
	return false;

	// Add ST's interval.
	Intervals.insert(0, (St->getMemoryVT().getSizeInBits() + 7) / 8, Unit);

	while (StoreSDNode *Chain = dyn_cast<StoreSDNode>(STChain->getChain())) {
	// If the chain has more than one use, then we can't reorder the mem ops.
	if (!SDValue(Chain, 0)->hasOneUse())
	break;
	// TODO: Relax for unordered atomics (see D66309)
	if (!Chain->isSimple() \|\| Chain->isIndexed())
	break;

	// Find the base pointer and offset for this memory node.
	const BaseIndexOffset Ptr = BaseIndexOffset::match(Chain, DAG);
	// Check that the base pointer is the same as the original one.
	int64_t Offset;
	if (!BasePtr.equalBaseIndex(Ptr, DAG, Offset))
	break;
	int64_t Length = (Chain->getMemoryVT().getSizeInBits() + 7) / 8;
	// Make sure we don't overlap with other intervals by checking the ones to
	// the left or right before inserting.
	auto I = Intervals.find(Offset);
	// If there's a next interval, we should end before it.
	if (I != Intervals.end() && I.start() < (Offset + Length))
	break;
	// If there's a previous interval, we should start after it.
	if (I != Intervals.begin() && (--I).stop() <= Offset)
	break;
	Intervals.insert(Offset, Offset + Length, Unit);

	ChainedStores.push_back(Chain);
	STChain = Chain;
	}

	// If we didn't find a chained store, exit.
	if (ChainedStores.size() == 0)
	return false;

	// Improve all chained stores (St and ChainedStores members) starting from
	// where the store chain ended and return single TokenFactor.
	SDValue NewChain = STChain->getChain();
	SmallVector<SDValue, 8> TFOps;
	for (unsigned I = ChainedStores.size(); I;) {
	StoreSDNode *S = ChainedStores[--I];
	SDValue BetterChain = FindBetterChain(S, NewChain);
	S = cast<StoreSDNode>(DAG.UpdateNodeOperands(
	S, BetterChain, S->getOperand(1), S->getOperand(2), S->getOperand(3)));
	TFOps.push_back(SDValue(S, 0));
	ChainedStores[I] = S;
	}

	// Improve St's chain. Use a new node to avoid creating a loop from CombineTo.
	SDValue BetterChain = FindBetterChain(St, NewChain);
	SDValue NewST;
	if (St->isTruncatingStore())
	NewST = DAG.getTruncStore(BetterChain, SDLoc(St), St->getValue(),
	St->getBasePtr(), St->getMemoryVT(),
	St->getMemOperand());
	else
	NewST = DAG.getStore(BetterChain, SDLoc(St), St->getValue(),
	St->getBasePtr(), St->getMemOperand());

	TFOps.push_back(NewST);

	// If we improved every element of TFOps, then we've lost the dependence on
	// NewChain to successors of St and we need to add it back to TFOps. Do so at
	// the beginning to keep relative order consistent with FindBetterChains.
	auto hasImprovedChain = [&](SDValue ST) -> bool {
	return ST->getOperand(0) != NewChain;
	};
	bool AddNewChain = llvm::all_of(TFOps, hasImprovedChain);
	if (AddNewChain)
	TFOps.insert(TFOps.begin(), NewChain);

	SDValue TF = DAG.getTokenFactor(SDLoc(STChain), TFOps);
	CombineTo(St, TF);

	// Add TF and its operands to the worklist.
	AddToWorklist(TF.getNode());
	for (const SDValue &Op : TF->ops())
	AddToWorklist(Op.getNode());
	AddToWorklist(STChain);
	return true;
	}

	bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
	if (OptLevel == CodeGenOpt::None)
	return false;

	const BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);

	// We must have a base and an offset.
	if (!BasePtr.getBase().getNode())
	return false;

	// Do not handle stores to undef base pointers.
	if (BasePtr.getBase().isUndef())
	return false;

	// Directly improve a chain of disjoint stores starting at St.
	if (parallelizeChainedStores(St))
	return true;

	// Improve St's Chain..
	SDValue BetterChain = FindBetterChain(St, St->getChain());
	if (St->getChain() != BetterChain) {
	replaceStoreChain(St, BetterChain);
	return true;
	}
	return false;
	}

	/// This is the entry point for the file.
	void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis *AA,
	CodeGenOpt::Level OptLevel) {
	/// This is the main entry point to this class.
	DAGCombiner(*this, AA, OptLevel).Run(Level);
	}
	diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
	index 414ba25ffd5f..c81d03cac81b 100644
	--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
	+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
	@@ -1,5247 +1,5258 @@
	//===------- LegalizeVectorTypes.cpp - Legalization of vector types -------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file performs vector type splitting and scalarization for LegalizeTypes.
	// Scalarization is the act of changing a computation in an illegal one-element
	// vector type to be a computation in its scalar element type. For example,
	// implementing <1 x f32> arithmetic in a scalar f32 register. This is needed
	// as a base case when scalarizing vector arithmetic like <4 x f32>, which
	// eventually decomposes to scalars if the target doesn't support v4f32 or v2f32
	// types.
	// Splitting is the act of changing a computation in an invalid vector type to
	// be a computation in two vectors of half the size. For example, implementing
	// <128 x f32> operations in terms of two <64 x f32> operations.
	//
	//===----------------------------------------------------------------------===//

	#include "LegalizeTypes.h"
	#include "llvm/Analysis/MemoryLocation.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/TypeSize.h"
	#include "llvm/Support/raw_ostream.h"
	using namespace llvm;

	#define DEBUG_TYPE "legalize-types"

	//===----------------------------------------------------------------------===//
	// Result Vector Scalarization: <1 x ty> -> ty.
	//===----------------------------------------------------------------------===//

	void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
	LLVM_DEBUG(dbgs() << "Scalarize node result " << ResNo << ": "; N->dump(&DAG);
	dbgs() << "\n");
	SDValue R = SDValue();

	switch (N->getOpcode()) {
	default:
	#ifndef NDEBUG
	dbgs() << "ScalarizeVectorResult #" << ResNo << ": ";
	N->dump(&DAG);
	dbgs() << "\n";
	#endif
	report_fatal_error("Do not know how to scalarize the result of this "
	"operator!\n");

	case ISD::MERGE_VALUES: R = ScalarizeVecRes_MERGE_VALUES(N, ResNo);break;
	case ISD::BITCAST: R = ScalarizeVecRes_BITCAST(N); break;
	case ISD::BUILD_VECTOR: R = ScalarizeVecRes_BUILD_VECTOR(N); break;
	case ISD::EXTRACT_SUBVECTOR: R = ScalarizeVecRes_EXTRACT_SUBVECTOR(N); break;
	case ISD::FP_ROUND: R = ScalarizeVecRes_FP_ROUND(N); break;
	case ISD::FPOWI: R = ScalarizeVecRes_FPOWI(N); break;
	case ISD::INSERT_VECTOR_ELT: R = ScalarizeVecRes_INSERT_VECTOR_ELT(N); break;
	case ISD::LOAD: R = ScalarizeVecRes_LOAD(cast<LoadSDNode>(N));break;
	case ISD::SCALAR_TO_VECTOR: R = ScalarizeVecRes_SCALAR_TO_VECTOR(N); break;
	case ISD::SIGN_EXTEND_INREG: R = ScalarizeVecRes_InregOp(N); break;
	case ISD::VSELECT: R = ScalarizeVecRes_VSELECT(N); break;
	case ISD::SELECT: R = ScalarizeVecRes_SELECT(N); break;
	case ISD::SELECT_CC: R = ScalarizeVecRes_SELECT_CC(N); break;
	case ISD::SETCC: R = ScalarizeVecRes_SETCC(N); break;
	case ISD::UNDEF: R = ScalarizeVecRes_UNDEF(N); break;
	case ISD::VECTOR_SHUFFLE: R = ScalarizeVecRes_VECTOR_SHUFFLE(N); break;
	case ISD::ANY_EXTEND_VECTOR_INREG:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	R = ScalarizeVecRes_VecInregOp(N);
	break;
	case ISD::ABS:
	case ISD::ANY_EXTEND:
	case ISD::BITREVERSE:
	case ISD::BSWAP:
	case ISD::CTLZ:
	case ISD::CTLZ_ZERO_UNDEF:
	case ISD::CTPOP:
	case ISD::CTTZ:
	case ISD::CTTZ_ZERO_UNDEF:
	case ISD::FABS:
	case ISD::FCEIL:
	case ISD::FCOS:
	case ISD::FEXP:
	case ISD::FEXP2:
	case ISD::FFLOOR:
	case ISD::FLOG:
	case ISD::FLOG10:
	case ISD::FLOG2:
	case ISD::FNEARBYINT:
	case ISD::FNEG:
	case ISD::FREEZE:
	case ISD::FP_EXTEND:
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	case ISD::FRINT:
	case ISD::FROUND:
	case ISD::FROUNDEVEN:
	case ISD::FSIN:
	case ISD::FSQRT:
	case ISD::FTRUNC:
	case ISD::SIGN_EXTEND:
	case ISD::SINT_TO_FP:
	case ISD::TRUNCATE:
	case ISD::UINT_TO_FP:
	case ISD::ZERO_EXTEND:
	case ISD::FCANONICALIZE:
	R = ScalarizeVecRes_UnaryOp(N);
	break;

	case ISD::ADD:
	case ISD::AND:
	case ISD::FADD:
	case ISD::FCOPYSIGN:
	case ISD::FDIV:
	case ISD::FMUL:
	case ISD::FMINNUM:
	case ISD::FMAXNUM:
	case ISD::FMINNUM_IEEE:
	case ISD::FMAXNUM_IEEE:
	case ISD::FMINIMUM:
	case ISD::FMAXIMUM:
	case ISD::SMIN:
	case ISD::SMAX:
	case ISD::UMIN:
	case ISD::UMAX:

	case ISD::SADDSAT:
	case ISD::UADDSAT:
	case ISD::SSUBSAT:
	case ISD::USUBSAT:

	case ISD::FPOW:
	case ISD::FREM:
	case ISD::FSUB:
	case ISD::MUL:
	case ISD::OR:
	case ISD::SDIV:
	case ISD::SREM:
	case ISD::SUB:
	case ISD::UDIV:
	case ISD::UREM:
	case ISD::XOR:
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL:
	R = ScalarizeVecRes_BinOp(N);
	break;
	case ISD::FMA:
	R = ScalarizeVecRes_TernaryOp(N);
	break;

	#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \
	case ISD::STRICT_##DAGN:
	#include "llvm/IR/ConstrainedOps.def"
	R = ScalarizeVecRes_StrictFPOp(N);
	break;

	case ISD::UADDO:
	case ISD::SADDO:
	case ISD::USUBO:
	case ISD::SSUBO:
	case ISD::UMULO:
	case ISD::SMULO:
	R = ScalarizeVecRes_OverflowOp(N, ResNo);
	break;
	case ISD::SMULFIX:
	case ISD::SMULFIXSAT:
	case ISD::UMULFIX:
	case ISD::UMULFIXSAT:
	case ISD::SDIVFIX:
	case ISD::SDIVFIXSAT:
	case ISD::UDIVFIX:
	case ISD::UDIVFIXSAT:
	R = ScalarizeVecRes_FIX(N);
	break;
	}

	// If R is null, the sub-method took care of registering the result.
	if (R.getNode())
	SetScalarizedVector(SDValue(N, ResNo), R);
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_BinOp(SDNode *N) {
	SDValue LHS = GetScalarizedVector(N->getOperand(0));
	SDValue RHS = GetScalarizedVector(N->getOperand(1));
	return DAG.getNode(N->getOpcode(), SDLoc(N),
	LHS.getValueType(), LHS, RHS, N->getFlags());
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_TernaryOp(SDNode *N) {
	SDValue Op0 = GetScalarizedVector(N->getOperand(0));
	SDValue Op1 = GetScalarizedVector(N->getOperand(1));
	SDValue Op2 = GetScalarizedVector(N->getOperand(2));
	return DAG.getNode(N->getOpcode(), SDLoc(N), Op0.getValueType(), Op0, Op1,
	Op2, N->getFlags());
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_FIX(SDNode *N) {
	SDValue Op0 = GetScalarizedVector(N->getOperand(0));
	SDValue Op1 = GetScalarizedVector(N->getOperand(1));
	SDValue Op2 = N->getOperand(2);
	return DAG.getNode(N->getOpcode(), SDLoc(N), Op0.getValueType(), Op0, Op1,
	Op2, N->getFlags());
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_StrictFPOp(SDNode *N) {
	EVT VT = N->getValueType(0).getVectorElementType();
	unsigned NumOpers = N->getNumOperands();
	SDValue Chain = N->getOperand(0);
	EVT ValueVTs[] = {VT, MVT::Other};
	SDLoc dl(N);

	SmallVector<SDValue, 4> Opers(NumOpers);

	// The Chain is the first operand.
	Opers[0] = Chain;

	// Now process the remaining operands.
	for (unsigned i = 1; i < NumOpers; ++i) {
	SDValue Oper = N->getOperand(i);

	if (Oper.getValueType().isVector())
	Oper = GetScalarizedVector(Oper);

	Opers[i] = Oper;
	}

	SDValue Result = DAG.getNode(N->getOpcode(), dl, DAG.getVTList(ValueVTs),
	Opers, N->getFlags());

	// Legalize the chain result - switch anything that used the old chain to
	// use the new one.
	ReplaceValueWith(SDValue(N, 1), Result.getValue(1));
	return Result;
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_OverflowOp(SDNode *N,
	unsigned ResNo) {
	SDLoc DL(N);
	EVT ResVT = N->getValueType(0);
	EVT OvVT = N->getValueType(1);

	SDValue ScalarLHS, ScalarRHS;
	if (getTypeAction(ResVT) == TargetLowering::TypeScalarizeVector) {
	ScalarLHS = GetScalarizedVector(N->getOperand(0));
	ScalarRHS = GetScalarizedVector(N->getOperand(1));
	} else {
	SmallVector<SDValue, 1> ElemsLHS, ElemsRHS;
	DAG.ExtractVectorElements(N->getOperand(0), ElemsLHS);
	DAG.ExtractVectorElements(N->getOperand(1), ElemsRHS);
	ScalarLHS = ElemsLHS[0];
	ScalarRHS = ElemsRHS[0];
	}

	SDVTList ScalarVTs = DAG.getVTList(
	ResVT.getVectorElementType(), OvVT.getVectorElementType());
	SDNode *ScalarNode = DAG.getNode(
	N->getOpcode(), DL, ScalarVTs, ScalarLHS, ScalarRHS).getNode();
	ScalarNode->setFlags(N->getFlags());

	// Replace the other vector result not being explicitly scalarized here.
	unsigned OtherNo = 1 - ResNo;
	EVT OtherVT = N->getValueType(OtherNo);
	if (getTypeAction(OtherVT) == TargetLowering::TypeScalarizeVector) {
	SetScalarizedVector(SDValue(N, OtherNo), SDValue(ScalarNode, OtherNo));
	} else {
	SDValue OtherVal = DAG.getNode(
	ISD::SCALAR_TO_VECTOR, DL, OtherVT, SDValue(ScalarNode, OtherNo));
	ReplaceValueWith(SDValue(N, OtherNo), OtherVal);
	}

	return SDValue(ScalarNode, ResNo);
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_MERGE_VALUES(SDNode *N,
	unsigned ResNo) {
	SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
	return GetScalarizedVector(Op);
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_BITCAST(SDNode *N) {
	SDValue Op = N->getOperand(0);
	if (Op.getValueType().isVector()
	&& Op.getValueType().getVectorNumElements() == 1
	&& !isSimpleLegalType(Op.getValueType()))
	Op = GetScalarizedVector(Op);
	EVT NewVT = N->getValueType(0).getVectorElementType();
	return DAG.getNode(ISD::BITCAST, SDLoc(N),
	NewVT, Op);
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_BUILD_VECTOR(SDNode *N) {
	EVT EltVT = N->getValueType(0).getVectorElementType();
	SDValue InOp = N->getOperand(0);
	// The BUILD_VECTOR operands may be of wider element types and
	// we may need to truncate them back to the requested return type.
	if (EltVT.isInteger())
	return DAG.getNode(ISD::TRUNCATE, SDLoc(N), EltVT, InOp);
	return InOp;
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
	N->getValueType(0).getVectorElementType(),
	N->getOperand(0), N->getOperand(1));
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_FP_ROUND(SDNode *N) {
	EVT NewVT = N->getValueType(0).getVectorElementType();
	SDValue Op = GetScalarizedVector(N->getOperand(0));
	return DAG.getNode(ISD::FP_ROUND, SDLoc(N),
	NewVT, Op, N->getOperand(1));
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_FPOWI(SDNode *N) {
	SDValue Op = GetScalarizedVector(N->getOperand(0));
	return DAG.getNode(ISD::FPOWI, SDLoc(N),
	Op.getValueType(), Op, N->getOperand(1));
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N) {
	// The value to insert may have a wider type than the vector element type,
	// so be sure to truncate it to the element type if necessary.
	SDValue Op = N->getOperand(1);
	EVT EltVT = N->getValueType(0).getVectorElementType();
	if (Op.getValueType() != EltVT)
	// FIXME: Can this happen for floating point types?
	Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), EltVT, Op);
	return Op;
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) {
	assert(N->isUnindexed() && "Indexed vector load?");

	SDValue Result = DAG.getLoad(
	ISD::UNINDEXED, N->getExtensionType(),
	N->getValueType(0).getVectorElementType(), SDLoc(N), N->getChain(),
	N->getBasePtr(), DAG.getUNDEF(N->getBasePtr().getValueType()),
	N->getPointerInfo(), N->getMemoryVT().getVectorElementType(),
	N->getOriginalAlign(), N->getMemOperand()->getFlags(), N->getAAInfo());

	// Legalize the chain result - switch anything that used the old chain to
	// use the new one.
	ReplaceValueWith(SDValue(N, 1), Result.getValue(1));
	return Result;
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_UnaryOp(SDNode *N) {
	// Get the dest type - it doesn't always match the input type, e.g. int_to_fp.
	EVT DestVT = N->getValueType(0).getVectorElementType();
	SDValue Op = N->getOperand(0);
	EVT OpVT = Op.getValueType();
	SDLoc DL(N);
	// The result needs scalarizing, but it's not a given that the source does.
	// This is a workaround for targets where it's impossible to scalarize the
	// result of a conversion, because the source type is legal.
	// For instance, this happens on AArch64: v1i1 is illegal but v1i{8,16,32}
	// are widened to v8i8, v4i16, and v2i32, which is legal, because v1i64 is
	// legal and was not scalarized.
	// See the similar logic in ScalarizeVecRes_SETCC
	if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
	Op = GetScalarizedVector(Op);
	} else {
	EVT VT = OpVT.getVectorElementType();
	Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
	DAG.getVectorIdxConstant(0, DL));
	}
	return DAG.getNode(N->getOpcode(), SDLoc(N), DestVT, Op, N->getFlags());
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_InregOp(SDNode *N) {
	EVT EltVT = N->getValueType(0).getVectorElementType();
	EVT ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT().getVectorElementType();
	SDValue LHS = GetScalarizedVector(N->getOperand(0));
	return DAG.getNode(N->getOpcode(), SDLoc(N), EltVT,
	LHS, DAG.getValueType(ExtVT));
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_VecInregOp(SDNode *N) {
	SDLoc DL(N);
	SDValue Op = N->getOperand(0);

	EVT OpVT = Op.getValueType();
	EVT OpEltVT = OpVT.getVectorElementType();
	EVT EltVT = N->getValueType(0).getVectorElementType();

	if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
	Op = GetScalarizedVector(Op);
	} else {
	Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpEltVT, Op,
	DAG.getVectorIdxConstant(0, DL));
	}

	switch (N->getOpcode()) {
	case ISD::ANY_EXTEND_VECTOR_INREG:
	return DAG.getNode(ISD::ANY_EXTEND, DL, EltVT, Op);
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	return DAG.getNode(ISD::SIGN_EXTEND, DL, EltVT, Op);
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	return DAG.getNode(ISD::ZERO_EXTEND, DL, EltVT, Op);
	}

	llvm_unreachable("Illegal extend_vector_inreg opcode");
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N) {
	// If the operand is wider than the vector element type then it is implicitly
	// truncated. Make that explicit here.
	EVT EltVT = N->getValueType(0).getVectorElementType();
	SDValue InOp = N->getOperand(0);
	if (InOp.getValueType() != EltVT)
	return DAG.getNode(ISD::TRUNCATE, SDLoc(N), EltVT, InOp);
	return InOp;
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_VSELECT(SDNode *N) {
	SDValue Cond = N->getOperand(0);
	EVT OpVT = Cond.getValueType();
	SDLoc DL(N);
	// The vselect result and true/value operands needs scalarizing, but it's
	// not a given that the Cond does. For instance, in AVX512 v1i1 is legal.
	// See the similar logic in ScalarizeVecRes_SETCC
	if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
	Cond = GetScalarizedVector(Cond);
	} else {
	EVT VT = OpVT.getVectorElementType();
	Cond = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Cond,
	DAG.getVectorIdxConstant(0, DL));
	}

	SDValue LHS = GetScalarizedVector(N->getOperand(1));
	TargetLowering::BooleanContent ScalarBool =
	TLI.getBooleanContents(false, false);
	TargetLowering::BooleanContent VecBool = TLI.getBooleanContents(true, false);

	// If integer and float booleans have different contents then we can't
	// reliably optimize in all cases. There is a full explanation for this in
	// DAGCombiner::visitSELECT() where the same issue affects folding
	// (select C, 0, 1) to (xor C, 1).
	if (TLI.getBooleanContents(false, false) !=
	TLI.getBooleanContents(false, true)) {
	// At least try the common case where the boolean is generated by a
	// comparison.
	if (Cond->getOpcode() == ISD::SETCC) {
	EVT OpVT = Cond->getOperand(0).getValueType();
	ScalarBool = TLI.getBooleanContents(OpVT.getScalarType());
	VecBool = TLI.getBooleanContents(OpVT);
	} else
	ScalarBool = TargetLowering::UndefinedBooleanContent;
	}

	EVT CondVT = Cond.getValueType();
	if (ScalarBool != VecBool) {
	switch (ScalarBool) {
	case TargetLowering::UndefinedBooleanContent:
	break;
	case TargetLowering::ZeroOrOneBooleanContent:
	assert(VecBool == TargetLowering::UndefinedBooleanContent \|\|
	VecBool == TargetLowering::ZeroOrNegativeOneBooleanContent);
	// Vector read from all ones, scalar expects a single 1 so mask.
	Cond = DAG.getNode(ISD::AND, SDLoc(N), CondVT,
	Cond, DAG.getConstant(1, SDLoc(N), CondVT));
	break;
	case TargetLowering::ZeroOrNegativeOneBooleanContent:
	assert(VecBool == TargetLowering::UndefinedBooleanContent \|\|
	VecBool == TargetLowering::ZeroOrOneBooleanContent);
	// Vector reads from a one, scalar from all ones so sign extend.
	Cond = DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), CondVT,
	Cond, DAG.getValueType(MVT::i1));
	break;
	}
	}

	// Truncate the condition if needed
	auto BoolVT = getSetCCResultType(CondVT);
	if (BoolVT.bitsLT(CondVT))
	Cond = DAG.getNode(ISD::TRUNCATE, SDLoc(N), BoolVT, Cond);

	return DAG.getSelect(SDLoc(N),
	LHS.getValueType(), Cond, LHS,
	GetScalarizedVector(N->getOperand(2)));
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT(SDNode *N) {
	SDValue LHS = GetScalarizedVector(N->getOperand(1));
	return DAG.getSelect(SDLoc(N),
	LHS.getValueType(), N->getOperand(0), LHS,
	GetScalarizedVector(N->getOperand(2)));
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT_CC(SDNode *N) {
	SDValue LHS = GetScalarizedVector(N->getOperand(2));
	return DAG.getNode(ISD::SELECT_CC, SDLoc(N), LHS.getValueType(),
	N->getOperand(0), N->getOperand(1),
	LHS, GetScalarizedVector(N->getOperand(3)),
	N->getOperand(4));
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_UNDEF(SDNode *N) {
	return DAG.getUNDEF(N->getValueType(0).getVectorElementType());
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_VECTOR_SHUFFLE(SDNode *N) {
	// Figure out if the scalar is the LHS or RHS and return it.
	SDValue Arg = N->getOperand(2).getOperand(0);
	if (Arg.isUndef())
	return DAG.getUNDEF(N->getValueType(0).getVectorElementType());
	unsigned Op = !cast<ConstantSDNode>(Arg)->isNullValue();
	return GetScalarizedVector(N->getOperand(Op));
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_SETCC(SDNode *N) {
	assert(N->getValueType(0).isVector() &&
	N->getOperand(0).getValueType().isVector() &&
	"Operand types must be vectors");
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	EVT OpVT = LHS.getValueType();
	EVT NVT = N->getValueType(0).getVectorElementType();
	SDLoc DL(N);

	// The result needs scalarizing, but it's not a given that the source does.
	if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
	LHS = GetScalarizedVector(LHS);
	RHS = GetScalarizedVector(RHS);
	} else {
	EVT VT = OpVT.getVectorElementType();
	LHS = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, LHS,
	DAG.getVectorIdxConstant(0, DL));
	RHS = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, RHS,
	DAG.getVectorIdxConstant(0, DL));
	}

	// Turn it into a scalar SETCC.
	SDValue Res = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS,
	N->getOperand(2));
	// Vectors may have a different boolean contents to scalars. Promote the
	// value appropriately.
	ISD::NodeType ExtendCode =
	TargetLowering::getExtendForContent(TLI.getBooleanContents(OpVT));
	return DAG.getNode(ExtendCode, DL, NVT, Res);
	}


	//===----------------------------------------------------------------------===//
	// Operand Vector Scalarization <1 x ty> -> ty.
	//===----------------------------------------------------------------------===//

	bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
	LLVM_DEBUG(dbgs() << "Scalarize node operand " << OpNo << ": "; N->dump(&DAG);
	dbgs() << "\n");
	SDValue Res = SDValue();

	if (!Res.getNode()) {
	switch (N->getOpcode()) {
	default:
	#ifndef NDEBUG
	dbgs() << "ScalarizeVectorOperand Op #" << OpNo << ": ";
	N->dump(&DAG);
	dbgs() << "\n";
	#endif
	report_fatal_error("Do not know how to scalarize this operator's "
	"operand!\n");
	case ISD::BITCAST:
	Res = ScalarizeVecOp_BITCAST(N);
	break;
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::SIGN_EXTEND:
	case ISD::TRUNCATE:
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	Res = ScalarizeVecOp_UnaryOp(N);
	break;
	case ISD::STRICT_SINT_TO_FP:
	case ISD::STRICT_UINT_TO_FP:
	case ISD::STRICT_FP_TO_SINT:
	case ISD::STRICT_FP_TO_UINT:
	Res = ScalarizeVecOp_UnaryOp_StrictFP(N);
	break;
	case ISD::CONCAT_VECTORS:
	Res = ScalarizeVecOp_CONCAT_VECTORS(N);
	break;
	case ISD::EXTRACT_VECTOR_ELT:
	Res = ScalarizeVecOp_EXTRACT_VECTOR_ELT(N);
	break;
	case ISD::VSELECT:
	Res = ScalarizeVecOp_VSELECT(N);
	break;
	case ISD::SETCC:
	Res = ScalarizeVecOp_VSETCC(N);
	break;
	case ISD::STORE:
	Res = ScalarizeVecOp_STORE(cast<StoreSDNode>(N), OpNo);
	break;
	case ISD::STRICT_FP_ROUND:
	Res = ScalarizeVecOp_STRICT_FP_ROUND(N, OpNo);
	break;
	case ISD::FP_ROUND:
	Res = ScalarizeVecOp_FP_ROUND(N, OpNo);
	break;
	case ISD::VECREDUCE_FADD:
	case ISD::VECREDUCE_FMUL:
	case ISD::VECREDUCE_ADD:
	case ISD::VECREDUCE_MUL:
	case ISD::VECREDUCE_AND:
	case ISD::VECREDUCE_OR:
	case ISD::VECREDUCE_XOR:
	case ISD::VECREDUCE_SMAX:
	case ISD::VECREDUCE_SMIN:
	case ISD::VECREDUCE_UMAX:
	case ISD::VECREDUCE_UMIN:
	case ISD::VECREDUCE_FMAX:
	case ISD::VECREDUCE_FMIN:
	Res = ScalarizeVecOp_VECREDUCE(N);
	break;
	}
	}

	// If the result is null, the sub-method took care of registering results etc.
	if (!Res.getNode()) return false;

	// If the result is N, the sub-method updated N in place. Tell the legalizer
	// core about this.
	if (Res.getNode() == N)
	return true;

	assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
	"Invalid operand expansion");

	ReplaceValueWith(SDValue(N, 0), Res);
	return false;
	}

	/// If the value to convert is a vector that needs to be scalarized, it must be
	/// <1 x ty>. Convert the element instead.
	SDValue DAGTypeLegalizer::ScalarizeVecOp_BITCAST(SDNode *N) {
	SDValue Elt = GetScalarizedVector(N->getOperand(0));
	return DAG.getNode(ISD::BITCAST, SDLoc(N),
	N->getValueType(0), Elt);
	}

	/// If the input is a vector that needs to be scalarized, it must be <1 x ty>.
	/// Do the operation on the element instead.
	SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp(SDNode *N) {
	assert(N->getValueType(0).getVectorNumElements() == 1 &&
	"Unexpected vector type!");
	SDValue Elt = GetScalarizedVector(N->getOperand(0));
	SDValue Op = DAG.getNode(N->getOpcode(), SDLoc(N),
	N->getValueType(0).getScalarType(), Elt);
	// Revectorize the result so the types line up with what the uses of this
	// expression expect.
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Op);
	}

	/// If the input is a vector that needs to be scalarized, it must be <1 x ty>.
	/// Do the strict FP operation on the element instead.
	SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp_StrictFP(SDNode *N) {
	assert(N->getValueType(0).getVectorNumElements() == 1 &&
	"Unexpected vector type!");
	SDValue Elt = GetScalarizedVector(N->getOperand(1));
	SDValue Res = DAG.getNode(N->getOpcode(), SDLoc(N),
	{ N->getValueType(0).getScalarType(), MVT::Other },
	{ N->getOperand(0), Elt });
	// Legalize the chain result - switch anything that used the old chain to
	// use the new one.
	ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
	// Revectorize the result so the types line up with what the uses of this
	// expression expect.
	Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Res);

	// Do our own replacement and return SDValue() to tell the caller that we
	// handled all replacements since caller can only handle a single result.
	ReplaceValueWith(SDValue(N, 0), Res);
	return SDValue();
	}

	/// The vectors to concatenate have length one - use a BUILD_VECTOR instead.
	SDValue DAGTypeLegalizer::ScalarizeVecOp_CONCAT_VECTORS(SDNode *N) {
	SmallVector<SDValue, 8> Ops(N->getNumOperands());
	for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i)
	Ops[i] = GetScalarizedVector(N->getOperand(i));
	return DAG.getBuildVector(N->getValueType(0), SDLoc(N), Ops);
	}

	/// If the input is a vector that needs to be scalarized, it must be <1 x ty>,
	/// so just return the element, ignoring the index.
	SDValue DAGTypeLegalizer::ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
	EVT VT = N->getValueType(0);
	SDValue Res = GetScalarizedVector(N->getOperand(0));
	if (Res.getValueType() != VT)
	Res = VT.isFloatingPoint()
	? DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, Res)
	: DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Res);
	return Res;
	}

	/// If the input condition is a vector that needs to be scalarized, it must be
	/// <1 x i1>, so just convert to a normal ISD::SELECT
	/// (still with vector output type since that was acceptable if we got here).
	SDValue DAGTypeLegalizer::ScalarizeVecOp_VSELECT(SDNode *N) {
	SDValue ScalarCond = GetScalarizedVector(N->getOperand(0));
	EVT VT = N->getValueType(0);

	return DAG.getNode(ISD::SELECT, SDLoc(N), VT, ScalarCond, N->getOperand(1),
	N->getOperand(2));
	}

	/// If the operand is a vector that needs to be scalarized then the
	/// result must be v1i1, so just convert to a scalar SETCC and wrap
	/// with a scalar_to_vector since the res type is legal if we got here
	SDValue DAGTypeLegalizer::ScalarizeVecOp_VSETCC(SDNode *N) {
	assert(N->getValueType(0).isVector() &&
	N->getOperand(0).getValueType().isVector() &&
	"Operand types must be vectors");
	assert(N->getValueType(0) == MVT::v1i1 && "Expected v1i1 type");

	EVT VT = N->getValueType(0);
	SDValue LHS = GetScalarizedVector(N->getOperand(0));
	SDValue RHS = GetScalarizedVector(N->getOperand(1));

	EVT OpVT = N->getOperand(0).getValueType();
	EVT NVT = VT.getVectorElementType();
	SDLoc DL(N);
	// Turn it into a scalar SETCC.
	SDValue Res = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS,
	N->getOperand(2));

	// Vectors may have a different boolean contents to scalars. Promote the
	// value appropriately.
	ISD::NodeType ExtendCode =
	TargetLowering::getExtendForContent(TLI.getBooleanContents(OpVT));

	Res = DAG.getNode(ExtendCode, DL, NVT, Res);

	return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Res);
	}

	/// If the value to store is a vector that needs to be scalarized, it must be
	/// <1 x ty>. Just store the element.
	SDValue DAGTypeLegalizer::ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo){
	assert(N->isUnindexed() && "Indexed store of one-element vector?");
	assert(OpNo == 1 && "Do not know how to scalarize this operand!");
	SDLoc dl(N);

	if (N->isTruncatingStore())
	return DAG.getTruncStore(
	N->getChain(), dl, GetScalarizedVector(N->getOperand(1)),
	N->getBasePtr(), N->getPointerInfo(),
	N->getMemoryVT().getVectorElementType(), N->getOriginalAlign(),
	N->getMemOperand()->getFlags(), N->getAAInfo());

	return DAG.getStore(N->getChain(), dl, GetScalarizedVector(N->getOperand(1)),
	N->getBasePtr(), N->getPointerInfo(),
	N->getOriginalAlign(), N->getMemOperand()->getFlags(),
	N->getAAInfo());
	}

	/// If the value to round is a vector that needs to be scalarized, it must be
	/// <1 x ty>. Convert the element instead.
	SDValue DAGTypeLegalizer::ScalarizeVecOp_FP_ROUND(SDNode *N, unsigned OpNo) {
	SDValue Elt = GetScalarizedVector(N->getOperand(0));
	SDValue Res = DAG.getNode(ISD::FP_ROUND, SDLoc(N),
	N->getValueType(0).getVectorElementType(), Elt,
	N->getOperand(1));
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Res);
	}

	SDValue DAGTypeLegalizer::ScalarizeVecOp_STRICT_FP_ROUND(SDNode *N,
	unsigned OpNo) {
	assert(OpNo == 1 && "Wrong operand for scalarization!");
	SDValue Elt = GetScalarizedVector(N->getOperand(1));
	SDValue Res = DAG.getNode(ISD::STRICT_FP_ROUND, SDLoc(N),
	{ N->getValueType(0).getVectorElementType(),
	MVT::Other },
	{ N->getOperand(0), Elt, N->getOperand(2) });
	// Legalize the chain result - switch anything that used the old chain to
	// use the new one.
	ReplaceValueWith(SDValue(N, 1), Res.getValue(1));

	Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Res);

	// Do our own replacement and return SDValue() to tell the caller that we
	// handled all replacements since caller can only handle a single result.
	ReplaceValueWith(SDValue(N, 0), Res);
	return SDValue();
	}

	SDValue DAGTypeLegalizer::ScalarizeVecOp_VECREDUCE(SDNode *N) {
	SDValue Res = GetScalarizedVector(N->getOperand(0));
	// Result type may be wider than element type.
	if (Res.getValueType() != N->getValueType(0))
	Res = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), N->getValueType(0), Res);
	return Res;
	}

	//===----------------------------------------------------------------------===//
	// Result Vector Splitting
	//===----------------------------------------------------------------------===//

	/// This method is called when the specified result of the specified node is
	/// found to need vector splitting. At this point, the node may also have
	/// invalid operands or may have other results that need legalization, we just
	/// know that (at least) one result needs vector splitting.
	void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
	LLVM_DEBUG(dbgs() << "Split node result: "; N->dump(&DAG); dbgs() << "\n");
	SDValue Lo, Hi;

	// See if the target wants to custom expand this node.
	if (CustomLowerNode(N, N->getValueType(ResNo), true))
	return;

	switch (N->getOpcode()) {
	default:
	#ifndef NDEBUG
	dbgs() << "SplitVectorResult #" << ResNo << ": ";
	N->dump(&DAG);
	dbgs() << "\n";
	#endif
	report_fatal_error("Do not know how to split the result of this "
	"operator!\n");

	case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
	case ISD::VSELECT:
	case ISD::SELECT: SplitRes_SELECT(N, Lo, Hi); break;
	case ISD::SELECT_CC: SplitRes_SELECT_CC(N, Lo, Hi); break;
	case ISD::UNDEF: SplitRes_UNDEF(N, Lo, Hi); break;
	case ISD::BITCAST: SplitVecRes_BITCAST(N, Lo, Hi); break;
	case ISD::BUILD_VECTOR: SplitVecRes_BUILD_VECTOR(N, Lo, Hi); break;
	case ISD::CONCAT_VECTORS: SplitVecRes_CONCAT_VECTORS(N, Lo, Hi); break;
	case ISD::EXTRACT_SUBVECTOR: SplitVecRes_EXTRACT_SUBVECTOR(N, Lo, Hi); break;
	case ISD::INSERT_SUBVECTOR: SplitVecRes_INSERT_SUBVECTOR(N, Lo, Hi); break;
	case ISD::FPOWI: SplitVecRes_FPOWI(N, Lo, Hi); break;
	case ISD::FCOPYSIGN: SplitVecRes_FCOPYSIGN(N, Lo, Hi); break;
	case ISD::INSERT_VECTOR_ELT: SplitVecRes_INSERT_VECTOR_ELT(N, Lo, Hi); break;
	case ISD::SCALAR_TO_VECTOR: SplitVecRes_SCALAR_TO_VECTOR(N, Lo, Hi); break;
	case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break;
	case ISD::LOAD:
	SplitVecRes_LOAD(cast<LoadSDNode>(N), Lo, Hi);
	break;
	case ISD::MLOAD:
	SplitVecRes_MLOAD(cast<MaskedLoadSDNode>(N), Lo, Hi);
	break;
	case ISD::MGATHER:
	SplitVecRes_MGATHER(cast<MaskedGatherSDNode>(N), Lo, Hi);
	break;
	case ISD::SETCC:
	SplitVecRes_SETCC(N, Lo, Hi);
	break;
	case ISD::VECTOR_SHUFFLE:
	SplitVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N), Lo, Hi);
	break;
	case ISD::VAARG:
	SplitVecRes_VAARG(N, Lo, Hi);
	break;

	case ISD::ANY_EXTEND_VECTOR_INREG:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	SplitVecRes_ExtVecInRegOp(N, Lo, Hi);
	break;

	case ISD::ABS:
	case ISD::BITREVERSE:
	case ISD::BSWAP:
	case ISD::CTLZ:
	case ISD::CTTZ:
	case ISD::CTLZ_ZERO_UNDEF:
	case ISD::CTTZ_ZERO_UNDEF:
	case ISD::CTPOP:
	case ISD::FABS:
	case ISD::FCEIL:
	case ISD::FCOS:
	case ISD::FEXP:
	case ISD::FEXP2:
	case ISD::FFLOOR:
	case ISD::FLOG:
	case ISD::FLOG10:
	case ISD::FLOG2:
	case ISD::FNEARBYINT:
	case ISD::FNEG:
	case ISD::FREEZE:
	case ISD::FP_EXTEND:
	case ISD::FP_ROUND:
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	case ISD::FRINT:
	case ISD::FROUND:
	case ISD::FROUNDEVEN:
	case ISD::FSIN:
	case ISD::FSQRT:
	case ISD::FTRUNC:
	case ISD::SINT_TO_FP:
	case ISD::TRUNCATE:
	case ISD::UINT_TO_FP:
	case ISD::FCANONICALIZE:
	SplitVecRes_UnaryOp(N, Lo, Hi);
	break;

	case ISD::ANY_EXTEND:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	SplitVecRes_ExtendOp(N, Lo, Hi);
	break;

	case ISD::ADD:
	case ISD::SUB:
	case ISD::MUL:
	case ISD::MULHS:
	case ISD::MULHU:
	case ISD::FADD:
	case ISD::FSUB:
	case ISD::FMUL:
	case ISD::FMINNUM:
	case ISD::FMAXNUM:
	case ISD::FMINIMUM:
	case ISD::FMAXIMUM:
	case ISD::SDIV:
	case ISD::UDIV:
	case ISD::FDIV:
	case ISD::FPOW:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL:
	case ISD::UREM:
	case ISD::SREM:
	case ISD::FREM:
	case ISD::SMIN:
	case ISD::SMAX:
	case ISD::UMIN:
	case ISD::UMAX:
	case ISD::SADDSAT:
	case ISD::UADDSAT:
	case ISD::SSUBSAT:
	case ISD::USUBSAT:
	SplitVecRes_BinOp(N, Lo, Hi);
	break;
	case ISD::FMA:
	SplitVecRes_TernaryOp(N, Lo, Hi);
	break;

	#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \
	case ISD::STRICT_##DAGN:
	#include "llvm/IR/ConstrainedOps.def"
	SplitVecRes_StrictFPOp(N, Lo, Hi);
	break;

	case ISD::UADDO:
	case ISD::SADDO:
	case ISD::USUBO:
	case ISD::SSUBO:
	case ISD::UMULO:
	case ISD::SMULO:
	SplitVecRes_OverflowOp(N, ResNo, Lo, Hi);
	break;
	case ISD::SMULFIX:
	case ISD::SMULFIXSAT:
	case ISD::UMULFIX:
	case ISD::UMULFIXSAT:
	case ISD::SDIVFIX:
	case ISD::SDIVFIXSAT:
	case ISD::UDIVFIX:
	case ISD::UDIVFIXSAT:
	SplitVecRes_FIX(N, Lo, Hi);
	break;
	}

	// If Lo/Hi is null, the sub-method took care of registering results etc.
	if (Lo.getNode())
	SetSplitVector(SDValue(N, ResNo), Lo, Hi);
	}

	void DAGTypeLegalizer::IncrementPointer(MemSDNode *N, EVT MemVT,
	MachinePointerInfo &MPI,
	SDValue &Ptr) {
	SDLoc DL(N);
	unsigned IncrementSize = MemVT.getSizeInBits().getKnownMinSize() / 8;

	if (MemVT.isScalableVector()) {
	SDValue BytesIncrement = DAG.getVScale(
	DL, Ptr.getValueType(),
	APInt(Ptr.getValueSizeInBits().getFixedSize(), IncrementSize));
	MPI = MachinePointerInfo(N->getPointerInfo().getAddrSpace());
	Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, BytesIncrement);
	} else {
	MPI = N->getPointerInfo().getWithOffset(IncrementSize);
	// Increment the pointer to the other half.
	Ptr = DAG.getObjectPtrOffset(DL, Ptr, IncrementSize);
	}
	}

	void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	SDValue LHSLo, LHSHi;
	GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
	SDValue RHSLo, RHSHi;
	GetSplitVector(N->getOperand(1), RHSLo, RHSHi);
	SDLoc dl(N);

	const SDNodeFlags Flags = N->getFlags();
	unsigned Opcode = N->getOpcode();
	Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(), LHSLo, RHSLo, Flags);
	Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(), LHSHi, RHSHi, Flags);
	}

	void DAGTypeLegalizer::SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	SDValue Op0Lo, Op0Hi;
	GetSplitVector(N->getOperand(0), Op0Lo, Op0Hi);
	SDValue Op1Lo, Op1Hi;
	GetSplitVector(N->getOperand(1), Op1Lo, Op1Hi);
	SDValue Op2Lo, Op2Hi;
	GetSplitVector(N->getOperand(2), Op2Lo, Op2Hi);
	SDLoc dl(N);

	Lo = DAG.getNode(N->getOpcode(), dl, Op0Lo.getValueType(), Op0Lo, Op1Lo,
	Op2Lo, N->getFlags());
	Hi = DAG.getNode(N->getOpcode(), dl, Op0Hi.getValueType(), Op0Hi, Op1Hi,
	Op2Hi, N->getFlags());
	}

	void DAGTypeLegalizer::SplitVecRes_FIX(SDNode *N, SDValue &Lo, SDValue &Hi) {
	SDValue LHSLo, LHSHi;
	GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
	SDValue RHSLo, RHSHi;
	GetSplitVector(N->getOperand(1), RHSLo, RHSHi);
	SDLoc dl(N);
	SDValue Op2 = N->getOperand(2);

	unsigned Opcode = N->getOpcode();
	Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(), LHSLo, RHSLo, Op2,
	N->getFlags());
	Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(), LHSHi, RHSHi, Op2,
	N->getFlags());
	}

	void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	// We know the result is a vector. The input may be either a vector or a
	// scalar value.
	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
	SDLoc dl(N);

	SDValue InOp = N->getOperand(0);
	EVT InVT = InOp.getValueType();

	// Handle some special cases efficiently.
	switch (getTypeAction(InVT)) {
	case TargetLowering::TypeLegal:
	case TargetLowering::TypePromoteInteger:
	case TargetLowering::TypePromoteFloat:
	case TargetLowering::TypeSoftPromoteHalf:
	case TargetLowering::TypeSoftenFloat:
	case TargetLowering::TypeScalarizeVector:
	case TargetLowering::TypeWidenVector:
	break;
	case TargetLowering::TypeExpandInteger:
	case TargetLowering::TypeExpandFloat:
	// A scalar to vector conversion, where the scalar needs expansion.
	// If the vector is being split in two then we can just convert the
	// expanded pieces.
	if (LoVT == HiVT) {
	GetExpandedOp(InOp, Lo, Hi);
	if (DAG.getDataLayout().isBigEndian())
	std::swap(Lo, Hi);
	Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
	Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
	return;
	}
	break;
	case TargetLowering::TypeSplitVector:
	// If the input is a vector that needs to be split, convert each split
	// piece of the input now.
	GetSplitVector(InOp, Lo, Hi);
	Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
	Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
	return;
	case TargetLowering::TypeScalarizeScalableVector:
	report_fatal_error("Scalarization of scalable vectors is not supported.");
	}

	// In the general case, convert the input to an integer and split it by hand.
	EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits());
	EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits());
	if (DAG.getDataLayout().isBigEndian())
	std::swap(LoIntVT, HiIntVT);

	SplitInteger(BitConvertToInteger(InOp), LoIntVT, HiIntVT, Lo, Hi);

	if (DAG.getDataLayout().isBigEndian())
	std::swap(Lo, Hi);
	Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
	Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
	}

	void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	EVT LoVT, HiVT;
	SDLoc dl(N);
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
	unsigned LoNumElts = LoVT.getVectorNumElements();
	SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+LoNumElts);
	Lo = DAG.getBuildVector(LoVT, dl, LoOps);

	SmallVector<SDValue, 8> HiOps(N->op_begin()+LoNumElts, N->op_end());
	Hi = DAG.getBuildVector(HiVT, dl, HiOps);
	}

	void DAGTypeLegalizer::SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	assert(!(N->getNumOperands() & 1) && "Unsupported CONCAT_VECTORS");
	SDLoc dl(N);
	unsigned NumSubvectors = N->getNumOperands() / 2;
	if (NumSubvectors == 1) {
	Lo = N->getOperand(0);
	Hi = N->getOperand(1);
	return;
	}

	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));

	SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+NumSubvectors);
	Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, LoVT, LoOps);

	SmallVector<SDValue, 8> HiOps(N->op_begin()+NumSubvectors, N->op_end());
	Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HiVT, HiOps);
	}

	void DAGTypeLegalizer::SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	SDValue Vec = N->getOperand(0);
	SDValue Idx = N->getOperand(1);
	SDLoc dl(N);

	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));

	Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, LoVT, Vec, Idx);
	uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	Hi = DAG.getNode(
	ISD::EXTRACT_SUBVECTOR, dl, HiVT, Vec,
	DAG.getVectorIdxConstant(IdxVal + LoVT.getVectorNumElements(), dl));
	}

	void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	SDValue Vec = N->getOperand(0);
	SDValue SubVec = N->getOperand(1);
	SDValue Idx = N->getOperand(2);
	SDLoc dl(N);
	GetSplitVector(Vec, Lo, Hi);

	EVT VecVT = Vec.getValueType();
	unsigned VecElems = VecVT.getVectorNumElements();
	unsigned SubElems = SubVec.getValueType().getVectorNumElements();

	// If we know the index is 0, and we know the subvector doesn't cross the
	// boundary between the halves, we can avoid spilling the vector, and insert
	// into the lower half of the split vector directly.
	// TODO: The IdxVal == 0 constraint is artificial, we could do this whenever
	// there is no boundary crossing. But those cases don't seem to get hit in
	// practice.
	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	if ((IdxVal == 0) && (IdxVal + SubElems <= VecElems / 2)) {
	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
	Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, LoVT, Lo, SubVec, Idx);
	return;
	}

	// Spill the vector to the stack.
	// In cases where the vector is illegal it will be broken down into parts
	// and stored in parts - we should use the alignment for the smallest part.
	Align SmallestAlign = DAG.getReducedAlign(VecVT, /UseABI=/false);
	SDValue StackPtr =
	DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign);
	auto &MF = DAG.getMachineFunction();
	auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
	auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);

	SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo,
	SmallestAlign);

	// Store the new subvector into the specified index.
	SDValue SubVecPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
	Store = DAG.getStore(Store, dl, SubVec, SubVecPtr,
	MachinePointerInfo::getUnknownStack(MF));

	// Load the Lo part from the stack slot.
	Lo = DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, PtrInfo,
	SmallestAlign);

	// Increment the pointer to the other part.
	unsigned IncrementSize = Lo.getValueSizeInBits() / 8;
	StackPtr = DAG.getMemBasePlusOffset(StackPtr, IncrementSize, dl);

	// Load the Hi part from the stack slot.
	Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr,
	PtrInfo.getWithOffset(IncrementSize), SmallestAlign);
	}

	void DAGTypeLegalizer::SplitVecRes_FPOWI(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	SDLoc dl(N);
	GetSplitVector(N->getOperand(0), Lo, Hi);
	Lo = DAG.getNode(ISD::FPOWI, dl, Lo.getValueType(), Lo, N->getOperand(1));
	Hi = DAG.getNode(ISD::FPOWI, dl, Hi.getValueType(), Hi, N->getOperand(1));
	}

	void DAGTypeLegalizer::SplitVecRes_FCOPYSIGN(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	SDValue LHSLo, LHSHi;
	GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
	SDLoc DL(N);

	SDValue RHSLo, RHSHi;
	SDValue RHS = N->getOperand(1);
	EVT RHSVT = RHS.getValueType();
	if (getTypeAction(RHSVT) == TargetLowering::TypeSplitVector)
	GetSplitVector(RHS, RHSLo, RHSHi);
	else
	std::tie(RHSLo, RHSHi) = DAG.SplitVector(RHS, SDLoc(RHS));


	Lo = DAG.getNode(ISD::FCOPYSIGN, DL, LHSLo.getValueType(), LHSLo, RHSLo);
	Hi = DAG.getNode(ISD::FCOPYSIGN, DL, LHSHi.getValueType(), LHSHi, RHSHi);
	}

	void DAGTypeLegalizer::SplitVecRes_InregOp(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	SDValue LHSLo, LHSHi;
	GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
	SDLoc dl(N);

	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) =
	DAG.GetSplitDestVTs(cast<VTSDNode>(N->getOperand(1))->getVT());

	Lo = DAG.getNode(N->getOpcode(), dl, LHSLo.getValueType(), LHSLo,
	DAG.getValueType(LoVT));
	Hi = DAG.getNode(N->getOpcode(), dl, LHSHi.getValueType(), LHSHi,
	DAG.getValueType(HiVT));
	}

	void DAGTypeLegalizer::SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	unsigned Opcode = N->getOpcode();
	SDValue N0 = N->getOperand(0);

	SDLoc dl(N);
	SDValue InLo, InHi;

	if (getTypeAction(N0.getValueType()) == TargetLowering::TypeSplitVector)
	GetSplitVector(N0, InLo, InHi);
	else
	std::tie(InLo, InHi) = DAG.SplitVectorOperand(N, 0);

	EVT InLoVT = InLo.getValueType();
	unsigned InNumElements = InLoVT.getVectorNumElements();

	EVT OutLoVT, OutHiVT;
	std::tie(OutLoVT, OutHiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
	unsigned OutNumElements = OutLoVT.getVectorNumElements();
	assert((2 * OutNumElements) <= InNumElements &&
	"Illegal extend vector in reg split");

	// *_EXTEND_VECTOR_INREG instructions extend the lowest elements of the
	// input vector (i.e. we only use InLo):
	// OutLo will extend the first OutNumElements from InLo.
	// OutHi will extend the next OutNumElements from InLo.

	// Shuffle the elements from InLo for OutHi into the bottom elements to
	// create a 'fake' InHi.
	SmallVector<int, 8> SplitHi(InNumElements, -1);
	for (unsigned i = 0; i != OutNumElements; ++i)
	SplitHi[i] = i + OutNumElements;
	InHi = DAG.getVectorShuffle(InLoVT, dl, InLo, DAG.getUNDEF(InLoVT), SplitHi);

	Lo = DAG.getNode(Opcode, dl, OutLoVT, InLo);
	Hi = DAG.getNode(Opcode, dl, OutHiVT, InHi);
	}

	void DAGTypeLegalizer::SplitVecRes_StrictFPOp(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	unsigned NumOps = N->getNumOperands();
	SDValue Chain = N->getOperand(0);
	EVT LoVT, HiVT;
	SDLoc dl(N);
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));

	SmallVector<SDValue, 4> OpsLo(NumOps);
	SmallVector<SDValue, 4> OpsHi(NumOps);

	// The Chain is the first operand.
	OpsLo[0] = Chain;
	OpsHi[0] = Chain;

	// Now process the remaining operands.
	for (unsigned i = 1; i < NumOps; ++i) {
	SDValue Op = N->getOperand(i);
	SDValue OpLo = Op;
	SDValue OpHi = Op;

	EVT InVT = Op.getValueType();
	if (InVT.isVector()) {
	// If the input also splits, handle it directly for a
	// compile time speedup. Otherwise split it by hand.
	if (getTypeAction(InVT) == TargetLowering::TypeSplitVector)
	GetSplitVector(Op, OpLo, OpHi);
	else
	std::tie(OpLo, OpHi) = DAG.SplitVectorOperand(N, i);
	}

	OpsLo[i] = OpLo;
	OpsHi[i] = OpHi;
	}

	EVT LoValueVTs[] = {LoVT, MVT::Other};
	EVT HiValueVTs[] = {HiVT, MVT::Other};
	Lo = DAG.getNode(N->getOpcode(), dl, DAG.getVTList(LoValueVTs), OpsLo,
	N->getFlags());
	Hi = DAG.getNode(N->getOpcode(), dl, DAG.getVTList(HiValueVTs), OpsHi,
	N->getFlags());

	// Build a factor node to remember that this Op is independent of the
	// other one.
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	Lo.getValue(1), Hi.getValue(1));

	// Legalize the chain result - switch anything that used the old chain to
	// use the new one.
	ReplaceValueWith(SDValue(N, 1), Chain);
	}

	SDValue DAGTypeLegalizer::UnrollVectorOp_StrictFP(SDNode *N, unsigned ResNE) {
	SDValue Chain = N->getOperand(0);
	EVT VT = N->getValueType(0);
	unsigned NE = VT.getVectorNumElements();
	EVT EltVT = VT.getVectorElementType();
	SDLoc dl(N);

	SmallVector<SDValue, 8> Scalars;
	SmallVector<SDValue, 4> Operands(N->getNumOperands());

	// If ResNE is 0, fully unroll the vector op.
	if (ResNE == 0)
	ResNE = NE;
	else if (NE > ResNE)
	NE = ResNE;

	//The results of each unrolled operation, including the chain.
	EVT ChainVTs[] = {EltVT, MVT::Other};
	SmallVector<SDValue, 8> Chains;

	unsigned i;
	for (i = 0; i != NE; ++i) {
	Operands[0] = Chain;
	for (unsigned j = 1, e = N->getNumOperands(); j != e; ++j) {
	SDValue Operand = N->getOperand(j);
	EVT OperandVT = Operand.getValueType();
	if (OperandVT.isVector()) {
	EVT OperandEltVT = OperandVT.getVectorElementType();
	Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, OperandEltVT,
	Operand, DAG.getVectorIdxConstant(i, dl));
	} else {
	Operands[j] = Operand;
	}
	}
	SDValue Scalar = DAG.getNode(N->getOpcode(), dl, ChainVTs, Operands);
	Scalar.getNode()->setFlags(N->getFlags());

	//Add in the scalar as well as its chain value to the
	//result vectors.
	Scalars.push_back(Scalar);
	Chains.push_back(Scalar.getValue(1));
	}

	for (; i < ResNE; ++i)
	Scalars.push_back(DAG.getUNDEF(EltVT));

	// Build a new factor node to connect the chain back together.
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
	ReplaceValueWith(SDValue(N, 1), Chain);

	// Create a new BUILD_VECTOR node
	EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, ResNE);
	return DAG.getBuildVector(VecVT, dl, Scalars);
	}

	void DAGTypeLegalizer::SplitVecRes_OverflowOp(SDNode *N, unsigned ResNo,
	SDValue &Lo, SDValue &Hi) {
	SDLoc dl(N);
	EVT ResVT = N->getValueType(0);
	EVT OvVT = N->getValueType(1);
	EVT LoResVT, HiResVT, LoOvVT, HiOvVT;
	std::tie(LoResVT, HiResVT) = DAG.GetSplitDestVTs(ResVT);
	std::tie(LoOvVT, HiOvVT) = DAG.GetSplitDestVTs(OvVT);

	SDValue LoLHS, HiLHS, LoRHS, HiRHS;
	if (getTypeAction(ResVT) == TargetLowering::TypeSplitVector) {
	GetSplitVector(N->getOperand(0), LoLHS, HiLHS);
	GetSplitVector(N->getOperand(1), LoRHS, HiRHS);
	} else {
	std::tie(LoLHS, HiLHS) = DAG.SplitVectorOperand(N, 0);
	std::tie(LoRHS, HiRHS) = DAG.SplitVectorOperand(N, 1);
	}

	unsigned Opcode = N->getOpcode();
	SDVTList LoVTs = DAG.getVTList(LoResVT, LoOvVT);
	SDVTList HiVTs = DAG.getVTList(HiResVT, HiOvVT);
	SDNode *LoNode = DAG.getNode(Opcode, dl, LoVTs, LoLHS, LoRHS).getNode();
	SDNode *HiNode = DAG.getNode(Opcode, dl, HiVTs, HiLHS, HiRHS).getNode();
	LoNode->setFlags(N->getFlags());
	HiNode->setFlags(N->getFlags());

	Lo = SDValue(LoNode, ResNo);
	Hi = SDValue(HiNode, ResNo);

	// Replace the other vector result not being explicitly split here.
	unsigned OtherNo = 1 - ResNo;
	EVT OtherVT = N->getValueType(OtherNo);
	if (getTypeAction(OtherVT) == TargetLowering::TypeSplitVector) {
	SetSplitVector(SDValue(N, OtherNo),
	SDValue(LoNode, OtherNo), SDValue(HiNode, OtherNo));
	} else {
	SDValue OtherVal = DAG.getNode(
	ISD::CONCAT_VECTORS, dl, OtherVT,
	SDValue(LoNode, OtherNo), SDValue(HiNode, OtherNo));
	ReplaceValueWith(SDValue(N, OtherNo), OtherVal);
	}
	}

	void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	SDValue Vec = N->getOperand(0);
	SDValue Elt = N->getOperand(1);
	SDValue Idx = N->getOperand(2);
	SDLoc dl(N);
	GetSplitVector(Vec, Lo, Hi);

	if (ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
	unsigned IdxVal = CIdx->getZExtValue();
	unsigned LoNumElts = Lo.getValueType().getVectorNumElements();
	if (IdxVal < LoNumElts)
	Lo = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
	Lo.getValueType(), Lo, Elt, Idx);
	else
	Hi = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Hi.getValueType(), Hi, Elt,
	DAG.getVectorIdxConstant(IdxVal - LoNumElts, dl));
	return;
	}

	// See if the target wants to custom expand this node.
	if (CustomLowerNode(N, N->getValueType(0), true))
	return;

	// Make the vector elements byte-addressable if they aren't already.
	EVT VecVT = Vec.getValueType();
	EVT EltVT = VecVT.getVectorElementType();
	if (VecVT.getScalarSizeInBits() < 8) {
	EltVT = MVT::i8;
	VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
	VecVT.getVectorNumElements());
	Vec = DAG.getNode(ISD::ANY_EXTEND, dl, VecVT, Vec);
	// Extend the element type to match if needed.
	if (EltVT.bitsGT(Elt.getValueType()))
	Elt = DAG.getNode(ISD::ANY_EXTEND, dl, EltVT, Elt);
	}

	// Spill the vector to the stack.
	// In cases where the vector is illegal it will be broken down into parts
	// and stored in parts - we should use the alignment for the smallest part.
	Align SmallestAlign = DAG.getReducedAlign(VecVT, /UseABI=/false);
	SDValue StackPtr =
	DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign);
	auto &MF = DAG.getMachineFunction();
	auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
	auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);

	SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo,
	SmallestAlign);

	// Store the new element. This may be larger than the vector element type,
	// so use a truncating store.
	SDValue EltPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
	Store = DAG.getTruncStore(
	Store, dl, Elt, EltPtr, MachinePointerInfo::getUnknownStack(MF), EltVT,
	commonAlignment(SmallestAlign, EltVT.getSizeInBits() / 8));

	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);

	// Load the Lo part from the stack slot.
	Lo = DAG.getLoad(LoVT, dl, Store, StackPtr, PtrInfo, SmallestAlign);

	// Increment the pointer to the other part.
	unsigned IncrementSize = LoVT.getSizeInBits() / 8;
	StackPtr = DAG.getMemBasePlusOffset(StackPtr, IncrementSize, dl);

	// Load the Hi part from the stack slot.
	Hi = DAG.getLoad(HiVT, dl, Store, StackPtr,
	PtrInfo.getWithOffset(IncrementSize), SmallestAlign);

	// If we adjusted the original type, we need to truncate the results.
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
	if (LoVT != Lo.getValueType())
	Lo = DAG.getNode(ISD::TRUNCATE, dl, LoVT, Lo);
	if (HiVT != Hi.getValueType())
	Hi = DAG.getNode(ISD::TRUNCATE, dl, HiVT, Hi);
	}

	void DAGTypeLegalizer::SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	EVT LoVT, HiVT;
	SDLoc dl(N);
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
	Lo = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoVT, N->getOperand(0));
	Hi = DAG.getUNDEF(HiVT);
	}

	void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
	SDValue &Hi) {
	assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!");
	EVT LoVT, HiVT;
	SDLoc dl(LD);
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(LD->getValueType(0));

	ISD::LoadExtType ExtType = LD->getExtensionType();
	SDValue Ch = LD->getChain();
	SDValue Ptr = LD->getBasePtr();
	SDValue Offset = DAG.getUNDEF(Ptr.getValueType());
	EVT MemoryVT = LD->getMemoryVT();
	MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
	AAMDNodes AAInfo = LD->getAAInfo();

	EVT LoMemVT, HiMemVT;
	std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);

	if (!LoMemVT.isByteSized() \|\| !HiMemVT.isByteSized()) {
	SDValue Value, NewChain;
	std::tie(Value, NewChain) = TLI.scalarizeVectorLoad(LD, DAG);
	std::tie(Lo, Hi) = DAG.SplitVector(Value, dl);
	ReplaceValueWith(SDValue(LD, 1), NewChain);
	return;
	}

	Lo = DAG.getLoad(ISD::UNINDEXED, ExtType, LoVT, dl, Ch, Ptr, Offset,
	LD->getPointerInfo(), LoMemVT, LD->getOriginalAlign(),
	MMOFlags, AAInfo);

	MachinePointerInfo MPI;
	IncrementPointer(LD, LoMemVT, MPI, Ptr);

	Hi = DAG.getLoad(ISD::UNINDEXED, ExtType, HiVT, dl, Ch, Ptr, Offset, MPI,
	HiMemVT, LD->getOriginalAlign(), MMOFlags, AAInfo);

	// Build a factor node to remember that this load is independent of the
	// other one.
	Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
	Hi.getValue(1));

	// Legalize the chain result - switch anything that used the old chain to
	// use the new one.
	ReplaceValueWith(SDValue(LD, 1), Ch);
	}

	void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
	SDValue &Lo, SDValue &Hi) {
	assert(MLD->isUnindexed() && "Indexed masked load during type legalization!");
	EVT LoVT, HiVT;
	SDLoc dl(MLD);
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0));

	SDValue Ch = MLD->getChain();
	SDValue Ptr = MLD->getBasePtr();
	SDValue Offset = MLD->getOffset();
	assert(Offset.isUndef() && "Unexpected indexed masked load offset");
	SDValue Mask = MLD->getMask();
	SDValue PassThru = MLD->getPassThru();
	Align Alignment = MLD->getOriginalAlign();
	ISD::LoadExtType ExtType = MLD->getExtensionType();

	// Split Mask operand
	SDValue MaskLo, MaskHi;
	if (Mask.getOpcode() == ISD::SETCC) {
	SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
	} else {
	if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
	GetSplitVector(Mask, MaskLo, MaskHi);
	else
	std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
	}

	EVT MemoryVT = MLD->getMemoryVT();
	EVT LoMemVT, HiMemVT;
	bool HiIsEmpty = false;
	std::tie(LoMemVT, HiMemVT) =
	DAG.GetDependentSplitDestVTs(MemoryVT, LoVT, &HiIsEmpty);

	SDValue PassThruLo, PassThruHi;
	if (getTypeAction(PassThru.getValueType()) == TargetLowering::TypeSplitVector)
	GetSplitVector(PassThru, PassThruLo, PassThruHi);
	else
	std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, dl);

	MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
	MLD->getPointerInfo(), MachineMemOperand::MOLoad, LoMemVT.getStoreSize(),
	Alignment, MLD->getAAInfo(), MLD->getRanges());

	Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, Offset, MaskLo, PassThruLo, LoMemVT,
	MMO, MLD->getAddressingMode(), ExtType,
	MLD->isExpandingLoad());

	if (HiIsEmpty) {
	// The hi masked load has zero storage size. We therefore simply set it to
	// the low masked load and rely on subsequent removal from the chain.
	Hi = Lo;
	} else {
	// Generate hi masked load.
	Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, dl, LoMemVT, DAG,
	MLD->isExpandingLoad());
	unsigned HiOffset = LoMemVT.getStoreSize();

	MMO = DAG.getMachineFunction().getMachineMemOperand(
	MLD->getPointerInfo().getWithOffset(HiOffset),
	MachineMemOperand::MOLoad, HiMemVT.getStoreSize(), Alignment,
	MLD->getAAInfo(), MLD->getRanges());

	Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, Offset, MaskHi, PassThruHi,
	HiMemVT, MMO, MLD->getAddressingMode(), ExtType,
	MLD->isExpandingLoad());
	}

	// Build a factor node to remember that this load is independent of the
	// other one.
	Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
	Hi.getValue(1));

	// Legalize the chain result - switch anything that used the old chain to
	// use the new one.
	ReplaceValueWith(SDValue(MLD, 1), Ch);

	}

	void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
	SDValue &Lo, SDValue &Hi) {
	EVT LoVT, HiVT;
	SDLoc dl(MGT);
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MGT->getValueType(0));

	SDValue Ch = MGT->getChain();
	SDValue Ptr = MGT->getBasePtr();
	SDValue Mask = MGT->getMask();
	SDValue PassThru = MGT->getPassThru();
	SDValue Index = MGT->getIndex();
	SDValue Scale = MGT->getScale();
	Align Alignment = MGT->getOriginalAlign();

	// Split Mask operand
	SDValue MaskLo, MaskHi;
	if (Mask.getOpcode() == ISD::SETCC) {
	SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
	} else {
	if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
	GetSplitVector(Mask, MaskLo, MaskHi);
	else
	std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
	}

	SDValue PassThruLo, PassThruHi;
	if (getTypeAction(PassThru.getValueType()) == TargetLowering::TypeSplitVector)
	GetSplitVector(PassThru, PassThruLo, PassThruHi);
	else
	std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, dl);

	SDValue IndexHi, IndexLo;
	if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
	GetSplitVector(Index, IndexLo, IndexHi);
	else
	std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl);

	MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
	MGT->getPointerInfo(), MachineMemOperand::MOLoad,
	MemoryLocation::UnknownSize, Alignment, MGT->getAAInfo(),
	MGT->getRanges());

	SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale};
	Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl, OpsLo,
	MMO, MGT->getIndexType());

	SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale};
	Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, dl, OpsHi,
	MMO, MGT->getIndexType());

	// Build a factor node to remember that this load is independent of the
	// other one.
	Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
	Hi.getValue(1));

	// Legalize the chain result - switch anything that used the old chain to
	// use the new one.
	ReplaceValueWith(SDValue(MGT, 1), Ch);
	}


	void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) {
	assert(N->getValueType(0).isVector() &&
	N->getOperand(0).getValueType().isVector() &&
	"Operand types must be vectors");

	EVT LoVT, HiVT;
	SDLoc DL(N);
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));

	// If the input also splits, handle it directly. Otherwise split it by hand.
	SDValue LL, LH, RL, RH;
	if (getTypeAction(N->getOperand(0).getValueType()) ==
	TargetLowering::TypeSplitVector)
	GetSplitVector(N->getOperand(0), LL, LH);
	else
	std::tie(LL, LH) = DAG.SplitVectorOperand(N, 0);

	if (getTypeAction(N->getOperand(1).getValueType()) ==
	TargetLowering::TypeSplitVector)
	GetSplitVector(N->getOperand(1), RL, RH);
	else
	std::tie(RL, RH) = DAG.SplitVectorOperand(N, 1);

	Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
	Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));
	}

	void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	// Get the dest types - they may not match the input types, e.g. int_to_fp.
	EVT LoVT, HiVT;
	SDLoc dl(N);
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));

	// If the input also splits, handle it directly for a compile time speedup.
	// Otherwise split it by hand.
	unsigned OpNo = N->isStrictFPOpcode() ? 1 : 0;
	EVT InVT = N->getOperand(OpNo).getValueType();
	if (getTypeAction(InVT) == TargetLowering::TypeSplitVector)
	GetSplitVector(N->getOperand(OpNo), Lo, Hi);
	else
	std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, OpNo);

	if (N->getOpcode() == ISD::FP_ROUND) {
	Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo, N->getOperand(1),
	N->getFlags());
	Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi, N->getOperand(1),
	N->getFlags());
	} else {
	Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo, N->getFlags());
	Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi, N->getFlags());
	}
	}

	void DAGTypeLegalizer::SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	SDLoc dl(N);
	EVT SrcVT = N->getOperand(0).getValueType();
	EVT DestVT = N->getValueType(0);
	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(DestVT);

	// We can do better than a generic split operation if the extend is doing
	// more than just doubling the width of the elements and the following are
	// true:
	// - The number of vector elements is even,
	// - the source type is legal,
	// - the type of a split source is illegal,
	// - the type of an extended (by doubling element size) source is legal, and
	// - the type of that extended source when split is legal.
	//
	// This won't necessarily completely legalize the operation, but it will
	// more effectively move in the right direction and prevent falling down
	// to scalarization in many cases due to the input vector being split too
	// far.
	if ((SrcVT.getVectorMinNumElements() & 1) == 0 &&
	SrcVT.getSizeInBits() * 2 < DestVT.getSizeInBits()) {
	LLVMContext &Ctx = *DAG.getContext();
	EVT NewSrcVT = SrcVT.widenIntegerVectorElementType(Ctx);
	EVT SplitSrcVT = SrcVT.getHalfNumVectorElementsVT(Ctx);

	EVT SplitLoVT, SplitHiVT;
	std::tie(SplitLoVT, SplitHiVT) = DAG.GetSplitDestVTs(NewSrcVT);
	if (TLI.isTypeLegal(SrcVT) && !TLI.isTypeLegal(SplitSrcVT) &&
	TLI.isTypeLegal(NewSrcVT) && TLI.isTypeLegal(SplitLoVT)) {
	LLVM_DEBUG(dbgs() << "Split vector extend via incremental extend:";
	N->dump(&DAG); dbgs() << "\n");
	// Extend the source vector by one step.
	SDValue NewSrc =
	DAG.getNode(N->getOpcode(), dl, NewSrcVT, N->getOperand(0));
	// Get the low and high halves of the new, extended one step, vector.
	std::tie(Lo, Hi) = DAG.SplitVector(NewSrc, dl);
	// Extend those vector halves the rest of the way.
	Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo);
	Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi);
	return;
	}
	}
	// Fall back to the generic unary operator splitting otherwise.
	SplitVecRes_UnaryOp(N, Lo, Hi);
	}

	void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N,
	SDValue &Lo, SDValue &Hi) {
	// The low and high parts of the original input give four input vectors.
	SDValue Inputs[4];
	SDLoc dl(N);
	GetSplitVector(N->getOperand(0), Inputs[0], Inputs[1]);
	GetSplitVector(N->getOperand(1), Inputs[2], Inputs[3]);
	EVT NewVT = Inputs[0].getValueType();
	unsigned NewElts = NewVT.getVectorNumElements();

	// If Lo or Hi uses elements from at most two of the four input vectors, then
	// express it as a vector shuffle of those two inputs. Otherwise extract the
	// input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
	SmallVector<int, 16> Ops;
	for (unsigned High = 0; High < 2; ++High) {
	SDValue &Output = High ? Hi : Lo;

	// Build a shuffle mask for the output, discovering on the fly which
	// input vectors to use as shuffle operands (recorded in InputUsed).
	// If building a suitable shuffle vector proves too hard, then bail
	// out with useBuildVector set.
	unsigned InputUsed[2] = { -1U, -1U }; // Not yet discovered.
	unsigned FirstMaskIdx = High * NewElts;
	bool useBuildVector = false;
	for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
	// The mask element. This indexes into the input.
	int Idx = N->getMaskElt(FirstMaskIdx + MaskOffset);

	// The input vector this mask element indexes into.
	unsigned Input = (unsigned)Idx / NewElts;

	if (Input >= array_lengthof(Inputs)) {
	// The mask element does not index into any input vector.
	Ops.push_back(-1);
	continue;
	}

	// Turn the index into an offset from the start of the input vector.
	Idx -= Input * NewElts;

	// Find or create a shuffle vector operand to hold this input.
	unsigned OpNo;
	for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
	if (InputUsed[OpNo] == Input) {
	// This input vector is already an operand.
	break;
	} else if (InputUsed[OpNo] == -1U) {
	// Create a new operand for this input vector.
	InputUsed[OpNo] = Input;
	break;
	}
	}

	if (OpNo >= array_lengthof(InputUsed)) {
	// More than two input vectors used! Give up on trying to create a
	// shuffle vector. Insert all elements into a BUILD_VECTOR instead.
	useBuildVector = true;
	break;
	}

	// Add the mask index for the new shuffle vector.
	Ops.push_back(Idx + OpNo * NewElts);
	}

	if (useBuildVector) {
	EVT EltVT = NewVT.getVectorElementType();
	SmallVector<SDValue, 16> SVOps;

	// Extract the input elements by hand.
	for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
	// The mask element. This indexes into the input.
	int Idx = N->getMaskElt(FirstMaskIdx + MaskOffset);

	// The input vector this mask element indexes into.
	unsigned Input = (unsigned)Idx / NewElts;

	if (Input >= array_lengthof(Inputs)) {
	// The mask element is "undef" or indexes off the end of the input.
	SVOps.push_back(DAG.getUNDEF(EltVT));
	continue;
	}

	// Turn the index into an offset from the start of the input vector.
	Idx -= Input * NewElts;

	// Extract the vector element by hand.
	SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
	Inputs[Input],
	DAG.getVectorIdxConstant(Idx, dl)));
	}

	// Construct the Lo/Hi output using a BUILD_VECTOR.
	Output = DAG.getBuildVector(NewVT, dl, SVOps);
	} else if (InputUsed[0] == -1U) {
	// No input vectors were used! The result is undefined.
	Output = DAG.getUNDEF(NewVT);
	} else {
	SDValue Op0 = Inputs[InputUsed[0]];
	// If only one input was used, use an undefined vector for the other.
	SDValue Op1 = InputUsed[1] == -1U ?
	DAG.getUNDEF(NewVT) : Inputs[InputUsed[1]];
	// At least one input vector was used. Create a new shuffle vector.
	Output = DAG.getVectorShuffle(NewVT, dl, Op0, Op1, Ops);
	}

	Ops.clear();
	}
	}

	void DAGTypeLegalizer::SplitVecRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi) {
	EVT OVT = N->getValueType(0);
	EVT NVT = OVT.getHalfNumVectorElementsVT(*DAG.getContext());
	SDValue Chain = N->getOperand(0);
	SDValue Ptr = N->getOperand(1);
	SDValue SV = N->getOperand(2);
	SDLoc dl(N);

	const Align Alignment =
	DAG.getDataLayout().getABITypeAlign(NVT.getTypeForEVT(*DAG.getContext()));

	Lo = DAG.getVAArg(NVT, dl, Chain, Ptr, SV, Alignment.value());
	Hi = DAG.getVAArg(NVT, dl, Lo.getValue(1), Ptr, SV, Alignment.value());
	Chain = Hi.getValue(1);

	// Modified the chain - switch anything that used the old chain to use
	// the new one.
	ReplaceValueWith(SDValue(N, 1), Chain);
	}


	//===----------------------------------------------------------------------===//
	// Operand Vector Splitting
	//===----------------------------------------------------------------------===//

	/// This method is called when the specified operand of the specified node is
	/// found to need vector splitting. At this point, all of the result types of
	/// the node are known to be legal, but other operands of the node may need
	/// legalization as well as the specified one.
	bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
	LLVM_DEBUG(dbgs() << "Split node operand: "; N->dump(&DAG); dbgs() << "\n");
	SDValue Res = SDValue();

	// See if the target wants to custom split this node.
	if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
	return false;

	if (!Res.getNode()) {
	switch (N->getOpcode()) {
	default:
	#ifndef NDEBUG
	dbgs() << "SplitVectorOperand Op #" << OpNo << ": ";
	N->dump(&DAG);
	dbgs() << "\n";
	#endif
	report_fatal_error("Do not know how to split this operator's "
	"operand!\n");

	case ISD::SETCC: Res = SplitVecOp_VSETCC(N); break;
	case ISD::BITCAST: Res = SplitVecOp_BITCAST(N); break;
	case ISD::EXTRACT_SUBVECTOR: Res = SplitVecOp_EXTRACT_SUBVECTOR(N); break;
	case ISD::EXTRACT_VECTOR_ELT:Res = SplitVecOp_EXTRACT_VECTOR_ELT(N); break;
	case ISD::CONCAT_VECTORS: Res = SplitVecOp_CONCAT_VECTORS(N); break;
	case ISD::TRUNCATE:
	Res = SplitVecOp_TruncateHelper(N);
	break;
	case ISD::STRICT_FP_ROUND:
	case ISD::FP_ROUND: Res = SplitVecOp_FP_ROUND(N); break;
	case ISD::FCOPYSIGN: Res = SplitVecOp_FCOPYSIGN(N); break;
	case ISD::STORE:
	Res = SplitVecOp_STORE(cast<StoreSDNode>(N), OpNo);
	break;
	case ISD::MSTORE:
	Res = SplitVecOp_MSTORE(cast<MaskedStoreSDNode>(N), OpNo);
	break;
	case ISD::MSCATTER:
	Res = SplitVecOp_MSCATTER(cast<MaskedScatterSDNode>(N), OpNo);
	break;
	case ISD::MGATHER:
	Res = SplitVecOp_MGATHER(cast<MaskedGatherSDNode>(N), OpNo);
	break;
	case ISD::VSELECT:
	Res = SplitVecOp_VSELECT(N, OpNo);
	break;
	case ISD::STRICT_SINT_TO_FP:
	case ISD::STRICT_UINT_TO_FP:
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	if (N->getValueType(0).bitsLT(
	N->getOperand(N->isStrictFPOpcode() ? 1 : 0).getValueType()))
	Res = SplitVecOp_TruncateHelper(N);
	else
	Res = SplitVecOp_UnaryOp(N);
	break;
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	case ISD::STRICT_FP_TO_SINT:
	case ISD::STRICT_FP_TO_UINT:
	case ISD::CTTZ:
	case ISD::CTLZ:
	case ISD::CTPOP:
	case ISD::STRICT_FP_EXTEND:
	case ISD::FP_EXTEND:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	case ISD::FTRUNC:
	case ISD::FCANONICALIZE:
	Res = SplitVecOp_UnaryOp(N);
	break;

	case ISD::ANY_EXTEND_VECTOR_INREG:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	Res = SplitVecOp_ExtVecInRegOp(N);
	break;

	case ISD::VECREDUCE_FADD:
	case ISD::VECREDUCE_FMUL:
	case ISD::VECREDUCE_ADD:
	case ISD::VECREDUCE_MUL:
	case ISD::VECREDUCE_AND:
	case ISD::VECREDUCE_OR:
	case ISD::VECREDUCE_XOR:
	case ISD::VECREDUCE_SMAX:
	case ISD::VECREDUCE_SMIN:
	case ISD::VECREDUCE_UMAX:
	case ISD::VECREDUCE_UMIN:
	case ISD::VECREDUCE_FMAX:
	case ISD::VECREDUCE_FMIN:
	Res = SplitVecOp_VECREDUCE(N, OpNo);
	break;
	}
	}

	// If the result is null, the sub-method took care of registering results etc.
	if (!Res.getNode()) return false;

	// If the result is N, the sub-method updated N in place. Tell the legalizer
	// core about this.
	if (Res.getNode() == N)
	return true;

	if (N->isStrictFPOpcode())
	assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 2 &&
	"Invalid operand expansion");
	else
	assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
	"Invalid operand expansion");

	ReplaceValueWith(SDValue(N, 0), Res);
	return false;
	}

	SDValue DAGTypeLegalizer::SplitVecOp_VSELECT(SDNode *N, unsigned OpNo) {
	// The only possibility for an illegal operand is the mask, since result type
	// legalization would have handled this node already otherwise.
	assert(OpNo == 0 && "Illegal operand must be mask");

	SDValue Mask = N->getOperand(0);
	SDValue Src0 = N->getOperand(1);
	SDValue Src1 = N->getOperand(2);
	EVT Src0VT = Src0.getValueType();
	SDLoc DL(N);
	assert(Mask.getValueType().isVector() && "VSELECT without a vector mask?");

	SDValue Lo, Hi;
	GetSplitVector(N->getOperand(0), Lo, Hi);
	assert(Lo.getValueType() == Hi.getValueType() &&
	"Lo and Hi have differing types");

	EVT LoOpVT, HiOpVT;
	std::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(Src0VT);
	assert(LoOpVT == HiOpVT && "Asymmetric vector split?");

	SDValue LoOp0, HiOp0, LoOp1, HiOp1, LoMask, HiMask;
	std::tie(LoOp0, HiOp0) = DAG.SplitVector(Src0, DL);
	std::tie(LoOp1, HiOp1) = DAG.SplitVector(Src1, DL);
	std::tie(LoMask, HiMask) = DAG.SplitVector(Mask, DL);

	SDValue LoSelect =
	DAG.getNode(ISD::VSELECT, DL, LoOpVT, LoMask, LoOp0, LoOp1);
	SDValue HiSelect =
	DAG.getNode(ISD::VSELECT, DL, HiOpVT, HiMask, HiOp0, HiOp1);

	return DAG.getNode(ISD::CONCAT_VECTORS, DL, Src0VT, LoSelect, HiSelect);
	}

	SDValue DAGTypeLegalizer::SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo) {
	EVT ResVT = N->getValueType(0);
	SDValue Lo, Hi;
	SDLoc dl(N);

	SDValue VecOp = N->getOperand(OpNo);
	EVT VecVT = VecOp.getValueType();
	assert(VecVT.isVector() && "Can only split reduce vector operand");
	GetSplitVector(VecOp, Lo, Hi);
	EVT LoOpVT, HiOpVT;
	std::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(VecVT);

	bool NoNaN = N->getFlags().hasNoNaNs();
	unsigned CombineOpc = 0;
	switch (N->getOpcode()) {
	case ISD::VECREDUCE_FADD: CombineOpc = ISD::FADD; break;
	case ISD::VECREDUCE_FMUL: CombineOpc = ISD::FMUL; break;
	case ISD::VECREDUCE_ADD: CombineOpc = ISD::ADD; break;
	case ISD::VECREDUCE_MUL: CombineOpc = ISD::MUL; break;
	case ISD::VECREDUCE_AND: CombineOpc = ISD::AND; break;
	case ISD::VECREDUCE_OR: CombineOpc = ISD::OR; break;
	case ISD::VECREDUCE_XOR: CombineOpc = ISD::XOR; break;
	case ISD::VECREDUCE_SMAX: CombineOpc = ISD::SMAX; break;
	case ISD::VECREDUCE_SMIN: CombineOpc = ISD::SMIN; break;
	case ISD::VECREDUCE_UMAX: CombineOpc = ISD::UMAX; break;
	case ISD::VECREDUCE_UMIN: CombineOpc = ISD::UMIN; break;
	case ISD::VECREDUCE_FMAX:
	CombineOpc = NoNaN ? ISD::FMAXNUM : ISD::FMAXIMUM;
	break;
	case ISD::VECREDUCE_FMIN:
	CombineOpc = NoNaN ? ISD::FMINNUM : ISD::FMINIMUM;
	break;
	default:
	llvm_unreachable("Unexpected reduce ISD node");
	}

	// Use the appropriate scalar instruction on the split subvectors before
	// reducing the now partially reduced smaller vector.
	SDValue Partial = DAG.getNode(CombineOpc, dl, LoOpVT, Lo, Hi, N->getFlags());
	return DAG.getNode(N->getOpcode(), dl, ResVT, Partial, N->getFlags());
	}

	SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) {
	// The result has a legal vector type, but the input needs splitting.
	EVT ResVT = N->getValueType(0);
	SDValue Lo, Hi;
	SDLoc dl(N);
	GetSplitVector(N->getOperand(N->isStrictFPOpcode() ? 1 : 0), Lo, Hi);
	EVT InVT = Lo.getValueType();

	EVT OutVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(),
	- InVT.getVectorNumElements());
	+ InVT.getVectorElementCount());

	if (N->isStrictFPOpcode()) {
	Lo = DAG.getNode(N->getOpcode(), dl, { OutVT, MVT::Other },
	{ N->getOperand(0), Lo });
	Hi = DAG.getNode(N->getOpcode(), dl, { OutVT, MVT::Other },
	{ N->getOperand(0), Hi });

	// Build a factor node to remember that this operation is independent
	// of the other one.
	SDValue Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
	Hi.getValue(1));

	// Legalize the chain result - switch anything that used the old chain to
	// use the new one.
	ReplaceValueWith(SDValue(N, 1), Ch);
	} else {
	Lo = DAG.getNode(N->getOpcode(), dl, OutVT, Lo);
	Hi = DAG.getNode(N->getOpcode(), dl, OutVT, Hi);
	}

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
	}

	SDValue DAGTypeLegalizer::SplitVecOp_BITCAST(SDNode *N) {
	// For example, i64 = BITCAST v4i16 on alpha. Typically the vector will
	// end up being split all the way down to individual components. Convert the
	// split pieces into integers and reassemble.
	SDValue Lo, Hi;
	GetSplitVector(N->getOperand(0), Lo, Hi);
	Lo = BitConvertToInteger(Lo);
	Hi = BitConvertToInteger(Hi);

	if (DAG.getDataLayout().isBigEndian())
	std::swap(Lo, Hi);

	return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0),
	JoinIntegers(Lo, Hi));
	}

	SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
	// We know that the extracted result type is legal.
	EVT SubVT = N->getValueType(0);
	SDValue Idx = N->getOperand(1);
	SDLoc dl(N);
	SDValue Lo, Hi;
	+
	+ if (SubVT.isScalableVector() !=
	+ N->getOperand(0).getValueType().isScalableVector())
	+ report_fatal_error("Extracting a fixed-length vector from an illegal "
	+ "scalable vector is not yet supported");
	+
	GetSplitVector(N->getOperand(0), Lo, Hi);

	- uint64_t LoElts = Lo.getValueType().getVectorNumElements();
	+ uint64_t LoElts = Lo.getValueType().getVectorMinNumElements();
	uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

	if (IdxVal < LoElts) {
	- assert(IdxVal + SubVT.getVectorNumElements() <= LoElts &&
	+ assert(IdxVal + SubVT.getVectorMinNumElements() <= LoElts &&
	"Extracted subvector crosses vector split!");
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Lo, Idx);
	} else {
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Hi,
	DAG.getVectorIdxConstant(IdxVal - LoElts, dl));
	}
	}

	SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
	SDValue Vec = N->getOperand(0);
	SDValue Idx = N->getOperand(1);
	EVT VecVT = Vec.getValueType();

	if (isa<ConstantSDNode>(Idx)) {
	uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

	SDValue Lo, Hi;
	GetSplitVector(Vec, Lo, Hi);

	uint64_t LoElts = Lo.getValueType().getVectorNumElements();

	if (IdxVal < LoElts)
	return SDValue(DAG.UpdateNodeOperands(N, Lo, Idx), 0);
	return SDValue(DAG.UpdateNodeOperands(N, Hi,
	DAG.getConstant(IdxVal - LoElts, SDLoc(N),
	Idx.getValueType())), 0);
	}

	// See if the target wants to custom expand this node.
	if (CustomLowerNode(N, N->getValueType(0), true))
	return SDValue();

	// Make the vector elements byte-addressable if they aren't already.
	SDLoc dl(N);
	EVT EltVT = VecVT.getVectorElementType();
	if (VecVT.getScalarSizeInBits() < 8) {
	EltVT = MVT::i8;
	VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
	VecVT.getVectorNumElements());
	Vec = DAG.getNode(ISD::ANY_EXTEND, dl, VecVT, Vec);
	}

	// Store the vector to the stack.
	// In cases where the vector is illegal it will be broken down into parts
	// and stored in parts - we should use the alignment for the smallest part.
	Align SmallestAlign = DAG.getReducedAlign(VecVT, /UseABI=/false);
	SDValue StackPtr =
	DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign);
	auto &MF = DAG.getMachineFunction();
	auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
	auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
	SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo,
	SmallestAlign);

	// Load back the required element.
	StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);

	// FIXME: This is to handle i1 vectors with elements promoted to i8.
	// i1 vector handling needs general improvement.
	if (N->getValueType(0).bitsLT(EltVT)) {
	SDValue Load = DAG.getLoad(EltVT, dl, Store, StackPtr,
	MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
	return DAG.getZExtOrTrunc(Load, dl, N->getValueType(0));
	}

	return DAG.getExtLoad(
	ISD::EXTLOAD, dl, N->getValueType(0), Store, StackPtr,
	MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), EltVT,
	commonAlignment(SmallestAlign, EltVT.getSizeInBits() / 8));
	}

	SDValue DAGTypeLegalizer::SplitVecOp_ExtVecInRegOp(SDNode *N) {
	SDValue Lo, Hi;

	// *_EXTEND_VECTOR_INREG only reference the lower half of the input, so
	// splitting the result has the same effect as splitting the input operand.
	SplitVecRes_ExtVecInRegOp(N, Lo, Hi);

	return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), N->getValueType(0), Lo, Hi);
	}

	SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
	unsigned OpNo) {
	EVT LoVT, HiVT;
	SDLoc dl(MGT);
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MGT->getValueType(0));

	SDValue Ch = MGT->getChain();
	SDValue Ptr = MGT->getBasePtr();
	SDValue Index = MGT->getIndex();
	SDValue Scale = MGT->getScale();
	SDValue Mask = MGT->getMask();
	SDValue PassThru = MGT->getPassThru();
	Align Alignment = MGT->getOriginalAlign();

	SDValue MaskLo, MaskHi;
	if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
	// Split Mask operand
	GetSplitVector(Mask, MaskLo, MaskHi);
	else
	std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);

	EVT MemoryVT = MGT->getMemoryVT();
	EVT LoMemVT, HiMemVT;
	std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);

	SDValue PassThruLo, PassThruHi;
	if (getTypeAction(PassThru.getValueType()) == TargetLowering::TypeSplitVector)
	GetSplitVector(PassThru, PassThruLo, PassThruHi);
	else
	std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, dl);

	SDValue IndexHi, IndexLo;
	if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
	GetSplitVector(Index, IndexLo, IndexHi);
	else
	std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl);

	MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
	MGT->getPointerInfo(), MachineMemOperand::MOLoad,
	MemoryLocation::UnknownSize, Alignment, MGT->getAAInfo(),
	MGT->getRanges());

	SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale};
	SDValue Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl,
	OpsLo, MMO, MGT->getIndexType());

	SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale};
	SDValue Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, dl,
	OpsHi, MMO, MGT->getIndexType());

	// Build a factor node to remember that this load is independent of the
	// other one.
	Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
	Hi.getValue(1));

	// Legalize the chain result - switch anything that used the old chain to
	// use the new one.
	ReplaceValueWith(SDValue(MGT, 1), Ch);

	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MGT->getValueType(0), Lo,
	Hi);
	ReplaceValueWith(SDValue(MGT, 0), Res);
	return SDValue();
	}

	SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
	unsigned OpNo) {
	assert(N->isUnindexed() && "Indexed masked store of vector?");
	SDValue Ch = N->getChain();
	SDValue Ptr = N->getBasePtr();
	SDValue Offset = N->getOffset();
	assert(Offset.isUndef() && "Unexpected indexed masked store offset");
	SDValue Mask = N->getMask();
	SDValue Data = N->getValue();
	Align Alignment = N->getOriginalAlign();
	SDLoc DL(N);

	SDValue DataLo, DataHi;
	if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector)
	// Split Data operand
	GetSplitVector(Data, DataLo, DataHi);
	else
	std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);

	// Split Mask operand
	SDValue MaskLo, MaskHi;
	if (OpNo == 1 && Mask.getOpcode() == ISD::SETCC) {
	SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
	} else {
	if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
	GetSplitVector(Mask, MaskLo, MaskHi);
	else
	std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
	}

	EVT MemoryVT = N->getMemoryVT();
	EVT LoMemVT, HiMemVT;
	bool HiIsEmpty = false;
	std::tie(LoMemVT, HiMemVT) =
	DAG.GetDependentSplitDestVTs(MemoryVT, DataLo.getValueType(), &HiIsEmpty);

	SDValue Lo, Hi, Res;
	MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
	N->getPointerInfo(), MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
	Alignment, N->getAAInfo(), N->getRanges());

	Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, Offset, MaskLo, LoMemVT, MMO,
	N->getAddressingMode(), N->isTruncatingStore(),
	N->isCompressingStore());

	if (HiIsEmpty) {
	// The hi masked store has zero storage size.
	// Only the lo masked store is needed.
	Res = Lo;
	} else {

	Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
	N->isCompressingStore());
	unsigned HiOffset = LoMemVT.getStoreSize();

	MMO = DAG.getMachineFunction().getMachineMemOperand(
	N->getPointerInfo().getWithOffset(HiOffset), MachineMemOperand::MOStore,
	HiMemVT.getStoreSize(), Alignment, N->getAAInfo(), N->getRanges());

	Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, Offset, MaskHi, HiMemVT, MMO,
	N->getAddressingMode(), N->isTruncatingStore(),
	N->isCompressingStore());

	// Build a factor node to remember that this store is independent of the
	// other one.
	Res = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
	}

	return Res;
	}

	SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N,
	unsigned OpNo) {
	SDValue Ch = N->getChain();
	SDValue Ptr = N->getBasePtr();
	SDValue Mask = N->getMask();
	SDValue Index = N->getIndex();
	SDValue Scale = N->getScale();
	SDValue Data = N->getValue();
	Align Alignment = N->getOriginalAlign();
	SDLoc DL(N);

	// Split all operands

	SDValue DataLo, DataHi;
	if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector)
	// Split Data operand
	GetSplitVector(Data, DataLo, DataHi);
	else
	std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);

	// Split Mask operand
	SDValue MaskLo, MaskHi;
	if (OpNo == 1 && Mask.getOpcode() == ISD::SETCC) {
	SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
	} else {
	if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
	GetSplitVector(Mask, MaskLo, MaskHi);
	else
	std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
	}

	SDValue IndexHi, IndexLo;
	if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
	GetSplitVector(Index, IndexLo, IndexHi);
	else
	std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL);

	SDValue Lo;
	MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
	N->getPointerInfo(), MachineMemOperand::MOStore,
	MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges());

	SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo, Scale};
	Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataLo.getValueType(),
	DL, OpsLo, MMO, N->getIndexType());

	// The order of the Scatter operation after split is well defined. The "Hi"
	// part comes after the "Lo". So these two operations should be chained one
	// after another.
	SDValue OpsHi[] = {Lo, DataHi, MaskHi, Ptr, IndexHi, Scale};
	return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataHi.getValueType(),
	DL, OpsHi, MMO, N->getIndexType());
	}

	SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
	assert(N->isUnindexed() && "Indexed store of vector?");
	assert(OpNo == 1 && "Can only split the stored value");
	SDLoc DL(N);

	bool isTruncating = N->isTruncatingStore();
	SDValue Ch = N->getChain();
	SDValue Ptr = N->getBasePtr();
	EVT MemoryVT = N->getMemoryVT();
	Align Alignment = N->getOriginalAlign();
	MachineMemOperand::Flags MMOFlags = N->getMemOperand()->getFlags();
	AAMDNodes AAInfo = N->getAAInfo();
	SDValue Lo, Hi;
	GetSplitVector(N->getOperand(1), Lo, Hi);

	EVT LoMemVT, HiMemVT;
	std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);

	// Scalarize if the split halves are not byte-sized.
	if (!LoMemVT.isByteSized() \|\| !HiMemVT.isByteSized())
	return TLI.scalarizeVectorStore(N, DAG);

	if (isTruncating)
	Lo = DAG.getTruncStore(Ch, DL, Lo, Ptr, N->getPointerInfo(), LoMemVT,
	Alignment, MMOFlags, AAInfo);
	else
	Lo = DAG.getStore(Ch, DL, Lo, Ptr, N->getPointerInfo(), Alignment, MMOFlags,
	AAInfo);

	MachinePointerInfo MPI;
	IncrementPointer(N, LoMemVT, MPI, Ptr);

	if (isTruncating)
	Hi = DAG.getTruncStore(Ch, DL, Hi, Ptr, MPI,
	HiMemVT, Alignment, MMOFlags, AAInfo);
	else
	Hi = DAG.getStore(Ch, DL, Hi, Ptr, MPI, Alignment, MMOFlags, AAInfo);

	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
	}

	SDValue DAGTypeLegalizer::SplitVecOp_CONCAT_VECTORS(SDNode *N) {
	SDLoc DL(N);

	// The input operands all must have the same type, and we know the result
	// type is valid. Convert this to a buildvector which extracts all the
	// input elements.
	// TODO: If the input elements are power-two vectors, we could convert this to
	// a new CONCAT_VECTORS node with elements that are half-wide.
	SmallVector<SDValue, 32> Elts;
	EVT EltVT = N->getValueType(0).getVectorElementType();
	for (const SDValue &Op : N->op_values()) {
	for (unsigned i = 0, e = Op.getValueType().getVectorNumElements();
	i != e; ++i) {
	Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Op,
	DAG.getVectorIdxConstant(i, DL)));
	}
	}

	return DAG.getBuildVector(N->getValueType(0), DL, Elts);
	}

	SDValue DAGTypeLegalizer::SplitVecOp_TruncateHelper(SDNode *N) {
	// The result type is legal, but the input type is illegal. If splitting
	// ends up with the result type of each half still being legal, just
	// do that. If, however, that would result in an illegal result type,
	// we can try to get more clever with power-two vectors. Specifically,
	// split the input type, but also widen the result element size, then
	// concatenate the halves and truncate again. For example, consider a target
	// where v8i8 is legal and v8i32 is not (ARM, which doesn't have 256-bit
	// vectors). To perform a "%res = v8i8 trunc v8i32 %in" we do:
	// %inlo = v4i32 extract_subvector %in, 0
	// %inhi = v4i32 extract_subvector %in, 4
	// %lo16 = v4i16 trunc v4i32 %inlo
	// %hi16 = v4i16 trunc v4i32 %inhi
	// %in16 = v8i16 concat_vectors v4i16 %lo16, v4i16 %hi16
	// %res = v8i8 trunc v8i16 %in16
	//
	// Without this transform, the original truncate would end up being
	// scalarized, which is pretty much always a last resort.
	unsigned OpNo = N->isStrictFPOpcode() ? 1 : 0;
	SDValue InVec = N->getOperand(OpNo);
	EVT InVT = InVec->getValueType(0);
	EVT OutVT = N->getValueType(0);
	- unsigned NumElements = OutVT.getVectorNumElements();
	+ ElementCount NumElements = OutVT.getVectorElementCount();
	bool IsFloat = OutVT.isFloatingPoint();

	- // Widening should have already made sure this is a power-two vector
	- // if we're trying to split it at all. assert() that's true, just in case.
	- assert(!(NumElements & 1) && "Splitting vector, but not in half!");
	-
	unsigned InElementSize = InVT.getScalarSizeInBits();
	unsigned OutElementSize = OutVT.getScalarSizeInBits();

	// Determine the split output VT. If its legal we can just split dirctly.
	EVT LoOutVT, HiOutVT;
	std::tie(LoOutVT, HiOutVT) = DAG.GetSplitDestVTs(OutVT);
	assert(LoOutVT == HiOutVT && "Unequal split?");

	// If the input elements are only 1/2 the width of the result elements,
	// just use the normal splitting. Our trick only work if there's room
	// to split more than once.
	if (isTypeLegal(LoOutVT) \|\|
	InElementSize <= OutElementSize * 2)
	return SplitVecOp_UnaryOp(N);
	SDLoc DL(N);

	// Don't touch if this will be scalarized.
	EVT FinalVT = InVT;
	while (getTypeAction(FinalVT) == TargetLowering::TypeSplitVector)
	FinalVT = FinalVT.getHalfNumVectorElementsVT(*DAG.getContext());

	if (getTypeAction(FinalVT) == TargetLowering::TypeScalarizeVector)
	return SplitVecOp_UnaryOp(N);

	// Get the split input vector.
	SDValue InLoVec, InHiVec;
	GetSplitVector(InVec, InLoVec, InHiVec);

	// Truncate them to 1/2 the element size.
	+ //
	+ // This assumes the number of elements is a power of two; any vector that
	+ // isn't should be widened, not split.
	EVT HalfElementVT = IsFloat ?
	EVT::getFloatingPointVT(InElementSize/2) :
	EVT::getIntegerVT(*DAG.getContext(), InElementSize/2);
	EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT,
	NumElements/2);

	SDValue HalfLo;
	SDValue HalfHi;
	SDValue Chain;
	if (N->isStrictFPOpcode()) {
	HalfLo = DAG.getNode(N->getOpcode(), DL, {HalfVT, MVT::Other},
	{N->getOperand(0), InLoVec});
	HalfHi = DAG.getNode(N->getOpcode(), DL, {HalfVT, MVT::Other},
	{N->getOperand(0), InHiVec});
	// Legalize the chain result - switch anything that used the old chain to
	// use the new one.
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, HalfLo.getValue(1),
	HalfHi.getValue(1));
	} else {
	HalfLo = DAG.getNode(N->getOpcode(), DL, HalfVT, InLoVec);
	HalfHi = DAG.getNode(N->getOpcode(), DL, HalfVT, InHiVec);
	}
	// Concatenate them to get the full intermediate truncation result.
	EVT InterVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT, NumElements);
	SDValue InterVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InterVT, HalfLo,
	HalfHi);
	// Now finish up by truncating all the way down to the original result
	// type. This should normally be something that ends up being legal directly,
	// but in theory if a target has very wide vectors and an annoyingly
	// restricted set of legal types, this split can chain to build things up.

	if (N->isStrictFPOpcode()) {
	SDValue Res = DAG.getNode(
	ISD::STRICT_FP_ROUND, DL, {OutVT, MVT::Other},
	{Chain, InterVec,
	DAG.getTargetConstant(0, DL, TLI.getPointerTy(DAG.getDataLayout()))});
	// Relink the chain
	ReplaceValueWith(SDValue(N, 1), SDValue(Res.getNode(), 1));
	return Res;
	}

	return IsFloat
	? DAG.getNode(ISD::FP_ROUND, DL, OutVT, InterVec,
	DAG.getTargetConstant(
	0, DL, TLI.getPointerTy(DAG.getDataLayout())))
	: DAG.getNode(ISD::TRUNCATE, DL, OutVT, InterVec);
	}

	SDValue DAGTypeLegalizer::SplitVecOp_VSETCC(SDNode *N) {
	assert(N->getValueType(0).isVector() &&
	N->getOperand(0).getValueType().isVector() &&
	"Operand types must be vectors");
	// The result has a legal vector type, but the input needs splitting.
	SDValue Lo0, Hi0, Lo1, Hi1, LoRes, HiRes;
	SDLoc DL(N);
	GetSplitVector(N->getOperand(0), Lo0, Hi0);
	GetSplitVector(N->getOperand(1), Lo1, Hi1);
	auto PartEltCnt = Lo0.getValueType().getVectorElementCount();

	LLVMContext &Context = *DAG.getContext();
	EVT PartResVT = EVT::getVectorVT(Context, MVT::i1, PartEltCnt);
	EVT WideResVT = EVT::getVectorVT(Context, MVT::i1, PartEltCnt*2);

	LoRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Lo0, Lo1, N->getOperand(2));
	HiRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Hi0, Hi1, N->getOperand(2));
	SDValue Con = DAG.getNode(ISD::CONCAT_VECTORS, DL, WideResVT, LoRes, HiRes);

	EVT OpVT = N->getOperand(0).getValueType();
	ISD::NodeType ExtendCode =
	TargetLowering::getExtendForContent(TLI.getBooleanContents(OpVT));
	return DAG.getNode(ExtendCode, DL, N->getValueType(0), Con);
	}


	SDValue DAGTypeLegalizer::SplitVecOp_FP_ROUND(SDNode *N) {
	// The result has a legal vector type, but the input needs splitting.
	EVT ResVT = N->getValueType(0);
	SDValue Lo, Hi;
	SDLoc DL(N);
	GetSplitVector(N->getOperand(N->isStrictFPOpcode() ? 1 : 0), Lo, Hi);
	EVT InVT = Lo.getValueType();

	EVT OutVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(),
	InVT.getVectorNumElements());

	if (N->isStrictFPOpcode()) {
	Lo = DAG.getNode(N->getOpcode(), DL, { OutVT, MVT::Other },
	{ N->getOperand(0), Lo, N->getOperand(2) });
	Hi = DAG.getNode(N->getOpcode(), DL, { OutVT, MVT::Other },
	{ N->getOperand(0), Hi, N->getOperand(2) });
	// Legalize the chain result - switch anything that used the old chain to
	// use the new one.
	SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
	Lo.getValue(1), Hi.getValue(1));
	ReplaceValueWith(SDValue(N, 1), NewChain);
	} else {
	Lo = DAG.getNode(ISD::FP_ROUND, DL, OutVT, Lo, N->getOperand(1));
	Hi = DAG.getNode(ISD::FP_ROUND, DL, OutVT, Hi, N->getOperand(1));
	}

	return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
	}

	SDValue DAGTypeLegalizer::SplitVecOp_FCOPYSIGN(SDNode *N) {
	// The result (and the first input) has a legal vector type, but the second
	// input needs splitting.
	return DAG.UnrollVectorOp(N, N->getValueType(0).getVectorNumElements());
	}


	//===----------------------------------------------------------------------===//
	// Result Vector Widening
	//===----------------------------------------------------------------------===//

	void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
	LLVM_DEBUG(dbgs() << "Widen node result " << ResNo << ": "; N->dump(&DAG);
	dbgs() << "\n");

	// See if the target wants to custom widen this node.
	if (CustomWidenLowerNode(N, N->getValueType(ResNo)))
	return;

	SDValue Res = SDValue();
	switch (N->getOpcode()) {
	default:
	#ifndef NDEBUG
	dbgs() << "WidenVectorResult #" << ResNo << ": ";
	N->dump(&DAG);
	dbgs() << "\n";
	#endif
	llvm_unreachable("Do not know how to widen the result of this operator!");

	case ISD::MERGE_VALUES: Res = WidenVecRes_MERGE_VALUES(N, ResNo); break;
	case ISD::BITCAST: Res = WidenVecRes_BITCAST(N); break;
	case ISD::BUILD_VECTOR: Res = WidenVecRes_BUILD_VECTOR(N); break;
	case ISD::CONCAT_VECTORS: Res = WidenVecRes_CONCAT_VECTORS(N); break;
	case ISD::EXTRACT_SUBVECTOR: Res = WidenVecRes_EXTRACT_SUBVECTOR(N); break;
	case ISD::INSERT_VECTOR_ELT: Res = WidenVecRes_INSERT_VECTOR_ELT(N); break;
	case ISD::LOAD: Res = WidenVecRes_LOAD(N); break;
	case ISD::SCALAR_TO_VECTOR: Res = WidenVecRes_SCALAR_TO_VECTOR(N); break;
	case ISD::SIGN_EXTEND_INREG: Res = WidenVecRes_InregOp(N); break;
	case ISD::VSELECT:
	case ISD::SELECT: Res = WidenVecRes_SELECT(N); break;
	case ISD::SELECT_CC: Res = WidenVecRes_SELECT_CC(N); break;
	case ISD::SETCC: Res = WidenVecRes_SETCC(N); break;
	case ISD::UNDEF: Res = WidenVecRes_UNDEF(N); break;
	case ISD::VECTOR_SHUFFLE:
	Res = WidenVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N));
	break;
	case ISD::MLOAD:
	Res = WidenVecRes_MLOAD(cast<MaskedLoadSDNode>(N));
	break;
	case ISD::MGATHER:
	Res = WidenVecRes_MGATHER(cast<MaskedGatherSDNode>(N));
	break;

	case ISD::ADD:
	case ISD::AND:
	case ISD::MUL:
	case ISD::MULHS:
	case ISD::MULHU:
	case ISD::OR:
	case ISD::SUB:
	case ISD::XOR:
	case ISD::FMINNUM:
	case ISD::FMAXNUM:
	case ISD::FMINIMUM:
	case ISD::FMAXIMUM:
	case ISD::SMIN:
	case ISD::SMAX:
	case ISD::UMIN:
	case ISD::UMAX:
	case ISD::UADDSAT:
	case ISD::SADDSAT:
	case ISD::USUBSAT:
	case ISD::SSUBSAT:
	Res = WidenVecRes_Binary(N);
	break;

	case ISD::FADD:
	case ISD::FMUL:
	case ISD::FPOW:
	case ISD::FSUB:
	case ISD::FDIV:
	case ISD::FREM:
	case ISD::SDIV:
	case ISD::UDIV:
	case ISD::SREM:
	case ISD::UREM:
	Res = WidenVecRes_BinaryCanTrap(N);
	break;

	case ISD::SMULFIX:
	case ISD::SMULFIXSAT:
	case ISD::UMULFIX:
	case ISD::UMULFIXSAT:
	// These are binary operations, but with an extra operand that shouldn't
	// be widened (the scale).
	Res = WidenVecRes_BinaryWithExtraScalarOp(N);
	break;

	#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \
	case ISD::STRICT_##DAGN:
	#include "llvm/IR/ConstrainedOps.def"
	Res = WidenVecRes_StrictFP(N);
	break;

	case ISD::UADDO:
	case ISD::SADDO:
	case ISD::USUBO:
	case ISD::SSUBO:
	case ISD::UMULO:
	case ISD::SMULO:
	Res = WidenVecRes_OverflowOp(N, ResNo);
	break;

	case ISD::FCOPYSIGN:
	Res = WidenVecRes_FCOPYSIGN(N);
	break;

	case ISD::FPOWI:
	Res = WidenVecRes_POWI(N);
	break;

	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL:
	Res = WidenVecRes_Shift(N);
	break;

	case ISD::ANY_EXTEND_VECTOR_INREG:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	Res = WidenVecRes_EXTEND_VECTOR_INREG(N);
	break;

	case ISD::ANY_EXTEND:
	case ISD::FP_EXTEND:
	case ISD::FP_ROUND:
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	case ISD::SIGN_EXTEND:
	case ISD::SINT_TO_FP:
	case ISD::TRUNCATE:
	case ISD::UINT_TO_FP:
	case ISD::ZERO_EXTEND:
	Res = WidenVecRes_Convert(N);
	break;

	case ISD::FABS:
	case ISD::FCEIL:
	case ISD::FCOS:
	case ISD::FEXP:
	case ISD::FEXP2:
	case ISD::FFLOOR:
	case ISD::FLOG:
	case ISD::FLOG10:
	case ISD::FLOG2:
	case ISD::FNEARBYINT:
	case ISD::FRINT:
	case ISD::FROUND:
	case ISD::FROUNDEVEN:
	case ISD::FSIN:
	case ISD::FSQRT:
	case ISD::FTRUNC: {
	// We're going to widen this vector op to a legal type by padding with undef
	// elements. If the wide vector op is eventually going to be expanded to
	// scalar libcalls, then unroll into scalar ops now to avoid unnecessary
	// libcalls on the undef elements.
	EVT VT = N->getValueType(0);
	EVT WideVecVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
	if (!TLI.isOperationLegalOrCustom(N->getOpcode(), WideVecVT) &&
	TLI.isOperationExpand(N->getOpcode(), VT.getScalarType())) {
	Res = DAG.UnrollVectorOp(N, WideVecVT.getVectorNumElements());
	break;
	}
	}
	// If the target has custom/legal support for the scalar FP intrinsic ops
	// (they are probably not destined to become libcalls), then widen those like
	// any other unary ops.
	LLVM_FALLTHROUGH;

	case ISD::ABS:
	case ISD::BITREVERSE:
	case ISD::BSWAP:
	case ISD::CTLZ:
	case ISD::CTLZ_ZERO_UNDEF:
	case ISD::CTPOP:
	case ISD::CTTZ:
	case ISD::CTTZ_ZERO_UNDEF:
	case ISD::FNEG:
	case ISD::FREEZE:
	case ISD::FCANONICALIZE:
	Res = WidenVecRes_Unary(N);
	break;
	case ISD::FMA:
	Res = WidenVecRes_Ternary(N);
	break;
	}

	// If Res is null, the sub-method took care of registering the result.
	if (Res.getNode())
	SetWidenedVector(SDValue(N, ResNo), Res);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_Ternary(SDNode *N) {
	// Ternary op widening.
	SDLoc dl(N);
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	SDValue InOp1 = GetWidenedVector(N->getOperand(0));
	SDValue InOp2 = GetWidenedVector(N->getOperand(1));
	SDValue InOp3 = GetWidenedVector(N->getOperand(2));
	return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, InOp3);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
	// Binary op widening.
	SDLoc dl(N);
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	SDValue InOp1 = GetWidenedVector(N->getOperand(0));
	SDValue InOp2 = GetWidenedVector(N->getOperand(1));
	return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, N->getFlags());
	}

	SDValue DAGTypeLegalizer::WidenVecRes_BinaryWithExtraScalarOp(SDNode *N) {
	// Binary op widening, but with an extra operand that shouldn't be widened.
	SDLoc dl(N);
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	SDValue InOp1 = GetWidenedVector(N->getOperand(0));
	SDValue InOp2 = GetWidenedVector(N->getOperand(1));
	SDValue InOp3 = N->getOperand(2);
	return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, InOp3,
	N->getFlags());
	}

	// Given a vector of operations that have been broken up to widen, see
	// if we can collect them together into the next widest legal VT. This
	// implementation is trap-safe.
	static SDValue CollectOpsToWiden(SelectionDAG &DAG, const TargetLowering &TLI,
	SmallVectorImpl<SDValue> &ConcatOps,
	unsigned ConcatEnd, EVT VT, EVT MaxVT,
	EVT WidenVT) {
	// Check to see if we have a single operation with the widen type.
	if (ConcatEnd == 1) {
	VT = ConcatOps[0].getValueType();
	if (VT == WidenVT)
	return ConcatOps[0];
	}

	SDLoc dl(ConcatOps[0]);
	EVT WidenEltVT = WidenVT.getVectorElementType();

	// while (Some element of ConcatOps is not of type MaxVT) {
	// From the end of ConcatOps, collect elements of the same type and put
	// them into an op of the next larger supported type
	// }
	while (ConcatOps[ConcatEnd-1].getValueType() != MaxVT) {
	int Idx = ConcatEnd - 1;
	VT = ConcatOps[Idx--].getValueType();
	while (Idx >= 0 && ConcatOps[Idx].getValueType() == VT)
	Idx--;

	int NextSize = VT.isVector() ? VT.getVectorNumElements() : 1;
	EVT NextVT;
	do {
	NextSize *= 2;
	NextVT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NextSize);
	} while (!TLI.isTypeLegal(NextVT));

	if (!VT.isVector()) {
	// Scalar type, create an INSERT_VECTOR_ELEMENT of type NextVT
	SDValue VecOp = DAG.getUNDEF(NextVT);
	unsigned NumToInsert = ConcatEnd - Idx - 1;
	for (unsigned i = 0, OpIdx = Idx+1; i < NumToInsert; i++, OpIdx++) {
	VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NextVT, VecOp,
	ConcatOps[OpIdx], DAG.getVectorIdxConstant(i, dl));
	}
	ConcatOps[Idx+1] = VecOp;
	ConcatEnd = Idx + 2;
	} else {
	// Vector type, create a CONCAT_VECTORS of type NextVT
	SDValue undefVec = DAG.getUNDEF(VT);
	unsigned OpsToConcat = NextSize/VT.getVectorNumElements();
	SmallVector<SDValue, 16> SubConcatOps(OpsToConcat);
	unsigned RealVals = ConcatEnd - Idx - 1;
	unsigned SubConcatEnd = 0;
	unsigned SubConcatIdx = Idx + 1;
	while (SubConcatEnd < RealVals)
	SubConcatOps[SubConcatEnd++] = ConcatOps[++Idx];
	while (SubConcatEnd < OpsToConcat)
	SubConcatOps[SubConcatEnd++] = undefVec;
	ConcatOps[SubConcatIdx] = DAG.getNode(ISD::CONCAT_VECTORS, dl,
	NextVT, SubConcatOps);
	ConcatEnd = SubConcatIdx + 1;
	}
	}

	// Check to see if we have a single operation with the widen type.
	if (ConcatEnd == 1) {
	VT = ConcatOps[0].getValueType();
	if (VT == WidenVT)
	return ConcatOps[0];
	}

	// add undefs of size MaxVT until ConcatOps grows to length of WidenVT
	unsigned NumOps = WidenVT.getVectorNumElements()/MaxVT.getVectorNumElements();
	if (NumOps != ConcatEnd ) {
	SDValue UndefVal = DAG.getUNDEF(MaxVT);
	for (unsigned j = ConcatEnd; j < NumOps; ++j)
	ConcatOps[j] = UndefVal;
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT,
	makeArrayRef(ConcatOps.data(), NumOps));
	}

	SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) {
	// Binary op widening for operations that can trap.
	unsigned Opcode = N->getOpcode();
	SDLoc dl(N);
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	EVT WidenEltVT = WidenVT.getVectorElementType();
	EVT VT = WidenVT;
	unsigned NumElts = VT.getVectorNumElements();
	const SDNodeFlags Flags = N->getFlags();
	while (!TLI.isTypeLegal(VT) && NumElts != 1) {
	NumElts = NumElts / 2;
	VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
	}

	if (NumElts != 1 && !TLI.canOpTrap(N->getOpcode(), VT)) {
	// Operation doesn't trap so just widen as normal.
	SDValue InOp1 = GetWidenedVector(N->getOperand(0));
	SDValue InOp2 = GetWidenedVector(N->getOperand(1));
	return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, Flags);
	}

	// No legal vector version so unroll the vector operation and then widen.
	if (NumElts == 1)
	return DAG.UnrollVectorOp(N, WidenVT.getVectorNumElements());

	// Since the operation can trap, apply operation on the original vector.
	EVT MaxVT = VT;
	SDValue InOp1 = GetWidenedVector(N->getOperand(0));
	SDValue InOp2 = GetWidenedVector(N->getOperand(1));
	unsigned CurNumElts = N->getValueType(0).getVectorNumElements();

	SmallVector<SDValue, 16> ConcatOps(CurNumElts);
	unsigned ConcatEnd = 0; // Current ConcatOps index.
	int Idx = 0; // Current Idx into input vectors.

	// NumElts := greatest legal vector size (at most WidenVT)
	// while (orig. vector has unhandled elements) {
	// take munches of size NumElts from the beginning and add to ConcatOps
	// NumElts := next smaller supported vector size or 1
	// }
	while (CurNumElts != 0) {
	while (CurNumElts >= NumElts) {
	SDValue EOp1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, InOp1,
	DAG.getVectorIdxConstant(Idx, dl));
	SDValue EOp2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, InOp2,
	DAG.getVectorIdxConstant(Idx, dl));
	ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, VT, EOp1, EOp2, Flags);
	Idx += NumElts;
	CurNumElts -= NumElts;
	}
	do {
	NumElts = NumElts / 2;
	VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
	} while (!TLI.isTypeLegal(VT) && NumElts != 1);

	if (NumElts == 1) {
	for (unsigned i = 0; i != CurNumElts; ++i, ++Idx) {
	SDValue EOp1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT,
	InOp1, DAG.getVectorIdxConstant(Idx, dl));
	SDValue EOp2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT,
	InOp2, DAG.getVectorIdxConstant(Idx, dl));
	ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, WidenEltVT,
	EOp1, EOp2, Flags);
	}
	CurNumElts = 0;
	}
	}

	return CollectOpsToWiden(DAG, TLI, ConcatOps, ConcatEnd, VT, MaxVT, WidenVT);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_StrictFP(SDNode *N) {
	switch (N->getOpcode()) {
	case ISD::STRICT_FSETCC:
	case ISD::STRICT_FSETCCS:
	return WidenVecRes_STRICT_FSETCC(N);
	case ISD::STRICT_FP_EXTEND:
	case ISD::STRICT_FP_ROUND:
	case ISD::STRICT_FP_TO_SINT:
	case ISD::STRICT_FP_TO_UINT:
	case ISD::STRICT_SINT_TO_FP:
	case ISD::STRICT_UINT_TO_FP:
	return WidenVecRes_Convert_StrictFP(N);
	default:
	break;
	}

	// StrictFP op widening for operations that can trap.
	unsigned NumOpers = N->getNumOperands();
	unsigned Opcode = N->getOpcode();
	SDLoc dl(N);
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	EVT WidenEltVT = WidenVT.getVectorElementType();
	EVT VT = WidenVT;
	unsigned NumElts = VT.getVectorNumElements();
	while (!TLI.isTypeLegal(VT) && NumElts != 1) {
	NumElts = NumElts / 2;
	VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
	}

	// No legal vector version so unroll the vector operation and then widen.
	if (NumElts == 1)
	return UnrollVectorOp_StrictFP(N, WidenVT.getVectorNumElements());

	// Since the operation can trap, apply operation on the original vector.
	EVT MaxVT = VT;
	SmallVector<SDValue, 4> InOps;
	unsigned CurNumElts = N->getValueType(0).getVectorNumElements();

	SmallVector<SDValue, 16> ConcatOps(CurNumElts);
	SmallVector<SDValue, 16> Chains;
	unsigned ConcatEnd = 0; // Current ConcatOps index.
	int Idx = 0; // Current Idx into input vectors.

	// The Chain is the first operand.
	InOps.push_back(N->getOperand(0));

	// Now process the remaining operands.
	for (unsigned i = 1; i < NumOpers; ++i) {
	SDValue Oper = N->getOperand(i);

	if (Oper.getValueType().isVector()) {
	assert(Oper.getValueType() == N->getValueType(0) &&
	"Invalid operand type to widen!");
	Oper = GetWidenedVector(Oper);
	}

	InOps.push_back(Oper);
	}

	// NumElts := greatest legal vector size (at most WidenVT)
	// while (orig. vector has unhandled elements) {
	// take munches of size NumElts from the beginning and add to ConcatOps
	// NumElts := next smaller supported vector size or 1
	// }
	while (CurNumElts != 0) {
	while (CurNumElts >= NumElts) {
	SmallVector<SDValue, 4> EOps;

	for (unsigned i = 0; i < NumOpers; ++i) {
	SDValue Op = InOps[i];

	if (Op.getValueType().isVector())
	Op = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Op,
	DAG.getVectorIdxConstant(Idx, dl));

	EOps.push_back(Op);
	}

	EVT OperVT[] = {VT, MVT::Other};
	SDValue Oper = DAG.getNode(Opcode, dl, OperVT, EOps);
	ConcatOps[ConcatEnd++] = Oper;
	Chains.push_back(Oper.getValue(1));
	Idx += NumElts;
	CurNumElts -= NumElts;
	}
	do {
	NumElts = NumElts / 2;
	VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
	} while (!TLI.isTypeLegal(VT) && NumElts != 1);

	if (NumElts == 1) {
	for (unsigned i = 0; i != CurNumElts; ++i, ++Idx) {
	SmallVector<SDValue, 4> EOps;

	for (unsigned i = 0; i < NumOpers; ++i) {
	SDValue Op = InOps[i];

	if (Op.getValueType().isVector())
	Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, Op,
	DAG.getVectorIdxConstant(Idx, dl));

	EOps.push_back(Op);
	}

	EVT WidenVT[] = {WidenEltVT, MVT::Other};
	SDValue Oper = DAG.getNode(Opcode, dl, WidenVT, EOps);
	ConcatOps[ConcatEnd++] = Oper;
	Chains.push_back(Oper.getValue(1));
	}
	CurNumElts = 0;
	}
	}

	// Build a factor node to remember all the Ops that have been created.
	SDValue NewChain;
	if (Chains.size() == 1)
	NewChain = Chains[0];
	else
	NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
	ReplaceValueWith(SDValue(N, 1), NewChain);

	return CollectOpsToWiden(DAG, TLI, ConcatOps, ConcatEnd, VT, MaxVT, WidenVT);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_OverflowOp(SDNode *N, unsigned ResNo) {
	SDLoc DL(N);
	EVT ResVT = N->getValueType(0);
	EVT OvVT = N->getValueType(1);
	EVT WideResVT, WideOvVT;
	SDValue WideLHS, WideRHS;

	// TODO: This might result in a widen/split loop.
	if (ResNo == 0) {
	WideResVT = TLI.getTypeToTransformTo(*DAG.getContext(), ResVT);
	WideOvVT = EVT::getVectorVT(
	*DAG.getContext(), OvVT.getVectorElementType(),
	WideResVT.getVectorNumElements());

	WideLHS = GetWidenedVector(N->getOperand(0));
	WideRHS = GetWidenedVector(N->getOperand(1));
	} else {
	WideOvVT = TLI.getTypeToTransformTo(*DAG.getContext(), OvVT);
	WideResVT = EVT::getVectorVT(
	*DAG.getContext(), ResVT.getVectorElementType(),
	WideOvVT.getVectorNumElements());

	SDValue Zero = DAG.getVectorIdxConstant(0, DL);
	WideLHS = DAG.getNode(
	ISD::INSERT_SUBVECTOR, DL, WideResVT, DAG.getUNDEF(WideResVT),
	N->getOperand(0), Zero);
	WideRHS = DAG.getNode(
	ISD::INSERT_SUBVECTOR, DL, WideResVT, DAG.getUNDEF(WideResVT),
	N->getOperand(1), Zero);
	}

	SDVTList WideVTs = DAG.getVTList(WideResVT, WideOvVT);
	SDNode *WideNode = DAG.getNode(
	N->getOpcode(), DL, WideVTs, WideLHS, WideRHS).getNode();

	// Replace the other vector result not being explicitly widened here.
	unsigned OtherNo = 1 - ResNo;
	EVT OtherVT = N->getValueType(OtherNo);
	if (getTypeAction(OtherVT) == TargetLowering::TypeWidenVector) {
	SetWidenedVector(SDValue(N, OtherNo), SDValue(WideNode, OtherNo));
	} else {
	SDValue Zero = DAG.getVectorIdxConstant(0, DL);
	SDValue OtherVal = DAG.getNode(
	ISD::EXTRACT_SUBVECTOR, DL, OtherVT, SDValue(WideNode, OtherNo), Zero);
	ReplaceValueWith(SDValue(N, OtherNo), OtherVal);
	}

	return SDValue(WideNode, ResNo);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
	SDValue InOp = N->getOperand(0);
	SDLoc DL(N);

	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	unsigned WidenNumElts = WidenVT.getVectorNumElements();

	EVT InVT = InOp.getValueType();
	EVT InEltVT = InVT.getVectorElementType();
	EVT InWidenVT = EVT::getVectorVT(*DAG.getContext(), InEltVT, WidenNumElts);

	unsigned Opcode = N->getOpcode();
	unsigned InVTNumElts = InVT.getVectorNumElements();
	const SDNodeFlags Flags = N->getFlags();
	if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
	InOp = GetWidenedVector(N->getOperand(0));
	InVT = InOp.getValueType();
	InVTNumElts = InVT.getVectorNumElements();
	if (InVTNumElts == WidenNumElts) {
	if (N->getNumOperands() == 1)
	return DAG.getNode(Opcode, DL, WidenVT, InOp);
	return DAG.getNode(Opcode, DL, WidenVT, InOp, N->getOperand(1), Flags);
	}
	if (WidenVT.getSizeInBits() == InVT.getSizeInBits()) {
	// If both input and result vector types are of same width, extend
	// operations should be done with SIGN/ZERO_EXTEND_VECTOR_INREG, which
	// accepts fewer elements in the result than in the input.
	if (Opcode == ISD::ANY_EXTEND)
	return DAG.getNode(ISD::ANY_EXTEND_VECTOR_INREG, DL, WidenVT, InOp);
	if (Opcode == ISD::SIGN_EXTEND)
	return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, WidenVT, InOp);
	if (Opcode == ISD::ZERO_EXTEND)
	return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, WidenVT, InOp);
	}
	}

	if (TLI.isTypeLegal(InWidenVT)) {
	// Because the result and the input are different vector types, widening
	// the result could create a legal type but widening the input might make
	// it an illegal type that might lead to repeatedly splitting the input
	// and then widening it. To avoid this, we widen the input only if
	// it results in a legal type.
	if (WidenNumElts % InVTNumElts == 0) {
	// Widen the input and call convert on the widened input vector.
	unsigned NumConcat = WidenNumElts/InVTNumElts;
	SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
	Ops[0] = InOp;
	SDValue InVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InWidenVT, Ops);
	if (N->getNumOperands() == 1)
	return DAG.getNode(Opcode, DL, WidenVT, InVec);
	return DAG.getNode(Opcode, DL, WidenVT, InVec, N->getOperand(1), Flags);
	}

	if (InVTNumElts % WidenNumElts == 0) {
	SDValue InVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InWidenVT, InOp,
	DAG.getVectorIdxConstant(0, DL));
	// Extract the input and convert the shorten input vector.
	if (N->getNumOperands() == 1)
	return DAG.getNode(Opcode, DL, WidenVT, InVal);
	return DAG.getNode(Opcode, DL, WidenVT, InVal, N->getOperand(1), Flags);
	}
	}

	// Otherwise unroll into some nasty scalar code and rebuild the vector.
	EVT EltVT = WidenVT.getVectorElementType();
	SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
	// Use the original element count so we don't do more scalar opts than
	// necessary.
	unsigned MinElts = N->getValueType(0).getVectorNumElements();
	for (unsigned i=0; i < MinElts; ++i) {
	SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, InEltVT, InOp,
	DAG.getVectorIdxConstant(i, DL));
	if (N->getNumOperands() == 1)
	Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val);
	else
	Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val, N->getOperand(1), Flags);
	}

	return DAG.getBuildVector(WidenVT, DL, Ops);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_Convert_StrictFP(SDNode *N) {
	SDValue InOp = N->getOperand(1);
	SDLoc DL(N);
	SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());

	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	unsigned WidenNumElts = WidenVT.getVectorNumElements();

	EVT InVT = InOp.getValueType();
	EVT InEltVT = InVT.getVectorElementType();

	unsigned Opcode = N->getOpcode();

	// FIXME: Optimizations need to be implemented here.

	// Otherwise unroll into some nasty scalar code and rebuild the vector.
	EVT EltVT = WidenVT.getVectorElementType();
	std::array<EVT, 2> EltVTs = {{EltVT, MVT::Other}};
	SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
	SmallVector<SDValue, 32> OpChains;
	// Use the original element count so we don't do more scalar opts than
	// necessary.
	unsigned MinElts = N->getValueType(0).getVectorNumElements();
	for (unsigned i=0; i < MinElts; ++i) {
	NewOps[1] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, InEltVT, InOp,
	DAG.getVectorIdxConstant(i, DL));
	Ops[i] = DAG.getNode(Opcode, DL, EltVTs, NewOps);
	OpChains.push_back(Ops[i].getValue(1));
	}
	SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OpChains);
	ReplaceValueWith(SDValue(N, 1), NewChain);

	return DAG.getBuildVector(WidenVT, DL, Ops);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_EXTEND_VECTOR_INREG(SDNode *N) {
	unsigned Opcode = N->getOpcode();
	SDValue InOp = N->getOperand(0);
	SDLoc DL(N);

	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	EVT WidenSVT = WidenVT.getVectorElementType();
	unsigned WidenNumElts = WidenVT.getVectorNumElements();

	EVT InVT = InOp.getValueType();
	EVT InSVT = InVT.getVectorElementType();
	unsigned InVTNumElts = InVT.getVectorNumElements();

	if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
	InOp = GetWidenedVector(InOp);
	InVT = InOp.getValueType();
	if (InVT.getSizeInBits() == WidenVT.getSizeInBits()) {
	switch (Opcode) {
	case ISD::ANY_EXTEND_VECTOR_INREG:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	return DAG.getNode(Opcode, DL, WidenVT, InOp);
	}
	}
	}

	// Unroll, extend the scalars and rebuild the vector.
	SmallVector<SDValue, 16> Ops;
	for (unsigned i = 0, e = std::min(InVTNumElts, WidenNumElts); i != e; ++i) {
	SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, InSVT, InOp,
	DAG.getVectorIdxConstant(i, DL));
	switch (Opcode) {
	case ISD::ANY_EXTEND_VECTOR_INREG:
	Val = DAG.getNode(ISD::ANY_EXTEND, DL, WidenSVT, Val);
	break;
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	Val = DAG.getNode(ISD::SIGN_EXTEND, DL, WidenSVT, Val);
	break;
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	Val = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenSVT, Val);
	break;
	default:
	llvm_unreachable("A *_EXTEND_VECTOR_INREG node was expected");
	}
	Ops.push_back(Val);
	}

	while (Ops.size() != WidenNumElts)
	Ops.push_back(DAG.getUNDEF(WidenSVT));

	return DAG.getBuildVector(WidenVT, DL, Ops);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_FCOPYSIGN(SDNode *N) {
	// If this is an FCOPYSIGN with same input types, we can treat it as a
	// normal (can trap) binary op.
	if (N->getOperand(0).getValueType() == N->getOperand(1).getValueType())
	return WidenVecRes_BinaryCanTrap(N);

	// If the types are different, fall back to unrolling.
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	return DAG.UnrollVectorOp(N, WidenVT.getVectorNumElements());
	}

	SDValue DAGTypeLegalizer::WidenVecRes_POWI(SDNode *N) {
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	SDValue InOp = GetWidenedVector(N->getOperand(0));
	SDValue ShOp = N->getOperand(1);
	return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, ShOp);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_Shift(SDNode *N) {
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	SDValue InOp = GetWidenedVector(N->getOperand(0));
	SDValue ShOp = N->getOperand(1);

	EVT ShVT = ShOp.getValueType();
	if (getTypeAction(ShVT) == TargetLowering::TypeWidenVector) {
	ShOp = GetWidenedVector(ShOp);
	ShVT = ShOp.getValueType();
	}
	EVT ShWidenVT = EVT::getVectorVT(*DAG.getContext(),
	ShVT.getVectorElementType(),
	WidenVT.getVectorNumElements());
	if (ShVT != ShWidenVT)
	ShOp = ModifyToType(ShOp, ShWidenVT);

	return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, ShOp);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_Unary(SDNode *N) {
	// Unary op widening.
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	SDValue InOp = GetWidenedVector(N->getOperand(0));
	return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_InregOp(SDNode *N) {
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	EVT ExtVT = EVT::getVectorVT(*DAG.getContext(),
	cast<VTSDNode>(N->getOperand(1))->getVT()
	.getVectorElementType(),
	WidenVT.getVectorNumElements());
	SDValue WidenLHS = GetWidenedVector(N->getOperand(0));
	return DAG.getNode(N->getOpcode(), SDLoc(N),
	WidenVT, WidenLHS, DAG.getValueType(ExtVT));
	}

	SDValue DAGTypeLegalizer::WidenVecRes_MERGE_VALUES(SDNode *N, unsigned ResNo) {
	SDValue WidenVec = DisintegrateMERGE_VALUES(N, ResNo);
	return GetWidenedVector(WidenVec);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
	SDValue InOp = N->getOperand(0);
	EVT InVT = InOp.getValueType();
	EVT VT = N->getValueType(0);
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
	SDLoc dl(N);

	switch (getTypeAction(InVT)) {
	case TargetLowering::TypeLegal:
	break;
	case TargetLowering::TypeScalarizeScalableVector:
	report_fatal_error("Scalarization of scalable vectors is not supported.");
	case TargetLowering::TypePromoteInteger: {
	// If the incoming type is a vector that is being promoted, then
	// we know that the elements are arranged differently and that we
	// must perform the conversion using a stack slot.
	if (InVT.isVector())
	break;

	// If the InOp is promoted to the same size, convert it. Otherwise,
	// fall out of the switch and widen the promoted input.
	SDValue NInOp = GetPromotedInteger(InOp);
	EVT NInVT = NInOp.getValueType();
	if (WidenVT.bitsEq(NInVT)) {
	// For big endian targets we need to shift the input integer or the
	// interesting bits will end up at the wrong place.
	if (DAG.getDataLayout().isBigEndian()) {
	unsigned ShiftAmt = NInVT.getSizeInBits() - InVT.getSizeInBits();
	EVT ShiftAmtTy = TLI.getShiftAmountTy(NInVT, DAG.getDataLayout());
	assert(ShiftAmt < WidenVT.getSizeInBits() && "Too large shift amount!");
	NInOp = DAG.getNode(ISD::SHL, dl, NInVT, NInOp,
	DAG.getConstant(ShiftAmt, dl, ShiftAmtTy));
	}
	return DAG.getNode(ISD::BITCAST, dl, WidenVT, NInOp);
	}
	InOp = NInOp;
	InVT = NInVT;
	break;
	}
	case TargetLowering::TypeSoftenFloat:
	case TargetLowering::TypePromoteFloat:
	case TargetLowering::TypeSoftPromoteHalf:
	case TargetLowering::TypeExpandInteger:
	case TargetLowering::TypeExpandFloat:
	case TargetLowering::TypeScalarizeVector:
	case TargetLowering::TypeSplitVector:
	break;
	case TargetLowering::TypeWidenVector:
	// If the InOp is widened to the same size, convert it. Otherwise, fall
	// out of the switch and widen the widened input.
	InOp = GetWidenedVector(InOp);
	InVT = InOp.getValueType();
	if (WidenVT.bitsEq(InVT))
	// The input widens to the same size. Convert to the widen value.
	return DAG.getNode(ISD::BITCAST, dl, WidenVT, InOp);
	break;
	}

	unsigned WidenSize = WidenVT.getSizeInBits();
	unsigned InSize = InVT.getSizeInBits();
	// x86mmx is not an acceptable vector element type, so don't try.
	if (WidenSize % InSize == 0 && InVT != MVT::x86mmx) {
	// Determine new input vector type. The new input vector type will use
	// the same element type (if its a vector) or use the input type as a
	// vector. It is the same size as the type to widen to.
	EVT NewInVT;
	unsigned NewNumElts = WidenSize / InSize;
	if (InVT.isVector()) {
	EVT InEltVT = InVT.getVectorElementType();
	NewInVT = EVT::getVectorVT(*DAG.getContext(), InEltVT,
	WidenSize / InEltVT.getSizeInBits());
	} else {
	NewInVT = EVT::getVectorVT(*DAG.getContext(), InVT, NewNumElts);
	}

	if (TLI.isTypeLegal(NewInVT)) {
	SDValue NewVec;
	if (InVT.isVector()) {
	// Because the result and the input are different vector types, widening
	// the result could create a legal type but widening the input might make
	// it an illegal type that might lead to repeatedly splitting the input
	// and then widening it. To avoid this, we widen the input only if
	// it results in a legal type.
	SmallVector<SDValue, 16> Ops(NewNumElts, DAG.getUNDEF(InVT));
	Ops[0] = InOp;

	NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewInVT, Ops);
	} else {
	NewVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewInVT, InOp);
	}
	return DAG.getNode(ISD::BITCAST, dl, WidenVT, NewVec);
	}
	}

	return CreateStackStoreLoad(InOp, WidenVT);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) {
	SDLoc dl(N);
	// Build a vector with undefined for the new nodes.
	EVT VT = N->getValueType(0);

	// Integer BUILD_VECTOR operands may be larger than the node's vector element
	// type. The UNDEFs need to have the same type as the existing operands.
	EVT EltVT = N->getOperand(0).getValueType();
	unsigned NumElts = VT.getVectorNumElements();

	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
	unsigned WidenNumElts = WidenVT.getVectorNumElements();

	SmallVector<SDValue, 16> NewOps(N->op_begin(), N->op_end());
	assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
	NewOps.append(WidenNumElts - NumElts, DAG.getUNDEF(EltVT));

	return DAG.getBuildVector(WidenVT, dl, NewOps);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
	EVT InVT = N->getOperand(0).getValueType();
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	SDLoc dl(N);
	- unsigned WidenNumElts = WidenVT.getVectorNumElements();
	- unsigned NumInElts = InVT.getVectorNumElements();
	unsigned NumOperands = N->getNumOperands();

	bool InputWidened = false; // Indicates we need to widen the input.
	if (getTypeAction(InVT) != TargetLowering::TypeWidenVector) {
	- if (WidenVT.getVectorNumElements() % InVT.getVectorNumElements() == 0) {
	+ unsigned WidenNumElts = WidenVT.getVectorMinNumElements();
	+ unsigned NumInElts = InVT.getVectorMinNumElements();
	+ if (WidenNumElts % NumInElts == 0) {
	// Add undef vectors to widen to correct length.
	- unsigned NumConcat = WidenVT.getVectorNumElements() /
	- InVT.getVectorNumElements();
	+ unsigned NumConcat = WidenNumElts / NumInElts;
	SDValue UndefVal = DAG.getUNDEF(InVT);
	SmallVector<SDValue, 16> Ops(NumConcat);
	for (unsigned i=0; i < NumOperands; ++i)
	Ops[i] = N->getOperand(i);
	for (unsigned i = NumOperands; i != NumConcat; ++i)
	Ops[i] = UndefVal;
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, Ops);
	}
	} else {
	InputWidened = true;
	if (WidenVT == TLI.getTypeToTransformTo(*DAG.getContext(), InVT)) {
	// The inputs and the result are widen to the same value.
	unsigned i;
	for (i=1; i < NumOperands; ++i)
	if (!N->getOperand(i).isUndef())
	break;

	if (i == NumOperands)
	// Everything but the first operand is an UNDEF so just return the
	// widened first operand.
	return GetWidenedVector(N->getOperand(0));

	if (NumOperands == 2) {
	+ assert(!WidenVT.isScalableVector() &&
	+ "Cannot use vector shuffles to widen CONCAT_VECTOR result");
	+ unsigned WidenNumElts = WidenVT.getVectorNumElements();
	+ unsigned NumInElts = InVT.getVectorNumElements();
	+
	// Replace concat of two operands with a shuffle.
	SmallVector<int, 16> MaskOps(WidenNumElts, -1);
	for (unsigned i = 0; i < NumInElts; ++i) {
	MaskOps[i] = i;
	MaskOps[i + NumInElts] = i + WidenNumElts;
	}
	return DAG.getVectorShuffle(WidenVT, dl,
	GetWidenedVector(N->getOperand(0)),
	GetWidenedVector(N->getOperand(1)),
	MaskOps);
	}
	}
	}

	+ assert(!WidenVT.isScalableVector() &&
	+ "Cannot use build vectors to widen CONCAT_VECTOR result");
	+ unsigned WidenNumElts = WidenVT.getVectorNumElements();
	+ unsigned NumInElts = InVT.getVectorNumElements();
	+
	// Fall back to use extracts and build vector.
	EVT EltVT = WidenVT.getVectorElementType();
	SmallVector<SDValue, 16> Ops(WidenNumElts);
	unsigned Idx = 0;
	for (unsigned i=0; i < NumOperands; ++i) {
	SDValue InOp = N->getOperand(i);
	if (InputWidened)
	InOp = GetWidenedVector(InOp);
	for (unsigned j = 0; j < NumInElts; ++j)
	Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
	DAG.getVectorIdxConstant(j, dl));
	}
	SDValue UndefVal = DAG.getUNDEF(EltVT);
	for (; Idx < WidenNumElts; ++Idx)
	Ops[Idx] = UndefVal;
	return DAG.getBuildVector(WidenVT, dl, Ops);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
	EVT VT = N->getValueType(0);
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
	unsigned WidenNumElts = WidenVT.getVectorNumElements();
	SDValue InOp = N->getOperand(0);
	SDValue Idx = N->getOperand(1);
	SDLoc dl(N);

	if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector)
	InOp = GetWidenedVector(InOp);

	EVT InVT = InOp.getValueType();

	// Check if we can just return the input vector after widening.
	uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	if (IdxVal == 0 && InVT == WidenVT)
	return InOp;

	// Check if we can extract from the vector.
	unsigned InNumElts = InVT.getVectorNumElements();
	if (IdxVal % WidenNumElts == 0 && IdxVal + WidenNumElts < InNumElts)
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, WidenVT, InOp, Idx);

	// We could try widening the input to the right length but for now, extract
	// the original elements, fill the rest with undefs and build a vector.
	SmallVector<SDValue, 16> Ops(WidenNumElts);
	EVT EltVT = VT.getVectorElementType();
	unsigned NumElts = VT.getVectorNumElements();
	unsigned i;
	for (i = 0; i < NumElts; ++i)
	Ops[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
	DAG.getVectorIdxConstant(IdxVal + i, dl));

	SDValue UndefVal = DAG.getUNDEF(EltVT);
	for (; i < WidenNumElts; ++i)
	Ops[i] = UndefVal;
	return DAG.getBuildVector(WidenVT, dl, Ops);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_INSERT_VECTOR_ELT(SDNode *N) {
	SDValue InOp = GetWidenedVector(N->getOperand(0));
	return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N),
	InOp.getValueType(), InOp,
	N->getOperand(1), N->getOperand(2));
	}

	SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
	LoadSDNode *LD = cast<LoadSDNode>(N);
	ISD::LoadExtType ExtType = LD->getExtensionType();

	// A vector must always be stored in memory as-is, i.e. without any padding
	// between the elements, since various code depend on it, e.g. in the
	// handling of a bitcast of a vector type to int, which may be done with a
	// vector store followed by an integer load. A vector that does not have
	// elements that are byte-sized must therefore be stored as an integer
	// built out of the extracted vector elements.
	if (!LD->getMemoryVT().isByteSized()) {
	SDValue Value, NewChain;
	std::tie(Value, NewChain) = TLI.scalarizeVectorLoad(LD, DAG);
	ReplaceValueWith(SDValue(LD, 0), Value);
	ReplaceValueWith(SDValue(LD, 1), NewChain);
	return SDValue();
	}

	SDValue Result;
	SmallVector<SDValue, 16> LdChain; // Chain for the series of load
	if (ExtType != ISD::NON_EXTLOAD)
	Result = GenWidenVectorExtLoads(LdChain, LD, ExtType);
	else
	Result = GenWidenVectorLoads(LdChain, LD);

	// If we generate a single load, we can use that for the chain. Otherwise,
	// build a factor node to remember the multiple loads are independent and
	// chain to that.
	SDValue NewChain;
	if (LdChain.size() == 1)
	NewChain = LdChain[0];
	else
	NewChain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other, LdChain);

	// Modified the chain - switch anything that used the old chain to use
	// the new one.
	ReplaceValueWith(SDValue(N, 1), NewChain);

	return Result;
	}

	SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {

	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),N->getValueType(0));
	SDValue Mask = N->getMask();
	EVT MaskVT = Mask.getValueType();
	SDValue PassThru = GetWidenedVector(N->getPassThru());
	ISD::LoadExtType ExtType = N->getExtensionType();
	SDLoc dl(N);

	// The mask should be widened as well
	EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(),
	MaskVT.getVectorElementType(),
	WidenVT.getVectorNumElements());
	Mask = ModifyToType(Mask, WideMaskVT, true);

	SDValue Res = DAG.getMaskedLoad(
	WidenVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
	PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
	ExtType, N->isExpandingLoad());
	// Legalize the chain result - switch anything that used the old chain to
	// use the new one.
	ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
	return Res;
	}

	SDValue DAGTypeLegalizer::WidenVecRes_MGATHER(MaskedGatherSDNode *N) {

	EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	SDValue Mask = N->getMask();
	EVT MaskVT = Mask.getValueType();
	SDValue PassThru = GetWidenedVector(N->getPassThru());
	SDValue Scale = N->getScale();
	unsigned NumElts = WideVT.getVectorNumElements();
	SDLoc dl(N);

	// The mask should be widened as well
	EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(),
	MaskVT.getVectorElementType(),
	WideVT.getVectorNumElements());
	Mask = ModifyToType(Mask, WideMaskVT, true);

	// Widen the Index operand
	SDValue Index = N->getIndex();
	EVT WideIndexVT = EVT::getVectorVT(*DAG.getContext(),
	Index.getValueType().getScalarType(),
	NumElts);
	Index = ModifyToType(Index, WideIndexVT);
	SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
	Scale };
	SDValue Res = DAG.getMaskedGather(DAG.getVTList(WideVT, MVT::Other),
	N->getMemoryVT(), dl, Ops,
	N->getMemOperand(), N->getIndexType());

	// Legalize the chain result - switch anything that used the old chain to
	// use the new one.
	ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
	return Res;
	}

	SDValue DAGTypeLegalizer::WidenVecRes_SCALAR_TO_VECTOR(SDNode *N) {
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N),
	WidenVT, N->getOperand(0));
	}

	// Return true is this is a SETCC node or a strict version of it.
	static inline bool isSETCCOp(unsigned Opcode) {
	switch (Opcode) {
	case ISD::SETCC:
	case ISD::STRICT_FSETCC:
	case ISD::STRICT_FSETCCS:
	return true;
	}
	return false;
	}

	// Return true if this is a node that could have two SETCCs as operands.
	static inline bool isLogicalMaskOp(unsigned Opcode) {
	switch (Opcode) {
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	return true;
	}
	return false;
	}

	// If N is a SETCC or a strict variant of it, return the type
	// of the compare operands.
	static inline EVT getSETCCOperandType(SDValue N) {
	unsigned OpNo = N->isStrictFPOpcode() ? 1 : 0;
	return N->getOperand(OpNo).getValueType();
	}

	// This is used just for the assert in convertMask(). Check that this either
	// a SETCC or a previously handled SETCC by convertMask().
	#ifndef NDEBUG
	static inline bool isSETCCorConvertedSETCC(SDValue N) {
	if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
	N = N.getOperand(0);
	else if (N.getOpcode() == ISD::CONCAT_VECTORS) {
	for (unsigned i = 1; i < N->getNumOperands(); ++i)
	if (!N->getOperand(i)->isUndef())
	return false;
	N = N.getOperand(0);
	}

	if (N.getOpcode() == ISD::TRUNCATE)
	N = N.getOperand(0);
	else if (N.getOpcode() == ISD::SIGN_EXTEND)
	N = N.getOperand(0);

	if (isLogicalMaskOp(N.getOpcode()))
	return isSETCCorConvertedSETCC(N.getOperand(0)) &&
	isSETCCorConvertedSETCC(N.getOperand(1));

	return (isSETCCOp(N.getOpcode()) \|\|
	ISD::isBuildVectorOfConstantSDNodes(N.getNode()));
	}
	#endif

	// Return a mask of vector type MaskVT to replace InMask. Also adjust MaskVT
	// to ToMaskVT if needed with vector extension or truncation.
	SDValue DAGTypeLegalizer::convertMask(SDValue InMask, EVT MaskVT,
	EVT ToMaskVT) {
	// Currently a SETCC or a AND/OR/XOR with two SETCCs are handled.
	// FIXME: This code seems to be too restrictive, we might consider
	// generalizing it or dropping it.
	assert(isSETCCorConvertedSETCC(InMask) && "Unexpected mask argument.");

	// Make a new Mask node, with a legal result VT.
	SDValue Mask;
	SmallVector<SDValue, 4> Ops;
	for (unsigned i = 0, e = InMask->getNumOperands(); i < e; ++i)
	Ops.push_back(InMask->getOperand(i));
	if (InMask->isStrictFPOpcode()) {
	Mask = DAG.getNode(InMask->getOpcode(), SDLoc(InMask),
	{ MaskVT, MVT::Other }, Ops);
	ReplaceValueWith(InMask.getValue(1), Mask.getValue(1));
	}
	else
	Mask = DAG.getNode(InMask->getOpcode(), SDLoc(InMask), MaskVT, Ops);

	// If MaskVT has smaller or bigger elements than ToMaskVT, a vector sign
	// extend or truncate is needed.
	LLVMContext &Ctx = *DAG.getContext();
	unsigned MaskScalarBits = MaskVT.getScalarSizeInBits();
	unsigned ToMaskScalBits = ToMaskVT.getScalarSizeInBits();
	if (MaskScalarBits < ToMaskScalBits) {
	EVT ExtVT = EVT::getVectorVT(Ctx, ToMaskVT.getVectorElementType(),
	MaskVT.getVectorNumElements());
	Mask = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Mask), ExtVT, Mask);
	} else if (MaskScalarBits > ToMaskScalBits) {
	EVT TruncVT = EVT::getVectorVT(Ctx, ToMaskVT.getVectorElementType(),
	MaskVT.getVectorNumElements());
	Mask = DAG.getNode(ISD::TRUNCATE, SDLoc(Mask), TruncVT, Mask);
	}

	assert(Mask->getValueType(0).getScalarSizeInBits() ==
	ToMaskVT.getScalarSizeInBits() &&
	"Mask should have the right element size by now.");

	// Adjust Mask to the right number of elements.
	unsigned CurrMaskNumEls = Mask->getValueType(0).getVectorNumElements();
	if (CurrMaskNumEls > ToMaskVT.getVectorNumElements()) {
	SDValue ZeroIdx = DAG.getVectorIdxConstant(0, SDLoc(Mask));
	Mask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Mask), ToMaskVT, Mask,
	ZeroIdx);
	} else if (CurrMaskNumEls < ToMaskVT.getVectorNumElements()) {
	unsigned NumSubVecs = (ToMaskVT.getVectorNumElements() / CurrMaskNumEls);
	EVT SubVT = Mask->getValueType(0);
	SmallVector<SDValue, 16> SubOps(NumSubVecs, DAG.getUNDEF(SubVT));
	SubOps[0] = Mask;
	Mask = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Mask), ToMaskVT, SubOps);
	}

	assert((Mask->getValueType(0) == ToMaskVT) &&
	"A mask of ToMaskVT should have been produced by now.");

	return Mask;
	}

	// This method tries to handle VSELECT and its mask by legalizing operands
	// (which may require widening) and if needed adjusting the mask vector type
	// to match that of the VSELECT. Without it, many cases end up with
	// scalarization of the SETCC, with many unnecessary instructions.
	SDValue DAGTypeLegalizer::WidenVSELECTAndMask(SDNode *N) {
	LLVMContext &Ctx = *DAG.getContext();
	SDValue Cond = N->getOperand(0);

	if (N->getOpcode() != ISD::VSELECT)
	return SDValue();

	if (!isSETCCOp(Cond->getOpcode()) && !isLogicalMaskOp(Cond->getOpcode()))
	return SDValue();

	// If this is a splitted VSELECT that was previously already handled, do
	// nothing.
	EVT CondVT = Cond->getValueType(0);
	if (CondVT.getScalarSizeInBits() != 1)
	return SDValue();

	EVT VSelVT = N->getValueType(0);
	// Only handle vector types which are a power of 2.
	if (!isPowerOf2_64(VSelVT.getSizeInBits()))
	return SDValue();

	// Don't touch if this will be scalarized.
	EVT FinalVT = VSelVT;
	while (getTypeAction(FinalVT) == TargetLowering::TypeSplitVector)
	FinalVT = FinalVT.getHalfNumVectorElementsVT(Ctx);

	if (FinalVT.getVectorNumElements() == 1)
	return SDValue();

	// If there is support for an i1 vector mask, don't touch.
	if (isSETCCOp(Cond.getOpcode())) {
	EVT SetCCOpVT = getSETCCOperandType(Cond);
	while (TLI.getTypeAction(Ctx, SetCCOpVT) != TargetLowering::TypeLegal)
	SetCCOpVT = TLI.getTypeToTransformTo(Ctx, SetCCOpVT);
	EVT SetCCResVT = getSetCCResultType(SetCCOpVT);
	if (SetCCResVT.getScalarSizeInBits() == 1)
	return SDValue();
	} else if (CondVT.getScalarType() == MVT::i1) {
	// If there is support for an i1 vector mask (or only scalar i1 conditions),
	// don't touch.
	while (TLI.getTypeAction(Ctx, CondVT) != TargetLowering::TypeLegal)
	CondVT = TLI.getTypeToTransformTo(Ctx, CondVT);

	if (CondVT.getScalarType() == MVT::i1)
	return SDValue();
	}

	// Get the VT and operands for VSELECT, and widen if needed.
	SDValue VSelOp1 = N->getOperand(1);
	SDValue VSelOp2 = N->getOperand(2);
	if (getTypeAction(VSelVT) == TargetLowering::TypeWidenVector) {
	VSelVT = TLI.getTypeToTransformTo(Ctx, VSelVT);
	VSelOp1 = GetWidenedVector(VSelOp1);
	VSelOp2 = GetWidenedVector(VSelOp2);
	}

	// The mask of the VSELECT should have integer elements.
	EVT ToMaskVT = VSelVT;
	if (!ToMaskVT.getScalarType().isInteger())
	ToMaskVT = ToMaskVT.changeVectorElementTypeToInteger();

	SDValue Mask;
	if (isSETCCOp(Cond->getOpcode())) {
	EVT MaskVT = getSetCCResultType(getSETCCOperandType(Cond));
	Mask = convertMask(Cond, MaskVT, ToMaskVT);
	} else if (isLogicalMaskOp(Cond->getOpcode()) &&
	isSETCCOp(Cond->getOperand(0).getOpcode()) &&
	isSETCCOp(Cond->getOperand(1).getOpcode())) {
	// Cond is (AND/OR/XOR (SETCC, SETCC))
	SDValue SETCC0 = Cond->getOperand(0);
	SDValue SETCC1 = Cond->getOperand(1);
	EVT VT0 = getSetCCResultType(getSETCCOperandType(SETCC0));
	EVT VT1 = getSetCCResultType(getSETCCOperandType(SETCC1));
	unsigned ScalarBits0 = VT0.getScalarSizeInBits();
	unsigned ScalarBits1 = VT1.getScalarSizeInBits();
	unsigned ScalarBits_ToMask = ToMaskVT.getScalarSizeInBits();
	EVT MaskVT;
	// If the two SETCCs have different VTs, either extend/truncate one of
	// them to the other "towards" ToMaskVT, or truncate one and extend the
	// other to ToMaskVT.
	if (ScalarBits0 != ScalarBits1) {
	EVT NarrowVT = ((ScalarBits0 < ScalarBits1) ? VT0 : VT1);
	EVT WideVT = ((NarrowVT == VT0) ? VT1 : VT0);
	if (ScalarBits_ToMask >= WideVT.getScalarSizeInBits())
	MaskVT = WideVT;
	else if (ScalarBits_ToMask <= NarrowVT.getScalarSizeInBits())
	MaskVT = NarrowVT;
	else
	MaskVT = ToMaskVT;
	} else
	// If the two SETCCs have the same VT, don't change it.
	MaskVT = VT0;

	// Make new SETCCs and logical nodes.
	SETCC0 = convertMask(SETCC0, VT0, MaskVT);
	SETCC1 = convertMask(SETCC1, VT1, MaskVT);
	Cond = DAG.getNode(Cond->getOpcode(), SDLoc(Cond), MaskVT, SETCC0, SETCC1);

	// Convert the logical op for VSELECT if needed.
	Mask = convertMask(Cond, MaskVT, ToMaskVT);
	} else
	return SDValue();

	return DAG.getNode(ISD::VSELECT, SDLoc(N), VSelVT, Mask, VSelOp1, VSelOp2);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	unsigned WidenNumElts = WidenVT.getVectorNumElements();

	SDValue Cond1 = N->getOperand(0);
	EVT CondVT = Cond1.getValueType();
	if (CondVT.isVector()) {
	if (SDValue Res = WidenVSELECTAndMask(N))
	return Res;

	EVT CondEltVT = CondVT.getVectorElementType();
	EVT CondWidenVT = EVT::getVectorVT(*DAG.getContext(),
	CondEltVT, WidenNumElts);
	if (getTypeAction(CondVT) == TargetLowering::TypeWidenVector)
	Cond1 = GetWidenedVector(Cond1);

	// If we have to split the condition there is no point in widening the
	// select. This would result in an cycle of widening the select ->
	// widening the condition operand -> splitting the condition operand ->
	// splitting the select -> widening the select. Instead split this select
	// further and widen the resulting type.
	if (getTypeAction(CondVT) == TargetLowering::TypeSplitVector) {
	SDValue SplitSelect = SplitVecOp_VSELECT(N, 0);
	SDValue Res = ModifyToType(SplitSelect, WidenVT);
	return Res;
	}

	if (Cond1.getValueType() != CondWidenVT)
	Cond1 = ModifyToType(Cond1, CondWidenVT);
	}

	SDValue InOp1 = GetWidenedVector(N->getOperand(1));
	SDValue InOp2 = GetWidenedVector(N->getOperand(2));
	assert(InOp1.getValueType() == WidenVT && InOp2.getValueType() == WidenVT);
	return DAG.getNode(N->getOpcode(), SDLoc(N),
	WidenVT, Cond1, InOp1, InOp2);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_SELECT_CC(SDNode *N) {
	SDValue InOp1 = GetWidenedVector(N->getOperand(2));
	SDValue InOp2 = GetWidenedVector(N->getOperand(3));
	return DAG.getNode(ISD::SELECT_CC, SDLoc(N),
	InOp1.getValueType(), N->getOperand(0),
	N->getOperand(1), InOp1, InOp2, N->getOperand(4));
	}

	SDValue DAGTypeLegalizer::WidenVecRes_UNDEF(SDNode *N) {
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	return DAG.getUNDEF(WidenVT);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N) {
	EVT VT = N->getValueType(0);
	SDLoc dl(N);

	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
	unsigned NumElts = VT.getVectorNumElements();
	unsigned WidenNumElts = WidenVT.getVectorNumElements();

	SDValue InOp1 = GetWidenedVector(N->getOperand(0));
	SDValue InOp2 = GetWidenedVector(N->getOperand(1));

	// Adjust mask based on new input vector length.
	SmallVector<int, 16> NewMask;
	for (unsigned i = 0; i != NumElts; ++i) {
	int Idx = N->getMaskElt(i);
	if (Idx < (int)NumElts)
	NewMask.push_back(Idx);
	else
	NewMask.push_back(Idx - NumElts + WidenNumElts);
	}
	for (unsigned i = NumElts; i != WidenNumElts; ++i)
	NewMask.push_back(-1);
	return DAG.getVectorShuffle(WidenVT, dl, InOp1, InOp2, NewMask);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_SETCC(SDNode *N) {
	assert(N->getValueType(0).isVector() &&
	N->getOperand(0).getValueType().isVector() &&
	"Operands must be vectors");
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	unsigned WidenNumElts = WidenVT.getVectorNumElements();

	SDValue InOp1 = N->getOperand(0);
	EVT InVT = InOp1.getValueType();
	assert(InVT.isVector() && "can not widen non-vector type");
	EVT WidenInVT = EVT::getVectorVT(*DAG.getContext(),
	InVT.getVectorElementType(), WidenNumElts);

	// The input and output types often differ here, and it could be that while
	// we'd prefer to widen the result type, the input operands have been split.
	// In this case, we also need to split the result of this node as well.
	if (getTypeAction(InVT) == TargetLowering::TypeSplitVector) {
	SDValue SplitVSetCC = SplitVecOp_VSETCC(N);
	SDValue Res = ModifyToType(SplitVSetCC, WidenVT);
	return Res;
	}

	// If the inputs also widen, handle them directly. Otherwise widen by hand.
	SDValue InOp2 = N->getOperand(1);
	if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
	InOp1 = GetWidenedVector(InOp1);
	InOp2 = GetWidenedVector(InOp2);
	} else {
	InOp1 = DAG.WidenVector(InOp1, SDLoc(N));
	InOp2 = DAG.WidenVector(InOp2, SDLoc(N));
	}

	// Assume that the input and output will be widen appropriately. If not,
	// we will have to unroll it at some point.
	assert(InOp1.getValueType() == WidenInVT &&
	InOp2.getValueType() == WidenInVT &&
	"Input not widened to expected type!");
	(void)WidenInVT;
	return DAG.getNode(ISD::SETCC, SDLoc(N),
	WidenVT, InOp1, InOp2, N->getOperand(2));
	}

	SDValue DAGTypeLegalizer::WidenVecRes_STRICT_FSETCC(SDNode *N) {
	assert(N->getValueType(0).isVector() &&
	N->getOperand(1).getValueType().isVector() &&
	"Operands must be vectors");
	EVT VT = N->getValueType(0);
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
	unsigned WidenNumElts = WidenVT.getVectorNumElements();
	unsigned NumElts = VT.getVectorNumElements();
	EVT EltVT = VT.getVectorElementType();

	SDLoc dl(N);
	SDValue Chain = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	SDValue CC = N->getOperand(3);
	EVT TmpEltVT = LHS.getValueType().getVectorElementType();

	// Fully unroll and reassemble.
	SmallVector<SDValue, 8> Scalars(WidenNumElts, DAG.getUNDEF(EltVT));
	SmallVector<SDValue, 8> Chains(NumElts);
	for (unsigned i = 0; i != NumElts; ++i) {
	SDValue LHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, LHS,
	DAG.getVectorIdxConstant(i, dl));
	SDValue RHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, RHS,
	DAG.getVectorIdxConstant(i, dl));

	Scalars[i] = DAG.getNode(N->getOpcode(), dl, {MVT::i1, MVT::Other},
	{Chain, LHSElem, RHSElem, CC});
	Chains[i] = Scalars[i].getValue(1);
	Scalars[i] = DAG.getSelect(dl, EltVT, Scalars[i],
	DAG.getBoolConstant(true, dl, EltVT, VT),
	DAG.getBoolConstant(false, dl, EltVT, VT));
	}

	SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
	ReplaceValueWith(SDValue(N, 1), NewChain);

	return DAG.getBuildVector(WidenVT, dl, Scalars);
	}

	//===----------------------------------------------------------------------===//
	// Widen Vector Operand
	//===----------------------------------------------------------------------===//
	bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
	LLVM_DEBUG(dbgs() << "Widen node operand " << OpNo << ": "; N->dump(&DAG);
	dbgs() << "\n");
	SDValue Res = SDValue();

	// See if the target wants to custom widen this node.
	if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
	return false;

	switch (N->getOpcode()) {
	default:
	#ifndef NDEBUG
	dbgs() << "WidenVectorOperand op #" << OpNo << ": ";
	N->dump(&DAG);
	dbgs() << "\n";
	#endif
	llvm_unreachable("Do not know how to widen this operator's operand!");

	case ISD::BITCAST: Res = WidenVecOp_BITCAST(N); break;
	case ISD::CONCAT_VECTORS: Res = WidenVecOp_CONCAT_VECTORS(N); break;
	case ISD::EXTRACT_SUBVECTOR: Res = WidenVecOp_EXTRACT_SUBVECTOR(N); break;
	case ISD::EXTRACT_VECTOR_ELT: Res = WidenVecOp_EXTRACT_VECTOR_ELT(N); break;
	case ISD::STORE: Res = WidenVecOp_STORE(N); break;
	case ISD::MSTORE: Res = WidenVecOp_MSTORE(N, OpNo); break;
	case ISD::MGATHER: Res = WidenVecOp_MGATHER(N, OpNo); break;
	case ISD::MSCATTER: Res = WidenVecOp_MSCATTER(N, OpNo); break;
	case ISD::SETCC: Res = WidenVecOp_SETCC(N); break;
	case ISD::STRICT_FSETCC:
	case ISD::STRICT_FSETCCS: Res = WidenVecOp_STRICT_FSETCC(N); break;
	case ISD::VSELECT: Res = WidenVecOp_VSELECT(N); break;
	case ISD::FCOPYSIGN: Res = WidenVecOp_FCOPYSIGN(N); break;

	case ISD::ANY_EXTEND:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	Res = WidenVecOp_EXTEND(N);
	break;

	case ISD::FP_EXTEND:
	case ISD::STRICT_FP_EXTEND:
	case ISD::FP_ROUND:
	case ISD::STRICT_FP_ROUND:
	case ISD::FP_TO_SINT:
	case ISD::STRICT_FP_TO_SINT:
	case ISD::FP_TO_UINT:
	case ISD::STRICT_FP_TO_UINT:
	case ISD::SINT_TO_FP:
	case ISD::STRICT_SINT_TO_FP:
	case ISD::UINT_TO_FP:
	case ISD::STRICT_UINT_TO_FP:
	case ISD::TRUNCATE:
	Res = WidenVecOp_Convert(N);
	break;

	case ISD::VECREDUCE_FADD:
	case ISD::VECREDUCE_FMUL:
	case ISD::VECREDUCE_ADD:
	case ISD::VECREDUCE_MUL:
	case ISD::VECREDUCE_AND:
	case ISD::VECREDUCE_OR:
	case ISD::VECREDUCE_XOR:
	case ISD::VECREDUCE_SMAX:
	case ISD::VECREDUCE_SMIN:
	case ISD::VECREDUCE_UMAX:
	case ISD::VECREDUCE_UMIN:
	case ISD::VECREDUCE_FMAX:
	case ISD::VECREDUCE_FMIN:
	Res = WidenVecOp_VECREDUCE(N);
	break;
	}

	// If Res is null, the sub-method took care of registering the result.
	if (!Res.getNode()) return false;

	// If the result is N, the sub-method updated N in place. Tell the legalizer
	// core about this.
	if (Res.getNode() == N)
	return true;


	if (N->isStrictFPOpcode())
	assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 2 &&
	"Invalid operand expansion");
	else
	assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
	"Invalid operand expansion");

	ReplaceValueWith(SDValue(N, 0), Res);
	return false;
	}

	SDValue DAGTypeLegalizer::WidenVecOp_EXTEND(SDNode *N) {
	SDLoc DL(N);
	EVT VT = N->getValueType(0);

	SDValue InOp = N->getOperand(0);
	assert(getTypeAction(InOp.getValueType()) ==
	TargetLowering::TypeWidenVector &&
	"Unexpected type action");
	InOp = GetWidenedVector(InOp);
	assert(VT.getVectorNumElements() <
	InOp.getValueType().getVectorNumElements() &&
	"Input wasn't widened!");

	// We may need to further widen the operand until it has the same total
	// vector size as the result.
	EVT InVT = InOp.getValueType();
	if (InVT.getSizeInBits() != VT.getSizeInBits()) {
	EVT InEltVT = InVT.getVectorElementType();
	for (int i = MVT::FIRST_VECTOR_VALUETYPE, e = MVT::LAST_VECTOR_VALUETYPE; i < e; ++i) {
	EVT FixedVT = (MVT::SimpleValueType)i;
	EVT FixedEltVT = FixedVT.getVectorElementType();
	if (TLI.isTypeLegal(FixedVT) &&
	FixedVT.getSizeInBits() == VT.getSizeInBits() &&
	FixedEltVT == InEltVT) {
	assert(FixedVT.getVectorNumElements() >= VT.getVectorNumElements() &&
	"Not enough elements in the fixed type for the operand!");
	assert(FixedVT.getVectorNumElements() != InVT.getVectorNumElements() &&
	"We can't have the same type as we started with!");
	if (FixedVT.getVectorNumElements() > InVT.getVectorNumElements())
	InOp = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, FixedVT,
	DAG.getUNDEF(FixedVT), InOp,
	DAG.getVectorIdxConstant(0, DL));
	else
	InOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, FixedVT, InOp,
	DAG.getVectorIdxConstant(0, DL));
	break;
	}
	}
	InVT = InOp.getValueType();
	if (InVT.getSizeInBits() != VT.getSizeInBits())
	// We couldn't find a legal vector type that was a widening of the input
	// and could be extended in-register to the result type, so we have to
	// scalarize.
	return WidenVecOp_Convert(N);
	}

	// Use special DAG nodes to represent the operation of extending the
	// low lanes.
	switch (N->getOpcode()) {
	default:
	llvm_unreachable("Extend legalization on extend operation!");
	case ISD::ANY_EXTEND:
	return DAG.getNode(ISD::ANY_EXTEND_VECTOR_INREG, DL, VT, InOp);
	case ISD::SIGN_EXTEND:
	return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, VT, InOp);
	case ISD::ZERO_EXTEND:
	return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, InOp);
	}
	}

	SDValue DAGTypeLegalizer::WidenVecOp_FCOPYSIGN(SDNode *N) {
	// The result (and first input) is legal, but the second input is illegal.
	// We can't do much to fix that, so just unroll and let the extracts off of
	// the second input be widened as needed later.
	return DAG.UnrollVectorOp(N);
	}

	SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) {
	// Since the result is legal and the input is illegal.
	EVT VT = N->getValueType(0);
	EVT EltVT = VT.getVectorElementType();
	SDLoc dl(N);
	unsigned NumElts = VT.getVectorNumElements();
	SDValue InOp = N->getOperand(N->isStrictFPOpcode() ? 1 : 0);
	assert(getTypeAction(InOp.getValueType()) ==
	TargetLowering::TypeWidenVector &&
	"Unexpected type action");
	InOp = GetWidenedVector(InOp);
	EVT InVT = InOp.getValueType();
	unsigned Opcode = N->getOpcode();

	// See if a widened result type would be legal, if so widen the node.
	// FIXME: This isn't safe for StrictFP. Other optimization here is needed.
	EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
	InVT.getVectorNumElements());
	if (TLI.isTypeLegal(WideVT) && !N->isStrictFPOpcode()) {
	SDValue Res;
	if (N->isStrictFPOpcode()) {
	if (Opcode == ISD::STRICT_FP_ROUND)
	Res = DAG.getNode(Opcode, dl, { WideVT, MVT::Other },
	{ N->getOperand(0), InOp, N->getOperand(2) });
	else
	Res = DAG.getNode(Opcode, dl, { WideVT, MVT::Other },
	{ N->getOperand(0), InOp });
	// Legalize the chain result - switch anything that used the old chain to
	// use the new one.
	ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
	} else {
	if (Opcode == ISD::FP_ROUND)
	Res = DAG.getNode(Opcode, dl, WideVT, InOp, N->getOperand(1));
	else
	Res = DAG.getNode(Opcode, dl, WideVT, InOp);
	}
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
	DAG.getVectorIdxConstant(0, dl));
	}

	EVT InEltVT = InVT.getVectorElementType();

	// Unroll the convert into some scalar code and create a nasty build vector.
	SmallVector<SDValue, 16> Ops(NumElts);
	if (N->isStrictFPOpcode()) {
	SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
	SmallVector<SDValue, 32> OpChains;
	for (unsigned i=0; i < NumElts; ++i) {
	NewOps[1] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
	DAG.getVectorIdxConstant(i, dl));
	Ops[i] = DAG.getNode(Opcode, dl, { EltVT, MVT::Other }, NewOps);
	OpChains.push_back(Ops[i].getValue(1));
	}
	SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OpChains);
	ReplaceValueWith(SDValue(N, 1), NewChain);
	} else {
	for (unsigned i = 0; i < NumElts; ++i)
	Ops[i] = DAG.getNode(Opcode, dl, EltVT,
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT,
	InOp, DAG.getVectorIdxConstant(i, dl)));
	}

	return DAG.getBuildVector(VT, dl, Ops);
	}

	SDValue DAGTypeLegalizer::WidenVecOp_BITCAST(SDNode *N) {
	EVT VT = N->getValueType(0);
	SDValue InOp = GetWidenedVector(N->getOperand(0));
	EVT InWidenVT = InOp.getValueType();
	SDLoc dl(N);

	// Check if we can convert between two legal vector types and extract.
	unsigned InWidenSize = InWidenVT.getSizeInBits();
	unsigned Size = VT.getSizeInBits();
	// x86mmx is not an acceptable vector element type, so don't try.
	if (InWidenSize % Size == 0 && !VT.isVector() && VT != MVT::x86mmx) {
	unsigned NewNumElts = InWidenSize / Size;
	EVT NewVT = EVT::getVectorVT(*DAG.getContext(), VT, NewNumElts);
	if (TLI.isTypeLegal(NewVT)) {
	SDValue BitOp = DAG.getNode(ISD::BITCAST, dl, NewVT, InOp);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, BitOp,
	DAG.getVectorIdxConstant(0, dl));
	}
	}

	// Handle a case like bitcast v12i8 -> v3i32. Normally that would get widened
	// to v16i8 -> v4i32, but for a target where v3i32 is legal but v12i8 is not,
	// we end up here. Handling the case here with EXTRACT_SUBVECTOR avoids
	// having to copy via memory.
	if (VT.isVector()) {
	EVT EltVT = VT.getVectorElementType();
	unsigned EltSize = EltVT.getSizeInBits();
	if (InWidenSize % EltSize == 0) {
	unsigned NewNumElts = InWidenSize / EltSize;
	EVT NewVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NewNumElts);
	if (TLI.isTypeLegal(NewVT)) {
	SDValue BitOp = DAG.getNode(ISD::BITCAST, dl, NewVT, InOp);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, BitOp,
	DAG.getVectorIdxConstant(0, dl));
	}
	}
	}

	return CreateStackStoreLoad(InOp, VT);
	}

	SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) {
	EVT VT = N->getValueType(0);
	EVT EltVT = VT.getVectorElementType();
	EVT InVT = N->getOperand(0).getValueType();
	SDLoc dl(N);

	// If the widen width for this operand is the same as the width of the concat
	// and all but the first operand is undef, just use the widened operand.
	unsigned NumOperands = N->getNumOperands();
	if (VT == TLI.getTypeToTransformTo(*DAG.getContext(), InVT)) {
	unsigned i;
	for (i = 1; i < NumOperands; ++i)
	if (!N->getOperand(i).isUndef())
	break;

	if (i == NumOperands)
	return GetWidenedVector(N->getOperand(0));
	}

	// Otherwise, fall back to a nasty build vector.
	unsigned NumElts = VT.getVectorNumElements();
	SmallVector<SDValue, 16> Ops(NumElts);

	unsigned NumInElts = InVT.getVectorNumElements();

	unsigned Idx = 0;
	for (unsigned i=0; i < NumOperands; ++i) {
	SDValue InOp = N->getOperand(i);
	assert(getTypeAction(InOp.getValueType()) ==
	TargetLowering::TypeWidenVector &&
	"Unexpected type action");
	InOp = GetWidenedVector(InOp);
	for (unsigned j = 0; j < NumInElts; ++j)
	Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
	DAG.getVectorIdxConstant(j, dl));
	}
	return DAG.getBuildVector(VT, dl, Ops);
	}

	SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
	SDValue InOp = GetWidenedVector(N->getOperand(0));
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N),
	N->getValueType(0), InOp, N->getOperand(1));
	}

	SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
	SDValue InOp = GetWidenedVector(N->getOperand(0));
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
	N->getValueType(0), InOp, N->getOperand(1));
	}

	SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
	// We have to widen the value, but we want only to store the original
	// vector type.
	StoreSDNode *ST = cast<StoreSDNode>(N);

	if (!ST->getMemoryVT().getScalarType().isByteSized())
	return TLI.scalarizeVectorStore(ST, DAG);

	SmallVector<SDValue, 16> StChain;
	if (ST->isTruncatingStore())
	GenWidenVectorTruncStores(StChain, ST);
	else
	GenWidenVectorStores(StChain, ST);

	if (StChain.size() == 1)
	return StChain[0];
	else
	return DAG.getNode(ISD::TokenFactor, SDLoc(ST), MVT::Other, StChain);
	}

	SDValue DAGTypeLegalizer::WidenVecOp_MSTORE(SDNode *N, unsigned OpNo) {
	assert((OpNo == 1 \|\| OpNo == 3) &&
	"Can widen only data or mask operand of mstore");
	MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
	SDValue Mask = MST->getMask();
	EVT MaskVT = Mask.getValueType();
	SDValue StVal = MST->getValue();
	SDLoc dl(N);

	if (OpNo == 1) {
	// Widen the value.
	StVal = GetWidenedVector(StVal);

	// The mask should be widened as well.
	EVT WideVT = StVal.getValueType();
	EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(),
	MaskVT.getVectorElementType(),
	WideVT.getVectorNumElements());
	Mask = ModifyToType(Mask, WideMaskVT, true);
	} else {
	// Widen the mask.
	EVT WideMaskVT = TLI.getTypeToTransformTo(*DAG.getContext(), MaskVT);
	Mask = ModifyToType(Mask, WideMaskVT, true);

	EVT ValueVT = StVal.getValueType();
	EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
	ValueVT.getVectorElementType(),
	WideMaskVT.getVectorNumElements());
	StVal = ModifyToType(StVal, WideVT);
	}

	assert(Mask.getValueType().getVectorNumElements() ==
	StVal.getValueType().getVectorNumElements() &&
	"Mask and data vectors should have the same number of elements");
	return DAG.getMaskedStore(MST->getChain(), dl, StVal, MST->getBasePtr(),
	MST->getOffset(), Mask, MST->getMemoryVT(),
	MST->getMemOperand(), MST->getAddressingMode(),
	false, MST->isCompressingStore());
	}

	SDValue DAGTypeLegalizer::WidenVecOp_MGATHER(SDNode *N, unsigned OpNo) {
	assert(OpNo == 4 && "Can widen only the index of mgather");
	auto *MG = cast<MaskedGatherSDNode>(N);
	SDValue DataOp = MG->getPassThru();
	SDValue Mask = MG->getMask();
	SDValue Scale = MG->getScale();

	// Just widen the index. It's allowed to have extra elements.
	SDValue Index = GetWidenedVector(MG->getIndex());

	SDLoc dl(N);
	SDValue Ops[] = {MG->getChain(), DataOp, Mask, MG->getBasePtr(), Index,
	Scale};
	SDValue Res = DAG.getMaskedGather(MG->getVTList(), MG->getMemoryVT(), dl, Ops,
	MG->getMemOperand(), MG->getIndexType());
	ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
	ReplaceValueWith(SDValue(N, 0), Res.getValue(0));
	return SDValue();
	}

	SDValue DAGTypeLegalizer::WidenVecOp_MSCATTER(SDNode *N, unsigned OpNo) {
	MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N);
	SDValue DataOp = MSC->getValue();
	SDValue Mask = MSC->getMask();
	SDValue Index = MSC->getIndex();
	SDValue Scale = MSC->getScale();

	if (OpNo == 1) {
	DataOp = GetWidenedVector(DataOp);
	unsigned NumElts = DataOp.getValueType().getVectorNumElements();

	// Widen index.
	EVT IndexVT = Index.getValueType();
	EVT WideIndexVT = EVT::getVectorVT(*DAG.getContext(),
	IndexVT.getVectorElementType(), NumElts);
	Index = ModifyToType(Index, WideIndexVT);

	// The mask should be widened as well.
	EVT MaskVT = Mask.getValueType();
	EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(),
	MaskVT.getVectorElementType(), NumElts);
	Mask = ModifyToType(Mask, WideMaskVT, true);
	} else if (OpNo == 4) {
	// Just widen the index. It's allowed to have extra elements.
	Index = GetWidenedVector(Index);
	} else
	llvm_unreachable("Can't widen this operand of mscatter");

	SDValue Ops[] = {MSC->getChain(), DataOp, Mask, MSC->getBasePtr(), Index,
	Scale};
	return DAG.getMaskedScatter(DAG.getVTList(MVT::Other),
	MSC->getMemoryVT(), SDLoc(N), Ops,
	MSC->getMemOperand(), MSC->getIndexType());
	}

	SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
	SDValue InOp0 = GetWidenedVector(N->getOperand(0));
	SDValue InOp1 = GetWidenedVector(N->getOperand(1));
	SDLoc dl(N);
	EVT VT = N->getValueType(0);

	// WARNING: In this code we widen the compare instruction with garbage.
	// This garbage may contain denormal floats which may be slow. Is this a real
	// concern ? Should we zero the unused lanes if this is a float compare ?

	// Get a new SETCC node to compare the newly widened operands.
	// Only some of the compared elements are legal.
	EVT SVT = getSetCCResultType(InOp0.getValueType());
	// The result type is legal, if its vXi1, keep vXi1 for the new SETCC.
	if (VT.getScalarType() == MVT::i1)
	SVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
	SVT.getVectorNumElements());

	SDValue WideSETCC = DAG.getNode(ISD::SETCC, SDLoc(N),
	SVT, InOp0, InOp1, N->getOperand(2));

	// Extract the needed results from the result vector.
	EVT ResVT = EVT::getVectorVT(*DAG.getContext(),
	SVT.getVectorElementType(),
	VT.getVectorNumElements());
	SDValue CC = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, WideSETCC,
	DAG.getVectorIdxConstant(0, dl));

	EVT OpVT = N->getOperand(0).getValueType();
	ISD::NodeType ExtendCode =
	TargetLowering::getExtendForContent(TLI.getBooleanContents(OpVT));
	return DAG.getNode(ExtendCode, dl, VT, CC);
	}

	SDValue DAGTypeLegalizer::WidenVecOp_STRICT_FSETCC(SDNode *N) {
	SDValue Chain = N->getOperand(0);
	SDValue LHS = GetWidenedVector(N->getOperand(1));
	SDValue RHS = GetWidenedVector(N->getOperand(2));
	SDValue CC = N->getOperand(3);
	SDLoc dl(N);

	EVT VT = N->getValueType(0);
	EVT EltVT = VT.getVectorElementType();
	EVT TmpEltVT = LHS.getValueType().getVectorElementType();
	unsigned NumElts = VT.getVectorNumElements();

	// Unroll into a build vector.
	SmallVector<SDValue, 8> Scalars(NumElts);
	SmallVector<SDValue, 8> Chains(NumElts);

	for (unsigned i = 0; i != NumElts; ++i) {
	SDValue LHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, LHS,
	DAG.getVectorIdxConstant(i, dl));
	SDValue RHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, RHS,
	DAG.getVectorIdxConstant(i, dl));

	Scalars[i] = DAG.getNode(N->getOpcode(), dl, {MVT::i1, MVT::Other},
	{Chain, LHSElem, RHSElem, CC});
	Chains[i] = Scalars[i].getValue(1);
	Scalars[i] = DAG.getSelect(dl, EltVT, Scalars[i],
	DAG.getBoolConstant(true, dl, EltVT, VT),
	DAG.getBoolConstant(false, dl, EltVT, VT));
	}

	SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
	ReplaceValueWith(SDValue(N, 1), NewChain);

	return DAG.getBuildVector(VT, dl, Scalars);
	}

	SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE(SDNode *N) {
	SDLoc dl(N);
	SDValue Op = GetWidenedVector(N->getOperand(0));
	EVT OrigVT = N->getOperand(0).getValueType();
	EVT WideVT = Op.getValueType();
	EVT ElemVT = OrigVT.getVectorElementType();

	SDValue NeutralElem;
	switch (N->getOpcode()) {
	case ISD::VECREDUCE_ADD:
	case ISD::VECREDUCE_OR:
	case ISD::VECREDUCE_XOR:
	case ISD::VECREDUCE_UMAX:
	NeutralElem = DAG.getConstant(0, dl, ElemVT);
	break;
	case ISD::VECREDUCE_MUL:
	NeutralElem = DAG.getConstant(1, dl, ElemVT);
	break;
	case ISD::VECREDUCE_AND:
	case ISD::VECREDUCE_UMIN:
	NeutralElem = DAG.getAllOnesConstant(dl, ElemVT);
	break;
	case ISD::VECREDUCE_SMAX:
	NeutralElem = DAG.getConstant(
	APInt::getSignedMinValue(ElemVT.getSizeInBits()), dl, ElemVT);
	break;
	case ISD::VECREDUCE_SMIN:
	NeutralElem = DAG.getConstant(
	APInt::getSignedMaxValue(ElemVT.getSizeInBits()), dl, ElemVT);
	break;
	case ISD::VECREDUCE_FADD:
	NeutralElem = DAG.getConstantFP(0.0, dl, ElemVT);
	break;
	case ISD::VECREDUCE_FMUL:
	NeutralElem = DAG.getConstantFP(1.0, dl, ElemVT);
	break;
	case ISD::VECREDUCE_FMAX:
	NeutralElem = DAG.getConstantFP(
	-std::numeric_limits<double>::infinity(), dl, ElemVT);
	break;
	case ISD::VECREDUCE_FMIN:
	NeutralElem = DAG.getConstantFP(
	std::numeric_limits<double>::infinity(), dl, ElemVT);
	break;
	}

	// Pad the vector with the neutral element.
	unsigned OrigElts = OrigVT.getVectorNumElements();
	unsigned WideElts = WideVT.getVectorNumElements();
	for (unsigned Idx = OrigElts; Idx < WideElts; Idx++)
	Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, WideVT, Op, NeutralElem,
	DAG.getVectorIdxConstant(Idx, dl));

	return DAG.getNode(N->getOpcode(), dl, N->getValueType(0), Op, N->getFlags());
	}

	SDValue DAGTypeLegalizer::WidenVecOp_VSELECT(SDNode *N) {
	// This only gets called in the case that the left and right inputs and
	// result are of a legal odd vector type, and the condition is illegal i1 of
	// the same odd width that needs widening.
	EVT VT = N->getValueType(0);
	assert(VT.isVector() && !VT.isPow2VectorType() && isTypeLegal(VT));

	SDValue Cond = GetWidenedVector(N->getOperand(0));
	SDValue LeftIn = DAG.WidenVector(N->getOperand(1), SDLoc(N));
	SDValue RightIn = DAG.WidenVector(N->getOperand(2), SDLoc(N));
	SDLoc DL(N);

	SDValue Select = DAG.getNode(N->getOpcode(), DL, LeftIn.getValueType(), Cond,
	LeftIn, RightIn);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Select,
	DAG.getVectorIdxConstant(0, DL));
	}

	//===----------------------------------------------------------------------===//
	// Vector Widening Utilities
	//===----------------------------------------------------------------------===//

	// Utility function to find the type to chop up a widen vector for load/store
	// TLI: Target lowering used to determine legal types.
	// Width: Width left need to load/store.
	// WidenVT: The widen vector type to load to/store from
	// Align: If 0, don't allow use of a wider type
	// WidenEx: If Align is not 0, the amount additional we can load/store from.

	static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI,
	unsigned Width, EVT WidenVT,
	unsigned Align = 0, unsigned WidenEx = 0) {
	EVT WidenEltVT = WidenVT.getVectorElementType();
	const bool Scalable = WidenVT.isScalableVector();
	unsigned WidenWidth = WidenVT.getSizeInBits().getKnownMinSize();
	unsigned WidenEltWidth = WidenEltVT.getSizeInBits();
	unsigned AlignInBits = Align*8;

	// If we have one element to load/store, return it.
	EVT RetVT = WidenEltVT;
	if (Width == WidenEltWidth)
	return RetVT;

	// See if there is larger legal integer than the element type to load/store.
	unsigned VT;
	// Don't bother looking for an integer type if the vector is scalable, skip
	// to vector types.
	if (!Scalable) {
	for (VT = (unsigned)MVT::LAST_INTEGER_VALUETYPE;
	VT >= (unsigned)MVT::FIRST_INTEGER_VALUETYPE; --VT) {
	EVT MemVT((MVT::SimpleValueType) VT);
	unsigned MemVTWidth = MemVT.getSizeInBits();
	if (MemVT.getSizeInBits() <= WidenEltWidth)
	break;
	auto Action = TLI.getTypeAction(*DAG.getContext(), MemVT);
	if ((Action == TargetLowering::TypeLegal \|\|
	Action == TargetLowering::TypePromoteInteger) &&
	(WidenWidth % MemVTWidth) == 0 &&
	isPowerOf2_32(WidenWidth / MemVTWidth) &&
	(MemVTWidth <= Width \|\|
	(Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
	if (MemVTWidth == WidenWidth)
	return MemVT;
	RetVT = MemVT;
	break;
	}
	}
	}

	// See if there is a larger vector type to load/store that has the same vector
	// element type and is evenly divisible with the WidenVT.
	for (VT = (unsigned)MVT::LAST_VECTOR_VALUETYPE;
	VT >= (unsigned)MVT::FIRST_VECTOR_VALUETYPE; --VT) {
	EVT MemVT = (MVT::SimpleValueType) VT;
	// Skip vector MVTs which don't match the scalable property of WidenVT.
	if (Scalable != MemVT.isScalableVector())
	continue;
	unsigned MemVTWidth = MemVT.getSizeInBits().getKnownMinSize();
	auto Action = TLI.getTypeAction(*DAG.getContext(), MemVT);
	if ((Action == TargetLowering::TypeLegal \|\|
	Action == TargetLowering::TypePromoteInteger) &&
	WidenEltVT == MemVT.getVectorElementType() &&
	(WidenWidth % MemVTWidth) == 0 &&
	isPowerOf2_32(WidenWidth / MemVTWidth) &&
	(MemVTWidth <= Width \|\|
	(Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
	if (RetVT.getSizeInBits() < MemVTWidth \|\| MemVT == WidenVT)
	return MemVT;
	}
	}

	return RetVT;
	}

	// Builds a vector type from scalar loads
	// VecTy: Resulting Vector type
	// LDOps: Load operators to build a vector type
	// [Start,End) the list of loads to use.
	static SDValue BuildVectorFromScalar(SelectionDAG& DAG, EVT VecTy,
	SmallVectorImpl<SDValue> &LdOps,
	unsigned Start, unsigned End) {
	SDLoc dl(LdOps[Start]);
	EVT LdTy = LdOps[Start].getValueType();
	unsigned Width = VecTy.getSizeInBits();
	unsigned NumElts = Width / LdTy.getSizeInBits();
	EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), LdTy, NumElts);

	unsigned Idx = 1;
	SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT,LdOps[Start]);

	for (unsigned i = Start + 1; i != End; ++i) {
	EVT NewLdTy = LdOps[i].getValueType();
	if (NewLdTy != LdTy) {
	NumElts = Width / NewLdTy.getSizeInBits();
	NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewLdTy, NumElts);
	VecOp = DAG.getNode(ISD::BITCAST, dl, NewVecVT, VecOp);
	// Readjust position and vector position based on new load type.
	Idx = Idx * LdTy.getSizeInBits() / NewLdTy.getSizeInBits();
	LdTy = NewLdTy;
	}
	VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, VecOp, LdOps[i],
	DAG.getVectorIdxConstant(Idx++, dl));
	}
	return DAG.getNode(ISD::BITCAST, dl, VecTy, VecOp);
	}

	SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
	LoadSDNode *LD) {
	// The strategy assumes that we can efficiently load power-of-two widths.
	// The routine chops the vector into the largest vector loads with the same
	// element type or scalar loads and then recombines it to the widen vector
	// type.
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),LD->getValueType(0));
	unsigned WidenWidth = WidenVT.getSizeInBits();
	EVT LdVT = LD->getMemoryVT();
	SDLoc dl(LD);
	assert(LdVT.isVector() && WidenVT.isVector());
	assert(LdVT.getVectorElementType() == WidenVT.getVectorElementType());

	// Load information
	SDValue Chain = LD->getChain();
	SDValue BasePtr = LD->getBasePtr();
	MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
	AAMDNodes AAInfo = LD->getAAInfo();

	int LdWidth = LdVT.getSizeInBits();
	int WidthDiff = WidenWidth - LdWidth;
	- // Allow wider loads.
	+ // Allow wider loads if they are sufficiently aligned to avoid memory faults
	+ // and if the original load is simple.
	unsigned LdAlign = (!LD->isSimple()) ? 0 : LD->getAlignment();

	// Find the vector type that can load from.
	EVT NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
	int NewVTWidth = NewVT.getSizeInBits();
	SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, LD->getPointerInfo(),
	LD->getOriginalAlign(), MMOFlags, AAInfo);
	LdChain.push_back(LdOp.getValue(1));

	// Check if we can load the element with one instruction.
	if (LdWidth <= NewVTWidth) {
	if (!NewVT.isVector()) {
	unsigned NumElts = WidenWidth / NewVTWidth;
	EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NumElts);
	SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT, LdOp);
	return DAG.getNode(ISD::BITCAST, dl, WidenVT, VecOp);
	}
	if (NewVT == WidenVT)
	return LdOp;

	assert(WidenWidth % NewVTWidth == 0);
	unsigned NumConcat = WidenWidth / NewVTWidth;
	SmallVector<SDValue, 16> ConcatOps(NumConcat);
	SDValue UndefVal = DAG.getUNDEF(NewVT);
	ConcatOps[0] = LdOp;
	for (unsigned i = 1; i != NumConcat; ++i)
	ConcatOps[i] = UndefVal;
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, ConcatOps);
	}

	// Load vector by using multiple loads from largest vector to scalar.
	SmallVector<SDValue, 16> LdOps;
	LdOps.push_back(LdOp);

	LdWidth -= NewVTWidth;
	unsigned Offset = 0;

	while (LdWidth > 0) {
	unsigned Increment = NewVTWidth / 8;
	Offset += Increment;
	BasePtr = DAG.getObjectPtrOffset(dl, BasePtr, Increment);

	SDValue L;
	if (LdWidth < NewVTWidth) {
	// The current type we are using is too large. Find a better size.
	NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
	NewVTWidth = NewVT.getSizeInBits();
	L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
	LD->getPointerInfo().getWithOffset(Offset),
	LD->getOriginalAlign(), MMOFlags, AAInfo);
	LdChain.push_back(L.getValue(1));
	- if (L->getValueType(0).isVector() && NewVTWidth >= LdWidth) {
	- // Later code assumes the vector loads produced will be mergeable, so we
	- // must pad the final entry up to the previous width. Scalars are
	- // combined separately.
	- SmallVector<SDValue, 16> Loads;
	- Loads.push_back(L);
	- unsigned size = L->getValueSizeInBits(0);
	- while (size < LdOp->getValueSizeInBits(0)) {
	- Loads.push_back(DAG.getUNDEF(L->getValueType(0)));
	- size += L->getValueSizeInBits(0);
	- }
	- L = DAG.getNode(ISD::CONCAT_VECTORS, dl, LdOp->getValueType(0), Loads);
	- }
	} else {
	L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
	LD->getPointerInfo().getWithOffset(Offset),
	LD->getOriginalAlign(), MMOFlags, AAInfo);
	LdChain.push_back(L.getValue(1));
	}

	LdOps.push_back(L);
	LdOp = L;

	LdWidth -= NewVTWidth;
	}

	// Build the vector from the load operations.
	unsigned End = LdOps.size();
	if (!LdOps[0].getValueType().isVector())
	// All the loads are scalar loads.
	return BuildVectorFromScalar(DAG, WidenVT, LdOps, 0, End);

	// If the load contains vectors, build the vector using concat vector.
	// All of the vectors used to load are power-of-2, and the scalar loads can be
	// combined to make a power-of-2 vector.
	SmallVector<SDValue, 16> ConcatOps(End);
	int i = End - 1;
	int Idx = End;
	EVT LdTy = LdOps[i].getValueType();
	// First, combine the scalar loads to a vector.
	if (!LdTy.isVector()) {
	for (--i; i >= 0; --i) {
	LdTy = LdOps[i].getValueType();
	if (LdTy.isVector())
	break;
	}
	ConcatOps[--Idx] = BuildVectorFromScalar(DAG, LdTy, LdOps, i + 1, End);
	}
	ConcatOps[--Idx] = LdOps[i];
	for (--i; i >= 0; --i) {
	EVT NewLdTy = LdOps[i].getValueType();
	if (NewLdTy != LdTy) {
	// Create a larger vector.
	+ unsigned NumOps = NewLdTy.getSizeInBits() / LdTy.getSizeInBits();
	+ assert(NewLdTy.getSizeInBits() % LdTy.getSizeInBits() == 0);
	+ SmallVector<SDValue, 16> WidenOps(NumOps);
	+ unsigned j = 0;
	+ for (; j != End-Idx; ++j)
	+ WidenOps[j] = ConcatOps[Idx+j];
	+ for (; j != NumOps; ++j)
	+ WidenOps[j] = DAG.getUNDEF(LdTy);
	+
	ConcatOps[End-1] = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewLdTy,
	- makeArrayRef(&ConcatOps[Idx], End - Idx));
	+ WidenOps);
	Idx = End - 1;
	LdTy = NewLdTy;
	}
	ConcatOps[--Idx] = LdOps[i];
	}

	if (WidenWidth == LdTy.getSizeInBits() * (End - Idx))
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT,
	makeArrayRef(&ConcatOps[Idx], End - Idx));

	// We need to fill the rest with undefs to build the vector.
	unsigned NumOps = WidenWidth / LdTy.getSizeInBits();
	SmallVector<SDValue, 16> WidenOps(NumOps);
	SDValue UndefVal = DAG.getUNDEF(LdTy);
	{
	unsigned i = 0;
	for (; i != End-Idx; ++i)
	WidenOps[i] = ConcatOps[Idx+i];
	for (; i != NumOps; ++i)
	WidenOps[i] = UndefVal;
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, WidenOps);
	}

	SDValue
	DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
	LoadSDNode *LD,
	ISD::LoadExtType ExtType) {
	// For extension loads, it may not be more efficient to chop up the vector
	// and then extend it. Instead, we unroll the load and build a new vector.
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),LD->getValueType(0));
	EVT LdVT = LD->getMemoryVT();
	SDLoc dl(LD);
	assert(LdVT.isVector() && WidenVT.isVector());

	// Load information
	SDValue Chain = LD->getChain();
	SDValue BasePtr = LD->getBasePtr();
	MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
	AAMDNodes AAInfo = LD->getAAInfo();

	EVT EltVT = WidenVT.getVectorElementType();
	EVT LdEltVT = LdVT.getVectorElementType();
	unsigned NumElts = LdVT.getVectorNumElements();

	// Load each element and widen.
	unsigned WidenNumElts = WidenVT.getVectorNumElements();
	SmallVector<SDValue, 16> Ops(WidenNumElts);
	unsigned Increment = LdEltVT.getSizeInBits() / 8;
	Ops[0] =
	DAG.getExtLoad(ExtType, dl, EltVT, Chain, BasePtr, LD->getPointerInfo(),
	LdEltVT, LD->getOriginalAlign(), MMOFlags, AAInfo);
	LdChain.push_back(Ops[0].getValue(1));
	unsigned i = 0, Offset = Increment;
	for (i=1; i < NumElts; ++i, Offset += Increment) {
	SDValue NewBasePtr = DAG.getObjectPtrOffset(dl, BasePtr, Offset);
	Ops[i] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, NewBasePtr,
	LD->getPointerInfo().getWithOffset(Offset), LdEltVT,
	LD->getOriginalAlign(), MMOFlags, AAInfo);
	LdChain.push_back(Ops[i].getValue(1));
	}

	// Fill the rest with undefs.
	SDValue UndefVal = DAG.getUNDEF(EltVT);
	for (; i != WidenNumElts; ++i)
	Ops[i] = UndefVal;

	return DAG.getBuildVector(WidenVT, dl, Ops);
	}

	void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
	StoreSDNode *ST) {
	// The strategy assumes that we can efficiently store power-of-two widths.
	// The routine chops the vector into the largest vector stores with the same
	// element type or scalar stores.
	SDValue Chain = ST->getChain();
	SDValue BasePtr = ST->getBasePtr();
	MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
	AAMDNodes AAInfo = ST->getAAInfo();
	SDValue ValOp = GetWidenedVector(ST->getValue());
	SDLoc dl(ST);

	EVT StVT = ST->getMemoryVT();
	unsigned StWidth = StVT.getSizeInBits();
	EVT ValVT = ValOp.getValueType();
	unsigned ValWidth = ValVT.getSizeInBits();
	EVT ValEltVT = ValVT.getVectorElementType();
	unsigned ValEltWidth = ValEltVT.getSizeInBits();
	assert(StVT.getVectorElementType() == ValEltVT);

	int Idx = 0; // current index to store
	unsigned Offset = 0; // offset from base to store
	while (StWidth != 0) {
	// Find the largest vector type we can store with.
	EVT NewVT = FindMemType(DAG, TLI, StWidth, ValVT);
	unsigned NewVTWidth = NewVT.getSizeInBits();
	unsigned Increment = NewVTWidth / 8;
	if (NewVT.isVector()) {
	unsigned NumVTElts = NewVT.getVectorNumElements();
	do {
	SDValue EOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NewVT, ValOp,
	DAG.getVectorIdxConstant(Idx, dl));
	StChain.push_back(DAG.getStore(
	Chain, dl, EOp, BasePtr, ST->getPointerInfo().getWithOffset(Offset),
	ST->getOriginalAlign(), MMOFlags, AAInfo));
	StWidth -= NewVTWidth;
	Offset += Increment;
	Idx += NumVTElts;

	BasePtr = DAG.getObjectPtrOffset(dl, BasePtr, Increment);
	} while (StWidth != 0 && StWidth >= NewVTWidth);
	} else {
	// Cast the vector to the scalar type we can store.
	unsigned NumElts = ValWidth / NewVTWidth;
	EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NumElts);
	SDValue VecOp = DAG.getNode(ISD::BITCAST, dl, NewVecVT, ValOp);
	// Readjust index position based on new vector type.
	Idx = Idx * ValEltWidth / NewVTWidth;
	do {
	SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewVT, VecOp,
	DAG.getVectorIdxConstant(Idx++, dl));
	StChain.push_back(DAG.getStore(
	Chain, dl, EOp, BasePtr, ST->getPointerInfo().getWithOffset(Offset),
	ST->getOriginalAlign(), MMOFlags, AAInfo));
	StWidth -= NewVTWidth;
	Offset += Increment;
	BasePtr = DAG.getObjectPtrOffset(dl, BasePtr, Increment);
	} while (StWidth != 0 && StWidth >= NewVTWidth);
	// Restore index back to be relative to the original widen element type.
	Idx = Idx * NewVTWidth / ValEltWidth;
	}
	}
	}

	void
	DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVectorImpl<SDValue> &StChain,
	StoreSDNode *ST) {
	// For extension loads, it may not be more efficient to truncate the vector
	// and then store it. Instead, we extract each element and then store it.
	SDValue Chain = ST->getChain();
	SDValue BasePtr = ST->getBasePtr();
	MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
	AAMDNodes AAInfo = ST->getAAInfo();
	SDValue ValOp = GetWidenedVector(ST->getValue());
	SDLoc dl(ST);

	EVT StVT = ST->getMemoryVT();
	EVT ValVT = ValOp.getValueType();

	// It must be true that the wide vector type is bigger than where we need to
	// store.
	assert(StVT.isVector() && ValOp.getValueType().isVector());
	assert(StVT.bitsLT(ValOp.getValueType()));

	// For truncating stores, we can not play the tricks of chopping legal vector
	// types and bitcast it to the right type. Instead, we unroll the store.
	EVT StEltVT = StVT.getVectorElementType();
	EVT ValEltVT = ValVT.getVectorElementType();
	unsigned Increment = ValEltVT.getSizeInBits() / 8;
	unsigned NumElts = StVT.getVectorNumElements();
	SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
	DAG.getVectorIdxConstant(0, dl));
	StChain.push_back(
	DAG.getTruncStore(Chain, dl, EOp, BasePtr, ST->getPointerInfo(), StEltVT,
	ST->getOriginalAlign(), MMOFlags, AAInfo));
	unsigned Offset = Increment;
	for (unsigned i=1; i < NumElts; ++i, Offset += Increment) {
	SDValue NewBasePtr = DAG.getObjectPtrOffset(dl, BasePtr, Offset);
	SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
	DAG.getVectorIdxConstant(0, dl));
	StChain.push_back(DAG.getTruncStore(
	Chain, dl, EOp, NewBasePtr, ST->getPointerInfo().getWithOffset(Offset),
	StEltVT, ST->getOriginalAlign(), MMOFlags, AAInfo));
	}
	}

	/// Modifies a vector input (widen or narrows) to a vector of NVT. The
	/// input vector must have the same element type as NVT.
	/// FillWithZeroes specifies that the vector should be widened with zeroes.
	SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT,
	bool FillWithZeroes) {
	// Note that InOp might have been widened so it might already have
	// the right width or it might need be narrowed.
	EVT InVT = InOp.getValueType();
	assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
	"input and widen element type must match");
	SDLoc dl(InOp);

	// Check if InOp already has the right width.
	if (InVT == NVT)
	return InOp;

	unsigned InNumElts = InVT.getVectorNumElements();
	unsigned WidenNumElts = NVT.getVectorNumElements();
	if (WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0) {
	unsigned NumConcat = WidenNumElts / InNumElts;
	SmallVector<SDValue, 16> Ops(NumConcat);
	SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, InVT) :
	DAG.getUNDEF(InVT);
	Ops[0] = InOp;
	for (unsigned i = 1; i != NumConcat; ++i)
	Ops[i] = FillVal;

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, Ops);
	}

	if (WidenNumElts < InNumElts && InNumElts % WidenNumElts)
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NVT, InOp,
	DAG.getVectorIdxConstant(0, dl));

	// Fall back to extract and build.
	SmallVector<SDValue, 16> Ops(WidenNumElts);
	EVT EltVT = NVT.getVectorElementType();
	unsigned MinNumElts = std::min(WidenNumElts, InNumElts);
	unsigned Idx;
	for (Idx = 0; Idx < MinNumElts; ++Idx)
	Ops[Idx] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
	DAG.getVectorIdxConstant(Idx, dl));

	SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
	DAG.getUNDEF(EltVT);
	for ( ; Idx < WidenNumElts; ++Idx)
	Ops[Idx] = FillVal;
	return DAG.getBuildVector(NVT, dl, Ops);
	}
	diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp
	index 4796ef531054..8e7bf1eb0169 100644
	--- a/llvm/lib/MC/WinCOFFObjectWriter.cpp
	+++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp
	@@ -1,1174 +1,1175 @@
	//===- llvm/MC/WinCOFFObjectWriter.cpp ------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains an implementation of a Win32 COFF object file writer.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/DenseSet.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallString.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/BinaryFormat/COFF.h"
	#include "llvm/MC/MCAsmLayout.h"
	#include "llvm/MC/MCAssembler.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCExpr.h"
	#include "llvm/MC/MCFixup.h"
	#include "llvm/MC/MCFragment.h"
	#include "llvm/MC/MCObjectWriter.h"
	#include "llvm/MC/MCSection.h"
	#include "llvm/MC/MCSectionCOFF.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/MC/MCSymbolCOFF.h"
	#include "llvm/MC/MCValue.h"
	#include "llvm/MC/MCWinCOFFObjectWriter.h"
	#include "llvm/MC/StringTableBuilder.h"
	#include "llvm/Support/CRC.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/EndianStream.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/LEB128.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include <algorithm>
	#include <cassert>
	#include <cstddef>
	#include <cstdint>
	#include <cstring>
	#include <ctime>
	#include <memory>
	#include <string>
	#include <vector>

	using namespace llvm;
	using llvm::support::endian::write32le;

	#define DEBUG_TYPE "WinCOFFObjectWriter"

	namespace {

	using name = SmallString<COFF::NameSize>;

	enum AuxiliaryType {
	ATWeakExternal,
	ATFile,
	ATSectionDefinition
	};

	struct AuxSymbol {
	AuxiliaryType AuxType;
	COFF::Auxiliary Aux;
	};

	class COFFSection;

	class COFFSymbol {
	public:
	COFF::symbol Data = {};

	using AuxiliarySymbols = SmallVector<AuxSymbol, 1>;

	name Name;
	int Index;
	AuxiliarySymbols Aux;
	COFFSymbol *Other = nullptr;
	COFFSection *Section = nullptr;
	int Relocations = 0;
	const MCSymbol *MC = nullptr;

	COFFSymbol(StringRef Name) : Name(Name) {}

	void set_name_offset(uint32_t Offset);

	int64_t getIndex() const { return Index; }
	void setIndex(int Value) {
	Index = Value;
	if (MC)
	MC->setIndex(static_cast<uint32_t>(Value));
	}
	};

	// This class contains staging data for a COFF relocation entry.
	struct COFFRelocation {
	COFF::relocation Data;
	COFFSymbol *Symb = nullptr;

	COFFRelocation() = default;

	static size_t size() { return COFF::RelocationSize; }
	};

	using relocations = std::vector<COFFRelocation>;

	class COFFSection {
	public:
	COFF::section Header = {};

	std::string Name;
	int Number;
	MCSectionCOFF const *MCSection = nullptr;
	COFFSymbol *Symbol = nullptr;
	relocations Relocations;

	COFFSection(StringRef Name) : Name(std::string(Name)) {}
	};

	class WinCOFFObjectWriter : public MCObjectWriter {
	public:
	support::endian::Writer W;

	using symbols = std::vector<std::unique_ptr<COFFSymbol>>;
	using sections = std::vector<std::unique_ptr<COFFSection>>;

	using symbol_map = DenseMap<MCSymbol const , COFFSymbol >;
	using section_map = DenseMap<MCSection const , COFFSection >;

	using symbol_list = DenseSet<COFFSymbol *>;

	std::unique_ptr<MCWinCOFFObjectTargetWriter> TargetObjectWriter;

	// Root level file contents.
	COFF::header Header = {};
	sections Sections;
	symbols Symbols;
	StringTableBuilder Strings{StringTableBuilder::WinCOFF};

	// Maps used during object file creation.
	section_map SectionMap;
	symbol_map SymbolMap;

	symbol_list WeakDefaults;

	bool UseBigObj;

	bool EmitAddrsigSection = false;
	MCSectionCOFF *AddrsigSection;
	std::vector<const MCSymbol *> AddrsigSyms;

	MCSectionCOFF *CGProfileSection = nullptr;

	WinCOFFObjectWriter(std::unique_ptr<MCWinCOFFObjectTargetWriter> MOTW,
	raw_pwrite_stream &OS);

	void reset() override {
	memset(&Header, 0, sizeof(Header));
	Header.Machine = TargetObjectWriter->getMachine();
	Sections.clear();
	Symbols.clear();
	Strings.clear();
	SectionMap.clear();
	SymbolMap.clear();
	MCObjectWriter::reset();
	}

	COFFSymbol *createSymbol(StringRef Name);
	COFFSymbol GetOrCreateCOFFSymbol(const MCSymbol Symbol);
	COFFSection *createSection(StringRef Name);

	void defineSection(MCSectionCOFF const &Sec);

	COFFSymbol *getLinkedSymbol(const MCSymbol &Symbol);
	void DefineSymbol(const MCSymbol &Symbol, MCAssembler &Assembler,
	const MCAsmLayout &Layout);

	void SetSymbolName(COFFSymbol &S);
	void SetSectionName(COFFSection &S);

	bool IsPhysicalSection(COFFSection *S);

	// Entity writing methods.

	void WriteFileHeader(const COFF::header &Header);
	void WriteSymbol(const COFFSymbol &S);
	void WriteAuxiliarySymbols(const COFFSymbol::AuxiliarySymbols &S);
	void writeSectionHeaders();
	void WriteRelocation(const COFF::relocation &R);
	uint32_t writeSectionContents(MCAssembler &Asm, const MCAsmLayout &Layout,
	const MCSection &MCSec);
	void writeSection(MCAssembler &Asm, const MCAsmLayout &Layout,
	const COFFSection &Sec, const MCSection &MCSec);

	// MCObjectWriter interface implementation.

	void executePostLayoutBinding(MCAssembler &Asm,
	const MCAsmLayout &Layout) override;

	bool isSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
	const MCSymbol &SymA,
	const MCFragment &FB, bool InSet,
	bool IsPCRel) const override;

	void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
	const MCFragment *Fragment, const MCFixup &Fixup,
	MCValue Target, uint64_t &FixedValue) override;

	void createFileSymbols(MCAssembler &Asm);
	void setWeakDefaultNames();
	void assignSectionNumbers();
	void assignFileOffsets(MCAssembler &Asm, const MCAsmLayout &Layout);

	void emitAddrsigSection() override { EmitAddrsigSection = true; }
	void addAddrsigSymbol(const MCSymbol *Sym) override {
	AddrsigSyms.push_back(Sym);
	}

	uint64_t writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
	};

	} // end anonymous namespace

	//------------------------------------------------------------------------------
	// Symbol class implementation

	// In the case that the name does not fit within 8 bytes, the offset
	// into the string table is stored in the last 4 bytes instead, leaving
	// the first 4 bytes as 0.
	void COFFSymbol::set_name_offset(uint32_t Offset) {
	write32le(Data.Name + 0, 0);
	write32le(Data.Name + 4, Offset);
	}

	//------------------------------------------------------------------------------
	// WinCOFFObjectWriter class implementation

	WinCOFFObjectWriter::WinCOFFObjectWriter(
	std::unique_ptr<MCWinCOFFObjectTargetWriter> MOTW, raw_pwrite_stream &OS)
	: W(OS, support::little), TargetObjectWriter(std::move(MOTW)) {
	Header.Machine = TargetObjectWriter->getMachine();
	}

	COFFSymbol *WinCOFFObjectWriter::createSymbol(StringRef Name) {
	Symbols.push_back(std::make_unique<COFFSymbol>(Name));
	return Symbols.back().get();
	}

	COFFSymbol WinCOFFObjectWriter::GetOrCreateCOFFSymbol(const MCSymbol Symbol) {
	COFFSymbol *&Ret = SymbolMap[Symbol];
	if (!Ret)
	Ret = createSymbol(Symbol->getName());
	return Ret;
	}

	COFFSection *WinCOFFObjectWriter::createSection(StringRef Name) {
	Sections.emplace_back(std::make_unique<COFFSection>(Name));
	return Sections.back().get();
	}

	static uint32_t getAlignment(const MCSectionCOFF &Sec) {
	switch (Sec.getAlignment()) {
	case 1:
	return COFF::IMAGE_SCN_ALIGN_1BYTES;
	case 2:
	return COFF::IMAGE_SCN_ALIGN_2BYTES;
	case 4:
	return COFF::IMAGE_SCN_ALIGN_4BYTES;
	case 8:
	return COFF::IMAGE_SCN_ALIGN_8BYTES;
	case 16:
	return COFF::IMAGE_SCN_ALIGN_16BYTES;
	case 32:
	return COFF::IMAGE_SCN_ALIGN_32BYTES;
	case 64:
	return COFF::IMAGE_SCN_ALIGN_64BYTES;
	case 128:
	return COFF::IMAGE_SCN_ALIGN_128BYTES;
	case 256:
	return COFF::IMAGE_SCN_ALIGN_256BYTES;
	case 512:
	return COFF::IMAGE_SCN_ALIGN_512BYTES;
	case 1024:
	return COFF::IMAGE_SCN_ALIGN_1024BYTES;
	case 2048:
	return COFF::IMAGE_SCN_ALIGN_2048BYTES;
	case 4096:
	return COFF::IMAGE_SCN_ALIGN_4096BYTES;
	case 8192:
	return COFF::IMAGE_SCN_ALIGN_8192BYTES;
	}
	llvm_unreachable("unsupported section alignment");
	}

	/// This function takes a section data object from the assembler
	/// and creates the associated COFF section staging object.
	void WinCOFFObjectWriter::defineSection(const MCSectionCOFF &MCSec) {
	COFFSection *Section = createSection(MCSec.getName());
	COFFSymbol *Symbol = createSymbol(MCSec.getName());
	Section->Symbol = Symbol;
	Symbol->Section = Section;
	Symbol->Data.StorageClass = COFF::IMAGE_SYM_CLASS_STATIC;

	// Create a COMDAT symbol if needed.
	if (MCSec.getSelection() != COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) {
	if (const MCSymbol *S = MCSec.getCOMDATSymbol()) {
	COFFSymbol *COMDATSymbol = GetOrCreateCOFFSymbol(S);
	if (COMDATSymbol->Section)
	report_fatal_error("two sections have the same comdat");
	COMDATSymbol->Section = Section;
	}
	}

	// In this case the auxiliary symbol is a Section Definition.
	Symbol->Aux.resize(1);
	Symbol->Aux[0] = {};
	Symbol->Aux[0].AuxType = ATSectionDefinition;
	Symbol->Aux[0].Aux.SectionDefinition.Selection = MCSec.getSelection();

	// Set section alignment.
	Section->Header.Characteristics = MCSec.getCharacteristics();
	Section->Header.Characteristics \|= getAlignment(MCSec);

	// Bind internal COFF section to MC section.
	Section->MCSection = &MCSec;
	SectionMap[&MCSec] = Section;
	}

	static uint64_t getSymbolValue(const MCSymbol &Symbol,
	const MCAsmLayout &Layout) {
	if (Symbol.isCommon() && Symbol.isExternal())
	return Symbol.getCommonSize();

	uint64_t Res;
	if (!Layout.getSymbolOffset(Symbol, Res))
	return 0;

	return Res;
	}

	COFFSymbol *WinCOFFObjectWriter::getLinkedSymbol(const MCSymbol &Symbol) {
	if (!Symbol.isVariable())
	return nullptr;

	const MCSymbolRefExpr *SymRef =
	dyn_cast<MCSymbolRefExpr>(Symbol.getVariableValue());
	if (!SymRef)
	return nullptr;

	const MCSymbol &Aliasee = SymRef->getSymbol();
	if (!Aliasee.isUndefined())
	return nullptr;
	return GetOrCreateCOFFSymbol(&Aliasee);
	}

	/// This function takes a symbol data object from the assembler
	/// and creates the associated COFF symbol staging object.
	void WinCOFFObjectWriter::DefineSymbol(const MCSymbol &MCSym,
	MCAssembler &Assembler,
	const MCAsmLayout &Layout) {
	COFFSymbol *Sym = GetOrCreateCOFFSymbol(&MCSym);
	const MCSymbol *Base = Layout.getBaseSymbol(MCSym);
	COFFSection *Sec = nullptr;
	if (Base && Base->getFragment()) {
	Sec = SectionMap[Base->getFragment()->getParent()];
	if (Sym->Section && Sym->Section != Sec)
	report_fatal_error("conflicting sections for symbol");
	}

	COFFSymbol *Local = nullptr;
	if (cast<MCSymbolCOFF>(MCSym).isWeakExternal()) {
	Sym->Data.StorageClass = COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL;
	+ Sym->Section = nullptr;

	COFFSymbol *WeakDefault = getLinkedSymbol(MCSym);
	if (!WeakDefault) {
	std::string WeakName = (".weak." + MCSym.getName() + ".default").str();
	WeakDefault = createSymbol(WeakName);
	if (!Sec)
	WeakDefault->Data.SectionNumber = COFF::IMAGE_SYM_ABSOLUTE;
	else
	WeakDefault->Section = Sec;
	WeakDefaults.insert(WeakDefault);
	Local = WeakDefault;
	}

	Sym->Other = WeakDefault;

	// Setup the Weak External auxiliary symbol.
	Sym->Aux.resize(1);
	memset(&Sym->Aux[0], 0, sizeof(Sym->Aux[0]));
	Sym->Aux[0].AuxType = ATWeakExternal;
	Sym->Aux[0].Aux.WeakExternal.TagIndex = 0;
	Sym->Aux[0].Aux.WeakExternal.Characteristics =
	COFF::IMAGE_WEAK_EXTERN_SEARCH_ALIAS;
	} else {
	if (!Base)
	Sym->Data.SectionNumber = COFF::IMAGE_SYM_ABSOLUTE;
	else
	Sym->Section = Sec;
	Local = Sym;
	}

	if (Local) {
	Local->Data.Value = getSymbolValue(MCSym, Layout);

	const MCSymbolCOFF &SymbolCOFF = cast<MCSymbolCOFF>(MCSym);
	Local->Data.Type = SymbolCOFF.getType();
	Local->Data.StorageClass = SymbolCOFF.getClass();

	// If no storage class was specified in the streamer, define it here.
	if (Local->Data.StorageClass == COFF::IMAGE_SYM_CLASS_NULL) {
	bool IsExternal = MCSym.isExternal() \|\|
	(!MCSym.getFragment() && !MCSym.isVariable());

	Local->Data.StorageClass = IsExternal ? COFF::IMAGE_SYM_CLASS_EXTERNAL
	: COFF::IMAGE_SYM_CLASS_STATIC;
	}
	}

	Sym->MC = &MCSym;
	}

	// Maximum offsets for different string table entry encodings.
	enum : unsigned { Max7DecimalOffset = 9999999U };
	enum : uint64_t { MaxBase64Offset = 0xFFFFFFFFFULL }; // 64^6, including 0

	// Encode a string table entry offset in base 64, padded to 6 chars, and
	// prefixed with a double slash: '//AAAAAA', '//AAAAAB', ...
	// Buffer must be at least 8 bytes large. No terminating null appended.
	static void encodeBase64StringEntry(char *Buffer, uint64_t Value) {
	assert(Value > Max7DecimalOffset && Value <= MaxBase64Offset &&
	"Illegal section name encoding for value");

	static const char Alphabet[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
	"abcdefghijklmnopqrstuvwxyz"
	"0123456789+/";

	Buffer[0] = '/';
	Buffer[1] = '/';

	char *Ptr = Buffer + 7;
	for (unsigned i = 0; i < 6; ++i) {
	unsigned Rem = Value % 64;
	Value /= 64;
	*(Ptr--) = Alphabet[Rem];
	}
	}

	void WinCOFFObjectWriter::SetSectionName(COFFSection &S) {
	if (S.Name.size() <= COFF::NameSize) {
	std::memcpy(S.Header.Name, S.Name.c_str(), S.Name.size());
	return;
	}

	uint64_t StringTableEntry = Strings.getOffset(S.Name);
	if (StringTableEntry <= Max7DecimalOffset) {
	SmallVector<char, COFF::NameSize> Buffer;
	Twine('/').concat(Twine(StringTableEntry)).toVector(Buffer);
	assert(Buffer.size() <= COFF::NameSize && Buffer.size() >= 2);
	std::memcpy(S.Header.Name, Buffer.data(), Buffer.size());
	return;
	}
	if (StringTableEntry <= MaxBase64Offset) {
	// Starting with 10,000,000, offsets are encoded as base64.
	encodeBase64StringEntry(S.Header.Name, StringTableEntry);
	return;
	}
	report_fatal_error("COFF string table is greater than 64 GB.");
	}

	void WinCOFFObjectWriter::SetSymbolName(COFFSymbol &S) {
	if (S.Name.size() > COFF::NameSize)
	S.set_name_offset(Strings.getOffset(S.Name));
	else
	std::memcpy(S.Data.Name, S.Name.c_str(), S.Name.size());
	}

	bool WinCOFFObjectWriter::IsPhysicalSection(COFFSection *S) {
	return (S->Header.Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA) ==
	0;
	}

	//------------------------------------------------------------------------------
	// entity writing methods

	void WinCOFFObjectWriter::WriteFileHeader(const COFF::header &Header) {
	if (UseBigObj) {
	W.write<uint16_t>(COFF::IMAGE_FILE_MACHINE_UNKNOWN);
	W.write<uint16_t>(0xFFFF);
	W.write<uint16_t>(COFF::BigObjHeader::MinBigObjectVersion);
	W.write<uint16_t>(Header.Machine);
	W.write<uint32_t>(Header.TimeDateStamp);
	W.OS.write(COFF::BigObjMagic, sizeof(COFF::BigObjMagic));
	W.write<uint32_t>(0);
	W.write<uint32_t>(0);
	W.write<uint32_t>(0);
	W.write<uint32_t>(0);
	W.write<uint32_t>(Header.NumberOfSections);
	W.write<uint32_t>(Header.PointerToSymbolTable);
	W.write<uint32_t>(Header.NumberOfSymbols);
	} else {
	W.write<uint16_t>(Header.Machine);
	W.write<uint16_t>(static_cast<int16_t>(Header.NumberOfSections));
	W.write<uint32_t>(Header.TimeDateStamp);
	W.write<uint32_t>(Header.PointerToSymbolTable);
	W.write<uint32_t>(Header.NumberOfSymbols);
	W.write<uint16_t>(Header.SizeOfOptionalHeader);
	W.write<uint16_t>(Header.Characteristics);
	}
	}

	void WinCOFFObjectWriter::WriteSymbol(const COFFSymbol &S) {
	W.OS.write(S.Data.Name, COFF::NameSize);
	W.write<uint32_t>(S.Data.Value);
	if (UseBigObj)
	W.write<uint32_t>(S.Data.SectionNumber);
	else
	W.write<uint16_t>(static_cast<int16_t>(S.Data.SectionNumber));
	W.write<uint16_t>(S.Data.Type);
	W.OS << char(S.Data.StorageClass);
	W.OS << char(S.Data.NumberOfAuxSymbols);
	WriteAuxiliarySymbols(S.Aux);
	}

	void WinCOFFObjectWriter::WriteAuxiliarySymbols(
	const COFFSymbol::AuxiliarySymbols &S) {
	for (const AuxSymbol &i : S) {
	switch (i.AuxType) {
	case ATWeakExternal:
	W.write<uint32_t>(i.Aux.WeakExternal.TagIndex);
	W.write<uint32_t>(i.Aux.WeakExternal.Characteristics);
	W.OS.write_zeros(sizeof(i.Aux.WeakExternal.unused));
	if (UseBigObj)
	W.OS.write_zeros(COFF::Symbol32Size - COFF::Symbol16Size);
	break;
	case ATFile:
	W.OS.write(reinterpret_cast<const char *>(&i.Aux),
	UseBigObj ? COFF::Symbol32Size : COFF::Symbol16Size);
	break;
	case ATSectionDefinition:
	W.write<uint32_t>(i.Aux.SectionDefinition.Length);
	W.write<uint16_t>(i.Aux.SectionDefinition.NumberOfRelocations);
	W.write<uint16_t>(i.Aux.SectionDefinition.NumberOfLinenumbers);
	W.write<uint32_t>(i.Aux.SectionDefinition.CheckSum);
	W.write<uint16_t>(static_cast<int16_t>(i.Aux.SectionDefinition.Number));
	W.OS << char(i.Aux.SectionDefinition.Selection);
	W.OS.write_zeros(sizeof(i.Aux.SectionDefinition.unused));
	W.write<uint16_t>(static_cast<int16_t>(i.Aux.SectionDefinition.Number >> 16));
	if (UseBigObj)
	W.OS.write_zeros(COFF::Symbol32Size - COFF::Symbol16Size);
	break;
	}
	}
	}

	// Write the section header.
	void WinCOFFObjectWriter::writeSectionHeaders() {
	// Section numbers must be monotonically increasing in the section
	// header, but our Sections array is not sorted by section number,
	// so make a copy of Sections and sort it.
	std::vector<COFFSection *> Arr;
	for (auto &Section : Sections)
	Arr.push_back(Section.get());
	llvm::sort(Arr, [](const COFFSection A, const COFFSection B) {
	return A->Number < B->Number;
	});

	for (auto &Section : Arr) {
	if (Section->Number == -1)
	continue;

	COFF::section &S = Section->Header;
	if (Section->Relocations.size() >= 0xffff)
	S.Characteristics \|= COFF::IMAGE_SCN_LNK_NRELOC_OVFL;
	W.OS.write(S.Name, COFF::NameSize);
	W.write<uint32_t>(S.VirtualSize);
	W.write<uint32_t>(S.VirtualAddress);
	W.write<uint32_t>(S.SizeOfRawData);
	W.write<uint32_t>(S.PointerToRawData);
	W.write<uint32_t>(S.PointerToRelocations);
	W.write<uint32_t>(S.PointerToLineNumbers);
	W.write<uint16_t>(S.NumberOfRelocations);
	W.write<uint16_t>(S.NumberOfLineNumbers);
	W.write<uint32_t>(S.Characteristics);
	}
	}

	void WinCOFFObjectWriter::WriteRelocation(const COFF::relocation &R) {
	W.write<uint32_t>(R.VirtualAddress);
	W.write<uint32_t>(R.SymbolTableIndex);
	W.write<uint16_t>(R.Type);
	}

	// Write MCSec's contents. What this function does is essentially
	// "Asm.writeSectionData(&MCSec, Layout)", but it's a bit complicated
	// because it needs to compute a CRC.
	uint32_t WinCOFFObjectWriter::writeSectionContents(MCAssembler &Asm,
	const MCAsmLayout &Layout,
	const MCSection &MCSec) {
	// Save the contents of the section to a temporary buffer, we need this
	// to CRC the data before we dump it into the object file.
	SmallVector<char, 128> Buf;
	raw_svector_ostream VecOS(Buf);
	Asm.writeSectionData(VecOS, &MCSec, Layout);

	// Write the section contents to the object file.
	W.OS << Buf;

	// Calculate our CRC with an initial value of '0', this is not how
	// JamCRC is specified but it aligns with the expected output.
	JamCRC JC(/Init=/0);
	JC.update(makeArrayRef(reinterpret_cast<uint8_t*>(Buf.data()), Buf.size()));
	return JC.getCRC();
	}

	void WinCOFFObjectWriter::writeSection(MCAssembler &Asm,
	const MCAsmLayout &Layout,
	const COFFSection &Sec,
	const MCSection &MCSec) {
	if (Sec.Number == -1)
	return;

	// Write the section contents.
	if (Sec.Header.PointerToRawData != 0) {
	assert(W.OS.tell() == Sec.Header.PointerToRawData &&
	"Section::PointerToRawData is insane!");

	uint32_t CRC = writeSectionContents(Asm, Layout, MCSec);

	// Update the section definition auxiliary symbol to record the CRC.
	COFFSection *Sec = SectionMap[&MCSec];
	COFFSymbol::AuxiliarySymbols &AuxSyms = Sec->Symbol->Aux;
	assert(AuxSyms.size() == 1 && AuxSyms[0].AuxType == ATSectionDefinition);
	AuxSymbol &SecDef = AuxSyms[0];
	SecDef.Aux.SectionDefinition.CheckSum = CRC;
	}

	// Write relocations for this section.
	if (Sec.Relocations.empty()) {
	assert(Sec.Header.PointerToRelocations == 0 &&
	"Section::PointerToRelocations is insane!");
	return;
	}

	assert(W.OS.tell() == Sec.Header.PointerToRelocations &&
	"Section::PointerToRelocations is insane!");

	if (Sec.Relocations.size() >= 0xffff) {
	// In case of overflow, write actual relocation count as first
	// relocation. Including the synthetic reloc itself (+ 1).
	COFF::relocation R;
	R.VirtualAddress = Sec.Relocations.size() + 1;
	R.SymbolTableIndex = 0;
	R.Type = 0;
	WriteRelocation(R);
	}

	for (const auto &Relocation : Sec.Relocations)
	WriteRelocation(Relocation.Data);
	}

	////////////////////////////////////////////////////////////////////////////////
	// MCObjectWriter interface implementations

	void WinCOFFObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
	const MCAsmLayout &Layout) {
	if (EmitAddrsigSection) {
	AddrsigSection = Asm.getContext().getCOFFSection(
	".llvm_addrsig", COFF::IMAGE_SCN_LNK_REMOVE,
	SectionKind::getMetadata());
	Asm.registerSection(*AddrsigSection);
	}

	if (!Asm.CGProfile.empty()) {
	CGProfileSection = Asm.getContext().getCOFFSection(
	".llvm.call-graph-profile", COFF::IMAGE_SCN_LNK_REMOVE,
	SectionKind::getMetadata());
	Asm.registerSection(*CGProfileSection);
	}

	// "Define" each section & symbol. This creates section & symbol
	// entries in the staging area.
	for (const auto &Section : Asm)
	defineSection(static_cast<const MCSectionCOFF &>(Section));

	for (const MCSymbol &Symbol : Asm.symbols())
	if (!Symbol.isTemporary())
	DefineSymbol(Symbol, Asm, Layout);
	}

	bool WinCOFFObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
	const MCAssembler &Asm, const MCSymbol &SymA, const MCFragment &FB,
	bool InSet, bool IsPCRel) const {
	// Don't drop relocations between functions, even if they are in the same text
	// section. Multiple Visual C++ linker features depend on having the
	// relocations present. The /INCREMENTAL flag will cause these relocations to
	// point to thunks, and the /GUARD:CF flag assumes that it can use relocations
	// to approximate the set of all address taken functions. LLD's implementation
	// of /GUARD:CF also relies on the existance of these relocations.
	uint16_t Type = cast<MCSymbolCOFF>(SymA).getType();
	if ((Type >> COFF::SCT_COMPLEX_TYPE_SHIFT) == COFF::IMAGE_SYM_DTYPE_FUNCTION)
	return false;
	return MCObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(Asm, SymA, FB,
	InSet, IsPCRel);
	}

	void WinCOFFObjectWriter::recordRelocation(MCAssembler &Asm,
	const MCAsmLayout &Layout,
	const MCFragment *Fragment,
	const MCFixup &Fixup, MCValue Target,
	uint64_t &FixedValue) {
	assert(Target.getSymA() && "Relocation must reference a symbol!");

	const MCSymbol &A = Target.getSymA()->getSymbol();
	if (!A.isRegistered()) {
	Asm.getContext().reportError(Fixup.getLoc(),
	Twine("symbol '") + A.getName() +
	"' can not be undefined");
	return;
	}
	if (A.isTemporary() && A.isUndefined()) {
	Asm.getContext().reportError(Fixup.getLoc(),
	Twine("assembler label '") + A.getName() +
	"' can not be undefined");
	return;
	}

	MCSection *MCSec = Fragment->getParent();

	// Mark this symbol as requiring an entry in the symbol table.
	assert(SectionMap.find(MCSec) != SectionMap.end() &&
	"Section must already have been defined in executePostLayoutBinding!");

	COFFSection *Sec = SectionMap[MCSec];
	const MCSymbolRefExpr *SymB = Target.getSymB();

	if (SymB) {
	const MCSymbol *B = &SymB->getSymbol();
	if (!B->getFragment()) {
	Asm.getContext().reportError(
	Fixup.getLoc(),
	Twine("symbol '") + B->getName() +
	"' can not be undefined in a subtraction expression");
	return;
	}

	// Offset of the symbol in the section
	int64_t OffsetOfB = Layout.getSymbolOffset(*B);

	// Offset of the relocation in the section
	int64_t OffsetOfRelocation =
	Layout.getFragmentOffset(Fragment) + Fixup.getOffset();

	FixedValue = (OffsetOfRelocation - OffsetOfB) + Target.getConstant();
	} else {
	FixedValue = Target.getConstant();
	}

	COFFRelocation Reloc;

	Reloc.Data.SymbolTableIndex = 0;
	Reloc.Data.VirtualAddress = Layout.getFragmentOffset(Fragment);

	// Turn relocations for temporary symbols into section relocations.
	if (A.isTemporary()) {
	MCSection *TargetSection = &A.getSection();
	assert(
	SectionMap.find(TargetSection) != SectionMap.end() &&
	"Section must already have been defined in executePostLayoutBinding!");
	Reloc.Symb = SectionMap[TargetSection]->Symbol;
	FixedValue += Layout.getSymbolOffset(A);
	} else {
	assert(
	SymbolMap.find(&A) != SymbolMap.end() &&
	"Symbol must already have been defined in executePostLayoutBinding!");
	Reloc.Symb = SymbolMap[&A];
	}

	++Reloc.Symb->Relocations;

	Reloc.Data.VirtualAddress += Fixup.getOffset();
	Reloc.Data.Type = TargetObjectWriter->getRelocType(
	Asm.getContext(), Target, Fixup, SymB, Asm.getBackend());

	// FIXME: Can anyone explain what this does other than adjust for the size
	// of the offset?
	if ((Header.Machine == COFF::IMAGE_FILE_MACHINE_AMD64 &&
	Reloc.Data.Type == COFF::IMAGE_REL_AMD64_REL32) \|\|
	(Header.Machine == COFF::IMAGE_FILE_MACHINE_I386 &&
	Reloc.Data.Type == COFF::IMAGE_REL_I386_REL32))
	FixedValue += 4;

	if (Header.Machine == COFF::IMAGE_FILE_MACHINE_ARMNT) {
	switch (Reloc.Data.Type) {
	case COFF::IMAGE_REL_ARM_ABSOLUTE:
	case COFF::IMAGE_REL_ARM_ADDR32:
	case COFF::IMAGE_REL_ARM_ADDR32NB:
	case COFF::IMAGE_REL_ARM_TOKEN:
	case COFF::IMAGE_REL_ARM_SECTION:
	case COFF::IMAGE_REL_ARM_SECREL:
	break;
	case COFF::IMAGE_REL_ARM_BRANCH11:
	case COFF::IMAGE_REL_ARM_BLX11:
	// IMAGE_REL_ARM_BRANCH11 and IMAGE_REL_ARM_BLX11 are only used for
	// pre-ARMv7, which implicitly rules it out of ARMNT (it would be valid
	// for Windows CE).
	case COFF::IMAGE_REL_ARM_BRANCH24:
	case COFF::IMAGE_REL_ARM_BLX24:
	case COFF::IMAGE_REL_ARM_MOV32A:
	// IMAGE_REL_ARM_BRANCH24, IMAGE_REL_ARM_BLX24, IMAGE_REL_ARM_MOV32A are
	// only used for ARM mode code, which is documented as being unsupported
	// by Windows on ARM. Empirical proof indicates that masm is able to
	// generate the relocations however the rest of the MSVC toolchain is
	// unable to handle it.
	llvm_unreachable("unsupported relocation");
	break;
	case COFF::IMAGE_REL_ARM_MOV32T:
	break;
	case COFF::IMAGE_REL_ARM_BRANCH20T:
	case COFF::IMAGE_REL_ARM_BRANCH24T:
	case COFF::IMAGE_REL_ARM_BLX23T:
	// IMAGE_REL_BRANCH20T, IMAGE_REL_ARM_BRANCH24T, IMAGE_REL_ARM_BLX23T all
	// perform a 4 byte adjustment to the relocation. Relative branches are
	// offset by 4 on ARM, however, because there is no RELA relocations, all
	// branches are offset by 4.
	FixedValue = FixedValue + 4;
	break;
	}
	}

	// The fixed value never makes sense for section indices, ignore it.
	if (Fixup.getKind() == FK_SecRel_2)
	FixedValue = 0;

	if (TargetObjectWriter->recordRelocation(Fixup))
	Sec->Relocations.push_back(Reloc);
	}

	static std::time_t getTime() {
	std::time_t Now = time(nullptr);
	if (Now < 0 \|\| !isUInt<32>(Now))
	return UINT32_MAX;
	return Now;
	}

	// Create .file symbols.
	void WinCOFFObjectWriter::createFileSymbols(MCAssembler &Asm) {
	for (const std::string &Name : Asm.getFileNames()) {
	// round up to calculate the number of auxiliary symbols required
	unsigned SymbolSize = UseBigObj ? COFF::Symbol32Size : COFF::Symbol16Size;
	unsigned Count = (Name.size() + SymbolSize - 1) / SymbolSize;

	COFFSymbol *File = createSymbol(".file");
	File->Data.SectionNumber = COFF::IMAGE_SYM_DEBUG;
	File->Data.StorageClass = COFF::IMAGE_SYM_CLASS_FILE;
	File->Aux.resize(Count);

	unsigned Offset = 0;
	unsigned Length = Name.size();
	for (auto &Aux : File->Aux) {
	Aux.AuxType = ATFile;

	if (Length > SymbolSize) {
	memcpy(&Aux.Aux, Name.c_str() + Offset, SymbolSize);
	Length = Length - SymbolSize;
	} else {
	memcpy(&Aux.Aux, Name.c_str() + Offset, Length);
	memset((char *)&Aux.Aux + Length, 0, SymbolSize - Length);
	break;
	}

	Offset += SymbolSize;
	}
	}
	}

	void WinCOFFObjectWriter::setWeakDefaultNames() {
	if (WeakDefaults.empty())
	return;

	// If multiple object files use a weak symbol (either with a regular
	// defined default, or an absolute zero symbol as default), the defaults
	// cause duplicate definitions unless their names are made unique. Look
	// for a defined extern symbol, that isn't comdat - that should be unique
	// unless there are other duplicate definitions. And if none is found,
	// allow picking a comdat symbol, as that's still better than nothing.

	COFFSymbol *Unique = nullptr;
	for (bool AllowComdat : {false, true}) {
	for (auto &Sym : Symbols) {
	// Don't include the names of the defaults themselves
	if (WeakDefaults.count(Sym.get()))
	continue;
	// Only consider external symbols
	if (Sym->Data.StorageClass != COFF::IMAGE_SYM_CLASS_EXTERNAL)
	continue;
	// Only consider symbols defined in a section or that are absolute
	if (!Sym->Section && Sym->Data.SectionNumber != COFF::IMAGE_SYM_ABSOLUTE)
	continue;
	if (!AllowComdat && Sym->Section &&
	Sym->Section->Header.Characteristics & COFF::IMAGE_SCN_LNK_COMDAT)
	continue;
	Unique = Sym.get();
	break;
	}
	if (Unique)
	break;
	}
	// If we didn't find any unique symbol to use for the names, just skip this.
	if (!Unique)
	return;
	for (auto *Sym : WeakDefaults) {
	Sym->Name.append(".");
	Sym->Name.append(Unique->Name);
	}
	}

	static bool isAssociative(const COFFSection &Section) {
	return Section.Symbol->Aux[0].Aux.SectionDefinition.Selection ==
	COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE;
	}

	void WinCOFFObjectWriter::assignSectionNumbers() {
	size_t I = 1;
	auto Assign = [&](COFFSection &Section) {
	Section.Number = I;
	Section.Symbol->Data.SectionNumber = I;
	Section.Symbol->Aux[0].Aux.SectionDefinition.Number = I;
	++I;
	};

	// Although it is not explicitly requested by the Microsoft COFF spec,
	// we should avoid emitting forward associative section references,
	// because MSVC link.exe as of 2017 cannot handle that.
	for (const std::unique_ptr<COFFSection> &Section : Sections)
	if (!isAssociative(*Section))
	Assign(*Section);
	for (const std::unique_ptr<COFFSection> &Section : Sections)
	if (isAssociative(*Section))
	Assign(*Section);
	}

	// Assign file offsets to COFF object file structures.
	void WinCOFFObjectWriter::assignFileOffsets(MCAssembler &Asm,
	const MCAsmLayout &Layout) {
	unsigned Offset = W.OS.tell();

	Offset += UseBigObj ? COFF::Header32Size : COFF::Header16Size;
	Offset += COFF::SectionSize * Header.NumberOfSections;

	for (const auto &Section : Asm) {
	COFFSection *Sec = SectionMap[&Section];

	if (Sec->Number == -1)
	continue;

	Sec->Header.SizeOfRawData = Layout.getSectionAddressSize(&Section);

	if (IsPhysicalSection(Sec)) {
	Sec->Header.PointerToRawData = Offset;
	Offset += Sec->Header.SizeOfRawData;
	}

	if (!Sec->Relocations.empty()) {
	bool RelocationsOverflow = Sec->Relocations.size() >= 0xffff;

	if (RelocationsOverflow) {
	// Signal overflow by setting NumberOfRelocations to max value. Actual
	// size is found in reloc #0. Microsoft tools understand this.
	Sec->Header.NumberOfRelocations = 0xffff;
	} else {
	Sec->Header.NumberOfRelocations = Sec->Relocations.size();
	}
	Sec->Header.PointerToRelocations = Offset;

	if (RelocationsOverflow) {
	// Reloc #0 will contain actual count, so make room for it.
	Offset += COFF::RelocationSize;
	}

	Offset += COFF::RelocationSize * Sec->Relocations.size();

	for (auto &Relocation : Sec->Relocations) {
	assert(Relocation.Symb->getIndex() != -1);
	Relocation.Data.SymbolTableIndex = Relocation.Symb->getIndex();
	}
	}

	assert(Sec->Symbol->Aux.size() == 1 &&
	"Section's symbol must have one aux!");
	AuxSymbol &Aux = Sec->Symbol->Aux[0];
	assert(Aux.AuxType == ATSectionDefinition &&
	"Section's symbol's aux symbol must be a Section Definition!");
	Aux.Aux.SectionDefinition.Length = Sec->Header.SizeOfRawData;
	Aux.Aux.SectionDefinition.NumberOfRelocations =
	Sec->Header.NumberOfRelocations;
	Aux.Aux.SectionDefinition.NumberOfLinenumbers =
	Sec->Header.NumberOfLineNumbers;
	}

	Header.PointerToSymbolTable = Offset;
	}

	uint64_t WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
	const MCAsmLayout &Layout) {
	uint64_t StartOffset = W.OS.tell();

	if (Sections.size() > INT32_MAX)
	report_fatal_error(
	"PE COFF object files can't have more than 2147483647 sections");

	UseBigObj = Sections.size() > COFF::MaxNumberOfSections16;
	Header.NumberOfSections = Sections.size();
	Header.NumberOfSymbols = 0;

	setWeakDefaultNames();
	assignSectionNumbers();
	createFileSymbols(Asm);

	for (auto &Symbol : Symbols) {
	// Update section number & offset for symbols that have them.
	if (Symbol->Section)
	Symbol->Data.SectionNumber = Symbol->Section->Number;
	Symbol->setIndex(Header.NumberOfSymbols++);
	// Update auxiliary symbol info.
	Symbol->Data.NumberOfAuxSymbols = Symbol->Aux.size();
	Header.NumberOfSymbols += Symbol->Data.NumberOfAuxSymbols;
	}

	// Build string table.
	for (const auto &S : Sections)
	if (S->Name.size() > COFF::NameSize)
	Strings.add(S->Name);
	for (const auto &S : Symbols)
	if (S->Name.size() > COFF::NameSize)
	Strings.add(S->Name);
	Strings.finalize();

	// Set names.
	for (const auto &S : Sections)
	SetSectionName(*S);
	for (auto &S : Symbols)
	SetSymbolName(*S);

	// Fixup weak external references.
	for (auto &Symbol : Symbols) {
	if (Symbol->Other) {
	assert(Symbol->getIndex() != -1);
	assert(Symbol->Aux.size() == 1 && "Symbol must contain one aux symbol!");
	assert(Symbol->Aux[0].AuxType == ATWeakExternal &&
	"Symbol's aux symbol must be a Weak External!");
	Symbol->Aux[0].Aux.WeakExternal.TagIndex = Symbol->Other->getIndex();
	}
	}

	// Fixup associative COMDAT sections.
	for (auto &Section : Sections) {
	if (Section->Symbol->Aux[0].Aux.SectionDefinition.Selection !=
	COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE)
	continue;

	const MCSectionCOFF &MCSec = *Section->MCSection;
	const MCSymbol *AssocMCSym = MCSec.getCOMDATSymbol();
	assert(AssocMCSym);

	// It's an error to try to associate with an undefined symbol or a symbol
	// without a section.
	if (!AssocMCSym->isInSection()) {
	Asm.getContext().reportError(
	SMLoc(), Twine("cannot make section ") + MCSec.getName() +
	Twine(" associative with sectionless symbol ") +
	AssocMCSym->getName());
	continue;
	}

	const auto *AssocMCSec = cast<MCSectionCOFF>(&AssocMCSym->getSection());
	assert(SectionMap.count(AssocMCSec));
	COFFSection *AssocSec = SectionMap[AssocMCSec];

	// Skip this section if the associated section is unused.
	if (AssocSec->Number == -1)
	continue;

	Section->Symbol->Aux[0].Aux.SectionDefinition.Number = AssocSec->Number;
	}

	// Create the contents of the .llvm_addrsig section.
	if (EmitAddrsigSection) {
	auto Frag = new MCDataFragment(AddrsigSection);
	Frag->setLayoutOrder(0);
	raw_svector_ostream OS(Frag->getContents());
	for (const MCSymbol *S : AddrsigSyms) {
	if (!S->isTemporary()) {
	encodeULEB128(S->getIndex(), OS);
	continue;
	}

	MCSection *TargetSection = &S->getSection();
	assert(SectionMap.find(TargetSection) != SectionMap.end() &&
	"Section must already have been defined in "
	"executePostLayoutBinding!");
	encodeULEB128(SectionMap[TargetSection]->Symbol->getIndex(), OS);
	}
	}

	// Create the contents of the .llvm.call-graph-profile section.
	if (CGProfileSection) {
	auto *Frag = new MCDataFragment(CGProfileSection);
	Frag->setLayoutOrder(0);
	raw_svector_ostream OS(Frag->getContents());
	for (const MCAssembler::CGProfileEntry &CGPE : Asm.CGProfile) {
	uint32_t FromIndex = CGPE.From->getSymbol().getIndex();
	uint32_t ToIndex = CGPE.To->getSymbol().getIndex();
	support::endian::write(OS, FromIndex, W.Endian);
	support::endian::write(OS, ToIndex, W.Endian);
	support::endian::write(OS, CGPE.Count, W.Endian);
	}
	}

	assignFileOffsets(Asm, Layout);

	// MS LINK expects to be able to use this timestamp to implement their
	// /INCREMENTAL feature.
	if (Asm.isIncrementalLinkerCompatible()) {
	Header.TimeDateStamp = getTime();
	} else {
	// Have deterministic output if /INCREMENTAL isn't needed. Also matches GNU.
	Header.TimeDateStamp = 0;
	}

	// Write it all to disk...
	WriteFileHeader(Header);
	writeSectionHeaders();

	// Write section contents.
	sections::iterator I = Sections.begin();
	sections::iterator IE = Sections.end();
	MCAssembler::iterator J = Asm.begin();
	MCAssembler::iterator JE = Asm.end();
	for (; I != IE && J != JE; ++I, ++J)
	writeSection(Asm, Layout, *I, J);

	assert(W.OS.tell() == Header.PointerToSymbolTable &&
	"Header::PointerToSymbolTable is insane!");

	// Write a symbol table.
	for (auto &Symbol : Symbols)
	if (Symbol->getIndex() != -1)
	WriteSymbol(*Symbol);

	// Write a string table, which completes the entire COFF file.
	Strings.write(W.OS);

	return W.OS.tell() - StartOffset;
	}

	MCWinCOFFObjectTargetWriter::MCWinCOFFObjectTargetWriter(unsigned Machine_)
	: Machine(Machine_) {}

	// Pin the vtable to this file.
	void MCWinCOFFObjectTargetWriter::anchor() {}

	//------------------------------------------------------------------------------
	// WinCOFFObjectWriter factory function

	std::unique_ptr<MCObjectWriter> llvm::createWinCOFFObjectWriter(
	std::unique_ptr<MCWinCOFFObjectTargetWriter> MOTW, raw_pwrite_stream &OS) {
	return std::make_unique<WinCOFFObjectWriter>(std::move(MOTW), OS);
	}
	diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
	index efa3fd5ca9ce..4789a9f02937 100644
	--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
	+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
	@@ -1,3144 +1,3139 @@
	//===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------- C++ --====//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains the AArch64 implementation of TargetFrameLowering class.
	//
	// On AArch64, stack frames are structured as follows:
	//
	// The stack grows downward.
	//
	// All of the individual frame areas on the frame below are optional, i.e. it's
	// possible to create a function so that the particular area isn't present
	// in the frame.
	//
	// At function entry, the "frame" looks as follows:
	//
	// \| \| Higher address
	// \|-----------------------------------\|
	// \| \|
	// \| arguments passed on the stack \|
	// \| \|
	// \|-----------------------------------\| <- sp
	// \| \| Lower address
	//
	//
	// After the prologue has run, the frame has the following general structure.
	// Note that this doesn't depict the case where a red-zone is used. Also,
	// technically the last frame area (VLAs) doesn't get created until in the
	// main function body, after the prologue is run. However, it's depicted here
	// for completeness.
	//
	// \| \| Higher address
	// \|-----------------------------------\|
	// \| \|
	// \| arguments passed on the stack \|
	// \| \|
	// \|-----------------------------------\|
	// \| \|
	// \| (Win64 only) varargs from reg \|
	// \| \|
	// \|-----------------------------------\|
	// \| \|
	// \| callee-saved gpr registers \| <--.
	// \| \| \| On Darwin platforms these
	// \|- - - - - - - - - - - - - - - - - -\| \| callee saves are swapped,
	// \| \| \| (frame record first)
	// \| prev_fp, prev_lr \| <--'
	// \| (a.k.a. "frame record") \|
	// \|-----------------------------------\| <- fp(=x29)
	// \| \|
	// \| callee-saved fp/simd/SVE regs \|
	// \| \|
	// \|-----------------------------------\|
	// \| \|
	// \| SVE stack objects \|
	// \| \|
	// \|-----------------------------------\|
	// \|.empty.space.to.make.part.below....\|
	// \|.aligned.in.case.it.needs.more.than\| (size of this area is unknown at
	// \|.the.standard.16-byte.alignment....\| compile time; if present)
	// \|-----------------------------------\|
	// \| \|
	// \| local variables of fixed size \|
	// \| including spill slots \|
	// \|-----------------------------------\| <- bp(not defined by ABI,
	// \|.variable-sized.local.variables....\| LLVM chooses X19)
	// \|.(VLAs)............................\| (size of this area is unknown at
	// \|...................................\| compile time)
	// \|-----------------------------------\| <- sp
	// \| \| Lower address
	//
	//
	// To access the data in a frame, at-compile time, a constant offset must be
	// computable from one of the pointers (fp, bp, sp) to access it. The size
	// of the areas with a dotted background cannot be computed at compile-time
	// if they are present, making it required to have all three of fp, bp and
	// sp to be set up to be able to access all contents in the frame areas,
	// assuming all of the frame areas are non-empty.
	//
	// For most functions, some of the frame areas are empty. For those functions,
	// it may not be necessary to set up fp or bp:
	// * A base pointer is definitely needed when there are both VLAs and local
	// variables with more-than-default alignment requirements.
	// * A frame pointer is definitely needed when there are local variables with
	// more-than-default alignment requirements.
	//
	// For Darwin platforms the frame-record (fp, lr) is stored at the top of the
	// callee-saved area, since the unwind encoding does not allow for encoding
	// this dynamically and existing tools depend on this layout. For other
	// platforms, the frame-record is stored at the bottom of the (gpr) callee-saved
	// area to allow SVE stack objects (allocated directly below the callee-saves,
	// if available) to be accessed directly from the framepointer.
	// The SVE spill/fill instructions have VL-scaled addressing modes such
	// as:
	// ldr z8, [fp, #-7 mul vl]
	// For SVE the size of the vector length (VL) is not known at compile-time, so
	// '#-7 mul vl' is an offset that can only be evaluated at runtime. With this
	// layout, we don't need to add an unscaled offset to the framepointer before
	// accessing the SVE object in the frame.
	//
	// In some cases when a base pointer is not strictly needed, it is generated
	// anyway when offsets from the frame pointer to access local variables become
	// so large that the offset can't be encoded in the immediate fields of loads
	// or stores.
	//
	// FIXME: also explain the redzone concept.
	// FIXME: also explain the concept of reserved call frames.
	//
	//===----------------------------------------------------------------------===//

	#include "AArch64FrameLowering.h"
	#include "AArch64InstrInfo.h"
	#include "AArch64MachineFunctionInfo.h"
	#include "AArch64RegisterInfo.h"
	#include "AArch64StackOffset.h"
	#include "AArch64Subtarget.h"
	#include "AArch64TargetMachine.h"
	#include "MCTargetDesc/AArch64AddressingModes.h"
	#include "llvm/ADT/ScopeExit.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/CodeGen/LivePhysRegs.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/RegisterScavenging.h"
	#include "llvm/CodeGen/TargetInstrInfo.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/CodeGen/WinEHFuncInfo.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/Function.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCDwarf.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include <cassert>
	#include <cstdint>
	#include <iterator>
	#include <vector>

	using namespace llvm;

	#define DEBUG_TYPE "frame-info"

	static cl::opt<bool> EnableRedZone("aarch64-redzone",
	cl::desc("enable use of redzone on AArch64"),
	cl::init(false), cl::Hidden);

	static cl::opt<bool>
	ReverseCSRRestoreSeq("reverse-csr-restore-seq",
	cl::desc("reverse the CSR restore sequence"),
	cl::init(false), cl::Hidden);

	static cl::opt<bool> StackTaggingMergeSetTag(
	"stack-tagging-merge-settag",
	cl::desc("merge settag instruction in function epilog"), cl::init(true),
	cl::Hidden);

	STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");

	/// Returns the argument pop size.
	static uint64_t getArgumentPopSize(MachineFunction &MF,
	MachineBasicBlock &MBB) {
	MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
	bool IsTailCallReturn = false;
	if (MBB.end() != MBBI) {
	unsigned RetOpcode = MBBI->getOpcode();
	IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi \|\|
	RetOpcode == AArch64::TCRETURNri \|\|
	RetOpcode == AArch64::TCRETURNriBTI;
	}
	AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();

	uint64_t ArgumentPopSize = 0;
	if (IsTailCallReturn) {
	MachineOperand &StackAdjust = MBBI->getOperand(1);

	// For a tail-call in a callee-pops-arguments environment, some or all of
	// the stack may actually be in use for the call's arguments, this is
	// calculated during LowerCall and consumed here...
	ArgumentPopSize = StackAdjust.getImm();
	} else {
	// ... otherwise the amount to pop is all of the argument space,
	// conveniently stored in the MachineFunctionInfo by
	// LowerFormalArguments. This will, of course, be zero for the C calling
	// convention.
	ArgumentPopSize = AFI->getArgumentStackToRestore();
	}

	return ArgumentPopSize;
	}

	/// This is the biggest offset to the stack pointer we can encode in aarch64
	/// instructions (without using a separate calculation and a temp register).
	/// Note that the exception here are vector stores/loads which cannot encode any
	/// displacements (see estimateRSStackSizeLimit(), isAArch64FrameOffsetLegal()).
	static const unsigned DefaultSafeSPDisplacement = 255;

	/// Look at each instruction that references stack frames and return the stack
	/// size limit beyond which some of these instructions will require a scratch
	/// register during their expansion later.
	static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
	// FIXME: For now, just conservatively guestimate based on unscaled indexing
	// range. We'll end up allocating an unnecessary spill slot a lot, but
	// realistically that's not a big deal at this stage of the game.
	for (MachineBasicBlock &MBB : MF) {
	for (MachineInstr &MI : MBB) {
	if (MI.isDebugInstr() \|\| MI.isPseudo() \|\|
	MI.getOpcode() == AArch64::ADDXri \|\|
	MI.getOpcode() == AArch64::ADDSXri)
	continue;

	for (const MachineOperand &MO : MI.operands()) {
	if (!MO.isFI())
	continue;

	StackOffset Offset;
	if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) ==
	AArch64FrameOffsetCannotUpdate)
	return 0;
	}
	}
	}
	return DefaultSafeSPDisplacement;
	}

	TargetStackID::Value
	AArch64FrameLowering::getStackIDForScalableVectors() const {
	return TargetStackID::SVEVector;
	}

	/// Returns the size of the fixed object area (allocated next to sp on entry)
	/// On Win64 this may include a var args area and an UnwindHelp object for EH.
	static unsigned getFixedObjectSize(const MachineFunction &MF,
	const AArch64FunctionInfo *AFI, bool IsWin64,
	bool IsFunclet) {
	if (!IsWin64 \|\| IsFunclet) {
	// Only Win64 uses fixed objects, and then only for the function (not
	// funclets)
	return 0;
	} else {
	// Var args are stored here in the primary function.
	const unsigned VarArgsArea = AFI->getVarArgsGPRSize();
	// To support EH funclets we allocate an UnwindHelp object
	const unsigned UnwindHelpObject = (MF.hasEHFunclets() ? 8 : 0);
	return alignTo(VarArgsArea + UnwindHelpObject, 16);
	}
	}

	/// Returns the size of the entire SVE stackframe (calleesaves + spills).
	static StackOffset getSVEStackSize(const MachineFunction &MF) {
	const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	return {(int64_t)AFI->getStackSizeSVE(), MVT::nxv1i8};
	}

	bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
	if (!EnableRedZone)
	return false;
	// Don't use the red zone if the function explicitly asks us not to.
	// This is typically used for kernel code.
	if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone))
	return false;

	const MachineFrameInfo &MFI = MF.getFrameInfo();
	const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	uint64_t NumBytes = AFI->getLocalStackSize();

	return !(MFI.hasCalls() \|\| hasFP(MF) \|\| NumBytes > 128 \|\|
	getSVEStackSize(MF));
	}

	/// hasFP - Return true if the specified function should have a dedicated frame
	/// pointer register.
	bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
	// Win64 EH requires a frame pointer if funclets are present, as the locals
	// are accessed off the frame pointer in both the parent function and the
	// funclets.
	if (MF.hasEHFunclets())
	return true;
	// Retain behavior of always omitting the FP for leaf functions when possible.
	if (MF.getTarget().Options.DisableFramePointerElim(MF))
	return true;
	if (MFI.hasVarSizedObjects() \|\| MFI.isFrameAddressTaken() \|\|
	MFI.hasStackMap() \|\| MFI.hasPatchPoint() \|\|
	RegInfo->needsStackRealignment(MF))
	return true;
	// With large callframes around we may need to use FP to access the scavenging
	// emergency spillslot.
	//
	// Unfortunately some calls to hasFP() like machine verifier ->
	// getReservedReg() -> hasFP in the middle of global isel are too early
	// to know the max call frame size. Hopefully conservatively returning "true"
	// in those cases is fine.
	// DefaultSafeSPDisplacement is fine as we only emergency spill GP regs.
	if (!MFI.isMaxCallFrameSizeComputed() \|\|
	MFI.getMaxCallFrameSize() > DefaultSafeSPDisplacement)
	return true;

	return false;
	}

	/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
	/// not required, we reserve argument space for call sites in the function
	/// immediately on entry to the current function. This eliminates the need for
	/// add/sub sp brackets around call sites. Returns true if the call frame is
	/// included as part of the stack frame.
	bool
	AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
	return !MF.getFrameInfo().hasVarSizedObjects();
	}

	MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
	MachineFunction &MF, MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I) const {
	const AArch64InstrInfo *TII =
	static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
	DebugLoc DL = I->getDebugLoc();
	unsigned Opc = I->getOpcode();
	bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
	uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;

	if (!hasReservedCallFrame(MF)) {
	int64_t Amount = I->getOperand(0).getImm();
	Amount = alignTo(Amount, getStackAlign());
	if (!IsDestroy)
	Amount = -Amount;

	// N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
	// doesn't have to pop anything), then the first operand will be zero too so
	// this adjustment is a no-op.
	if (CalleePopAmount == 0) {
	// FIXME: in-function stack adjustment for calls is limited to 24-bits
	// because there's no guaranteed temporary register available.
	//
	// ADD/SUB (immediate) has only LSL #0 and LSL #12 available.
	// 1) For offset <= 12-bit, we use LSL #0
	// 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
	// LSL #0, and the other uses LSL #12.
	//
	// Most call frames will be allocated at the start of a function so
	// this is OK, but it is a limitation that needs dealing with.
	assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
	emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, {Amount, MVT::i8},
	TII);
	}
	} else if (CalleePopAmount != 0) {
	// If the calling convention demands that the callee pops arguments from the
	// stack, we want to add it back if we have a reserved call frame.
	assert(CalleePopAmount < 0xffffff && "call frame too large");
	emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
	{-(int64_t)CalleePopAmount, MVT::i8}, TII);
	}
	return MBB.erase(I);
	}

	static bool ShouldSignReturnAddress(MachineFunction &MF) {
	// The function should be signed in the following situations:
	// - sign-return-address=all
	// - sign-return-address=non-leaf and the functions spills the LR

	const Function &F = MF.getFunction();
	if (!F.hasFnAttribute("sign-return-address"))
	return false;

	StringRef Scope = F.getFnAttribute("sign-return-address").getValueAsString();
	if (Scope.equals("none"))
	return false;

	if (Scope.equals("all"))
	return true;

	assert(Scope.equals("non-leaf") && "Expected all, none or non-leaf");

	for (const auto &Info : MF.getFrameInfo().getCalleeSavedInfo())
	if (Info.getReg() == AArch64::LR)
	return true;

	return false;
	}

	void AArch64FrameLowering::emitCalleeSavedFrameMoves(
	MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
	MachineFunction &MF = *MBB.getParent();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	const TargetSubtargetInfo &STI = MF.getSubtarget();
	const MCRegisterInfo *MRI = STI.getRegisterInfo();
	const TargetInstrInfo *TII = STI.getInstrInfo();
	DebugLoc DL = MBB.findDebugLoc(MBBI);

	// Add callee saved registers to move list.
	const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
	if (CSI.empty())
	return;

	for (const auto &Info : CSI) {
	unsigned Reg = Info.getReg();
	int64_t Offset =
	MFI.getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea();
	unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
	unsigned CFIIndex = MF.addFrameInst(
	MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
	BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
	.addCFIIndex(CFIIndex)
	.setMIFlags(MachineInstr::FrameSetup);
	}
	}

	// Find a scratch register that we can use at the start of the prologue to
	// re-align the stack pointer. We avoid using callee-save registers since they
	// may appear to be free when this is called from canUseAsPrologue (during
	// shrink wrapping), but then no longer be free when this is called from
	// emitPrologue.
	//
	// FIXME: This is a bit conservative, since in the above case we could use one
	// of the callee-save registers as a scratch temp to re-align the stack pointer,
	// but we would then have to make sure that we were in fact saving at least one
	// callee-save register in the prologue, which is additional complexity that
	// doesn't seem worth the benefit.
	static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
	MachineFunction *MF = MBB->getParent();

	// If MBB is an entry block, use X9 as the scratch register
	if (&MF->front() == MBB)
	return AArch64::X9;

	const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
	const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
	LivePhysRegs LiveRegs(TRI);
	LiveRegs.addLiveIns(*MBB);

	// Mark callee saved registers as used so we will not choose them.
	const MCPhysReg *CSRegs = MF->getRegInfo().getCalleeSavedRegs();
	for (unsigned i = 0; CSRegs[i]; ++i)
	LiveRegs.addReg(CSRegs[i]);

	// Prefer X9 since it was historically used for the prologue scratch reg.
	const MachineRegisterInfo &MRI = MF->getRegInfo();
	if (LiveRegs.available(MRI, AArch64::X9))
	return AArch64::X9;

	for (unsigned Reg : AArch64::GPR64RegClass) {
	if (LiveRegs.available(MRI, Reg))
	return Reg;
	}
	return AArch64::NoRegister;
	}

	bool AArch64FrameLowering::canUseAsPrologue(
	const MachineBasicBlock &MBB) const {
	const MachineFunction *MF = MBB.getParent();
	MachineBasicBlock TmpMBB = const_cast<MachineBasicBlock >(&MBB);
	const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
	const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

	// Don't need a scratch register if we're not going to re-align the stack.
	if (!RegInfo->needsStackRealignment(*MF))
	return true;
	// Otherwise, we can use any block as long as it has a scratch register
	// available.
	return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;
	}

	static bool windowsRequiresStackProbe(MachineFunction &MF,
	uint64_t StackSizeInBytes) {
	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
	if (!Subtarget.isTargetWindows())
	return false;
	const Function &F = MF.getFunction();
	// TODO: When implementing stack protectors, take that into account
	// for the probe threshold.
	unsigned StackProbeSize = 4096;
	if (F.hasFnAttribute("stack-probe-size"))
	F.getFnAttribute("stack-probe-size")
	.getValueAsString()
	.getAsInteger(0, StackProbeSize);
	return (StackSizeInBytes >= StackProbeSize) &&
	!F.hasFnAttribute("no-stack-arg-probe");
	}

	bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
	MachineFunction &MF, uint64_t StackBumpBytes) const {
	AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
	const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

	if (AFI->getLocalStackSize() == 0)
	return false;

	// 512 is the maximum immediate for stp/ldp that will be used for
	// callee-save save/restores
	if (StackBumpBytes >= 512 \|\| windowsRequiresStackProbe(MF, StackBumpBytes))
	return false;

	if (MFI.hasVarSizedObjects())
	return false;

	if (RegInfo->needsStackRealignment(MF))
	return false;

	// This isn't strictly necessary, but it simplifies things a bit since the
	// current RedZone handling code assumes the SP is adjusted by the
	// callee-save save/restore code.
	if (canUseRedZone(MF))
	return false;

	// When there is an SVE area on the stack, always allocate the
	// callee-saves and spills/locals separately.
	if (getSVEStackSize(MF))
	return false;

	return true;
	}

	bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue(
	MachineBasicBlock &MBB, unsigned StackBumpBytes) const {
	if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes))
	return false;

	if (MBB.empty())
	return true;

	// Disable combined SP bump if the last instruction is an MTE tag store. It
	// is almost always better to merge SP adjustment into those instructions.
	MachineBasicBlock::iterator LastI = MBB.getFirstTerminator();
	MachineBasicBlock::iterator Begin = MBB.begin();
	while (LastI != Begin) {
	--LastI;
	if (LastI->isTransient())
	continue;
	if (!LastI->getFlag(MachineInstr::FrameDestroy))
	break;
	}
	switch (LastI->getOpcode()) {
	case AArch64::STGloop:
	case AArch64::STZGloop:
	case AArch64::STGOffset:
	case AArch64::STZGOffset:
	case AArch64::ST2GOffset:
	case AArch64::STZ2GOffset:
	return false;
	default:
	return true;
	}
	llvm_unreachable("unreachable");
	}

	// Given a load or a store instruction, generate an appropriate unwinding SEH
	// code on Windows.
	static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
	const TargetInstrInfo &TII,
	MachineInstr::MIFlag Flag) {
	unsigned Opc = MBBI->getOpcode();
	MachineBasicBlock *MBB = MBBI->getParent();
	MachineFunction &MF = *MBB->getParent();
	DebugLoc DL = MBBI->getDebugLoc();
	unsigned ImmIdx = MBBI->getNumOperands() - 1;
	int Imm = MBBI->getOperand(ImmIdx).getImm();
	MachineInstrBuilder MIB;
	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
	const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

	switch (Opc) {
	default:
	llvm_unreachable("No SEH Opcode for this instruction");
	case AArch64::LDPDpost:
	Imm = -Imm;
	LLVM_FALLTHROUGH;
	case AArch64::STPDpre: {
	unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
	unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(2).getReg());
	MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP_X))
	.addImm(Reg0)
	.addImm(Reg1)
	.addImm(Imm * 8)
	.setMIFlag(Flag);
	break;
	}
	case AArch64::LDPXpost:
	Imm = -Imm;
	LLVM_FALLTHROUGH;
	case AArch64::STPXpre: {
	Register Reg0 = MBBI->getOperand(1).getReg();
	Register Reg1 = MBBI->getOperand(2).getReg();
	if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
	MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR_X))
	.addImm(Imm * 8)
	.setMIFlag(Flag);
	else
	MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP_X))
	.addImm(RegInfo->getSEHRegNum(Reg0))
	.addImm(RegInfo->getSEHRegNum(Reg1))
	.addImm(Imm * 8)
	.setMIFlag(Flag);
	break;
	}
	case AArch64::LDRDpost:
	Imm = -Imm;
	LLVM_FALLTHROUGH;
	case AArch64::STRDpre: {
	unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
	MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg_X))
	.addImm(Reg)
	.addImm(Imm)
	.setMIFlag(Flag);
	break;
	}
	case AArch64::LDRXpost:
	Imm = -Imm;
	LLVM_FALLTHROUGH;
	case AArch64::STRXpre: {
	unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
	MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg_X))
	.addImm(Reg)
	.addImm(Imm)
	.setMIFlag(Flag);
	break;
	}
	case AArch64::STPDi:
	case AArch64::LDPDi: {
	unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
	unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
	MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP))
	.addImm(Reg0)
	.addImm(Reg1)
	.addImm(Imm * 8)
	.setMIFlag(Flag);
	break;
	}
	case AArch64::STPXi:
	case AArch64::LDPXi: {
	Register Reg0 = MBBI->getOperand(0).getReg();
	Register Reg1 = MBBI->getOperand(1).getReg();
	if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
	MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR))
	.addImm(Imm * 8)
	.setMIFlag(Flag);
	else
	MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP))
	.addImm(RegInfo->getSEHRegNum(Reg0))
	.addImm(RegInfo->getSEHRegNum(Reg1))
	.addImm(Imm * 8)
	.setMIFlag(Flag);
	break;
	}
	case AArch64::STRXui:
	case AArch64::LDRXui: {
	int Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
	MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg))
	.addImm(Reg)
	.addImm(Imm * 8)
	.setMIFlag(Flag);
	break;
	}
	case AArch64::STRDui:
	case AArch64::LDRDui: {
	unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
	MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg))
	.addImm(Reg)
	.addImm(Imm * 8)
	.setMIFlag(Flag);
	break;
	}
	}
	auto I = MBB->insertAfter(MBBI, MIB);
	return I;
	}

	// Fix up the SEH opcode associated with the save/restore instruction.
	static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
	unsigned LocalStackSize) {
	MachineOperand *ImmOpnd = nullptr;
	unsigned ImmIdx = MBBI->getNumOperands() - 1;
	switch (MBBI->getOpcode()) {
	default:
	llvm_unreachable("Fix the offset in the SEH instruction");
	case AArch64::SEH_SaveFPLR:
	case AArch64::SEH_SaveRegP:
	case AArch64::SEH_SaveReg:
	case AArch64::SEH_SaveFRegP:
	case AArch64::SEH_SaveFReg:
	ImmOpnd = &MBBI->getOperand(ImmIdx);
	break;
	}
	if (ImmOpnd)
	ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize);
	}

	// Convert callee-save register save/restore instruction to do stack pointer
	// decrement/increment to allocate/deallocate the callee-save stack area by
	// converting store/load to use pre/post increment version.
	static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
	MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
	const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc,
	bool NeedsWinCFI, bool *HasWinCFI, bool InProlog = true) {
	// Ignore instructions that do not operate on SP, i.e. shadow call stack
	// instructions and associated CFI instruction.
	while (MBBI->getOpcode() == AArch64::STRXpost \|\|
	MBBI->getOpcode() == AArch64::LDRXpre \|\|
	MBBI->getOpcode() == AArch64::CFI_INSTRUCTION) {
	if (MBBI->getOpcode() != AArch64::CFI_INSTRUCTION)
	assert(MBBI->getOperand(0).getReg() != AArch64::SP);
	++MBBI;
	}
	unsigned NewOpc;
	int Scale = 1;
	switch (MBBI->getOpcode()) {
	default:
	llvm_unreachable("Unexpected callee-save save/restore opcode!");
	case AArch64::STPXi:
	NewOpc = AArch64::STPXpre;
	Scale = 8;
	break;
	case AArch64::STPDi:
	NewOpc = AArch64::STPDpre;
	Scale = 8;
	break;
	case AArch64::STPQi:
	NewOpc = AArch64::STPQpre;
	Scale = 16;
	break;
	case AArch64::STRXui:
	NewOpc = AArch64::STRXpre;
	break;
	case AArch64::STRDui:
	NewOpc = AArch64::STRDpre;
	break;
	case AArch64::STRQui:
	NewOpc = AArch64::STRQpre;
	break;
	case AArch64::LDPXi:
	NewOpc = AArch64::LDPXpost;
	Scale = 8;
	break;
	case AArch64::LDPDi:
	NewOpc = AArch64::LDPDpost;
	Scale = 8;
	break;
	case AArch64::LDPQi:
	NewOpc = AArch64::LDPQpost;
	Scale = 16;
	break;
	case AArch64::LDRXui:
	NewOpc = AArch64::LDRXpost;
	break;
	case AArch64::LDRDui:
	NewOpc = AArch64::LDRDpost;
	break;
	case AArch64::LDRQui:
	NewOpc = AArch64::LDRQpost;
	break;
	}
	// Get rid of the SEH code associated with the old instruction.
	if (NeedsWinCFI) {
	auto SEH = std::next(MBBI);
	if (AArch64InstrInfo::isSEHInstruction(*SEH))
	SEH->eraseFromParent();
	}

	MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
	MIB.addReg(AArch64::SP, RegState::Define);

	// Copy all operands other than the immediate offset.
	unsigned OpndIdx = 0;
	for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
	++OpndIdx)
	MIB.add(MBBI->getOperand(OpndIdx));

	assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
	"Unexpected immediate offset in first/last callee-save save/restore "
	"instruction!");
	assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP &&
	"Unexpected base register in callee-save save/restore instruction!");
	assert(CSStackSizeInc % Scale == 0);
	MIB.addImm(CSStackSizeInc / Scale);

	MIB.setMIFlags(MBBI->getFlags());
	MIB.setMemRefs(MBBI->memoperands());

	// Generate a new SEH code that corresponds to the new instruction.
	if (NeedsWinCFI) {
	*HasWinCFI = true;
	InsertSEH(MIB, TII,
	InProlog ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy);
	}

	return std::prev(MBB.erase(MBBI));
	}

	// Fixup callee-save register save/restore instructions to take into account
	// combined SP bump by adding the local stack size to the stack offsets.
	static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
	uint64_t LocalStackSize,
	bool NeedsWinCFI,
	bool *HasWinCFI) {
	if (AArch64InstrInfo::isSEHInstruction(MI))
	return;

	unsigned Opc = MI.getOpcode();

	// Ignore instructions that do not operate on SP, i.e. shadow call stack
	// instructions and associated CFI instruction.
	if (Opc == AArch64::STRXpost \|\| Opc == AArch64::LDRXpre \|\|
	Opc == AArch64::CFI_INSTRUCTION) {
	if (Opc != AArch64::CFI_INSTRUCTION)
	assert(MI.getOperand(0).getReg() != AArch64::SP);
	return;
	}

	unsigned Scale;
	switch (Opc) {
	case AArch64::STPXi:
	case AArch64::STRXui:
	case AArch64::STPDi:
	case AArch64::STRDui:
	case AArch64::LDPXi:
	case AArch64::LDRXui:
	case AArch64::LDPDi:
	case AArch64::LDRDui:
	Scale = 8;
	break;
	case AArch64::STPQi:
	case AArch64::STRQui:
	case AArch64::LDPQi:
	case AArch64::LDRQui:
	Scale = 16;
	break;
	default:
	llvm_unreachable("Unexpected callee-save save/restore opcode!");
	}

	unsigned OffsetIdx = MI.getNumExplicitOperands() - 1;
	assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP &&
	"Unexpected base register in callee-save save/restore instruction!");
	// Last operand is immediate offset that needs fixing.
	MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx);
	// All generated opcodes have scaled offsets.
	assert(LocalStackSize % Scale == 0);
	OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / Scale);

	if (NeedsWinCFI) {
	*HasWinCFI = true;
	auto MBBI = std::next(MachineBasicBlock::iterator(MI));
	assert(MBBI != MI.getParent()->end() && "Expecting a valid instruction");
	assert(AArch64InstrInfo::isSEHInstruction(*MBBI) &&
	"Expecting a SEH instruction");
	fixupSEHOpcode(MBBI, LocalStackSize);
	}
	}

	static void adaptForLdStOpt(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator FirstSPPopI,
	MachineBasicBlock::iterator LastPopI) {
	// Sometimes (when we restore in the same order as we save), we can end up
	// with code like this:
	//
	// ldp x26, x25, [sp]
	// ldp x24, x23, [sp, #16]
	// ldp x22, x21, [sp, #32]
	// ldp x20, x19, [sp, #48]
	// add sp, sp, #64
	//
	// In this case, it is always better to put the first ldp at the end, so
	// that the load-store optimizer can run and merge the ldp and the add into
	// a post-index ldp.
	// If we managed to grab the first pop instruction, move it to the end.
	if (ReverseCSRRestoreSeq)
	MBB.splice(FirstSPPopI, &MBB, LastPopI);
	// We should end up with something like this now:
	//
	// ldp x24, x23, [sp, #16]
	// ldp x22, x21, [sp, #32]
	// ldp x20, x19, [sp, #48]
	// ldp x26, x25, [sp]
	// add sp, sp, #64
	//
	// and the load-store optimizer can merge the last two instructions into:
	//
	// ldp x26, x25, [sp], #64
	//
	}

	static bool ShouldSignWithAKey(MachineFunction &MF) {
	const Function &F = MF.getFunction();
	if (!F.hasFnAttribute("sign-return-address-key"))
	return true;

	const StringRef Key =
	F.getFnAttribute("sign-return-address-key").getValueAsString();
	assert(Key.equals_lower("a_key") \|\| Key.equals_lower("b_key"));
	return Key.equals_lower("a_key");
	}

	static bool needsWinCFI(const MachineFunction &MF) {
	const Function &F = MF.getFunction();
	return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
	F.needsUnwindTableEntry();
	}

	static bool isTargetDarwin(const MachineFunction &MF) {
	return MF.getSubtarget<AArch64Subtarget>().isTargetDarwin();
	}

	static bool isTargetWindows(const MachineFunction &MF) {
	return MF.getSubtarget<AArch64Subtarget>().isTargetWindows();
	}

	// Convenience function to determine whether I is an SVE callee save.
	static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
	switch (I->getOpcode()) {
	default:
	return false;
	case AArch64::STR_ZXI:
	case AArch64::STR_PXI:
	case AArch64::LDR_ZXI:
	case AArch64::LDR_PXI:
	return I->getFlag(MachineInstr::FrameSetup) \|\|
	I->getFlag(MachineInstr::FrameDestroy);
	}
	}

	void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
	MachineBasicBlock &MBB) const {
	MachineBasicBlock::iterator MBBI = MBB.begin();
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	const Function &F = MF.getFunction();
	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
	const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineModuleInfo &MMI = MF.getMMI();
	AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	bool needsFrameMoves =
	MF.needsFrameMoves() && !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
	bool HasFP = hasFP(MF);
	bool NeedsWinCFI = needsWinCFI(MF);
	bool HasWinCFI = false;
	auto Cleanup = make_scope_exit([&]() { MF.setHasWinCFI(HasWinCFI); });

	bool IsFunclet = MBB.isEHFuncletEntry();

	// At this point, we're going to decide whether or not the function uses a
	// redzone. In most cases, the function doesn't have a redzone so let's
	// assume that's false and set it to true in the case that there's a redzone.
	AFI->setHasRedZone(false);

	// Debug location must be unknown since the first debug location is used
	// to determine the end of the prologue.
	DebugLoc DL;

	if (ShouldSignReturnAddress(MF)) {
	if (ShouldSignWithAKey(MF))
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIASP))
	.setMIFlag(MachineInstr::FrameSetup);
	else {
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITBKEY))
	.setMIFlag(MachineInstr::FrameSetup);
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIBSP))
	.setMIFlag(MachineInstr::FrameSetup);
	}

	unsigned CFIIndex =
	MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
	BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
	.addCFIIndex(CFIIndex)
	.setMIFlags(MachineInstr::FrameSetup);
	}

	// All calls are tail calls in GHC calling conv, and functions have no
	// prologue/epilogue.
	if (MF.getFunction().getCallingConv() == CallingConv::GHC)
	return;

	// Set tagged base pointer to the bottom of the stack frame.
	// Ideally it should match SP value after prologue.
	AFI->setTaggedBasePointerOffset(MFI.getStackSize());

	const StackOffset &SVEStackSize = getSVEStackSize(MF);

	// getStackSize() includes all the locals in its size calculation. We don't
	// include these locals when computing the stack size of a funclet, as they
	// are allocated in the parent's stack frame and accessed via the frame
	// pointer from the funclet. We only save the callee saved registers in the
	// funclet, which are really the callee saved registers of the parent
	// function, including the funclet.
	int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
	: MFI.getStackSize();
	if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
	assert(!HasFP && "unexpected function without stack frame but with FP");
	assert(!SVEStackSize &&
	"unexpected function without stack frame but with SVE objects");
	// All of the stack allocation is for locals.
	AFI->setLocalStackSize(NumBytes);
	if (!NumBytes)
	return;
	// REDZONE: If the stack size is less than 128 bytes, we don't need
	// to actually allocate.
	if (canUseRedZone(MF)) {
	AFI->setHasRedZone(true);
	++NumRedZoneFunctions;
	} else {
	emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
	{-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup,
	false, NeedsWinCFI, &HasWinCFI);
	if (!NeedsWinCFI && needsFrameMoves) {
	// Label used to tie together the PROLOG_LABEL and the MachineMoves.
	MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
	// Encode the stack size of the leaf function.
	unsigned CFIIndex = MF.addFrameInst(
	MCCFIInstruction::cfiDefCfaOffset(FrameLabel, NumBytes));
	BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
	.addCFIIndex(CFIIndex)
	.setMIFlags(MachineInstr::FrameSetup);
	}
	}

	if (NeedsWinCFI) {
	HasWinCFI = true;
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
	.setMIFlag(MachineInstr::FrameSetup);
	}

	return;
	}

	bool IsWin64 =
	Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
	unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);

	auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
	// All of the remaining stack allocations are for locals.
	AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
	bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
	if (CombineSPBump) {
	assert(!SVEStackSize && "Cannot combine SP bump with SVE");
	emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
	{-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup, false,
	NeedsWinCFI, &HasWinCFI);
	NumBytes = 0;
	} else if (PrologueSaveSize != 0) {
	MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
	MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI);
	NumBytes -= PrologueSaveSize;
	}
	assert(NumBytes >= 0 && "Negative stack allocation size!?");

	// Move past the saves of the callee-saved registers, fixing up the offsets
	// and pre-inc if we decided to combine the callee-save and local stack
	// pointer bump above.
	MachineBasicBlock::iterator End = MBB.end();
	while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
	!IsSVECalleeSave(MBBI)) {
	if (CombineSPBump)
	fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
	NeedsWinCFI, &HasWinCFI);
	++MBBI;
	}

	// For funclets the FP belongs to the containing function.
	if (!IsFunclet && HasFP) {
	// Only set up FP if we actually need to.
	int64_t FPOffset = isTargetDarwin(MF) ? (AFI->getCalleeSavedStackSize() - 16) : 0;

	if (CombineSPBump)
	FPOffset += AFI->getLocalStackSize();

	// Issue sub fp, sp, FPOffset or
	// mov fp,sp when FPOffset is zero.
	// Note: All stores of callee-saved registers are marked as "FrameSetup".
	// This code marks the instruction(s) that set the FP also.
	emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP,
	{FPOffset, MVT::i8}, TII, MachineInstr::FrameSetup, false,
	NeedsWinCFI, &HasWinCFI);
	}

	if (windowsRequiresStackProbe(MF, NumBytes)) {
	uint64_t NumWords = NumBytes >> 4;
	if (NeedsWinCFI) {
	HasWinCFI = true;
	// alloc_l can hold at most 256MB, so assume that NumBytes doesn't
	// exceed this amount. We need to move at most 2^24 - 1 into x15.
	// This is at most two instructions, MOVZ follwed by MOVK.
	// TODO: Fix to use multiple stack alloc unwind codes for stacks
	// exceeding 256MB in size.
	if (NumBytes >= (1 << 28))
	report_fatal_error("Stack size cannot exceed 256MB for stack "
	"unwinding purposes");

	uint32_t LowNumWords = NumWords & 0xFFFF;
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVZXi), AArch64::X15)
	.addImm(LowNumWords)
	.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
	.setMIFlag(MachineInstr::FrameSetup);
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
	.setMIFlag(MachineInstr::FrameSetup);
	if ((NumWords & 0xFFFF0000) != 0) {
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVKXi), AArch64::X15)
	.addReg(AArch64::X15)
	.addImm((NumWords & 0xFFFF0000) >> 16) // High half
	.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16))
	.setMIFlag(MachineInstr::FrameSetup);
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
	.setMIFlag(MachineInstr::FrameSetup);
	}
	} else {
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15)
	.addImm(NumWords)
	.setMIFlags(MachineInstr::FrameSetup);
	}

	switch (MF.getTarget().getCodeModel()) {
	case CodeModel::Tiny:
	case CodeModel::Small:
	case CodeModel::Medium:
	case CodeModel::Kernel:
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
	.addExternalSymbol("__chkstk")
	.addReg(AArch64::X15, RegState::Implicit)
	.addReg(AArch64::X16, RegState::Implicit \| RegState::Define \| RegState::Dead)
	.addReg(AArch64::X17, RegState::Implicit \| RegState::Define \| RegState::Dead)
	.addReg(AArch64::NZCV, RegState::Implicit \| RegState::Define \| RegState::Dead)
	.setMIFlags(MachineInstr::FrameSetup);
	if (NeedsWinCFI) {
	HasWinCFI = true;
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
	.setMIFlag(MachineInstr::FrameSetup);
	}
	break;
	case CodeModel::Large:
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT))
	.addReg(AArch64::X16, RegState::Define)
	.addExternalSymbol("__chkstk")
	.addExternalSymbol("__chkstk")
	.setMIFlags(MachineInstr::FrameSetup);
	if (NeedsWinCFI) {
	HasWinCFI = true;
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
	.setMIFlag(MachineInstr::FrameSetup);
	}

	BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF)))
	.addReg(AArch64::X16, RegState::Kill)
	.addReg(AArch64::X15, RegState::Implicit \| RegState::Define)
	.addReg(AArch64::X16, RegState::Implicit \| RegState::Define \| RegState::Dead)
	.addReg(AArch64::X17, RegState::Implicit \| RegState::Define \| RegState::Dead)
	.addReg(AArch64::NZCV, RegState::Implicit \| RegState::Define \| RegState::Dead)
	.setMIFlags(MachineInstr::FrameSetup);
	if (NeedsWinCFI) {
	HasWinCFI = true;
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
	.setMIFlag(MachineInstr::FrameSetup);
	}
	break;
	}

	BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
	.addReg(AArch64::SP, RegState::Kill)
	.addReg(AArch64::X15, RegState::Kill)
	.addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4))
	.setMIFlags(MachineInstr::FrameSetup);
	if (NeedsWinCFI) {
	HasWinCFI = true;
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
	.addImm(NumBytes)
	.setMIFlag(MachineInstr::FrameSetup);
	}
	NumBytes = 0;
	}

	StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {};
	MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI;

	// Process the SVE callee-saves to determine what space needs to be
	// allocated.
	- if (AFI->getSVECalleeSavedStackSize()) {
	+ if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
	// Find callee save instructions in frame.
	CalleeSavesBegin = MBBI;
	assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
	while (IsSVECalleeSave(MBBI) && MBBI != MBB.getFirstTerminator())
	++MBBI;
	CalleeSavesEnd = MBBI;

	- int64_t OffsetToFirstCalleeSaveFromSP =
	- MFI.getObjectOffset(AFI->getMaxSVECSFrameIndex());
	- StackOffset OffsetToCalleeSavesFromSP =
	- StackOffset(OffsetToFirstCalleeSaveFromSP, MVT::nxv1i8) + SVEStackSize;
	- AllocateBefore -= OffsetToCalleeSavesFromSP;
	+ AllocateBefore = {CalleeSavedSize, MVT::nxv1i8};
	AllocateAfter = SVEStackSize - AllocateBefore;
	}

	// Allocate space for the callee saves (if any).
	emitFrameOffset(MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP,
	-AllocateBefore, TII,
	MachineInstr::FrameSetup);

	// Finally allocate remaining SVE stack space.
	emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP,
	-AllocateAfter, TII,
	MachineInstr::FrameSetup);

	// Allocate space for the rest of the frame.
	if (NumBytes) {
	// Alignment is required for the parent frame, not the funclet
	const bool NeedsRealignment =
	!IsFunclet && RegInfo->needsStackRealignment(MF);
	unsigned scratchSPReg = AArch64::SP;

	if (NeedsRealignment) {
	scratchSPReg = findScratchNonCalleeSaveRegister(&MBB);
	assert(scratchSPReg != AArch64::NoRegister);
	}

	// If we're a leaf function, try using the red zone.
	if (!canUseRedZone(MF))
	// FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
	// the correct value here, as NumBytes also includes padding bytes,
	// which shouldn't be counted here.
	emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP,
	{-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup,
	false, NeedsWinCFI, &HasWinCFI);

	if (NeedsRealignment) {
	const unsigned NrBitsToZero = Log2(MFI.getMaxAlign());
	assert(NrBitsToZero > 1);
	assert(scratchSPReg != AArch64::SP);

	// SUB X9, SP, NumBytes
	// -- X9 is temporary register, so shouldn't contain any live data here,
	// -- free to use. This is already produced by emitFrameOffset above.
	// AND SP, X9, 0b11111...0000
	// The logical immediates have a non-trivial encoding. The following
	// formula computes the encoded immediate with all ones but
	// NrBitsToZero zero bits as least significant bits.
	uint32_t andMaskEncoded = (1 << 12) // = N
	\| ((64 - NrBitsToZero) << 6) // immr
	\| ((64 - NrBitsToZero - 1) << 0); // imms

	BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
	.addReg(scratchSPReg, RegState::Kill)
	.addImm(andMaskEncoded);
	AFI->setStackRealigned(true);
	if (NeedsWinCFI) {
	HasWinCFI = true;
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
	.addImm(NumBytes & andMaskEncoded)
	.setMIFlag(MachineInstr::FrameSetup);
	}
	}
	}

	// If we need a base pointer, set it up here. It's whatever the value of the
	// stack pointer is at this point. Any variable size objects will be allocated
	// after this, so we can still use the base pointer to reference locals.
	//
	// FIXME: Clarify FrameSetup flags here.
	// Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
	// needed.
	// For funclets the BP belongs to the containing function.
	if (!IsFunclet && RegInfo->hasBasePointer(MF)) {
	TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP,
	false);
	if (NeedsWinCFI) {
	HasWinCFI = true;
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
	.setMIFlag(MachineInstr::FrameSetup);
	}
	}

	// The very last FrameSetup instruction indicates the end of prologue. Emit a
	// SEH opcode indicating the prologue end.
	if (NeedsWinCFI && HasWinCFI) {
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
	.setMIFlag(MachineInstr::FrameSetup);
	}

	// SEH funclets are passed the frame pointer in X1. If the parent
	// function uses the base register, then the base register is used
	// directly, and is not retrieved from X1.
	if (IsFunclet && F.hasPersonalityFn()) {
	EHPersonality Per = classifyEHPersonality(F.getPersonalityFn());
	if (isAsynchronousEHPersonality(Per)) {
	BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP)
	.addReg(AArch64::X1)
	.setMIFlag(MachineInstr::FrameSetup);
	MBB.addLiveIn(AArch64::X1);
	}
	}

	if (needsFrameMoves) {
	const DataLayout &TD = MF.getDataLayout();
	const int StackGrowth = isTargetDarwin(MF)
	? (2 * -TD.getPointerSize(0))
	: -AFI->getCalleeSavedStackSize();
	Register FramePtr = RegInfo->getFrameRegister(MF);
	// An example of the prologue:
	//
	// .globl __foo
	// .align 2
	// __foo:
	// Ltmp0:
	// .cfi_startproc
	// .cfi_personality 155, ___gxx_personality_v0
	// Leh_func_begin:
	// .cfi_lsda 16, Lexception33
	//
	// stp xa,bx, [sp, -#offset]!
	// ...
	// stp x28, x27, [sp, #offset-32]
	// stp fp, lr, [sp, #offset-16]
	// add fp, sp, #offset - 16
	// sub sp, sp, #1360
	//
	// The Stack:
	// +-------------------------------------------+
	// 10000 \| ........ \| ........ \| ........ \| ........ \|
	// 10004 \| ........ \| ........ \| ........ \| ........ \|
	// +-------------------------------------------+
	// 10008 \| ........ \| ........ \| ........ \| ........ \|
	// 1000c \| ........ \| ........ \| ........ \| ........ \|
	// +===========================================+
	// 10010 \| X28 Register \|
	// 10014 \| X28 Register \|
	// +-------------------------------------------+
	// 10018 \| X27 Register \|
	// 1001c \| X27 Register \|
	// +===========================================+
	// 10020 \| Frame Pointer \|
	// 10024 \| Frame Pointer \|
	// +-------------------------------------------+
	// 10028 \| Link Register \|
	// 1002c \| Link Register \|
	// +===========================================+
	// 10030 \| ........ \| ........ \| ........ \| ........ \|
	// 10034 \| ........ \| ........ \| ........ \| ........ \|
	// +-------------------------------------------+
	// 10038 \| ........ \| ........ \| ........ \| ........ \|
	// 1003c \| ........ \| ........ \| ........ \| ........ \|
	// +-------------------------------------------+
	//
	// [sp] = 10030 :: >>initial value<<
	// sp = 10020 :: stp fp, lr, [sp, #-16]!
	// fp = sp == 10020 :: mov fp, sp
	// [sp] == 10020 :: stp x28, x27, [sp, #-16]!
	// sp == 10010 :: >>final value<<
	//
	// The frame pointer (w29) points to address 10020. If we use an offset of
	// '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
	// for w27, and -32 for w28:
	//
	// Ltmp1:
	// .cfi_def_cfa w29, 16
	// Ltmp2:
	// .cfi_offset w30, -8
	// Ltmp3:
	// .cfi_offset w29, -16
	// Ltmp4:
	// .cfi_offset w27, -24
	// Ltmp5:
	// .cfi_offset w28, -32

	if (HasFP) {
	// Define the current CFA rule to use the provided FP.
	unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
	unsigned CFIIndex = MF.addFrameInst(
	MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - StackGrowth));
	BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
	.addCFIIndex(CFIIndex)
	.setMIFlags(MachineInstr::FrameSetup);
	} else {
	// Encode the stack size of the leaf function.
	unsigned CFIIndex = MF.addFrameInst(
	MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize()));
	BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
	.addCFIIndex(CFIIndex)
	.setMIFlags(MachineInstr::FrameSetup);
	}

	// Now emit the moves for whatever callee saved regs we have (including FP,
	// LR if those are saved).
	emitCalleeSavedFrameMoves(MBB, MBBI);
	}
	}

	static void InsertReturnAddressAuth(MachineFunction &MF,
	MachineBasicBlock &MBB) {
	if (!ShouldSignReturnAddress(MF))
	return;
	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();

	MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
	DebugLoc DL;
	if (MBBI != MBB.end())
	DL = MBBI->getDebugLoc();

	// The AUTIASP instruction assembles to a hint instruction before v8.3a so
	// this instruction can safely used for any v8a architecture.
	// From v8.3a onwards there are optimised authenticate LR and return
	// instructions, namely RETA{A,B}, that can be used instead.
	if (Subtarget.hasV8_3aOps() && MBBI != MBB.end() &&
	MBBI->getOpcode() == AArch64::RET_ReallyLR) {
	BuildMI(MBB, MBBI, DL,
	TII->get(ShouldSignWithAKey(MF) ? AArch64::RETAA : AArch64::RETAB))
	.copyImplicitOps(*MBBI);
	MBB.erase(MBBI);
	} else {
	BuildMI(
	MBB, MBBI, DL,
	TII->get(ShouldSignWithAKey(MF) ? AArch64::AUTIASP : AArch64::AUTIBSP))
	.setMIFlag(MachineInstr::FrameDestroy);
	}
	}

	static bool isFuncletReturnInstr(const MachineInstr &MI) {
	switch (MI.getOpcode()) {
	default:
	return false;
	case AArch64::CATCHRET:
	case AArch64::CLEANUPRET:
	return true;
	}
	}

	void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
	MachineBasicBlock &MBB) const {
	MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL;
	bool NeedsWinCFI = needsWinCFI(MF);
	bool HasWinCFI = false;
	bool IsFunclet = false;
	auto WinCFI = make_scope_exit([&]() {
	if (!MF.hasWinCFI())
	MF.setHasWinCFI(HasWinCFI);
	});

	if (MBB.end() != MBBI) {
	DL = MBBI->getDebugLoc();
	IsFunclet = isFuncletReturnInstr(*MBBI);
	}

	int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
	: MFI.getStackSize();
	AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();

	// All calls are tail calls in GHC calling conv, and functions have no
	// prologue/epilogue.
	if (MF.getFunction().getCallingConv() == CallingConv::GHC)
	return;

	// Initial and residual are named for consistency with the prologue. Note that
	// in the epilogue, the residual adjustment is executed first.
	uint64_t ArgumentPopSize = getArgumentPopSize(MF, MBB);

	// The stack frame should be like below,
	//
	// ---------------------- ---
	// \| \| \|
	// \| BytesInStackArgArea\| CalleeArgStackSize
	// \| (NumReusableBytes) \| (of tail call)
	// \| \| ---
	// \| \| \|
	// ---------------------\| --- \|
	// \| \| \| \|
	// \| CalleeSavedReg \| \| \|
	// \| (CalleeSavedStackSize)\| \| \|
	// \| \| \| \|
	// ---------------------\| \| NumBytes
	// \| \| StackSize (StackAdjustUp)
	// \| LocalStackSize \| \| \|
	// \| (covering callee \| \| \|
	// \| args) \| \| \|
	// \| \| \| \|
	// ---------------------- --- ---
	//
	// So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize
	// = StackSize + ArgumentPopSize
	//
	// AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
	// it as the 2nd argument of AArch64ISD::TC_RETURN.

	auto Cleanup = make_scope_exit([&] { InsertReturnAddressAuth(MF, MBB); });

	bool IsWin64 =
	Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
	unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);

	uint64_t AfterCSRPopSize = ArgumentPopSize;
	auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
	// We cannot rely on the local stack size set in emitPrologue if the function
	// has funclets, as funclets have different local stack size requirements, and
	// the current value set in emitPrologue may be that of the containing
	// function.
	if (MF.hasEHFunclets())
	AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
	bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
	// Assume we can't combine the last pop with the sp restore.

	if (!CombineSPBump && PrologueSaveSize != 0) {
	MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
	while (AArch64InstrInfo::isSEHInstruction(*Pop))
	Pop = std::prev(Pop);
	// Converting the last ldp to a post-index ldp is valid only if the last
	// ldp's offset is 0.
	const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1);
	// If the offset is 0, convert it to a post-index ldp.
	if (OffsetOp.getImm() == 0)
	convertCalleeSaveRestoreToSPPrePostIncDec(
	MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, false);
	else {
	// If not, make sure to emit an add after the last ldp.
	// We're doing this by transfering the size to be restored from the
	// adjustment before the CSR pops to the adjustment after the CSR
	// pops.
	AfterCSRPopSize += PrologueSaveSize;
	}
	}

	// Move past the restores of the callee-saved registers.
	// If we plan on combining the sp bump of the local stack size and the callee
	// save stack size, we might need to adjust the CSR save and restore offsets.
	MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
	MachineBasicBlock::iterator Begin = MBB.begin();
	while (LastPopI != Begin) {
	--LastPopI;
	if (!LastPopI->getFlag(MachineInstr::FrameDestroy) \|\|
	IsSVECalleeSave(LastPopI)) {
	++LastPopI;
	break;
	} else if (CombineSPBump)
	fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize(),
	NeedsWinCFI, &HasWinCFI);
	}

	if (NeedsWinCFI) {
	HasWinCFI = true;
	BuildMI(MBB, LastPopI, DL, TII->get(AArch64::SEH_EpilogStart))
	.setMIFlag(MachineInstr::FrameDestroy);
	}

	const StackOffset &SVEStackSize = getSVEStackSize(MF);

	// If there is a single SP update, insert it before the ret and we're done.
	if (CombineSPBump) {
	assert(!SVEStackSize && "Cannot combine SP bump with SVE");
	emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
	{NumBytes + (int64_t)AfterCSRPopSize, MVT::i8}, TII,
	MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
	if (NeedsWinCFI && HasWinCFI)
	BuildMI(MBB, MBB.getFirstTerminator(), DL,
	TII->get(AArch64::SEH_EpilogEnd))
	.setMIFlag(MachineInstr::FrameDestroy);
	return;
	}

	NumBytes -= PrologueSaveSize;
	assert(NumBytes >= 0 && "Negative stack allocation size!?");

	// Process the SVE callee-saves to determine what space needs to be
	// deallocated.
	StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
	MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI;
	- if (AFI->getSVECalleeSavedStackSize()) {
	+ if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
	RestoreBegin = std::prev(RestoreEnd);;
	while (IsSVECalleeSave(RestoreBegin) &&
	RestoreBegin != MBB.begin())
	--RestoreBegin;
	++RestoreBegin;

	assert(IsSVECalleeSave(RestoreBegin) &&
	IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");

	- int64_t OffsetToFirstCalleeSaveFromSP =
	- MFI.getObjectOffset(AFI->getMaxSVECSFrameIndex());
	- StackOffset OffsetToCalleeSavesFromSP =
	- StackOffset(OffsetToFirstCalleeSaveFromSP, MVT::nxv1i8) + SVEStackSize;
	- DeallocateBefore = OffsetToCalleeSavesFromSP;
	- DeallocateAfter = SVEStackSize - DeallocateBefore;
	+ StackOffset CalleeSavedSizeAsOffset = {CalleeSavedSize, MVT::nxv1i8};
	+ DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
	+ DeallocateAfter = CalleeSavedSizeAsOffset;
	}

	// Deallocate the SVE area.
	if (SVEStackSize) {
	if (AFI->isStackRealigned()) {
	- if (AFI->getSVECalleeSavedStackSize())
	- // Set SP to start of SVE area, from which the callee-save reloads
	- // can be done. The code below will deallocate the stack space
	+ if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize())
	+ // Set SP to start of SVE callee-save area from which they can
	+ // be reloaded. The code below will deallocate the stack space
	// space by moving FP -> SP.
	emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP,
	- -SVEStackSize, TII, MachineInstr::FrameDestroy);
	+ {-CalleeSavedSize, MVT::nxv1i8}, TII,
	+ MachineInstr::FrameDestroy);
	} else {
	if (AFI->getSVECalleeSavedStackSize()) {
	// Deallocate the non-SVE locals first before we can deallocate (and
	// restore callee saves) from the SVE area.
	emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
	{NumBytes, MVT::i8}, TII, MachineInstr::FrameDestroy);
	NumBytes = 0;
	}

	emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
	DeallocateBefore, TII, MachineInstr::FrameDestroy);

	emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
	DeallocateAfter, TII, MachineInstr::FrameDestroy);
	}
	}

	if (!hasFP(MF)) {
	bool RedZone = canUseRedZone(MF);
	// If this was a redzone leaf function, we don't need to restore the
	// stack pointer (but we may need to pop stack args for fastcc).
	if (RedZone && AfterCSRPopSize == 0)
	return;

	bool NoCalleeSaveRestore = PrologueSaveSize == 0;
	int64_t StackRestoreBytes = RedZone ? 0 : NumBytes;
	if (NoCalleeSaveRestore)
	StackRestoreBytes += AfterCSRPopSize;

	// If we were able to combine the local stack pop with the argument pop,
	// then we're done.
	bool Done = NoCalleeSaveRestore \|\| AfterCSRPopSize == 0;

	// If we're done after this, make sure to help the load store optimizer.
	if (Done)
	adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI);

	emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
	{StackRestoreBytes, MVT::i8}, TII,
	MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
	if (Done) {
	if (NeedsWinCFI) {
	HasWinCFI = true;
	BuildMI(MBB, MBB.getFirstTerminator(), DL,
	TII->get(AArch64::SEH_EpilogEnd))
	.setMIFlag(MachineInstr::FrameDestroy);
	}
	return;
	}

	NumBytes = 0;
	}

	// Restore the original stack pointer.
	// FIXME: Rather than doing the math here, we should instead just use
	// non-post-indexed loads for the restores if we aren't actually going to
	// be able to save any instructions.
	if (!IsFunclet && (MFI.hasVarSizedObjects() \|\| AFI->isStackRealigned())) {
	int64_t OffsetToFrameRecord =
	isTargetDarwin(MF) ? (-(int64_t)AFI->getCalleeSavedStackSize() + 16) : 0;
	emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
	{OffsetToFrameRecord, MVT::i8},
	TII, MachineInstr::FrameDestroy, false, NeedsWinCFI);
	} else if (NumBytes)
	emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
	{NumBytes, MVT::i8}, TII, MachineInstr::FrameDestroy, false,
	NeedsWinCFI);

	// This must be placed after the callee-save restore code because that code
	// assumes the SP is at the same location as it was after the callee-save save
	// code in the prologue.
	if (AfterCSRPopSize) {
	// Find an insertion point for the first ldp so that it goes before the
	// shadow call stack epilog instruction. This ensures that the restore of
	// lr from x18 is placed after the restore from sp.
	auto FirstSPPopI = MBB.getFirstTerminator();
	while (FirstSPPopI != Begin) {
	auto Prev = std::prev(FirstSPPopI);
	if (Prev->getOpcode() != AArch64::LDRXpre \|\|
	Prev->getOperand(0).getReg() == AArch64::SP)
	break;
	FirstSPPopI = Prev;
	}

	adaptForLdStOpt(MBB, FirstSPPopI, LastPopI);

	emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
	{(int64_t)AfterCSRPopSize, MVT::i8}, TII,
	MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
	}
	if (NeedsWinCFI && HasWinCFI)
	BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd))
	.setMIFlag(MachineInstr::FrameDestroy);

	MF.setHasWinCFI(HasWinCFI);
	}

	/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
	/// debug info. It's the same as what we use for resolving the code-gen
	/// references for now. FIXME: This can go wrong when references are
	/// SP-relative and simple call frames aren't used.
	int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF,
	int FI,
	Register &FrameReg) const {
	return resolveFrameIndexReference(
	MF, FI, FrameReg,
	/PreferFP=/
	MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress),
	/ForSimm=/false)
	.getBytes();
	}

	int AArch64FrameLowering::getNonLocalFrameIndexReference(
	const MachineFunction &MF, int FI) const {
	return getSEHFrameIndexOffset(MF, FI);
	}

	static StackOffset getFPOffset(const MachineFunction &MF, int64_t ObjectOffset) {
	const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
	const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
	bool IsWin64 =
	Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());

	unsigned FixedObject =
	getFixedObjectSize(MF, AFI, IsWin64, /IsFunclet=/false);
	unsigned FPAdjust = isTargetDarwin(MF)
	? 16 : AFI->getCalleeSavedStackSize(MF.getFrameInfo());
	return {ObjectOffset + FixedObject + FPAdjust, MVT::i8};
	}

	static StackOffset getStackOffset(const MachineFunction &MF, int64_t ObjectOffset) {
	const auto &MFI = MF.getFrameInfo();
	return {ObjectOffset + (int64_t)MFI.getStackSize(), MVT::i8};
	}

	int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
	int FI) const {
	const auto RegInfo = static_cast<const AArch64RegisterInfo >(
	MF.getSubtarget().getRegisterInfo());
	int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI);
	return RegInfo->getLocalAddressRegister(MF) == AArch64::FP
	? getFPOffset(MF, ObjectOffset).getBytes()
	: getStackOffset(MF, ObjectOffset).getBytes();
	}

	StackOffset AArch64FrameLowering::resolveFrameIndexReference(
	const MachineFunction &MF, int FI, Register &FrameReg, bool PreferFP,
	bool ForSimm) const {
	const auto &MFI = MF.getFrameInfo();
	int64_t ObjectOffset = MFI.getObjectOffset(FI);
	bool isFixed = MFI.isFixedObjectIndex(FI);
	bool isSVE = MFI.getStackID(FI) == TargetStackID::SVEVector;
	return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg,
	PreferFP, ForSimm);
	}

	StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
	const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE,
	Register &FrameReg, bool PreferFP, bool ForSimm) const {
	const auto &MFI = MF.getFrameInfo();
	const auto RegInfo = static_cast<const AArch64RegisterInfo >(
	MF.getSubtarget().getRegisterInfo());
	const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
	const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();

	int64_t FPOffset = getFPOffset(MF, ObjectOffset).getBytes();
	int64_t Offset = getStackOffset(MF, ObjectOffset).getBytes();
	bool isCSR =
	!isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));

	const StackOffset &SVEStackSize = getSVEStackSize(MF);

	// Use frame pointer to reference fixed objects. Use it for locals if
	// there are VLAs or a dynamically realigned SP (and thus the SP isn't
	// reliable as a base). Make sure useFPForScavengingIndex() does the
	// right thing for the emergency spill slot.
	bool UseFP = false;
	if (AFI->hasStackFrame() && !isSVE) {
	// We shouldn't prefer using the FP when there is an SVE area
	// in between the FP and the non-SVE locals/spills.
	PreferFP &= !SVEStackSize;

	// Note: Keeping the following as multiple 'if' statements rather than
	// merging to a single expression for readability.
	//
	// Argument access should always use the FP.
	if (isFixed) {
	UseFP = hasFP(MF);
	} else if (isCSR && RegInfo->needsStackRealignment(MF)) {
	// References to the CSR area must use FP if we're re-aligning the stack
	// since the dynamically-sized alignment padding is between the SP/BP and
	// the CSR area.
	assert(hasFP(MF) && "Re-aligned stack must have frame pointer");
	UseFP = true;
	} else if (hasFP(MF) && !RegInfo->needsStackRealignment(MF)) {
	// If the FPOffset is negative and we're producing a signed immediate, we
	// have to keep in mind that the available offset range for negative
	// offsets is smaller than for positive ones. If an offset is available
	// via the FP and the SP, use whichever is closest.
	bool FPOffsetFits = !ForSimm \|\| FPOffset >= -256;
	PreferFP \|= Offset > -FPOffset;

	if (MFI.hasVarSizedObjects()) {
	// If we have variable sized objects, we can use either FP or BP, as the
	// SP offset is unknown. We can use the base pointer if we have one and
	// FP is not preferred. If not, we're stuck with using FP.
	bool CanUseBP = RegInfo->hasBasePointer(MF);
	if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best.
	UseFP = PreferFP;
	else if (!CanUseBP) // Can't use BP. Forced to use FP.
	UseFP = true;
	// else we can use BP and FP, but the offset from FP won't fit.
	// That will make us scavenge registers which we can probably avoid by
	// using BP. If it won't fit for BP either, we'll scavenge anyway.
	} else if (FPOffset >= 0) {
	// Use SP or FP, whichever gives us the best chance of the offset
	// being in range for direct access. If the FPOffset is positive,
	// that'll always be best, as the SP will be even further away.
	UseFP = true;
	} else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) {
	// Funclets access the locals contained in the parent's stack frame
	// via the frame pointer, so we have to use the FP in the parent
	// function.
	(void) Subtarget;
	assert(
	Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()) &&
	"Funclets should only be present on Win64");
	UseFP = true;
	} else {
	// We have the choice between FP and (SP or BP).
	if (FPOffsetFits && PreferFP) // If FP is the best fit, use it.
	UseFP = true;
	}
	}
	}

	assert(((isFixed \|\| isCSR) \|\| !RegInfo->needsStackRealignment(MF) \|\| !UseFP) &&
	"In the presence of dynamic stack pointer realignment, "
	"non-argument/CSR objects cannot be accessed through the frame pointer");

	if (isSVE) {
	int64_t OffsetToSVEArea =
	MFI.getStackSize() - AFI->getCalleeSavedStackSize();
	StackOffset FPOffset = {ObjectOffset, MVT::nxv1i8};
	StackOffset SPOffset = SVEStackSize +
	StackOffset(ObjectOffset, MVT::nxv1i8) +
	StackOffset(OffsetToSVEArea, MVT::i8);
	// Always use the FP for SVE spills if available and beneficial.
	if (hasFP(MF) &&
	(SPOffset.getBytes() \|\|
	FPOffset.getScalableBytes() < SPOffset.getScalableBytes() \|\|
	RegInfo->needsStackRealignment(MF))) {
	FrameReg = RegInfo->getFrameRegister(MF);
	return FPOffset;
	}

	FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister()
	: (unsigned)AArch64::SP;
	return SPOffset;
	}

	StackOffset ScalableOffset = {};
	if (UseFP && !(isFixed \|\| isCSR))
	ScalableOffset = -SVEStackSize;
	if (!UseFP && (isFixed \|\| isCSR))
	ScalableOffset = SVEStackSize;

	if (UseFP) {
	FrameReg = RegInfo->getFrameRegister(MF);
	return StackOffset(FPOffset, MVT::i8) + ScalableOffset;
	}

	// Use the base pointer if we have one.
	if (RegInfo->hasBasePointer(MF))
	FrameReg = RegInfo->getBaseRegister();
	else {
	assert(!MFI.hasVarSizedObjects() &&
	"Can't use SP when we have var sized objects.");
	FrameReg = AArch64::SP;
	// If we're using the red zone for this function, the SP won't actually
	// be adjusted, so the offsets will be negative. They're also all
	// within range of the signed 9-bit immediate instructions.
	if (canUseRedZone(MF))
	Offset -= AFI->getLocalStackSize();
	}

	return StackOffset(Offset, MVT::i8) + ScalableOffset;
	}

	static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
	// Do not set a kill flag on values that are also marked as live-in. This
	// happens with the @llvm-returnaddress intrinsic and with arguments passed in
	// callee saved registers.
	// Omitting the kill flags is conservatively correct even if the live-in
	// is not used after all.
	bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg);
	return getKillRegState(!IsLiveIn);
	}

	static bool produceCompactUnwindFrame(MachineFunction &MF) {
	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
	AttributeList Attrs = MF.getFunction().getAttributes();
	return Subtarget.isTargetMachO() &&
	!(Subtarget.getTargetLowering()->supportSwiftError() &&
	Attrs.hasAttrSomewhere(Attribute::SwiftError));
	}

	static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
	bool NeedsWinCFI) {
	// If we are generating register pairs for a Windows function that requires
	// EH support, then pair consecutive registers only. There are no unwind
	// opcodes for saves/restores of non-consectuve register pairs.
	// The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x.
	// https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling

	// TODO: LR can be paired with any register. We don't support this yet in
	// the MCLayer. We need to add support for the save_lrpair unwind code.
	if (Reg2 == AArch64::FP)
	return true;
	if (!NeedsWinCFI)
	return false;
	if (Reg2 == Reg1 + 1)
	return false;
	return true;
	}

	/// Returns true if Reg1 and Reg2 cannot be paired using a ldp/stp instruction.
	/// WindowsCFI requires that only consecutive registers can be paired.
	/// LR and FP need to be allocated together when the frame needs to save
	/// the frame-record. This means any other register pairing with LR is invalid.
	static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2,
	bool UsesWinAAPCS, bool NeedsWinCFI, bool NeedsFrameRecord) {
	if (UsesWinAAPCS)
	return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI);

	// If we need to store the frame record, don't pair any register
	// with LR other than FP.
	if (NeedsFrameRecord)
	return Reg2 == AArch64::LR;

	return false;
	}

	namespace {

	struct RegPairInfo {
	unsigned Reg1 = AArch64::NoRegister;
	unsigned Reg2 = AArch64::NoRegister;
	int FrameIdx;
	int Offset;
	enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type;

	RegPairInfo() = default;

	bool isPaired() const { return Reg2 != AArch64::NoRegister; }

	unsigned getScale() const {
	switch (Type) {
	case PPR:
	return 2;
	case GPR:
	case FPR64:
	return 8;
	case ZPR:
	case FPR128:
	return 16;
	}
	llvm_unreachable("Unsupported type");
	}

	bool isScalable() const { return Type == PPR \|\| Type == ZPR; }
	};

	} // end anonymous namespace

	static void computeCalleeSaveRegisterPairs(
	MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
	const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
	bool &NeedShadowCallStackProlog, bool NeedsFrameRecord) {

	if (CSI.empty())
	return;

	bool IsWindows = isTargetWindows(MF);
	bool NeedsWinCFI = needsWinCFI(MF);
	AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	CallingConv::ID CC = MF.getFunction().getCallingConv();
	unsigned Count = CSI.size();
	(void)CC;
	// MachO's compact unwind format relies on all registers being stored in
	// pairs.
	assert((!produceCompactUnwindFrame(MF) \|\|
	CC == CallingConv::PreserveMost \|\|
	(Count & 1) == 0) &&
	"Odd number of callee-saved regs to spill!");
	int ByteOffset = AFI->getCalleeSavedStackSize();
	int ScalableByteOffset = AFI->getSVECalleeSavedStackSize();
	// On Linux, we will have either one or zero non-paired register. On Windows
	// with CFI, we can have multiple unpaired registers in order to utilize the
	// available unwind codes. This flag assures that the alignment fixup is done
	// only once, as intened.
	bool FixupDone = false;
	for (unsigned i = 0; i < Count; ++i) {
	RegPairInfo RPI;
	RPI.Reg1 = CSI[i].getReg();

	if (AArch64::GPR64RegClass.contains(RPI.Reg1))
	RPI.Type = RegPairInfo::GPR;
	else if (AArch64::FPR64RegClass.contains(RPI.Reg1))
	RPI.Type = RegPairInfo::FPR64;
	else if (AArch64::FPR128RegClass.contains(RPI.Reg1))
	RPI.Type = RegPairInfo::FPR128;
	else if (AArch64::ZPRRegClass.contains(RPI.Reg1))
	RPI.Type = RegPairInfo::ZPR;
	else if (AArch64::PPRRegClass.contains(RPI.Reg1))
	RPI.Type = RegPairInfo::PPR;
	else
	llvm_unreachable("Unsupported register class.");

	// Add the next reg to the pair if it is in the same register class.
	if (i + 1 < Count) {
	unsigned NextReg = CSI[i + 1].getReg();
	switch (RPI.Type) {
	case RegPairInfo::GPR:
	if (AArch64::GPR64RegClass.contains(NextReg) &&
	!invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows, NeedsWinCFI,
	NeedsFrameRecord))
	RPI.Reg2 = NextReg;
	break;
	case RegPairInfo::FPR64:
	if (AArch64::FPR64RegClass.contains(NextReg) &&
	!invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI))
	RPI.Reg2 = NextReg;
	break;
	case RegPairInfo::FPR128:
	if (AArch64::FPR128RegClass.contains(NextReg))
	RPI.Reg2 = NextReg;
	break;
	case RegPairInfo::PPR:
	case RegPairInfo::ZPR:
	break;
	}
	}

	// If either of the registers to be saved is the lr register, it means that
	// we also need to save lr in the shadow call stack.
	if ((RPI.Reg1 == AArch64::LR \|\| RPI.Reg2 == AArch64::LR) &&
	MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)) {
	if (!MF.getSubtarget<AArch64Subtarget>().isXRegisterReserved(18))
	report_fatal_error("Must reserve x18 to use shadow call stack");
	NeedShadowCallStackProlog = true;
	}

	// GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
	// list to come in sorted by frame index so that we can issue the store
	// pair instructions directly. Assert if we see anything otherwise.
	//
	// The order of the registers in the list is controlled by
	// getCalleeSavedRegs(), so they will always be in-order, as well.
	assert((!RPI.isPaired() \|\|
	(CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx())) &&
	"Out of order callee saved regs!");

	assert((!RPI.isPaired() \|\| !NeedsFrameRecord \|\| RPI.Reg2 != AArch64::FP \|\|
	RPI.Reg1 == AArch64::LR) &&
	"FrameRecord must be allocated together with LR");

	// Windows AAPCS has FP and LR reversed.
	assert((!RPI.isPaired() \|\| !NeedsFrameRecord \|\| RPI.Reg1 != AArch64::FP \|\|
	RPI.Reg2 == AArch64::LR) &&
	"FrameRecord must be allocated together with LR");

	// MachO's compact unwind format relies on all registers being stored in
	// adjacent register pairs.
	assert((!produceCompactUnwindFrame(MF) \|\|
	CC == CallingConv::PreserveMost \|\|
	(RPI.isPaired() &&
	((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) \|\|
	RPI.Reg1 + 1 == RPI.Reg2))) &&
	"Callee-save registers not saved as adjacent register pair!");

	RPI.FrameIdx = CSI[i].getFrameIdx();

	int Scale = RPI.getScale();
	if (RPI.isScalable())
	ScalableByteOffset -= Scale;
	else
	ByteOffset -= RPI.isPaired() ? 2 * Scale : Scale;

	assert(!(RPI.isScalable() && RPI.isPaired()) &&
	"Paired spill/fill instructions don't exist for SVE vectors");

	// Round up size of non-pair to pair size if we need to pad the
	// callee-save area to ensure 16-byte alignment.
	if (AFI->hasCalleeSaveStackFreeSpace() && !FixupDone &&
	!RPI.isScalable() && RPI.Type != RegPairInfo::FPR128 &&
	!RPI.isPaired()) {
	FixupDone = true;
	ByteOffset -= 8;
	assert(ByteOffset % 16 == 0);
	assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16));
	MFI.setObjectAlignment(RPI.FrameIdx, Align(16));
	}

	int Offset = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
	assert(Offset % Scale == 0);
	RPI.Offset = Offset / Scale;

	assert(((!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) \|\|
	(RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) &&
	"Offset out of bounds for LDP/STP immediate");

	RegPairs.push_back(RPI);
	if (RPI.isPaired())
	++i;
	}
	}

	bool AArch64FrameLowering::spillCalleeSavedRegisters(
	MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
	ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
	MachineFunction &MF = *MBB.getParent();
	const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
	bool NeedsWinCFI = needsWinCFI(MF);
	DebugLoc DL;
	SmallVector<RegPairInfo, 8> RegPairs;

	bool NeedShadowCallStackProlog = false;
	computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
	NeedShadowCallStackProlog, hasFP(MF));
	const MachineRegisterInfo &MRI = MF.getRegInfo();

	if (NeedShadowCallStackProlog) {
	// Shadow call stack prolog: str x30, [x18], #8
	BuildMI(MBB, MI, DL, TII.get(AArch64::STRXpost))
	.addReg(AArch64::X18, RegState::Define)
	.addReg(AArch64::LR)
	.addReg(AArch64::X18)
	.addImm(8)
	.setMIFlag(MachineInstr::FrameSetup);

	if (NeedsWinCFI)
	BuildMI(MBB, MI, DL, TII.get(AArch64::SEH_Nop))
	.setMIFlag(MachineInstr::FrameSetup);

	if (!MF.getFunction().hasFnAttribute(Attribute::NoUnwind)) {
	// Emit a CFI instruction that causes 8 to be subtracted from the value of
	// x18 when unwinding past this frame.
	static const char CFIInst[] = {
	dwarf::DW_CFA_val_expression,
	18, // register
	2, // length
	static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
	static_cast<char>(-8) & 0x7f, // addend (sleb128)
	};
	unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape(
	nullptr, StringRef(CFIInst, sizeof(CFIInst))));
	BuildMI(MBB, MI, DL, TII.get(AArch64::CFI_INSTRUCTION))
	.addCFIIndex(CFIIndex)
	.setMIFlag(MachineInstr::FrameSetup);
	}

	// This instruction also makes x18 live-in to the entry block.
	MBB.addLiveIn(AArch64::X18);
	}

	for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE;
	++RPII) {
	RegPairInfo RPI = *RPII;
	unsigned Reg1 = RPI.Reg1;
	unsigned Reg2 = RPI.Reg2;
	unsigned StrOpc;

	// Issue sequence of spills for cs regs. The first spill may be converted
	// to a pre-decrement store later by emitPrologue if the callee-save stack
	// area allocation can't be combined with the local stack area allocation.
	// For example:
	// stp x22, x21, [sp, #0] // addImm(+0)
	// stp x20, x19, [sp, #16] // addImm(+2)
	// stp fp, lr, [sp, #32] // addImm(+4)
	// Rationale: This sequence saves uop updates compared to a sequence of
	// pre-increment spills like stp xi,xj,[sp,#-16]!
	// Note: Similar rationale and sequence for restores in epilog.
	unsigned Size;
	Align Alignment;
	switch (RPI.Type) {
	case RegPairInfo::GPR:
	StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
	Size = 8;
	Alignment = Align(8);
	break;
	case RegPairInfo::FPR64:
	StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
	Size = 8;
	Alignment = Align(8);
	break;
	case RegPairInfo::FPR128:
	StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui;
	Size = 16;
	Alignment = Align(16);
	break;
	case RegPairInfo::ZPR:
	StrOpc = AArch64::STR_ZXI;
	Size = 16;
	Alignment = Align(16);
	break;
	case RegPairInfo::PPR:
	StrOpc = AArch64::STR_PXI;
	Size = 2;
	Alignment = Align(2);
	break;
	}
	LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
	if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
	dbgs() << ") -> fi#(" << RPI.FrameIdx;
	if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
	dbgs() << ")\n");

	assert((!NeedsWinCFI \|\| !(Reg1 == AArch64::LR && Reg2 == AArch64::FP)) &&
	"Windows unwdinding requires a consecutive (FP,LR) pair");
	// Windows unwind codes require consecutive registers if registers are
	// paired. Make the switch here, so that the code below will save (x,x+1)
	// and not (x+1,x).
	unsigned FrameIdxReg1 = RPI.FrameIdx;
	unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
	if (NeedsWinCFI && RPI.isPaired()) {
	std::swap(Reg1, Reg2);
	std::swap(FrameIdxReg1, FrameIdxReg2);
	}
	MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
	if (!MRI.isReserved(Reg1))
	MBB.addLiveIn(Reg1);
	if (RPI.isPaired()) {
	if (!MRI.isReserved(Reg2))
	MBB.addLiveIn(Reg2);
	MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
	MIB.addMemOperand(MF.getMachineMemOperand(
	MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
	MachineMemOperand::MOStore, Size, Alignment));
	}
	MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
	.addReg(AArch64::SP)
	.addImm(RPI.Offset) // [sp, #offset*scale],
	// where factor*scale is implicit
	.setMIFlag(MachineInstr::FrameSetup);
	MIB.addMemOperand(MF.getMachineMemOperand(
	MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
	MachineMemOperand::MOStore, Size, Alignment));
	if (NeedsWinCFI)
	InsertSEH(MIB, TII, MachineInstr::FrameSetup);

	// Update the StackIDs of the SVE stack slots.
	MachineFrameInfo &MFI = MF.getFrameInfo();
	if (RPI.Type == RegPairInfo::ZPR \|\| RPI.Type == RegPairInfo::PPR)
	MFI.setStackID(RPI.FrameIdx, TargetStackID::SVEVector);

	}
	return true;
	}

	bool AArch64FrameLowering::restoreCalleeSavedRegisters(
	MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
	MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
	MachineFunction &MF = *MBB.getParent();
	const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
	DebugLoc DL;
	SmallVector<RegPairInfo, 8> RegPairs;
	bool NeedsWinCFI = needsWinCFI(MF);

	if (MI != MBB.end())
	DL = MI->getDebugLoc();

	bool NeedShadowCallStackProlog = false;
	computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
	NeedShadowCallStackProlog, hasFP(MF));

	auto EmitMI = [&](const RegPairInfo &RPI) {
	unsigned Reg1 = RPI.Reg1;
	unsigned Reg2 = RPI.Reg2;

	// Issue sequence of restores for cs regs. The last restore may be converted
	// to a post-increment load later by emitEpilogue if the callee-save stack
	// area allocation can't be combined with the local stack area allocation.
	// For example:
	// ldp fp, lr, [sp, #32] // addImm(+4)
	// ldp x20, x19, [sp, #16] // addImm(+2)
	// ldp x22, x21, [sp, #0] // addImm(+0)
	// Note: see comment in spillCalleeSavedRegisters()
	unsigned LdrOpc;
	unsigned Size;
	Align Alignment;
	switch (RPI.Type) {
	case RegPairInfo::GPR:
	LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
	Size = 8;
	Alignment = Align(8);
	break;
	case RegPairInfo::FPR64:
	LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
	Size = 8;
	Alignment = Align(8);
	break;
	case RegPairInfo::FPR128:
	LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui;
	Size = 16;
	Alignment = Align(16);
	break;
	case RegPairInfo::ZPR:
	LdrOpc = AArch64::LDR_ZXI;
	Size = 16;
	Alignment = Align(16);
	break;
	case RegPairInfo::PPR:
	LdrOpc = AArch64::LDR_PXI;
	Size = 2;
	Alignment = Align(2);
	break;
	}
	LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
	if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
	dbgs() << ") -> fi#(" << RPI.FrameIdx;
	if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
	dbgs() << ")\n");

	// Windows unwind codes require consecutive registers if registers are
	// paired. Make the switch here, so that the code below will save (x,x+1)
	// and not (x+1,x).
	unsigned FrameIdxReg1 = RPI.FrameIdx;
	unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
	if (NeedsWinCFI && RPI.isPaired()) {
	std::swap(Reg1, Reg2);
	std::swap(FrameIdxReg1, FrameIdxReg2);
	}
	MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
	if (RPI.isPaired()) {
	MIB.addReg(Reg2, getDefRegState(true));
	MIB.addMemOperand(MF.getMachineMemOperand(
	MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
	MachineMemOperand::MOLoad, Size, Alignment));
	}
	MIB.addReg(Reg1, getDefRegState(true))
	.addReg(AArch64::SP)
	.addImm(RPI.Offset) // [sp, #offset*scale]
	// where factor*scale is implicit
	.setMIFlag(MachineInstr::FrameDestroy);
	MIB.addMemOperand(MF.getMachineMemOperand(
	MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
	MachineMemOperand::MOLoad, Size, Alignment));
	if (NeedsWinCFI)
	InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
	};

	// SVE objects are always restored in reverse order.
	for (const RegPairInfo &RPI : reverse(RegPairs))
	if (RPI.isScalable())
	EmitMI(RPI);

	if (ReverseCSRRestoreSeq) {
	for (const RegPairInfo &RPI : reverse(RegPairs))
	if (!RPI.isScalable())
	EmitMI(RPI);
	} else
	for (const RegPairInfo &RPI : RegPairs)
	if (!RPI.isScalable())
	EmitMI(RPI);

	if (NeedShadowCallStackProlog) {
	// Shadow call stack epilog: ldr x30, [x18, #-8]!
	BuildMI(MBB, MI, DL, TII.get(AArch64::LDRXpre))
	.addReg(AArch64::X18, RegState::Define)
	.addReg(AArch64::LR, RegState::Define)
	.addReg(AArch64::X18)
	.addImm(-8)
	.setMIFlag(MachineInstr::FrameDestroy);
	}

	return true;
	}

	void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
	BitVector &SavedRegs,
	RegScavenger *RS) const {
	// All calls are tail calls in GHC calling conv, and functions have no
	// prologue/epilogue.
	if (MF.getFunction().getCallingConv() == CallingConv::GHC)
	return;

	TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
	const AArch64RegisterInfo RegInfo = static_cast<const AArch64RegisterInfo >(
	MF.getSubtarget().getRegisterInfo());
	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
	AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	unsigned UnspilledCSGPR = AArch64::NoRegister;
	unsigned UnspilledCSGPRPaired = AArch64::NoRegister;

	MachineFrameInfo &MFI = MF.getFrameInfo();
	const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();

	unsigned BasePointerReg = RegInfo->hasBasePointer(MF)
	? RegInfo->getBaseRegister()
	: (unsigned)AArch64::NoRegister;

	unsigned ExtraCSSpill = 0;
	// Figure out which callee-saved registers to save/restore.
	for (unsigned i = 0; CSRegs[i]; ++i) {
	const unsigned Reg = CSRegs[i];

	// Add the base pointer register to SavedRegs if it is callee-save.
	if (Reg == BasePointerReg)
	SavedRegs.set(Reg);

	bool RegUsed = SavedRegs.test(Reg);
	unsigned PairedReg = AArch64::NoRegister;
	if (AArch64::GPR64RegClass.contains(Reg) \|\|
	AArch64::FPR64RegClass.contains(Reg) \|\|
	AArch64::FPR128RegClass.contains(Reg))
	PairedReg = CSRegs[i ^ 1];

	if (!RegUsed) {
	if (AArch64::GPR64RegClass.contains(Reg) &&
	!RegInfo->isReservedReg(MF, Reg)) {
	UnspilledCSGPR = Reg;
	UnspilledCSGPRPaired = PairedReg;
	}
	continue;
	}

	// MachO's compact unwind format relies on all registers being stored in
	// pairs.
	// FIXME: the usual format is actually better if unwinding isn't needed.
	if (produceCompactUnwindFrame(MF) && PairedReg != AArch64::NoRegister &&
	!SavedRegs.test(PairedReg)) {
	SavedRegs.set(PairedReg);
	if (AArch64::GPR64RegClass.contains(PairedReg) &&
	!RegInfo->isReservedReg(MF, PairedReg))
	ExtraCSSpill = PairedReg;
	}
	}

	if (MF.getFunction().getCallingConv() == CallingConv::Win64 &&
	!Subtarget.isTargetWindows()) {
	// For Windows calling convention on a non-windows OS, where X18 is treated
	// as reserved, back up X18 when entering non-windows code (marked with the
	// Windows calling convention) and restore when returning regardless of
	// whether the individual function uses it - it might call other functions
	// that clobber it.
	SavedRegs.set(AArch64::X18);
	}

	// Calculates the callee saved stack size.
	unsigned CSStackSize = 0;
	unsigned SVECSStackSize = 0;
	const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
	const MachineRegisterInfo &MRI = MF.getRegInfo();
	for (unsigned Reg : SavedRegs.set_bits()) {
	auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8;
	if (AArch64::PPRRegClass.contains(Reg) \|\|
	AArch64::ZPRRegClass.contains(Reg))
	SVECSStackSize += RegSize;
	else
	CSStackSize += RegSize;
	}

	// Save number of saved regs, so we can easily update CSStackSize later.
	unsigned NumSavedRegs = SavedRegs.count();

	// The frame record needs to be created by saving the appropriate registers
	uint64_t EstimatedStackSize = MFI.estimateStackSize(MF);
	if (hasFP(MF) \|\|
	windowsRequiresStackProbe(MF, EstimatedStackSize + CSStackSize + 16)) {
	SavedRegs.set(AArch64::FP);
	SavedRegs.set(AArch64::LR);
	}

	LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nSaved CSRs:";
	for (unsigned Reg
	: SavedRegs.set_bits()) dbgs()
	<< ' ' << printReg(Reg, RegInfo);
	dbgs() << "\n";);

	// If any callee-saved registers are used, the frame cannot be eliminated.
	int64_t SVEStackSize =
	alignTo(SVECSStackSize + estimateSVEStackObjectOffsets(MFI), 16);
	bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize;

	// The CSR spill slots have not been allocated yet, so estimateStackSize
	// won't include them.
	unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);

	// Conservatively always assume BigStack when there are SVE spills.
	bool BigStack = SVEStackSize \|\|
	(EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit;
	if (BigStack \|\| !CanEliminateFrame \|\| RegInfo->cannotEliminateFrame(MF))
	AFI->setHasStackFrame(true);

	// Estimate if we might need to scavenge a register at some point in order
	// to materialize a stack offset. If so, either spill one additional
	// callee-saved register or reserve a special spill slot to facilitate
	// register scavenging. If we already spilled an extra callee-saved register
	// above to keep the number of spills even, we don't need to do anything else
	// here.
	if (BigStack) {
	if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) {
	LLVM_DEBUG(dbgs() << "Spilling " << printReg(UnspilledCSGPR, RegInfo)
	<< " to get a scratch register.\n");
	SavedRegs.set(UnspilledCSGPR);
	// MachO's compact unwind format relies on all registers being stored in
	// pairs, so if we need to spill one extra for BigStack, then we need to
	// store the pair.
	if (produceCompactUnwindFrame(MF))
	SavedRegs.set(UnspilledCSGPRPaired);
	ExtraCSSpill = UnspilledCSGPR;
	}

	// If we didn't find an extra callee-saved register to spill, create
	// an emergency spill slot.
	if (!ExtraCSSpill \|\| MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) {
	const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
	const TargetRegisterClass &RC = AArch64::GPR64RegClass;
	unsigned Size = TRI->getSpillSize(RC);
	Align Alignment = TRI->getSpillAlign(RC);
	int FI = MFI.CreateStackObject(Size, Alignment, false);
	RS->addScavengingFrameIndex(FI);
	LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
	<< " as the emergency spill slot.\n");
	}
	}

	// Adding the size of additional 64bit GPR saves.
	CSStackSize += 8 * (SavedRegs.count() - NumSavedRegs);
	uint64_t AlignedCSStackSize = alignTo(CSStackSize, 16);
	LLVM_DEBUG(dbgs() << "Estimated stack frame size: "
	<< EstimatedStackSize + AlignedCSStackSize
	<< " bytes.\n");

	assert((!MFI.isCalleeSavedInfoValid() \|\|
	AFI->getCalleeSavedStackSize() == AlignedCSStackSize) &&
	"Should not invalidate callee saved info");

	// Round up to register pair alignment to avoid additional SP adjustment
	// instructions.
	AFI->setCalleeSavedStackSize(AlignedCSStackSize);
	AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize);
	AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16));
	}

	bool AArch64FrameLowering::enableStackSlotScavenging(
	const MachineFunction &MF) const {
	const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	return AFI->hasCalleeSaveStackFreeSpace();
	}

	/// returns true if there are any SVE callee saves.
	static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI,
	int &Min, int &Max) {
	Min = std::numeric_limits<int>::max();
	Max = std::numeric_limits<int>::min();

	if (!MFI.isCalleeSavedInfoValid())
	return false;

	const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
	for (auto &CS : CSI) {
	if (AArch64::ZPRRegClass.contains(CS.getReg()) \|\|
	AArch64::PPRRegClass.contains(CS.getReg())) {
	assert((Max == std::numeric_limits<int>::min() \|\|
	Max + 1 == CS.getFrameIdx()) &&
	"SVE CalleeSaves are not consecutive");

	Min = std::min(Min, CS.getFrameIdx());
	Max = std::max(Max, CS.getFrameIdx());
	}
	}
	return Min != std::numeric_limits<int>::max();
	}

	// Process all the SVE stack objects and determine offsets for each
	// object. If AssignOffsets is true, the offsets get assigned.
	// Fills in the first and last callee-saved frame indices into
	// Min/MaxCSFrameIndex, respectively.
	// Returns the size of the stack.
	static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
	int &MinCSFrameIndex,
	int &MaxCSFrameIndex,
	bool AssignOffsets) {
	+#ifndef NDEBUG
	// First process all fixed stack objects.
	- int64_t Offset = 0;
	for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
	- if (MFI.getStackID(I) == TargetStackID::SVEVector) {
	- int64_t FixedOffset = -MFI.getObjectOffset(I);
	- if (FixedOffset > Offset)
	- Offset = FixedOffset;
	- }
	+ assert(MFI.getStackID(I) != TargetStackID::SVEVector &&
	+ "SVE vectors should never be passed on the stack by value, only by "
	+ "reference.");
	+#endif

	auto Assign = [&MFI](int FI, int64_t Offset) {
	LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n");
	MFI.setObjectOffset(FI, Offset);
	};

	+ int64_t Offset = 0;
	+
	// Then process all callee saved slots.
	if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) {
	- // Make sure to align the last callee save slot.
	- MFI.setObjectAlignment(MaxCSFrameIndex, Align(16));
	-
	// Assign offsets to the callee save slots.
	for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) {
	Offset += MFI.getObjectSize(I);
	Offset = alignTo(Offset, MFI.getObjectAlign(I));
	if (AssignOffsets)
	Assign(I, -Offset);
	}
	}

	+ // Ensure that the Callee-save area is aligned to 16bytes.
	+ Offset = alignTo(Offset, Align(16U));
	+
	// Create a buffer of SVE objects to allocate and sort it.
	SmallVector<int, 8> ObjectsToAllocate;
	for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
	unsigned StackID = MFI.getStackID(I);
	if (StackID != TargetStackID::SVEVector)
	continue;
	if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex)
	continue;
	if (MFI.isDeadObjectIndex(I))
	continue;

	ObjectsToAllocate.push_back(I);
	}

	// Allocate all SVE locals and spills
	for (unsigned FI : ObjectsToAllocate) {
	Align Alignment = MFI.getObjectAlign(FI);
	// FIXME: Given that the length of SVE vectors is not necessarily a power of
	// two, we'd need to align every object dynamically at runtime if the
	// alignment is larger than 16. This is not yet supported.
	if (Alignment > Align(16))
	report_fatal_error(
	"Alignment of scalable vectors > 16 bytes is not yet supported");

	Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment);
	if (AssignOffsets)
	Assign(FI, -Offset);
	}

	return Offset;
	}

	int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets(
	MachineFrameInfo &MFI) const {
	int MinCSFrameIndex, MaxCSFrameIndex;
	return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, false);
	}

	int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
	MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const {
	return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex,
	true);
	}

	void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
	MachineFunction &MF, RegScavenger *RS) const {
	MachineFrameInfo &MFI = MF.getFrameInfo();

	assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
	"Upwards growing stack unsupported");

	int MinCSFrameIndex, MaxCSFrameIndex;
	int64_t SVEStackSize =
	assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex);

	AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U));
	AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex);

	// If this function isn't doing Win64-style C++ EH, we don't need to do
	// anything.
	if (!MF.hasEHFunclets())
	return;
	const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
	WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();

	MachineBasicBlock &MBB = MF.front();
	auto MBBI = MBB.begin();
	while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
	++MBBI;

	// Create an UnwindHelp object.
	// The UnwindHelp object is allocated at the start of the fixed object area
	int64_t FixedObject =
	getFixedObjectSize(MF, AFI, /IsWin64/ true, /IsFunclet/ false);
	int UnwindHelpFI = MFI.CreateFixedObject(/Size/ 8,
	/SPOffset/ -FixedObject,
	/IsImmutable=/false);
	EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;

	// We need to store -2 into the UnwindHelp object at the start of the
	// function.
	DebugLoc DL;
	RS->enterBasicBlockEnd(MBB);
	RS->backward(std::prev(MBBI));
	unsigned DstReg = RS->FindUnusedReg(&AArch64::GPR64commonRegClass);
	assert(DstReg && "There must be a free register after frame setup");
	BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), DstReg).addImm(-2);
	BuildMI(MBB, MBBI, DL, TII.get(AArch64::STURXi))
	.addReg(DstReg, getKillRegState(true))
	.addFrameIndex(UnwindHelpFI)
	.addImm(0);
	}

	namespace {
	struct TagStoreInstr {
	MachineInstr *MI;
	int64_t Offset, Size;
	explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size)
	: MI(MI), Offset(Offset), Size(Size) {}
	};

	class TagStoreEdit {
	MachineFunction *MF;
	MachineBasicBlock *MBB;
	MachineRegisterInfo *MRI;
	// Tag store instructions that are being replaced.
	SmallVector<TagStoreInstr, 8> TagStores;
	// Combined memref arguments of the above instructions.
	SmallVector<MachineMemOperand *, 8> CombinedMemRefs;

	// Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg +
	// FrameRegOffset + Size) with the address tag of SP.
	Register FrameReg;
	StackOffset FrameRegOffset;
	int64_t Size;
	// If not None, move FrameReg to (FrameReg + FrameRegUpdate) at the end.
	Optional<int64_t> FrameRegUpdate;
	// MIFlags for any FrameReg updating instructions.
	unsigned FrameRegUpdateFlags;

	// Use zeroing instruction variants.
	bool ZeroData;
	DebugLoc DL;

	void emitUnrolled(MachineBasicBlock::iterator InsertI);
	void emitLoop(MachineBasicBlock::iterator InsertI);

	public:
	TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData)
	: MBB(MBB), ZeroData(ZeroData) {
	MF = MBB->getParent();
	MRI = &MF->getRegInfo();
	}
	// Add an instruction to be replaced. Instructions must be added in the
	// ascending order of Offset, and have to be adjacent.
	void addInstruction(TagStoreInstr I) {
	assert((TagStores.empty() \|\|
	TagStores.back().Offset + TagStores.back().Size == I.Offset) &&
	"Non-adjacent tag store instructions.");
	TagStores.push_back(I);
	}
	void clear() { TagStores.clear(); }
	// Emit equivalent code at the given location, and erase the current set of
	// instructions. May skip if the replacement is not profitable. May invalidate
	// the input iterator and replace it with a valid one.
	void emitCode(MachineBasicBlock::iterator &InsertI,
	const AArch64FrameLowering *TFI, bool IsLast);
	};

	void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
	const AArch64InstrInfo *TII =
	MF->getSubtarget<AArch64Subtarget>().getInstrInfo();

	const int64_t kMinOffset = -256 * 16;
	const int64_t kMaxOffset = 255 * 16;

	Register BaseReg = FrameReg;
	int64_t BaseRegOffsetBytes = FrameRegOffset.getBytes();
	if (BaseRegOffsetBytes < kMinOffset \|\|
	BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) {
	Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
	emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg,
	{BaseRegOffsetBytes, MVT::i8}, TII);
	BaseReg = ScratchReg;
	BaseRegOffsetBytes = 0;
	}

	MachineInstr *LastI = nullptr;
	while (Size) {
	int64_t InstrSize = (Size > 16) ? 32 : 16;
	unsigned Opcode =
	InstrSize == 16
	? (ZeroData ? AArch64::STZGOffset : AArch64::STGOffset)
	: (ZeroData ? AArch64::STZ2GOffset : AArch64::ST2GOffset);
	MachineInstr I = BuildMI(MBB, InsertI, DL, TII->get(Opcode))
	.addReg(AArch64::SP)
	.addReg(BaseReg)
	.addImm(BaseRegOffsetBytes / 16)
	.setMemRefs(CombinedMemRefs);
	// A store to [BaseReg, #0] should go last for an opportunity to fold the
	// final SP adjustment in the epilogue.
	if (BaseRegOffsetBytes == 0)
	LastI = I;
	BaseRegOffsetBytes += InstrSize;
	Size -= InstrSize;
	}

	if (LastI)
	MBB->splice(InsertI, MBB, LastI);
	}

	void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) {
	const AArch64InstrInfo *TII =
	MF->getSubtarget<AArch64Subtarget>().getInstrInfo();

	Register BaseReg = FrameRegUpdate
	? FrameReg
	: MRI->createVirtualRegister(&AArch64::GPR64RegClass);
	Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);

	emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII);

	int64_t LoopSize = Size;
	// If the loop size is not a multiple of 32, split off one 16-byte store at
	// the end to fold BaseReg update into.
	if (FrameRegUpdate && *FrameRegUpdate)
	LoopSize -= LoopSize % 32;
	MachineInstr LoopI = BuildMI(MBB, InsertI, DL,
	TII->get(ZeroData ? AArch64::STZGloop_wback
	: AArch64::STGloop_wback))
	.addDef(SizeReg)
	.addDef(BaseReg)
	.addImm(LoopSize)
	.addReg(BaseReg)
	.setMemRefs(CombinedMemRefs);
	if (FrameRegUpdate)
	LoopI->setFlags(FrameRegUpdateFlags);

	int64_t ExtraBaseRegUpdate =
	FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getBytes() - Size) : 0;
	if (LoopSize < Size) {
	assert(FrameRegUpdate);
	assert(Size - LoopSize == 16);
	// Tag 16 more bytes at BaseReg and update BaseReg.
	BuildMI(*MBB, InsertI, DL,
	TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex))
	.addDef(BaseReg)
	.addReg(BaseReg)
	.addReg(BaseReg)
	.addImm(1 + ExtraBaseRegUpdate / 16)
	.setMemRefs(CombinedMemRefs)
	.setMIFlags(FrameRegUpdateFlags);
	} else if (ExtraBaseRegUpdate) {
	// Update BaseReg.
	BuildMI(
	*MBB, InsertI, DL,
	TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri))
	.addDef(BaseReg)
	.addReg(BaseReg)
	.addImm(std::abs(ExtraBaseRegUpdate))
	.addImm(0)
	.setMIFlags(FrameRegUpdateFlags);
	}
	}

	// Check if *II is a register update that can be merged into STGloop that ends
	// at (Reg + Size). RemainingOffset is the required adjustment to Reg after the
	// end of the loop.
	bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg,
	int64_t Size, int64_t *TotalOffset) {
	MachineInstr &MI = *II;
	if ((MI.getOpcode() == AArch64::ADDXri \|\|
	MI.getOpcode() == AArch64::SUBXri) &&
	MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) {
	unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm());
	int64_t Offset = MI.getOperand(2).getImm() << Shift;
	if (MI.getOpcode() == AArch64::SUBXri)
	Offset = -Offset;
	int64_t AbsPostOffset = std::abs(Offset - Size);
	const int64_t kMaxOffset =
	0xFFF; // Max encoding for unshifted ADDXri / SUBXri
	if (AbsPostOffset <= kMaxOffset && AbsPostOffset % 16 == 0) {
	*TotalOffset = Offset;
	return true;
	}
	}
	return false;
	}

	void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
	SmallVectorImpl<MachineMemOperand *> &MemRefs) {
	MemRefs.clear();
	for (auto &TS : TSE) {
	MachineInstr *MI = TS.MI;
	// An instruction without memory operands may access anything. Be
	// conservative and return an empty list.
	if (MI->memoperands_empty()) {
	MemRefs.clear();
	return;
	}
	MemRefs.append(MI->memoperands_begin(), MI->memoperands_end());
	}
	}

	void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
	const AArch64FrameLowering *TFI, bool IsLast) {
	if (TagStores.empty())
	return;
	TagStoreInstr &FirstTagStore = TagStores[0];
	TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1];
	Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size;
	DL = TagStores[0].MI->getDebugLoc();

	Register Reg;
	FrameRegOffset = TFI->resolveFrameOffsetReference(
	MF, FirstTagStore.Offset, false /isFixed/, false /isSVE*/, Reg,
	/PreferFP=/false, /ForSimm=/true);
	FrameReg = Reg;
	FrameRegUpdate = None;

	mergeMemRefs(TagStores, CombinedMemRefs);

	LLVM_DEBUG(dbgs() << "Replacing adjacent STG instructions:\n";
	for (const auto &Instr
	: TagStores) { dbgs() << " " << *Instr.MI; });

	// Size threshold where a loop becomes shorter than a linear sequence of
	// tagging instructions.
	const int kSetTagLoopThreshold = 176;
	if (Size < kSetTagLoopThreshold) {
	if (TagStores.size() < 2)
	return;
	emitUnrolled(InsertI);
	} else {
	MachineInstr *UpdateInstr = nullptr;
	int64_t TotalOffset;
	if (IsLast) {
	// See if we can merge base register update into the STGloop.
	// This is done in AArch64LoadStoreOptimizer for "normal" stores,
	// but STGloop is way too unusual for that, and also it only
	// realistically happens in function epilogue. Also, STGloop is expanded
	// before that pass.
	if (InsertI != MBB->end() &&
	canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getBytes() + Size,
	&TotalOffset)) {
	UpdateInstr = &*InsertI++;
	LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n "
	<< *UpdateInstr);
	}
	}

	if (!UpdateInstr && TagStores.size() < 2)
	return;

	if (UpdateInstr) {
	FrameRegUpdate = TotalOffset;
	FrameRegUpdateFlags = UpdateInstr->getFlags();
	}
	emitLoop(InsertI);
	if (UpdateInstr)
	UpdateInstr->eraseFromParent();
	}

	for (auto &TS : TagStores)
	TS.MI->eraseFromParent();
	}

	bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset,
	int64_t &Size, bool &ZeroData) {
	MachineFunction &MF = *MI.getParent()->getParent();
	const MachineFrameInfo &MFI = MF.getFrameInfo();

	unsigned Opcode = MI.getOpcode();
	ZeroData = (Opcode == AArch64::STZGloop \|\| Opcode == AArch64::STZGOffset \|\|
	Opcode == AArch64::STZ2GOffset);

	if (Opcode == AArch64::STGloop \|\| Opcode == AArch64::STZGloop) {
	if (!MI.getOperand(0).isDead() \|\| !MI.getOperand(1).isDead())
	return false;
	if (!MI.getOperand(2).isImm() \|\| !MI.getOperand(3).isFI())
	return false;
	Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex());
	Size = MI.getOperand(2).getImm();
	return true;
	}

	if (Opcode == AArch64::STGOffset \|\| Opcode == AArch64::STZGOffset)
	Size = 16;
	else if (Opcode == AArch64::ST2GOffset \|\| Opcode == AArch64::STZ2GOffset)
	Size = 32;
	else
	return false;

	if (MI.getOperand(0).getReg() != AArch64::SP \|\| !MI.getOperand(1).isFI())
	return false;

	Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) +
	16 * MI.getOperand(2).getImm();
	return true;
	}

	// Detect a run of memory tagging instructions for adjacent stack frame slots,
	// and replace them with a shorter instruction sequence:
	// * replace STG + STG with ST2G
	// * replace STGloop + STGloop with STGloop
	// This code needs to run when stack slot offsets are already known, but before
	// FrameIndex operands in STG instructions are eliminated.
	MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
	const AArch64FrameLowering *TFI,
	RegScavenger *RS) {
	bool FirstZeroData;
	int64_t Size, Offset;
	MachineInstr &MI = *II;
	MachineBasicBlock *MBB = MI.getParent();
	MachineBasicBlock::iterator NextI = ++II;
	if (&MI == &MBB->instr_back())
	return II;
	if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData))
	return II;

	SmallVector<TagStoreInstr, 4> Instrs;
	Instrs.emplace_back(&MI, Offset, Size);

	constexpr int kScanLimit = 10;
	int Count = 0;
	for (MachineBasicBlock::iterator E = MBB->end();
	NextI != E && Count < kScanLimit; ++NextI) {
	MachineInstr &MI = *NextI;
	bool ZeroData;
	int64_t Size, Offset;
	// Collect instructions that update memory tags with a FrameIndex operand
	// and (when applicable) constant size, and whose output registers are dead
	// (the latter is almost always the case in practice). Since these
	// instructions effectively have no inputs or outputs, we are free to skip
	// any non-aliasing instructions in between without tracking used registers.
	if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) {
	if (ZeroData != FirstZeroData)
	break;
	Instrs.emplace_back(&MI, Offset, Size);
	continue;
	}

	// Only count non-transient, non-tagging instructions toward the scan
	// limit.
	if (!MI.isTransient())
	++Count;

	// Just in case, stop before the epilogue code starts.
	if (MI.getFlag(MachineInstr::FrameSetup) \|\|
	MI.getFlag(MachineInstr::FrameDestroy))
	break;

	// Reject anything that may alias the collected instructions.
	if (MI.mayLoadOrStore() \|\| MI.hasUnmodeledSideEffects())
	break;
	}

	// New code will be inserted after the last tagging instruction we've found.
	MachineBasicBlock::iterator InsertI = Instrs.back().MI;
	InsertI++;

	llvm::stable_sort(Instrs,
	[](const TagStoreInstr &Left, const TagStoreInstr &Right) {
	return Left.Offset < Right.Offset;
	});

	// Make sure that we don't have any overlapping stores.
	int64_t CurOffset = Instrs[0].Offset;
	for (auto &Instr : Instrs) {
	if (CurOffset > Instr.Offset)
	return NextI;
	CurOffset = Instr.Offset + Instr.Size;
	}

	// Find contiguous runs of tagged memory and emit shorter instruction
	// sequencies for them when possible.
	TagStoreEdit TSE(MBB, FirstZeroData);
	Optional<int64_t> EndOffset;
	for (auto &Instr : Instrs) {
	if (EndOffset && *EndOffset != Instr.Offset) {
	// Found a gap.
	TSE.emitCode(InsertI, TFI, /IsLast = / false);
	TSE.clear();
	}

	TSE.addInstruction(Instr);
	EndOffset = Instr.Offset + Instr.Size;
	}

	TSE.emitCode(InsertI, TFI, /IsLast = / true);

	return InsertI;
	}
	} // namespace

	void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
	MachineFunction &MF, RegScavenger *RS = nullptr) const {
	if (StackTaggingMergeSetTag)
	for (auto &BB : MF)
	for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();)
	II = tryMergeAdjacentSTG(II, this, RS);
	}

	/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
	/// before the update. This is easily retrieved as it is exactly the offset
	/// that is set in processFunctionBeforeFrameFinalized.
	int AArch64FrameLowering::getFrameIndexReferencePreferSP(
	const MachineFunction &MF, int FI, Register &FrameReg,
	bool IgnoreSPUpdates) const {
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	if (IgnoreSPUpdates) {
	LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is "
	<< MFI.getObjectOffset(FI) << "\n");
	FrameReg = AArch64::SP;
	return MFI.getObjectOffset(FI);
	}

	return getFrameIndexReference(MF, FI, FrameReg);
	}

	/// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve
	/// the parent's frame pointer
	unsigned AArch64FrameLowering::getWinEHParentFrameOffset(
	const MachineFunction &MF) const {
	return 0;
	}

	/// Funclets only need to account for space for the callee saved registers,
	/// as the locals are accounted for in the parent's stack frame.
	unsigned AArch64FrameLowering::getWinEHFuncletFrameSize(
	const MachineFunction &MF) const {
	// This is the size of the pushed CSRs.
	unsigned CSSize =
	MF.getInfo<AArch64FunctionInfo>()->getCalleeSavedStackSize();
	// This is the amount of stack a funclet needs to allocate.
	return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(),
	getStackAlign());
	}
	diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
	index 9d0a6d9eaf25..444740cb50ab 100644
	--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
	+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
	@@ -1,122 +1,128 @@
	//==-- AArch64FrameLowering.h - TargetFrameLowering for AArch64 --- C++ --==//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	//
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
	#define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H

	#include "AArch64StackOffset.h"
	#include "llvm/CodeGen/TargetFrameLowering.h"

	namespace llvm {

	class AArch64FrameLowering : public TargetFrameLowering {
	public:
	explicit AArch64FrameLowering()
	: TargetFrameLowering(StackGrowsDown, Align(16), 0, Align(16),
	true /StackRealignable/) {}

	void
	emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI) const override;

	MachineBasicBlock::iterator
	eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I) const override;

	/// emitProlog/emitEpilog - These methods insert prolog and epilog code into
	/// the function.
	void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
	void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;

	bool canUseAsPrologue(const MachineBasicBlock &MBB) const override;

	int getFrameIndexReference(const MachineFunction &MF, int FI,
	Register &FrameReg) const override;
	StackOffset resolveFrameIndexReference(const MachineFunction &MF, int FI,
	Register &FrameReg, bool PreferFP,
	bool ForSimm) const;
	StackOffset resolveFrameOffsetReference(const MachineFunction &MF,
	int64_t ObjectOffset, bool isFixed,
	bool isSVE, Register &FrameReg,
	bool PreferFP, bool ForSimm) const;
	bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI,
	ArrayRef<CalleeSavedInfo> CSI,
	const TargetRegisterInfo *TRI) const override;

	bool
	restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI,
	MutableArrayRef<CalleeSavedInfo> CSI,
	const TargetRegisterInfo *TRI) const override;

	/// Can this function use the red zone for local allocations.
	bool canUseRedZone(const MachineFunction &MF) const;

	bool hasFP(const MachineFunction &MF) const override;
	bool hasReservedCallFrame(const MachineFunction &MF) const override;

	void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
	RegScavenger *RS) const override;

	/// Returns true if the target will correctly handle shrink wrapping.
	bool enableShrinkWrapping(const MachineFunction &MF) const override {
	return true;
	}

	bool enableStackSlotScavenging(const MachineFunction &MF) const override;
	TargetStackID::Value getStackIDForScalableVectors() const override;

	void processFunctionBeforeFrameFinalized(MachineFunction &MF,
	RegScavenger *RS) const override;

	void
	processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF,
	RegScavenger *RS) const override;

	unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const override;

	unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const;

	int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
	Register &FrameReg,
	bool IgnoreSPUpdates) const override;
	int getNonLocalFrameIndexReference(const MachineFunction &MF,
	int FI) const override;
	int getSEHFrameIndexOffset(const MachineFunction &MF, int FI) const;

	bool isSupportedStackID(TargetStackID::Value ID) const override {
	switch (ID) {
	default:
	return false;
	case TargetStackID::Default:
	case TargetStackID::SVEVector:
	case TargetStackID::NoAlloc:
	return true;
	}
	}

	+ bool isStackIdSafeForLocalArea(unsigned StackId) const override {
	+ // We don't support putting SVE objects into the pre-allocated local
	+ // frame block at the moment.
	+ return StackId != TargetStackID::SVEVector;
	+ }
	+
	private:
	bool shouldCombineCSRLocalStackBump(MachineFunction &MF,
	uint64_t StackBumpBytes) const;

	int64_t estimateSVEStackObjectOffsets(MachineFrameInfo &MF) const;
	int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF,
	int &MinCSFrameIndex,
	int &MaxCSFrameIndex) const;
	bool shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock &MBB,
	unsigned StackBumpBytes) const;
	};

	} // End llvm namespace

	#endif
	diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
	index 10c477853353..7799ebfbd68e 100644
	--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
	+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
	@@ -1,4931 +1,4940 @@
	//===-- AArch64ISelDAGToDAG.cpp - A dag to dag inst selector for AArch64 --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines an instruction selector for the AArch64 target.
	//
	//===----------------------------------------------------------------------===//

	#include "AArch64TargetMachine.h"
	#include "MCTargetDesc/AArch64AddressingModes.h"
	#include "llvm/ADT/APSInt.h"
	#include "llvm/CodeGen/SelectionDAGISel.h"
	#include "llvm/IR/Function.h" // To access function attributes.
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/IntrinsicsAArch64.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"

	using namespace llvm;

	#define DEBUG_TYPE "aarch64-isel"

	//===--------------------------------------------------------------------===//
	/// AArch64DAGToDAGISel - AArch64 specific code to select AArch64 machine
	/// instructions for SelectionDAG operations.
	///
	namespace {

	class AArch64DAGToDAGISel : public SelectionDAGISel {

	/// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
	/// make the right decision when generating code for different targets.
	const AArch64Subtarget *Subtarget;

	public:
	explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm,
	CodeGenOpt::Level OptLevel)
	: SelectionDAGISel(tm, OptLevel), Subtarget(nullptr) {}

	StringRef getPassName() const override {
	return "AArch64 Instruction Selection";
	}

	bool runOnMachineFunction(MachineFunction &MF) override {
	Subtarget = &MF.getSubtarget<AArch64Subtarget>();
	return SelectionDAGISel::runOnMachineFunction(MF);
	}

	void Select(SDNode *Node) override;

	/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
	/// inline asm expressions.
	bool SelectInlineAsmMemoryOperand(const SDValue &Op,
	unsigned ConstraintID,
	std::vector<SDValue> &OutOps) override;

	template <signed Low, signed High, signed Scale>
	bool SelectRDVLImm(SDValue N, SDValue &Imm);

	bool tryMLAV64LaneV128(SDNode *N);
	bool tryMULLV64LaneV128(unsigned IntNo, SDNode *N);
	bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift);
	bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
	bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
	bool SelectArithShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
	return SelectShiftedRegister(N, false, Reg, Shift);
	}
	bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
	return SelectShiftedRegister(N, true, Reg, Shift);
	}
	bool SelectAddrModeIndexed7S8(SDValue N, SDValue &Base, SDValue &OffImm) {
	return SelectAddrModeIndexed7S(N, 1, Base, OffImm);
	}
	bool SelectAddrModeIndexed7S16(SDValue N, SDValue &Base, SDValue &OffImm) {
	return SelectAddrModeIndexed7S(N, 2, Base, OffImm);
	}
	bool SelectAddrModeIndexed7S32(SDValue N, SDValue &Base, SDValue &OffImm) {
	return SelectAddrModeIndexed7S(N, 4, Base, OffImm);
	}
	bool SelectAddrModeIndexed7S64(SDValue N, SDValue &Base, SDValue &OffImm) {
	return SelectAddrModeIndexed7S(N, 8, Base, OffImm);
	}
	bool SelectAddrModeIndexed7S128(SDValue N, SDValue &Base, SDValue &OffImm) {
	return SelectAddrModeIndexed7S(N, 16, Base, OffImm);
	}
	bool SelectAddrModeIndexedS9S128(SDValue N, SDValue &Base, SDValue &OffImm) {
	return SelectAddrModeIndexedBitWidth(N, true, 9, 16, Base, OffImm);
	}
	bool SelectAddrModeIndexedU6S128(SDValue N, SDValue &Base, SDValue &OffImm) {
	return SelectAddrModeIndexedBitWidth(N, false, 6, 16, Base, OffImm);
	}
	bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) {
	return SelectAddrModeIndexed(N, 1, Base, OffImm);
	}
	bool SelectAddrModeIndexed16(SDValue N, SDValue &Base, SDValue &OffImm) {
	return SelectAddrModeIndexed(N, 2, Base, OffImm);
	}
	bool SelectAddrModeIndexed32(SDValue N, SDValue &Base, SDValue &OffImm) {
	return SelectAddrModeIndexed(N, 4, Base, OffImm);
	}
	bool SelectAddrModeIndexed64(SDValue N, SDValue &Base, SDValue &OffImm) {
	return SelectAddrModeIndexed(N, 8, Base, OffImm);
	}
	bool SelectAddrModeIndexed128(SDValue N, SDValue &Base, SDValue &OffImm) {
	return SelectAddrModeIndexed(N, 16, Base, OffImm);
	}
	bool SelectAddrModeUnscaled8(SDValue N, SDValue &Base, SDValue &OffImm) {
	return SelectAddrModeUnscaled(N, 1, Base, OffImm);
	}
	bool SelectAddrModeUnscaled16(SDValue N, SDValue &Base, SDValue &OffImm) {
	return SelectAddrModeUnscaled(N, 2, Base, OffImm);
	}
	bool SelectAddrModeUnscaled32(SDValue N, SDValue &Base, SDValue &OffImm) {
	return SelectAddrModeUnscaled(N, 4, Base, OffImm);
	}
	bool SelectAddrModeUnscaled64(SDValue N, SDValue &Base, SDValue &OffImm) {
	return SelectAddrModeUnscaled(N, 8, Base, OffImm);
	}
	bool SelectAddrModeUnscaled128(SDValue N, SDValue &Base, SDValue &OffImm) {
	return SelectAddrModeUnscaled(N, 16, Base, OffImm);
	}

	template<int Width>
	bool SelectAddrModeWRO(SDValue N, SDValue &Base, SDValue &Offset,
	SDValue &SignExtend, SDValue &DoShift) {
	return SelectAddrModeWRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
	}

	template<int Width>
	bool SelectAddrModeXRO(SDValue N, SDValue &Base, SDValue &Offset,
	SDValue &SignExtend, SDValue &DoShift) {
	return SelectAddrModeXRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
	}

	bool SelectDupZeroOrUndef(SDValue N) {
	switch(N->getOpcode()) {
	case ISD::UNDEF:
	return true;
	case AArch64ISD::DUP:
	case ISD::SPLAT_VECTOR: {
	auto Opnd0 = N->getOperand(0);
	if (auto CN = dyn_cast<ConstantSDNode>(Opnd0))
	if (CN->isNullValue())
	return true;
	if (auto CN = dyn_cast<ConstantFPSDNode>(Opnd0))
	if (CN->isZero())
	return true;
	break;
	}
	default:
	break;
	}

	return false;
	}

	bool SelectDupZero(SDValue N) {
	switch(N->getOpcode()) {
	case AArch64ISD::DUP:
	case ISD::SPLAT_VECTOR: {
	auto Opnd0 = N->getOperand(0);
	if (auto CN = dyn_cast<ConstantSDNode>(Opnd0))
	if (CN->isNullValue())
	return true;
	if (auto CN = dyn_cast<ConstantFPSDNode>(Opnd0))
	if (CN->isZero())
	return true;
	break;
	}
	}

	return false;
	}

	template<MVT::SimpleValueType VT>
	bool SelectSVEAddSubImm(SDValue N, SDValue &Imm, SDValue &Shift) {
	return SelectSVEAddSubImm(N, VT, Imm, Shift);
	}

	template<MVT::SimpleValueType VT>
	bool SelectSVELogicalImm(SDValue N, SDValue &Imm) {
	return SelectSVELogicalImm(N, VT, Imm);
	}

	template <unsigned Low, unsigned High>
	bool SelectSVEShiftImm64(SDValue N, SDValue &Imm) {
	return SelectSVEShiftImm64(N, Low, High, Imm);
	}

	// Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N.
	template<signed Min, signed Max, signed Scale, bool Shift>
	bool SelectCntImm(SDValue N, SDValue &Imm) {
	if (!isa<ConstantSDNode>(N))
	return false;

	int64_t MulImm = cast<ConstantSDNode>(N)->getSExtValue();
	if (Shift)
	MulImm = 1LL << MulImm;

	if ((MulImm % std::abs(Scale)) != 0)
	return false;

	MulImm /= Scale;
	if ((MulImm >= Min) && (MulImm <= Max)) {
	Imm = CurDAG->getTargetConstant(MulImm, SDLoc(N), MVT::i32);
	return true;
	}

	return false;
	}

	/// Form sequences of consecutive 64/128-bit registers for use in NEON
	/// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have
	/// between 1 and 4 elements. If it contains a single element that is returned
	/// unchanged; otherwise a REG_SEQUENCE value is returned.
	SDValue createDTuple(ArrayRef<SDValue> Vecs);
	SDValue createQTuple(ArrayRef<SDValue> Vecs);
	// Form a sequence of SVE registers for instructions using list of vectors,
	// e.g. structured loads and stores (ldN, stN).
	SDValue createZTuple(ArrayRef<SDValue> Vecs);

	/// Generic helper for the createDTuple/createQTuple
	/// functions. Those should almost always be called instead.
	SDValue createTuple(ArrayRef<SDValue> Vecs, const unsigned RegClassIDs[],
	const unsigned SubRegs[]);

	void SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt);

	bool tryIndexedLoad(SDNode *N);

	bool trySelectStackSlotTagP(SDNode *N);
	void SelectTagP(SDNode *N);

	void SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
	unsigned SubRegIdx);
	void SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
	unsigned SubRegIdx);
	void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
	void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
	- void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, const unsigned Opc);
	+ void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, unsigned Scale,
	+ unsigned Opc_rr, unsigned Opc_ri);

	bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm);
	/// SVE Reg+Imm addressing mode.
	template <int64_t Min, int64_t Max>
	bool SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, SDValue &Base,
	SDValue &OffImm);
	/// SVE Reg+Reg address mode.
	template <unsigned Scale>
	bool SelectSVERegRegAddrMode(SDValue N, SDValue &Base, SDValue &Offset) {
	return SelectSVERegRegAddrMode(N, Scale, Base, Offset);
	}

	void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
	void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
	void SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
	void SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
	- template <unsigned Scale>
	- void SelectPredicatedStore(SDNode *N, unsigned NumVecs, const unsigned Opc_rr,
	- const unsigned Opc_ri);
	- template <unsigned Scale>
	+ void SelectPredicatedStore(SDNode *N, unsigned NumVecs, unsigned Scale,
	+ unsigned Opc_rr, unsigned Opc_ri);
	std::tuple<unsigned, SDValue, SDValue>
	- findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr,
	- const unsigned Opc_ri, const SDValue &OldBase,
	- const SDValue &OldOffset);
	+ findAddrModeSVELoadStore(SDNode *N, unsigned Opc_rr, unsigned Opc_ri,
	+ const SDValue &OldBase, const SDValue &OldOffset,
	+ unsigned Scale);

	bool tryBitfieldExtractOp(SDNode *N);
	bool tryBitfieldExtractOpFromSExt(SDNode *N);
	bool tryBitfieldInsertOp(SDNode *N);
	bool tryBitfieldInsertInZeroOp(SDNode *N);
	bool tryShiftAmountMod(SDNode *N);
	bool tryHighFPExt(SDNode *N);

	bool tryReadRegister(SDNode *N);
	bool tryWriteRegister(SDNode *N);

	// Include the pieces autogenerated from the target description.
	#include "AArch64GenDAGISel.inc"

	private:
	bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg,
	SDValue &Shift);
	bool SelectAddrModeIndexed7S(SDValue N, unsigned Size, SDValue &Base,
	SDValue &OffImm) {
	return SelectAddrModeIndexedBitWidth(N, true, 7, Size, Base, OffImm);
	}
	bool SelectAddrModeIndexedBitWidth(SDValue N, bool IsSignedImm, unsigned BW,
	unsigned Size, SDValue &Base,
	SDValue &OffImm);
	bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base,
	SDValue &OffImm);
	bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base,
	SDValue &OffImm);
	bool SelectAddrModeWRO(SDValue N, unsigned Size, SDValue &Base,
	SDValue &Offset, SDValue &SignExtend,
	SDValue &DoShift);
	bool SelectAddrModeXRO(SDValue N, unsigned Size, SDValue &Base,
	SDValue &Offset, SDValue &SignExtend,
	SDValue &DoShift);
	bool isWorthFolding(SDValue V) const;
	bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend,
	SDValue &Offset, SDValue &SignExtend);

	template<unsigned RegWidth>
	bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos) {
	return SelectCVTFixedPosOperand(N, FixedPos, RegWidth);
	}

	bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width);

	bool SelectCMP_SWAP(SDNode *N);

	bool SelectSVE8BitLslImm(SDValue N, SDValue &Imm, SDValue &Shift);

	bool SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift);

	bool SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm);

	bool SelectSVESignedArithImm(SDValue N, SDValue &Imm);
	bool SelectSVEShiftImm64(SDValue N, uint64_t Low, uint64_t High,
	SDValue &Imm);

	bool SelectSVEArithImm(SDValue N, SDValue &Imm);
	bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base,
	SDValue &Offset);
	};
	} // end anonymous namespace

	/// isIntImmediate - This method tests to see if the node is a constant
	/// operand. If so Imm will receive the 32-bit value.
	static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
	if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
	Imm = C->getZExtValue();
	return true;
	}
	return false;
	}

	// isIntImmediate - This method tests to see if a constant operand.
	// If so Imm will receive the value.
	static bool isIntImmediate(SDValue N, uint64_t &Imm) {
	return isIntImmediate(N.getNode(), Imm);
	}

	// isOpcWithIntImmediate - This method tests to see if the node is a specific
	// opcode and that it has a immediate integer right operand.
	// If so Imm will receive the 32 bit value.
	static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
	uint64_t &Imm) {
	return N->getOpcode() == Opc &&
	isIntImmediate(N->getOperand(1).getNode(), Imm);
	}

	bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand(
	const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
	switch(ConstraintID) {
	default:
	llvm_unreachable("Unexpected asm memory constraint");
	case InlineAsm::Constraint_m:
	case InlineAsm::Constraint_Q:
	// We need to make sure that this one operand does not end up in XZR, thus
	// require the address to be in a PointerRegClass register.
	const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
	const TargetRegisterClass TRC = TRI->getPointerRegClass(MF);
	SDLoc dl(Op);
	SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i64);
	SDValue NewOp =
	SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
	dl, Op.getValueType(),
	Op, RC), 0);
	OutOps.push_back(NewOp);
	return false;
	}
	return true;
	}

	/// SelectArithImmed - Select an immediate value that can be represented as
	/// a 12-bit value shifted left by either 0 or 12. If so, return true with
	/// Val set to the 12-bit value and Shift set to the shifter operand.
	bool AArch64DAGToDAGISel::SelectArithImmed(SDValue N, SDValue &Val,
	SDValue &Shift) {
	// This function is called from the addsub_shifted_imm ComplexPattern,
	// which lists [imm] as the list of opcode it's interested in, however
	// we still need to check whether the operand is actually an immediate
	// here because the ComplexPattern opcode list is only used in
	// root-level opcode matching.
	if (!isa<ConstantSDNode>(N.getNode()))
	return false;

	uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
	unsigned ShiftAmt;

	if (Immed >> 12 == 0) {
	ShiftAmt = 0;
	} else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
	ShiftAmt = 12;
	Immed = Immed >> 12;
	} else
	return false;

	unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
	SDLoc dl(N);
	Val = CurDAG->getTargetConstant(Immed, dl, MVT::i32);
	Shift = CurDAG->getTargetConstant(ShVal, dl, MVT::i32);
	return true;
	}

	/// SelectNegArithImmed - As above, but negates the value before trying to
	/// select it.
	bool AArch64DAGToDAGISel::SelectNegArithImmed(SDValue N, SDValue &Val,
	SDValue &Shift) {
	// This function is called from the addsub_shifted_imm ComplexPattern,
	// which lists [imm] as the list of opcode it's interested in, however
	// we still need to check whether the operand is actually an immediate
	// here because the ComplexPattern opcode list is only used in
	// root-level opcode matching.
	if (!isa<ConstantSDNode>(N.getNode()))
	return false;

	// The immediate operand must be a 24-bit zero-extended immediate.
	uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();

	// This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
	// have the opposite effect on the C flag, so this pattern mustn't match under
	// those circumstances.
	if (Immed == 0)
	return false;

	if (N.getValueType() == MVT::i32)
	Immed = ~((uint32_t)Immed) + 1;
	else
	Immed = ~Immed + 1ULL;
	if (Immed & 0xFFFFFFFFFF000000ULL)
	return false;

	Immed &= 0xFFFFFFULL;
	return SelectArithImmed(CurDAG->getConstant(Immed, SDLoc(N), MVT::i32), Val,
	Shift);
	}

	/// getShiftTypeForNode - Translate a shift node to the corresponding
	/// ShiftType value.
	static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
	switch (N.getOpcode()) {
	default:
	return AArch64_AM::InvalidShiftExtend;
	case ISD::SHL:
	return AArch64_AM::LSL;
	case ISD::SRL:
	return AArch64_AM::LSR;
	case ISD::SRA:
	return AArch64_AM::ASR;
	case ISD::ROTR:
	return AArch64_AM::ROR;
	}
	}

	/// Determine whether it is worth it to fold SHL into the addressing
	/// mode.
	static bool isWorthFoldingSHL(SDValue V) {
	assert(V.getOpcode() == ISD::SHL && "invalid opcode");
	// It is worth folding logical shift of up to three places.
	auto *CSD = dyn_cast<ConstantSDNode>(V.getOperand(1));
	if (!CSD)
	return false;
	unsigned ShiftVal = CSD->getZExtValue();
	if (ShiftVal > 3)
	return false;

	// Check if this particular node is reused in any non-memory related
	// operation. If yes, do not try to fold this node into the address
	// computation, since the computation will be kept.
	const SDNode *Node = V.getNode();
	for (SDNode *UI : Node->uses())
	if (!isa<MemSDNode>(*UI))
	for (SDNode *UII : UI->uses())
	if (!isa<MemSDNode>(*UII))
	return false;
	return true;
	}

	/// Determine whether it is worth to fold V into an extended register.
	bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
	// Trivial if we are optimizing for code size or if there is only
	// one use of the value.
	if (CurDAG->shouldOptForSize() \|\| V.hasOneUse())
	return true;
	// If a subtarget has a fastpath LSL we can fold a logical shift into
	// the addressing mode and save a cycle.
	if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::SHL &&
	isWorthFoldingSHL(V))
	return true;
	if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::ADD) {
	const SDValue LHS = V.getOperand(0);
	const SDValue RHS = V.getOperand(1);
	if (LHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(LHS))
	return true;
	if (RHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(RHS))
	return true;
	}

	// It hurts otherwise, since the value will be reused.
	return false;
	}

	/// SelectShiftedRegister - Select a "shifted register" operand. If the value
	/// is not shifted, set the Shift operand to default of "LSL 0". The logical
	/// instructions allow the shifted register to be rotated, but the arithmetic
	/// instructions do not. The AllowROR parameter specifies whether ROR is
	/// supported.
	bool AArch64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR,
	SDValue &Reg, SDValue &Shift) {
	AArch64_AM::ShiftExtendType ShType = getShiftTypeForNode(N);
	if (ShType == AArch64_AM::InvalidShiftExtend)
	return false;
	if (!AllowROR && ShType == AArch64_AM::ROR)
	return false;

	if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
	unsigned BitSize = N.getValueSizeInBits();
	unsigned Val = RHS->getZExtValue() & (BitSize - 1);
	unsigned ShVal = AArch64_AM::getShifterImm(ShType, Val);

	Reg = N.getOperand(0);
	Shift = CurDAG->getTargetConstant(ShVal, SDLoc(N), MVT::i32);
	return isWorthFolding(N);
	}

	return false;
	}

	/// getExtendTypeForNode - Translate an extend node to the corresponding
	/// ExtendType value.
	static AArch64_AM::ShiftExtendType
	getExtendTypeForNode(SDValue N, bool IsLoadStore = false) {
	if (N.getOpcode() == ISD::SIGN_EXTEND \|\|
	N.getOpcode() == ISD::SIGN_EXTEND_INREG) {
	EVT SrcVT;
	if (N.getOpcode() == ISD::SIGN_EXTEND_INREG)
	SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
	else
	SrcVT = N.getOperand(0).getValueType();

	if (!IsLoadStore && SrcVT == MVT::i8)
	return AArch64_AM::SXTB;
	else if (!IsLoadStore && SrcVT == MVT::i16)
	return AArch64_AM::SXTH;
	else if (SrcVT == MVT::i32)
	return AArch64_AM::SXTW;
	assert(SrcVT != MVT::i64 && "extend from 64-bits?");

	return AArch64_AM::InvalidShiftExtend;
	} else if (N.getOpcode() == ISD::ZERO_EXTEND \|\|
	N.getOpcode() == ISD::ANY_EXTEND) {
	EVT SrcVT = N.getOperand(0).getValueType();
	if (!IsLoadStore && SrcVT == MVT::i8)
	return AArch64_AM::UXTB;
	else if (!IsLoadStore && SrcVT == MVT::i16)
	return AArch64_AM::UXTH;
	else if (SrcVT == MVT::i32)
	return AArch64_AM::UXTW;
	assert(SrcVT != MVT::i64 && "extend from 64-bits?");

	return AArch64_AM::InvalidShiftExtend;
	} else if (N.getOpcode() == ISD::AND) {
	ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
	if (!CSD)
	return AArch64_AM::InvalidShiftExtend;
	uint64_t AndMask = CSD->getZExtValue();

	switch (AndMask) {
	default:
	return AArch64_AM::InvalidShiftExtend;
	case 0xFF:
	return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
	case 0xFFFF:
	return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
	case 0xFFFFFFFF:
	return AArch64_AM::UXTW;
	}
	}

	return AArch64_AM::InvalidShiftExtend;
	}

	// Helper for SelectMLAV64LaneV128 - Recognize high lane extracts.
	static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) {
	if (DL->getOpcode() != AArch64ISD::DUPLANE16 &&
	DL->getOpcode() != AArch64ISD::DUPLANE32)
	return false;

	SDValue SV = DL->getOperand(0);
	if (SV.getOpcode() != ISD::INSERT_SUBVECTOR)
	return false;

	SDValue EV = SV.getOperand(1);
	if (EV.getOpcode() != ISD::EXTRACT_SUBVECTOR)
	return false;

	ConstantSDNode *DLidx = cast<ConstantSDNode>(DL->getOperand(1).getNode());
	ConstantSDNode *EVidx = cast<ConstantSDNode>(EV.getOperand(1).getNode());
	LaneIdx = DLidx->getSExtValue() + EVidx->getSExtValue();
	LaneOp = EV.getOperand(0);

	return true;
	}

	// Helper for SelectOpcV64LaneV128 - Recognize operations where one operand is a
	// high lane extract.
	static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp,
	SDValue &LaneOp, int &LaneIdx) {

	if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx)) {
	std::swap(Op0, Op1);
	if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx))
	return false;
	}
	StdOp = Op1;
	return true;
	}

	/// SelectMLAV64LaneV128 - AArch64 supports vector MLAs where one multiplicand
	/// is a lane in the upper half of a 128-bit vector. Recognize and select this
	/// so that we don't emit unnecessary lane extracts.
	bool AArch64DAGToDAGISel::tryMLAV64LaneV128(SDNode *N) {
	SDLoc dl(N);
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	SDValue MLAOp1; // Will hold ordinary multiplicand for MLA.
	SDValue MLAOp2; // Will hold lane-accessed multiplicand for MLA.
	int LaneIdx = -1; // Will hold the lane index.

	if (Op1.getOpcode() != ISD::MUL \|\|
	!checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
	LaneIdx)) {
	std::swap(Op0, Op1);
	if (Op1.getOpcode() != ISD::MUL \|\|
	!checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
	LaneIdx))
	return false;
	}

	SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64);

	SDValue Ops[] = { Op0, MLAOp1, MLAOp2, LaneIdxVal };

	unsigned MLAOpc = ~0U;

	switch (N->getSimpleValueType(0).SimpleTy) {
	default:
	llvm_unreachable("Unrecognized MLA.");
	case MVT::v4i16:
	MLAOpc = AArch64::MLAv4i16_indexed;
	break;
	case MVT::v8i16:
	MLAOpc = AArch64::MLAv8i16_indexed;
	break;
	case MVT::v2i32:
	MLAOpc = AArch64::MLAv2i32_indexed;
	break;
	case MVT::v4i32:
	MLAOpc = AArch64::MLAv4i32_indexed;
	break;
	}

	ReplaceNode(N, CurDAG->getMachineNode(MLAOpc, dl, N->getValueType(0), Ops));
	return true;
	}

	bool AArch64DAGToDAGISel::tryMULLV64LaneV128(unsigned IntNo, SDNode *N) {
	SDLoc dl(N);
	SDValue SMULLOp0;
	SDValue SMULLOp1;
	int LaneIdx;

	if (!checkV64LaneV128(N->getOperand(1), N->getOperand(2), SMULLOp0, SMULLOp1,
	LaneIdx))
	return false;

	SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64);

	SDValue Ops[] = { SMULLOp0, SMULLOp1, LaneIdxVal };

	unsigned SMULLOpc = ~0U;

	if (IntNo == Intrinsic::aarch64_neon_smull) {
	switch (N->getSimpleValueType(0).SimpleTy) {
	default:
	llvm_unreachable("Unrecognized SMULL.");
	case MVT::v4i32:
	SMULLOpc = AArch64::SMULLv4i16_indexed;
	break;
	case MVT::v2i64:
	SMULLOpc = AArch64::SMULLv2i32_indexed;
	break;
	}
	} else if (IntNo == Intrinsic::aarch64_neon_umull) {
	switch (N->getSimpleValueType(0).SimpleTy) {
	default:
	llvm_unreachable("Unrecognized SMULL.");
	case MVT::v4i32:
	SMULLOpc = AArch64::UMULLv4i16_indexed;
	break;
	case MVT::v2i64:
	SMULLOpc = AArch64::UMULLv2i32_indexed;
	break;
	}
	} else
	llvm_unreachable("Unrecognized intrinsic.");

	ReplaceNode(N, CurDAG->getMachineNode(SMULLOpc, dl, N->getValueType(0), Ops));
	return true;
	}

	/// Instructions that accept extend modifiers like UXTW expect the register
	/// being extended to be a GPR32, but the incoming DAG might be acting on a
	/// GPR64 (either via SEXT_INREG or AND). Extract the appropriate low bits if
	/// this is the case.
	static SDValue narrowIfNeeded(SelectionDAG *CurDAG, SDValue N) {
	if (N.getValueType() == MVT::i32)
	return N;

	SDLoc dl(N);
	SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
	MachineSDNode *Node = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
	dl, MVT::i32, N, SubReg);
	return SDValue(Node, 0);
	}

	// Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N.
	template<signed Low, signed High, signed Scale>
	bool AArch64DAGToDAGISel::SelectRDVLImm(SDValue N, SDValue &Imm) {
	if (!isa<ConstantSDNode>(N))
	return false;

	int64_t MulImm = cast<ConstantSDNode>(N)->getSExtValue();
	if ((MulImm % std::abs(Scale)) == 0) {
	int64_t RDVLImm = MulImm / Scale;
	if ((RDVLImm >= Low) && (RDVLImm <= High)) {
	Imm = CurDAG->getTargetConstant(RDVLImm, SDLoc(N), MVT::i32);
	return true;
	}
	}

	return false;
	}

	/// SelectArithExtendedRegister - Select a "extended register" operand. This
	/// operand folds in an extend followed by an optional left shift.
	bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
	SDValue &Shift) {
	unsigned ShiftVal = 0;
	AArch64_AM::ShiftExtendType Ext;

	if (N.getOpcode() == ISD::SHL) {
	ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
	if (!CSD)
	return false;
	ShiftVal = CSD->getZExtValue();
	if (ShiftVal > 4)
	return false;

	Ext = getExtendTypeForNode(N.getOperand(0));
	if (Ext == AArch64_AM::InvalidShiftExtend)
	return false;

	Reg = N.getOperand(0).getOperand(0);
	} else {
	Ext = getExtendTypeForNode(N);
	if (Ext == AArch64_AM::InvalidShiftExtend)
	return false;

	Reg = N.getOperand(0);

	// Don't match if free 32-bit -> 64-bit zext can be used instead.
	if (Ext == AArch64_AM::UXTW &&
	Reg->getValueType(0).getSizeInBits() == 32 && isDef32(*Reg.getNode()))
	return false;
	}

	// AArch64 mandates that the RHS of the operation must use the smallest
	// register class that could contain the size being extended from. Thus,
	// if we're folding a (sext i8), we need the RHS to be a GPR32, even though
	// there might not be an actual 32-bit value in the program. We can
	// (harmlessly) synthesize one by injected an EXTRACT_SUBREG here.
	assert(Ext != AArch64_AM::UXTX && Ext != AArch64_AM::SXTX);
	Reg = narrowIfNeeded(CurDAG, Reg);
	Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), SDLoc(N),
	MVT::i32);
	return isWorthFolding(N);
	}

	/// If there's a use of this ADDlow that's not itself a load/store then we'll
	/// need to create a real ADD instruction from it anyway and there's no point in
	/// folding it into the mem op. Theoretically, it shouldn't matter, but there's
	/// a single pseudo-instruction for an ADRP/ADD pair so over-aggressive folding
	/// leads to duplicated ADRP instructions.
	static bool isWorthFoldingADDlow(SDValue N) {
	for (auto Use : N->uses()) {
	if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE &&
	Use->getOpcode() != ISD::ATOMIC_LOAD &&
	Use->getOpcode() != ISD::ATOMIC_STORE)
	return false;

	// ldar and stlr have much more restrictive addressing modes (just a
	// register).
	if (isStrongerThanMonotonic(cast<MemSDNode>(Use)->getOrdering()))
	return false;
	}

	return true;
	}

	/// SelectAddrModeIndexedBitWidth - Select a "register plus scaled (un)signed BW-bit
	/// immediate" address. The "Size" argument is the size in bytes of the memory
	/// reference, which determines the scale.
	bool AArch64DAGToDAGISel::SelectAddrModeIndexedBitWidth(SDValue N, bool IsSignedImm,
	unsigned BW, unsigned Size,
	SDValue &Base,
	SDValue &OffImm) {
	SDLoc dl(N);
	const DataLayout &DL = CurDAG->getDataLayout();
	const TargetLowering *TLI = getTargetLowering();
	if (N.getOpcode() == ISD::FrameIndex) {
	int FI = cast<FrameIndexSDNode>(N)->getIndex();
	Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
	OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
	return true;
	}

	// As opposed to the (12-bit) Indexed addressing mode below, the 7/9-bit signed
	// selected here doesn't support labels/immediates, only base+offset.
	if (CurDAG->isBaseWithConstantOffset(N)) {
	if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
	if (IsSignedImm) {
	int64_t RHSC = RHS->getSExtValue();
	unsigned Scale = Log2_32(Size);
	int64_t Range = 0x1LL << (BW - 1);

	if ((RHSC & (Size - 1)) == 0 && RHSC >= -(Range << Scale) &&
	RHSC < (Range << Scale)) {
	Base = N.getOperand(0);
	if (Base.getOpcode() == ISD::FrameIndex) {
	int FI = cast<FrameIndexSDNode>(Base)->getIndex();
	Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
	}
	OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
	return true;
	}
	} else {
	// unsigned Immediate
	uint64_t RHSC = RHS->getZExtValue();
	unsigned Scale = Log2_32(Size);
	uint64_t Range = 0x1ULL << BW;

	if ((RHSC & (Size - 1)) == 0 && RHSC < (Range << Scale)) {
	Base = N.getOperand(0);
	if (Base.getOpcode() == ISD::FrameIndex) {
	int FI = cast<FrameIndexSDNode>(Base)->getIndex();
	Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
	}
	OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
	return true;
	}
	}
	}
	}
	// Base only. The address will be materialized into a register before
	// the memory is accessed.
	// add x0, Xbase, #offset
	// stp x1, x2, [x0]
	Base = N;
	OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
	return true;
	}

	/// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
	/// immediate" address. The "Size" argument is the size in bytes of the memory
	/// reference, which determines the scale.
	bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
	SDValue &Base, SDValue &OffImm) {
	SDLoc dl(N);
	const DataLayout &DL = CurDAG->getDataLayout();
	const TargetLowering *TLI = getTargetLowering();
	if (N.getOpcode() == ISD::FrameIndex) {
	int FI = cast<FrameIndexSDNode>(N)->getIndex();
	Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
	OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
	return true;
	}

	if (N.getOpcode() == AArch64ISD::ADDlow && isWorthFoldingADDlow(N)) {
	GlobalAddressSDNode *GAN =
	dyn_cast<GlobalAddressSDNode>(N.getOperand(1).getNode());
	Base = N.getOperand(0);
	OffImm = N.getOperand(1);
	if (!GAN)
	return true;

	if (GAN->getOffset() % Size == 0 &&
	GAN->getGlobal()->getPointerAlignment(DL) >= Size)
	return true;
	}

	if (CurDAG->isBaseWithConstantOffset(N)) {
	if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
	int64_t RHSC = (int64_t)RHS->getZExtValue();
	unsigned Scale = Log2_32(Size);
	if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
	Base = N.getOperand(0);
	if (Base.getOpcode() == ISD::FrameIndex) {
	int FI = cast<FrameIndexSDNode>(Base)->getIndex();
	Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
	}
	OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
	return true;
	}
	}
	}

	// Before falling back to our general case, check if the unscaled
	// instructions can handle this. If so, that's preferable.
	if (SelectAddrModeUnscaled(N, Size, Base, OffImm))
	return false;

	// Base only. The address will be materialized into a register before
	// the memory is accessed.
	// add x0, Xbase, #offset
	// ldr x0, [x0]
	Base = N;
	OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
	return true;
	}

	/// SelectAddrModeUnscaled - Select a "register plus unscaled signed 9-bit
	/// immediate" address. This should only match when there is an offset that
	/// is not valid for a scaled immediate addressing mode. The "Size" argument
	/// is the size in bytes of the memory reference, which is needed here to know
	/// what is valid for a scaled immediate.
	bool AArch64DAGToDAGISel::SelectAddrModeUnscaled(SDValue N, unsigned Size,
	SDValue &Base,
	SDValue &OffImm) {
	if (!CurDAG->isBaseWithConstantOffset(N))
	return false;
	if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
	int64_t RHSC = RHS->getSExtValue();
	// If the offset is valid as a scaled immediate, don't match here.
	if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 &&
	RHSC < (0x1000 << Log2_32(Size)))
	return false;
	if (RHSC >= -256 && RHSC < 256) {
	Base = N.getOperand(0);
	if (Base.getOpcode() == ISD::FrameIndex) {
	int FI = cast<FrameIndexSDNode>(Base)->getIndex();
	const TargetLowering *TLI = getTargetLowering();
	Base = CurDAG->getTargetFrameIndex(
	FI, TLI->getPointerTy(CurDAG->getDataLayout()));
	}
	OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i64);
	return true;
	}
	}
	return false;
	}

	static SDValue Widen(SelectionDAG *CurDAG, SDValue N) {
	SDLoc dl(N);
	SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
	SDValue ImpDef = SDValue(
	CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, MVT::i64), 0);
	MachineSDNode *Node = CurDAG->getMachineNode(
	TargetOpcode::INSERT_SUBREG, dl, MVT::i64, ImpDef, N, SubReg);
	return SDValue(Node, 0);
	}

	/// Check if the given SHL node (\p N), can be used to form an
	/// extended register for an addressing mode.
	bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
	bool WantExtend, SDValue &Offset,
	SDValue &SignExtend) {
	assert(N.getOpcode() == ISD::SHL && "Invalid opcode.");
	ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
	if (!CSD \|\| (CSD->getZExtValue() & 0x7) != CSD->getZExtValue())
	return false;

	SDLoc dl(N);
	if (WantExtend) {
	AArch64_AM::ShiftExtendType Ext =
	getExtendTypeForNode(N.getOperand(0), true);
	if (Ext == AArch64_AM::InvalidShiftExtend)
	return false;

	Offset = narrowIfNeeded(CurDAG, N.getOperand(0).getOperand(0));
	SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
	MVT::i32);
	} else {
	Offset = N.getOperand(0);
	SignExtend = CurDAG->getTargetConstant(0, dl, MVT::i32);
	}

	unsigned LegalShiftVal = Log2_32(Size);
	unsigned ShiftVal = CSD->getZExtValue();

	if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
	return false;

	return isWorthFolding(N);
	}

	bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
	SDValue &Base, SDValue &Offset,
	SDValue &SignExtend,
	SDValue &DoShift) {
	if (N.getOpcode() != ISD::ADD)
	return false;
	SDValue LHS = N.getOperand(0);
	SDValue RHS = N.getOperand(1);
	SDLoc dl(N);

	// We don't want to match immediate adds here, because they are better lowered
	// to the register-immediate addressing modes.
	if (isa<ConstantSDNode>(LHS) \|\| isa<ConstantSDNode>(RHS))
	return false;

	// Check if this particular node is reused in any non-memory related
	// operation. If yes, do not try to fold this node into the address
	// computation, since the computation will be kept.
	const SDNode *Node = N.getNode();
	for (SDNode *UI : Node->uses()) {
	if (!isa<MemSDNode>(*UI))
	return false;
	}

	// Remember if it is worth folding N when it produces extended register.
	bool IsExtendedRegisterWorthFolding = isWorthFolding(N);

	// Try to match a shifted extend on the RHS.
	if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
	SelectExtendedSHL(RHS, Size, true, Offset, SignExtend)) {
	Base = LHS;
	DoShift = CurDAG->getTargetConstant(true, dl, MVT::i32);
	return true;
	}

	// Try to match a shifted extend on the LHS.
	if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
	SelectExtendedSHL(LHS, Size, true, Offset, SignExtend)) {
	Base = RHS;
	DoShift = CurDAG->getTargetConstant(true, dl, MVT::i32);
	return true;
	}

	// There was no shift, whatever else we find.
	DoShift = CurDAG->getTargetConstant(false, dl, MVT::i32);

	AArch64_AM::ShiftExtendType Ext = AArch64_AM::InvalidShiftExtend;
	// Try to match an unshifted extend on the LHS.
	if (IsExtendedRegisterWorthFolding &&
	(Ext = getExtendTypeForNode(LHS, true)) !=
	AArch64_AM::InvalidShiftExtend) {
	Base = RHS;
	Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0));
	SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
	MVT::i32);
	if (isWorthFolding(LHS))
	return true;
	}

	// Try to match an unshifted extend on the RHS.
	if (IsExtendedRegisterWorthFolding &&
	(Ext = getExtendTypeForNode(RHS, true)) !=
	AArch64_AM::InvalidShiftExtend) {
	Base = LHS;
	Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0));
	SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
	MVT::i32);
	if (isWorthFolding(RHS))
	return true;
	}

	return false;
	}

	// Check if the given immediate is preferred by ADD. If an immediate can be
	// encoded in an ADD, or it can be encoded in an "ADD LSL #12" and can not be
	// encoded by one MOVZ, return true.
	static bool isPreferredADD(int64_t ImmOff) {
	// Constant in [0x0, 0xfff] can be encoded in ADD.
	if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
	return true;
	// Check if it can be encoded in an "ADD LSL #12".
	if ((ImmOff & 0xffffffffff000fffLL) == 0x0LL)
	// As a single MOVZ is faster than a "ADD of LSL #12", ignore such constant.
	return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
	(ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
	return false;
	}

	bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
	SDValue &Base, SDValue &Offset,
	SDValue &SignExtend,
	SDValue &DoShift) {
	if (N.getOpcode() != ISD::ADD)
	return false;
	SDValue LHS = N.getOperand(0);
	SDValue RHS = N.getOperand(1);
	SDLoc DL(N);

	// Check if this particular node is reused in any non-memory related
	// operation. If yes, do not try to fold this node into the address
	// computation, since the computation will be kept.
	const SDNode *Node = N.getNode();
	for (SDNode *UI : Node->uses()) {
	if (!isa<MemSDNode>(*UI))
	return false;
	}

	// Watch out if RHS is a wide immediate, it can not be selected into
	// [BaseReg+Imm] addressing mode. Also it may not be able to be encoded into
	// ADD/SUB. Instead it will use [BaseReg + 0] address mode and generate
	// instructions like:
	// MOV X0, WideImmediate
	// ADD X1, BaseReg, X0
	// LDR X2, [X1, 0]
	// For such situation, using [BaseReg, XReg] addressing mode can save one
	// ADD/SUB:
	// MOV X0, WideImmediate
	// LDR X2, [BaseReg, X0]
	if (isa<ConstantSDNode>(RHS)) {
	int64_t ImmOff = (int64_t)cast<ConstantSDNode>(RHS)->getZExtValue();
	unsigned Scale = Log2_32(Size);
	// Skip the immediate can be selected by load/store addressing mode.
	// Also skip the immediate can be encoded by a single ADD (SUB is also
	// checked by using -ImmOff).
	if ((ImmOff % Size == 0 && ImmOff >= 0 && ImmOff < (0x1000 << Scale)) \|\|
	isPreferredADD(ImmOff) \|\| isPreferredADD(-ImmOff))
	return false;

	SDValue Ops[] = { RHS };
	SDNode *MOVI =
	CurDAG->getMachineNode(AArch64::MOVi64imm, DL, MVT::i64, Ops);
	SDValue MOVIV = SDValue(MOVI, 0);
	// This ADD of two X register will be selected into [Reg+Reg] mode.
	N = CurDAG->getNode(ISD::ADD, DL, MVT::i64, LHS, MOVIV);
	}

	// Remember if it is worth folding N when it produces extended register.
	bool IsExtendedRegisterWorthFolding = isWorthFolding(N);

	// Try to match a shifted extend on the RHS.
	if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
	SelectExtendedSHL(RHS, Size, false, Offset, SignExtend)) {
	Base = LHS;
	DoShift = CurDAG->getTargetConstant(true, DL, MVT::i32);
	return true;
	}

	// Try to match a shifted extend on the LHS.
	if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
	SelectExtendedSHL(LHS, Size, false, Offset, SignExtend)) {
	Base = RHS;
	DoShift = CurDAG->getTargetConstant(true, DL, MVT::i32);
	return true;
	}

	// Match any non-shifted, non-extend, non-immediate add expression.
	Base = LHS;
	Offset = RHS;
	SignExtend = CurDAG->getTargetConstant(false, DL, MVT::i32);
	DoShift = CurDAG->getTargetConstant(false, DL, MVT::i32);
	// Reg1 + Reg2 is free: no check needed.
	return true;
	}

	SDValue AArch64DAGToDAGISel::createDTuple(ArrayRef<SDValue> Regs) {
	static const unsigned RegClassIDs[] = {
	AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
	static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
	AArch64::dsub2, AArch64::dsub3};

	return createTuple(Regs, RegClassIDs, SubRegs);
	}

	SDValue AArch64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) {
	static const unsigned RegClassIDs[] = {
	AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
	static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
	AArch64::qsub2, AArch64::qsub3};

	return createTuple(Regs, RegClassIDs, SubRegs);
	}

	SDValue AArch64DAGToDAGISel::createZTuple(ArrayRef<SDValue> Regs) {
	static const unsigned RegClassIDs[] = {AArch64::ZPR2RegClassID,
	AArch64::ZPR3RegClassID,
	AArch64::ZPR4RegClassID};
	static const unsigned SubRegs[] = {AArch64::zsub0, AArch64::zsub1,
	AArch64::zsub2, AArch64::zsub3};

	return createTuple(Regs, RegClassIDs, SubRegs);
	}

	SDValue AArch64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs,
	const unsigned RegClassIDs[],
	const unsigned SubRegs[]) {
	// There's no special register-class for a vector-list of 1 element: it's just
	// a vector.
	if (Regs.size() == 1)
	return Regs[0];

	assert(Regs.size() >= 2 && Regs.size() <= 4);

	SDLoc DL(Regs[0]);

	SmallVector<SDValue, 4> Ops;

	// First operand of REG_SEQUENCE is the desired RegClass.
	Ops.push_back(
	CurDAG->getTargetConstant(RegClassIDs[Regs.size() - 2], DL, MVT::i32));

	// Then we get pairs of source & subregister-position for the components.
	for (unsigned i = 0; i < Regs.size(); ++i) {
	Ops.push_back(Regs[i]);
	Ops.push_back(CurDAG->getTargetConstant(SubRegs[i], DL, MVT::i32));
	}

	SDNode *N =
	CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops);
	return SDValue(N, 0);
	}

	void AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc,
	bool isExt) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);

	unsigned ExtOff = isExt;

	// Form a REG_SEQUENCE to force register allocation.
	unsigned Vec0Off = ExtOff + 1;
	SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Off,
	N->op_begin() + Vec0Off + NumVecs);
	SDValue RegSeq = createQTuple(Regs);

	SmallVector<SDValue, 6> Ops;
	if (isExt)
	Ops.push_back(N->getOperand(1));
	Ops.push_back(RegSeq);
	Ops.push_back(N->getOperand(NumVecs + ExtOff + 1));
	ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops));
	}

	bool AArch64DAGToDAGISel::tryIndexedLoad(SDNode *N) {
	LoadSDNode *LD = cast<LoadSDNode>(N);
	if (LD->isUnindexed())
	return false;
	EVT VT = LD->getMemoryVT();
	EVT DstVT = N->getValueType(0);
	ISD::MemIndexedMode AM = LD->getAddressingMode();
	bool IsPre = AM == ISD::PRE_INC \|\| AM == ISD::PRE_DEC;

	// We're not doing validity checking here. That was done when checking
	// if we should mark the load as indexed or not. We're just selecting
	// the right instruction.
	unsigned Opcode = 0;

	ISD::LoadExtType ExtType = LD->getExtensionType();
	bool InsertTo64 = false;
	if (VT == MVT::i64)
	Opcode = IsPre ? AArch64::LDRXpre : AArch64::LDRXpost;
	else if (VT == MVT::i32) {
	if (ExtType == ISD::NON_EXTLOAD)
	Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
	else if (ExtType == ISD::SEXTLOAD)
	Opcode = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost;
	else {
	Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
	InsertTo64 = true;
	// The result of the load is only i32. It's the subreg_to_reg that makes
	// it into an i64.
	DstVT = MVT::i32;
	}
	} else if (VT == MVT::i16) {
	if (ExtType == ISD::SEXTLOAD) {
	if (DstVT == MVT::i64)
	Opcode = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost;
	else
	Opcode = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost;
	} else {
	Opcode = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost;
	InsertTo64 = DstVT == MVT::i64;
	// The result of the load is only i32. It's the subreg_to_reg that makes
	// it into an i64.
	DstVT = MVT::i32;
	}
	} else if (VT == MVT::i8) {
	if (ExtType == ISD::SEXTLOAD) {
	if (DstVT == MVT::i64)
	Opcode = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost;
	else
	Opcode = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost;
	} else {
	Opcode = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost;
	InsertTo64 = DstVT == MVT::i64;
	// The result of the load is only i32. It's the subreg_to_reg that makes
	// it into an i64.
	DstVT = MVT::i32;
	}
	} else if (VT == MVT::f16) {
	Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
	} else if (VT == MVT::bf16) {
	Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
	} else if (VT == MVT::f32) {
	Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost;
	} else if (VT == MVT::f64 \|\| VT.is64BitVector()) {
	Opcode = IsPre ? AArch64::LDRDpre : AArch64::LDRDpost;
	} else if (VT.is128BitVector()) {
	Opcode = IsPre ? AArch64::LDRQpre : AArch64::LDRQpost;
	} else
	return false;
	SDValue Chain = LD->getChain();
	SDValue Base = LD->getBasePtr();
	ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset());
	int OffsetVal = (int)OffsetOp->getZExtValue();
	SDLoc dl(N);
	SDValue Offset = CurDAG->getTargetConstant(OffsetVal, dl, MVT::i64);
	SDValue Ops[] = { Base, Offset, Chain };
	SDNode *Res = CurDAG->getMachineNode(Opcode, dl, MVT::i64, DstVT,
	MVT::Other, Ops);
	// Either way, we're replacing the node, so tell the caller that.
	SDValue LoadedVal = SDValue(Res, 1);
	if (InsertTo64) {
	SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
	LoadedVal =
	SDValue(CurDAG->getMachineNode(
	AArch64::SUBREG_TO_REG, dl, MVT::i64,
	CurDAG->getTargetConstant(0, dl, MVT::i64), LoadedVal,
	SubReg),
	0);
	}

	ReplaceUses(SDValue(N, 0), LoadedVal);
	ReplaceUses(SDValue(N, 1), SDValue(Res, 0));
	ReplaceUses(SDValue(N, 2), SDValue(Res, 2));
	CurDAG->RemoveDeadNode(N);
	return true;
	}

	void AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
	unsigned SubRegIdx) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	SDValue Chain = N->getOperand(0);

	SDValue Ops[] = {N->getOperand(2), // Mem operand;
	Chain};

	const EVT ResTys[] = {MVT::Untyped, MVT::Other};

	SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
	SDValue SuperReg = SDValue(Ld, 0);
	for (unsigned i = 0; i < NumVecs; ++i)
	ReplaceUses(SDValue(N, i),
	CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));

	ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));

	// Transfer memoperands.
	MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
	CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ld), {MemOp});

	CurDAG->RemoveDeadNode(N);
	}

	void AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
	unsigned Opc, unsigned SubRegIdx) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	SDValue Chain = N->getOperand(0);

	SDValue Ops[] = {N->getOperand(1), // Mem operand
	N->getOperand(2), // Incremental
	Chain};

	const EVT ResTys[] = {MVT::i64, // Type of the write back register
	MVT::Untyped, MVT::Other};

	SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);

	// Update uses of write back register
	ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));

	// Update uses of vector list
	SDValue SuperReg = SDValue(Ld, 1);
	if (NumVecs == 1)
	ReplaceUses(SDValue(N, 0), SuperReg);
	else
	for (unsigned i = 0; i < NumVecs; ++i)
	ReplaceUses(SDValue(N, i),
	CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));

	// Update the chain
	ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
	CurDAG->RemoveDeadNode(N);
	}

	/// Optimize \param OldBase and \param OldOffset selecting the best addressing
	/// mode. Returns a tuple consisting of an Opcode, an SDValue representing the
	/// new Base and an SDValue representing the new offset.
	-template <unsigned Scale>
	std::tuple<unsigned, SDValue, SDValue>
	-AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr,
	- const unsigned Opc_ri,
	+AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, unsigned Opc_rr,
	+ unsigned Opc_ri,
	const SDValue &OldBase,
	- const SDValue &OldOffset) {
	+ const SDValue &OldOffset,
	+ unsigned Scale) {
	SDValue NewBase = OldBase;
	SDValue NewOffset = OldOffset;
	// Detect a possible Reg+Imm addressing mode.
	const bool IsRegImm = SelectAddrModeIndexedSVE</Min=/-8, /Max=/7>(
	N, OldBase, NewBase, NewOffset);

	// Detect a possible reg+reg addressing mode, but only if we haven't already
	// detected a Reg+Imm one.
	const bool IsRegReg =
	- !IsRegImm && SelectSVERegRegAddrMode<Scale>(OldBase, NewBase, NewOffset);
	+ !IsRegImm && SelectSVERegRegAddrMode(OldBase, Scale, NewBase, NewOffset);

	// Select the instruction.
	return std::make_tuple(IsRegReg ? Opc_rr : Opc_ri, NewBase, NewOffset);
	}

	void AArch64DAGToDAGISel::SelectPredicatedLoad(SDNode *N, unsigned NumVecs,
	- const unsigned Opc) {
	+ unsigned Scale, unsigned Opc_ri,
	+ unsigned Opc_rr) {
	+ assert(Scale < 4 && "Invalid scaling value.");
	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	SDValue Chain = N->getOperand(0);

	+ // Optimize addressing mode.
	+ SDValue Base, Offset;
	+ unsigned Opc;
	+ std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore(
	+ N, Opc_rr, Opc_ri, N->getOperand(2),
	+ CurDAG->getTargetConstant(0, DL, MVT::i64), Scale);
	+
	SDValue Ops[] = {N->getOperand(1), // Predicate
	- N->getOperand(2), // Memory operand
	- CurDAG->getTargetConstant(0, DL, MVT::i64), Chain};
	+ Base, // Memory operand
	+ Offset, Chain};

	const EVT ResTys[] = {MVT::Untyped, MVT::Other};

	SDNode *Load = CurDAG->getMachineNode(Opc, DL, ResTys, Ops);
	SDValue SuperReg = SDValue(Load, 0);
	for (unsigned i = 0; i < NumVecs; ++i)
	ReplaceUses(SDValue(N, i), CurDAG->getTargetExtractSubreg(
	AArch64::zsub0 + i, DL, VT, SuperReg));

	// Copy chain
	unsigned ChainIdx = NumVecs;
	ReplaceUses(SDValue(N, ChainIdx), SDValue(Load, 1));
	CurDAG->RemoveDeadNode(N);
	}

	void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
	unsigned Opc) {
	SDLoc dl(N);
	EVT VT = N->getOperand(2)->getValueType(0);

	// Form a REG_SEQUENCE to force register allocation.
	bool Is128Bit = VT.getSizeInBits() == 128;
	SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
	SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);

	SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), N->getOperand(0)};
	SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);

	// Transfer memoperands.
	MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
	CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp});

	ReplaceNode(N, St);
	}

	-template <unsigned Scale>
	void AArch64DAGToDAGISel::SelectPredicatedStore(SDNode *N, unsigned NumVecs,
	- const unsigned Opc_rr,
	- const unsigned Opc_ri) {
	+ unsigned Scale, unsigned Opc_rr,
	+ unsigned Opc_ri) {
	SDLoc dl(N);

	// Form a REG_SEQUENCE to force register allocation.
	SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
	SDValue RegSeq = createZTuple(Regs);

	// Optimize addressing mode.
	unsigned Opc;
	SDValue Offset, Base;
	- std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore<Scale>(
	+ std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore(
	N, Opc_rr, Opc_ri, N->getOperand(NumVecs + 3),
	- CurDAG->getTargetConstant(0, dl, MVT::i64));
	+ CurDAG->getTargetConstant(0, dl, MVT::i64), Scale);

	SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), // predicate
	Base, // address
	Offset, // offset
	N->getOperand(0)}; // chain
	SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);

	ReplaceNode(N, St);
	}

	bool AArch64DAGToDAGISel::SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base,
	SDValue &OffImm) {
	SDLoc dl(N);
	const DataLayout &DL = CurDAG->getDataLayout();
	const TargetLowering *TLI = getTargetLowering();

	// Try to match it for the frame address
	if (auto FINode = dyn_cast<FrameIndexSDNode>(N)) {
	int FI = FINode->getIndex();
	Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
	OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
	return true;
	}

	return false;
	}

	void AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
	unsigned Opc) {
	SDLoc dl(N);
	EVT VT = N->getOperand(2)->getValueType(0);
	const EVT ResTys[] = {MVT::i64, // Type of the write back register
	MVT::Other}; // Type for the Chain

	// Form a REG_SEQUENCE to force register allocation.
	bool Is128Bit = VT.getSizeInBits() == 128;
	SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
	SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);

	SDValue Ops[] = {RegSeq,
	N->getOperand(NumVecs + 1), // base register
	N->getOperand(NumVecs + 2), // Incremental
	N->getOperand(0)}; // Chain
	SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);

	ReplaceNode(N, St);
	}

	namespace {
	/// WidenVector - Given a value in the V64 register class, produce the
	/// equivalent value in the V128 register class.
	class WidenVector {
	SelectionDAG &DAG;

	public:
	WidenVector(SelectionDAG &DAG) : DAG(DAG) {}

	SDValue operator()(SDValue V64Reg) {
	EVT VT = V64Reg.getValueType();
	unsigned NarrowSize = VT.getVectorNumElements();
	MVT EltTy = VT.getVectorElementType().getSimpleVT();
	MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
	SDLoc DL(V64Reg);

	SDValue Undef =
	SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, WideTy), 0);
	return DAG.getTargetInsertSubreg(AArch64::dsub, DL, WideTy, Undef, V64Reg);
	}
	};
	} // namespace

	/// NarrowVector - Given a value in the V128 register class, produce the
	/// equivalent value in the V64 register class.
	static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
	EVT VT = V128Reg.getValueType();
	unsigned WideSize = VT.getVectorNumElements();
	MVT EltTy = VT.getVectorElementType().getSimpleVT();
	MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);

	return DAG.getTargetExtractSubreg(AArch64::dsub, SDLoc(V128Reg), NarrowTy,
	V128Reg);
	}

	void AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
	unsigned Opc) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	bool Narrow = VT.getSizeInBits() == 64;

	// Form a REG_SEQUENCE to force register allocation.
	SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);

	if (Narrow)
	transform(Regs, Regs.begin(),
	WidenVector(*CurDAG));

	SDValue RegSeq = createQTuple(Regs);

	const EVT ResTys[] = {MVT::Untyped, MVT::Other};

	unsigned LaneNo =
	cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();

	SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
	N->getOperand(NumVecs + 3), N->getOperand(0)};
	SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
	SDValue SuperReg = SDValue(Ld, 0);

	EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
	static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1,
	AArch64::qsub2, AArch64::qsub3 };
	for (unsigned i = 0; i < NumVecs; ++i) {
	SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg);
	if (Narrow)
	NV = NarrowVector(NV, *CurDAG);
	ReplaceUses(SDValue(N, i), NV);
	}

	ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
	CurDAG->RemoveDeadNode(N);
	}

	void AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
	unsigned Opc) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	bool Narrow = VT.getSizeInBits() == 64;

	// Form a REG_SEQUENCE to force register allocation.
	SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);

	if (Narrow)
	transform(Regs, Regs.begin(),
	WidenVector(*CurDAG));

	SDValue RegSeq = createQTuple(Regs);

	const EVT ResTys[] = {MVT::i64, // Type of the write back register
	RegSeq->getValueType(0), MVT::Other};

	unsigned LaneNo =
	cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();

	SDValue Ops[] = {RegSeq,
	CurDAG->getTargetConstant(LaneNo, dl,
	MVT::i64), // Lane Number
	N->getOperand(NumVecs + 2), // Base register
	N->getOperand(NumVecs + 3), // Incremental
	N->getOperand(0)};
	SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);

	// Update uses of the write back register
	ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));

	// Update uses of the vector list
	SDValue SuperReg = SDValue(Ld, 1);
	if (NumVecs == 1) {
	ReplaceUses(SDValue(N, 0),
	Narrow ? NarrowVector(SuperReg, *CurDAG) : SuperReg);
	} else {
	EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
	static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1,
	AArch64::qsub2, AArch64::qsub3 };
	for (unsigned i = 0; i < NumVecs; ++i) {
	SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT,
	SuperReg);
	if (Narrow)
	NV = NarrowVector(NV, *CurDAG);
	ReplaceUses(SDValue(N, i), NV);
	}
	}

	// Update the Chain
	ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
	CurDAG->RemoveDeadNode(N);
	}

	void AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
	unsigned Opc) {
	SDLoc dl(N);
	EVT VT = N->getOperand(2)->getValueType(0);
	bool Narrow = VT.getSizeInBits() == 64;

	// Form a REG_SEQUENCE to force register allocation.
	SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);

	if (Narrow)
	transform(Regs, Regs.begin(),
	WidenVector(*CurDAG));

	SDValue RegSeq = createQTuple(Regs);

	unsigned LaneNo =
	cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();

	SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
	N->getOperand(NumVecs + 3), N->getOperand(0)};
	SDNode *St = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);

	// Transfer memoperands.
	MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
	CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp});

	ReplaceNode(N, St);
	}

	void AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
	unsigned Opc) {
	SDLoc dl(N);
	EVT VT = N->getOperand(2)->getValueType(0);
	bool Narrow = VT.getSizeInBits() == 64;

	// Form a REG_SEQUENCE to force register allocation.
	SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);

	if (Narrow)
	transform(Regs, Regs.begin(),
	WidenVector(*CurDAG));

	SDValue RegSeq = createQTuple(Regs);

	const EVT ResTys[] = {MVT::i64, // Type of the write back register
	MVT::Other};

	unsigned LaneNo =
	cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();

	SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
	N->getOperand(NumVecs + 2), // Base Register
	N->getOperand(NumVecs + 3), // Incremental
	N->getOperand(0)};
	SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);

	// Transfer memoperands.
	MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
	CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp});

	ReplaceNode(N, St);
	}

	static bool isBitfieldExtractOpFromAnd(SelectionDAG CurDAG, SDNode N,
	unsigned &Opc, SDValue &Opd0,
	unsigned &LSB, unsigned &MSB,
	unsigned NumberOfIgnoredLowBits,
	bool BiggerPattern) {
	assert(N->getOpcode() == ISD::AND &&
	"N must be a AND operation to call this function");

	EVT VT = N->getValueType(0);

	// Here we can test the type of VT and return false when the type does not
	// match, but since it is done prior to that call in the current context
	// we turned that into an assert to avoid redundant code.
	assert((VT == MVT::i32 \|\| VT == MVT::i64) &&
	"Type checking must have been done before calling this function");

	// FIXME: simplify-demanded-bits in DAGCombine will probably have
	// changed the AND node to a 32-bit mask operation. We'll have to
	// undo that as part of the transform here if we want to catch all
	// the opportunities.
	// Currently the NumberOfIgnoredLowBits argument helps to recover
	// form these situations when matching bigger pattern (bitfield insert).

	// For unsigned extracts, check for a shift right and mask
	uint64_t AndImm = 0;
	if (!isOpcWithIntImmediate(N, ISD::AND, AndImm))
	return false;

	const SDNode *Op0 = N->getOperand(0).getNode();

	// Because of simplify-demanded-bits in DAGCombine, the mask may have been
	// simplified. Try to undo that
	AndImm \|= maskTrailingOnes<uint64_t>(NumberOfIgnoredLowBits);

	// The immediate is a mask of the low bits iff imm & (imm+1) == 0
	if (AndImm & (AndImm + 1))
	return false;

	bool ClampMSB = false;
	uint64_t SrlImm = 0;
	// Handle the SRL + ANY_EXTEND case.
	if (VT == MVT::i64 && Op0->getOpcode() == ISD::ANY_EXTEND &&
	isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, SrlImm)) {
	// Extend the incoming operand of the SRL to 64-bit.
	Opd0 = Widen(CurDAG, Op0->getOperand(0).getOperand(0));
	// Make sure to clamp the MSB so that we preserve the semantics of the
	// original operations.
	ClampMSB = true;
	} else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE &&
	isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL,
	SrlImm)) {
	// If the shift result was truncated, we can still combine them.
	Opd0 = Op0->getOperand(0).getOperand(0);

	// Use the type of SRL node.
	VT = Opd0->getValueType(0);
	} else if (isOpcWithIntImmediate(Op0, ISD::SRL, SrlImm)) {
	Opd0 = Op0->getOperand(0);
	} else if (BiggerPattern) {
	// Let's pretend a 0 shift right has been performed.
	// The resulting code will be at least as good as the original one
	// plus it may expose more opportunities for bitfield insert pattern.
	// FIXME: Currently we limit this to the bigger pattern, because
	// some optimizations expect AND and not UBFM.
	Opd0 = N->getOperand(0);
	} else
	return false;

	// Bail out on large immediates. This happens when no proper
	// combining/constant folding was performed.
	if (!BiggerPattern && (SrlImm <= 0 \|\| SrlImm >= VT.getSizeInBits())) {
	LLVM_DEBUG(
	(dbgs() << N
	<< ": Found large shift immediate, this should not happen\n"));
	return false;
	}

	LSB = SrlImm;
	MSB = SrlImm + (VT == MVT::i32 ? countTrailingOnes<uint32_t>(AndImm)
	: countTrailingOnes<uint64_t>(AndImm)) -
	1;
	if (ClampMSB)
	// Since we're moving the extend before the right shift operation, we need
	// to clamp the MSB to make sure we don't shift in undefined bits instead of
	// the zeros which would get shifted in with the original right shift
	// operation.
	MSB = MSB > 31 ? 31 : MSB;

	Opc = VT == MVT::i32 ? AArch64::UBFMWri : AArch64::UBFMXri;
	return true;
	}

	static bool isBitfieldExtractOpFromSExtInReg(SDNode *N, unsigned &Opc,
	SDValue &Opd0, unsigned &Immr,
	unsigned &Imms) {
	assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);

	EVT VT = N->getValueType(0);
	unsigned BitWidth = VT.getSizeInBits();
	assert((VT == MVT::i32 \|\| VT == MVT::i64) &&
	"Type checking must have been done before calling this function");

	SDValue Op = N->getOperand(0);
	if (Op->getOpcode() == ISD::TRUNCATE) {
	Op = Op->getOperand(0);
	VT = Op->getValueType(0);
	BitWidth = VT.getSizeInBits();
	}

	uint64_t ShiftImm;
	if (!isOpcWithIntImmediate(Op.getNode(), ISD::SRL, ShiftImm) &&
	!isOpcWithIntImmediate(Op.getNode(), ISD::SRA, ShiftImm))
	return false;

	unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
	if (ShiftImm + Width > BitWidth)
	return false;

	Opc = (VT == MVT::i32) ? AArch64::SBFMWri : AArch64::SBFMXri;
	Opd0 = Op.getOperand(0);
	Immr = ShiftImm;
	Imms = ShiftImm + Width - 1;
	return true;
	}

	static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc,
	SDValue &Opd0, unsigned &LSB,
	unsigned &MSB) {
	// We are looking for the following pattern which basically extracts several
	// continuous bits from the source value and places it from the LSB of the
	// destination value, all other bits of the destination value or set to zero:
	//
	// Value2 = AND Value, MaskImm
	// SRL Value2, ShiftImm
	//
	// with MaskImm >> ShiftImm to search for the bit width.
	//
	// This gets selected into a single UBFM:
	//
	// UBFM Value, ShiftImm, BitWide + SrlImm -1
	//

	if (N->getOpcode() != ISD::SRL)
	return false;

	uint64_t AndMask = 0;
	if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, AndMask))
	return false;

	Opd0 = N->getOperand(0).getOperand(0);

	uint64_t SrlImm = 0;
	if (!isIntImmediate(N->getOperand(1), SrlImm))
	return false;

	// Check whether we really have several bits extract here.
	unsigned BitWide = 64 - countLeadingOnes(~(AndMask >> SrlImm));
	if (BitWide && isMask_64(AndMask >> SrlImm)) {
	if (N->getValueType(0) == MVT::i32)
	Opc = AArch64::UBFMWri;
	else
	Opc = AArch64::UBFMXri;

	LSB = SrlImm;
	MSB = BitWide + SrlImm - 1;
	return true;
	}

	return false;
	}

	static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
	unsigned &Immr, unsigned &Imms,
	bool BiggerPattern) {
	assert((N->getOpcode() == ISD::SRA \|\| N->getOpcode() == ISD::SRL) &&
	"N must be a SHR/SRA operation to call this function");

	EVT VT = N->getValueType(0);

	// Here we can test the type of VT and return false when the type does not
	// match, but since it is done prior to that call in the current context
	// we turned that into an assert to avoid redundant code.
	assert((VT == MVT::i32 \|\| VT == MVT::i64) &&
	"Type checking must have been done before calling this function");

	// Check for AND + SRL doing several bits extract.
	if (isSeveralBitsExtractOpFromShr(N, Opc, Opd0, Immr, Imms))
	return true;

	// We're looking for a shift of a shift.
	uint64_t ShlImm = 0;
	uint64_t TruncBits = 0;
	if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, ShlImm)) {
	Opd0 = N->getOperand(0).getOperand(0);
	} else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL &&
	N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) {
	// We are looking for a shift of truncate. Truncate from i64 to i32 could
	// be considered as setting high 32 bits as zero. Our strategy here is to
	// always generate 64bit UBFM. This consistency will help the CSE pass
	// later find more redundancy.
	Opd0 = N->getOperand(0).getOperand(0);
	TruncBits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits();
	VT = Opd0.getValueType();
	assert(VT == MVT::i64 && "the promoted type should be i64");
	} else if (BiggerPattern) {
	// Let's pretend a 0 shift left has been performed.
	// FIXME: Currently we limit this to the bigger pattern case,
	// because some optimizations expect AND and not UBFM
	Opd0 = N->getOperand(0);
	} else
	return false;

	// Missing combines/constant folding may have left us with strange
	// constants.
	if (ShlImm >= VT.getSizeInBits()) {
	LLVM_DEBUG(
	(dbgs() << N
	<< ": Found large shift immediate, this should not happen\n"));
	return false;
	}

	uint64_t SrlImm = 0;
	if (!isIntImmediate(N->getOperand(1), SrlImm))
	return false;

	assert(SrlImm > 0 && SrlImm < VT.getSizeInBits() &&
	"bad amount in shift node!");
	int immr = SrlImm - ShlImm;
	Immr = immr < 0 ? immr + VT.getSizeInBits() : immr;
	Imms = VT.getSizeInBits() - ShlImm - TruncBits - 1;
	// SRA requires a signed extraction
	if (VT == MVT::i32)
	Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMWri : AArch64::UBFMWri;
	else
	Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMXri : AArch64::UBFMXri;
	return true;
	}

	bool AArch64DAGToDAGISel::tryBitfieldExtractOpFromSExt(SDNode *N) {
	assert(N->getOpcode() == ISD::SIGN_EXTEND);

	EVT VT = N->getValueType(0);
	EVT NarrowVT = N->getOperand(0)->getValueType(0);
	if (VT != MVT::i64 \|\| NarrowVT != MVT::i32)
	return false;

	uint64_t ShiftImm;
	SDValue Op = N->getOperand(0);
	if (!isOpcWithIntImmediate(Op.getNode(), ISD::SRA, ShiftImm))
	return false;

	SDLoc dl(N);
	// Extend the incoming operand of the shift to 64-bits.
	SDValue Opd0 = Widen(CurDAG, Op.getOperand(0));
	unsigned Immr = ShiftImm;
	unsigned Imms = NarrowVT.getSizeInBits() - 1;
	SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT),
	CurDAG->getTargetConstant(Imms, dl, VT)};
	CurDAG->SelectNodeTo(N, AArch64::SBFMXri, VT, Ops);
	return true;
	}

	/// Try to form fcvtl2 instructions from a floating-point extend of a high-half
	/// extract of a subvector.
	bool AArch64DAGToDAGISel::tryHighFPExt(SDNode *N) {
	assert(N->getOpcode() == ISD::FP_EXTEND);

	// There are 2 forms of fcvtl2 - extend to double or extend to float.
	SDValue Extract = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT NarrowVT = Extract.getValueType();
	if ((VT != MVT::v2f64 \|\| NarrowVT != MVT::v2f32) &&
	(VT != MVT::v4f32 \|\| NarrowVT != MVT::v4f16))
	return false;

	// Optionally look past a bitcast.
	Extract = peekThroughBitcasts(Extract);
	if (Extract.getOpcode() != ISD::EXTRACT_SUBVECTOR)
	return false;

	// Match extract from start of high half index.
	// Example: v8i16 -> v4i16 means the extract must begin at index 4.
	unsigned ExtractIndex = Extract.getConstantOperandVal(1);
	if (ExtractIndex != Extract.getValueType().getVectorNumElements())
	return false;

	auto Opcode = VT == MVT::v2f64 ? AArch64::FCVTLv4i32 : AArch64::FCVTLv8i16;
	CurDAG->SelectNodeTo(N, Opcode, VT, Extract.getOperand(0));
	return true;
	}

	static bool isBitfieldExtractOp(SelectionDAG CurDAG, SDNode N, unsigned &Opc,
	SDValue &Opd0, unsigned &Immr, unsigned &Imms,
	unsigned NumberOfIgnoredLowBits = 0,
	bool BiggerPattern = false) {
	if (N->getValueType(0) != MVT::i32 && N->getValueType(0) != MVT::i64)
	return false;

	switch (N->getOpcode()) {
	default:
	if (!N->isMachineOpcode())
	return false;
	break;
	case ISD::AND:
	return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, Immr, Imms,
	NumberOfIgnoredLowBits, BiggerPattern);
	case ISD::SRL:
	case ISD::SRA:
	return isBitfieldExtractOpFromShr(N, Opc, Opd0, Immr, Imms, BiggerPattern);

	case ISD::SIGN_EXTEND_INREG:
	return isBitfieldExtractOpFromSExtInReg(N, Opc, Opd0, Immr, Imms);
	}

	unsigned NOpc = N->getMachineOpcode();
	switch (NOpc) {
	default:
	return false;
	case AArch64::SBFMWri:
	case AArch64::UBFMWri:
	case AArch64::SBFMXri:
	case AArch64::UBFMXri:
	Opc = NOpc;
	Opd0 = N->getOperand(0);
	Immr = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
	Imms = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
	return true;
	}
	// Unreachable
	return false;
	}

	bool AArch64DAGToDAGISel::tryBitfieldExtractOp(SDNode *N) {
	unsigned Opc, Immr, Imms;
	SDValue Opd0;
	if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, Immr, Imms))
	return false;

	EVT VT = N->getValueType(0);
	SDLoc dl(N);

	// If the bit extract operation is 64bit but the original type is 32bit, we
	// need to add one EXTRACT_SUBREG.
	if ((Opc == AArch64::SBFMXri \|\| Opc == AArch64::UBFMXri) && VT == MVT::i32) {
	SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, MVT::i64),
	CurDAG->getTargetConstant(Imms, dl, MVT::i64)};

	SDNode *BFM = CurDAG->getMachineNode(Opc, dl, MVT::i64, Ops64);
	SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
	ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl,
	MVT::i32, SDValue(BFM, 0), SubReg));
	return true;
	}

	SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT),
	CurDAG->getTargetConstant(Imms, dl, VT)};
	CurDAG->SelectNodeTo(N, Opc, VT, Ops);
	return true;
	}

	/// Does DstMask form a complementary pair with the mask provided by
	/// BitsToBeInserted, suitable for use in a BFI instruction. Roughly speaking,
	/// this asks whether DstMask zeroes precisely those bits that will be set by
	/// the other half.
	static bool isBitfieldDstMask(uint64_t DstMask, const APInt &BitsToBeInserted,
	unsigned NumberOfIgnoredHighBits, EVT VT) {
	assert((VT == MVT::i32 \|\| VT == MVT::i64) &&
	"i32 or i64 mask type expected!");
	unsigned BitWidth = VT.getSizeInBits() - NumberOfIgnoredHighBits;

	APInt SignificantDstMask = APInt(BitWidth, DstMask);
	APInt SignificantBitsToBeInserted = BitsToBeInserted.zextOrTrunc(BitWidth);

	return (SignificantDstMask & SignificantBitsToBeInserted) == 0 &&
	(SignificantDstMask \| SignificantBitsToBeInserted).isAllOnesValue();
	}

	// Look for bits that will be useful for later uses.
	// A bit is consider useless as soon as it is dropped and never used
	// before it as been dropped.
	// E.g., looking for useful bit of x
	// 1. y = x & 0x7
	// 2. z = y >> 2
	// After #1, x useful bits are 0x7, then the useful bits of x, live through
	// y.
	// After #2, the useful bits of x are 0x4.
	// However, if x is used on an unpredicatable instruction, then all its bits
	// are useful.
	// E.g.
	// 1. y = x & 0x7
	// 2. z = y >> 2
	// 3. str x, [@x]
	static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth = 0);

	static void getUsefulBitsFromAndWithImmediate(SDValue Op, APInt &UsefulBits,
	unsigned Depth) {
	uint64_t Imm =
	cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
	Imm = AArch64_AM::decodeLogicalImmediate(Imm, UsefulBits.getBitWidth());
	UsefulBits &= APInt(UsefulBits.getBitWidth(), Imm);
	getUsefulBits(Op, UsefulBits, Depth + 1);
	}

	static void getUsefulBitsFromBitfieldMoveOpd(SDValue Op, APInt &UsefulBits,
	uint64_t Imm, uint64_t MSB,
	unsigned Depth) {
	// inherit the bitwidth value
	APInt OpUsefulBits(UsefulBits);
	OpUsefulBits = 1;

	if (MSB >= Imm) {
	OpUsefulBits <<= MSB - Imm + 1;
	--OpUsefulBits;
	// The interesting part will be in the lower part of the result
	getUsefulBits(Op, OpUsefulBits, Depth + 1);
	// The interesting part was starting at Imm in the argument
	OpUsefulBits <<= Imm;
	} else {
	OpUsefulBits <<= MSB + 1;
	--OpUsefulBits;
	// The interesting part will be shifted in the result
	OpUsefulBits <<= OpUsefulBits.getBitWidth() - Imm;
	getUsefulBits(Op, OpUsefulBits, Depth + 1);
	// The interesting part was at zero in the argument
	OpUsefulBits.lshrInPlace(OpUsefulBits.getBitWidth() - Imm);
	}

	UsefulBits &= OpUsefulBits;
	}

	static void getUsefulBitsFromUBFM(SDValue Op, APInt &UsefulBits,
	unsigned Depth) {
	uint64_t Imm =
	cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
	uint64_t MSB =
	cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();

	getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
	}

	static void getUsefulBitsFromOrWithShiftedReg(SDValue Op, APInt &UsefulBits,
	unsigned Depth) {
	uint64_t ShiftTypeAndValue =
	cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
	APInt Mask(UsefulBits);
	Mask.clearAllBits();
	Mask.flipAllBits();

	if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSL) {
	// Shift Left
	uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
	Mask <<= ShiftAmt;
	getUsefulBits(Op, Mask, Depth + 1);
	Mask.lshrInPlace(ShiftAmt);
	} else if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSR) {
	// Shift Right
	// We do not handle AArch64_AM::ASR, because the sign will change the
	// number of useful bits
	uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
	Mask.lshrInPlace(ShiftAmt);
	getUsefulBits(Op, Mask, Depth + 1);
	Mask <<= ShiftAmt;
	} else
	return;

	UsefulBits &= Mask;
	}

	static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits,
	unsigned Depth) {
	uint64_t Imm =
	cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
	uint64_t MSB =
	cast<const ConstantSDNode>(Op.getOperand(3).getNode())->getZExtValue();

	APInt OpUsefulBits(UsefulBits);
	OpUsefulBits = 1;

	APInt ResultUsefulBits(UsefulBits.getBitWidth(), 0);
	ResultUsefulBits.flipAllBits();
	APInt Mask(UsefulBits.getBitWidth(), 0);

	getUsefulBits(Op, ResultUsefulBits, Depth + 1);

	if (MSB >= Imm) {
	// The instruction is a BFXIL.
	uint64_t Width = MSB - Imm + 1;
	uint64_t LSB = Imm;

	OpUsefulBits <<= Width;
	--OpUsefulBits;

	if (Op.getOperand(1) == Orig) {
	// Copy the low bits from the result to bits starting from LSB.
	Mask = ResultUsefulBits & OpUsefulBits;
	Mask <<= LSB;
	}

	if (Op.getOperand(0) == Orig)
	// Bits starting from LSB in the input contribute to the result.
	Mask \|= (ResultUsefulBits & ~OpUsefulBits);
	} else {
	// The instruction is a BFI.
	uint64_t Width = MSB + 1;
	uint64_t LSB = UsefulBits.getBitWidth() - Imm;

	OpUsefulBits <<= Width;
	--OpUsefulBits;
	OpUsefulBits <<= LSB;

	if (Op.getOperand(1) == Orig) {
	// Copy the bits from the result to the zero bits.
	Mask = ResultUsefulBits & OpUsefulBits;
	Mask.lshrInPlace(LSB);
	}

	if (Op.getOperand(0) == Orig)
	Mask \|= (ResultUsefulBits & ~OpUsefulBits);
	}

	UsefulBits &= Mask;
	}

	static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits,
	SDValue Orig, unsigned Depth) {

	// Users of this node should have already been instruction selected
	// FIXME: Can we turn that into an assert?
	if (!UserNode->isMachineOpcode())
	return;

	switch (UserNode->getMachineOpcode()) {
	default:
	return;
	case AArch64::ANDSWri:
	case AArch64::ANDSXri:
	case AArch64::ANDWri:
	case AArch64::ANDXri:
	// We increment Depth only when we call the getUsefulBits
	return getUsefulBitsFromAndWithImmediate(SDValue(UserNode, 0), UsefulBits,
	Depth);
	case AArch64::UBFMWri:
	case AArch64::UBFMXri:
	return getUsefulBitsFromUBFM(SDValue(UserNode, 0), UsefulBits, Depth);

	case AArch64::ORRWrs:
	case AArch64::ORRXrs:
	if (UserNode->getOperand(1) != Orig)
	return;
	return getUsefulBitsFromOrWithShiftedReg(SDValue(UserNode, 0), UsefulBits,
	Depth);
	case AArch64::BFMWri:
	case AArch64::BFMXri:
	return getUsefulBitsFromBFM(SDValue(UserNode, 0), Orig, UsefulBits, Depth);

	case AArch64::STRBBui:
	case AArch64::STURBBi:
	if (UserNode->getOperand(0) != Orig)
	return;
	UsefulBits &= APInt(UsefulBits.getBitWidth(), 0xff);
	return;

	case AArch64::STRHHui:
	case AArch64::STURHHi:
	if (UserNode->getOperand(0) != Orig)
	return;
	UsefulBits &= APInt(UsefulBits.getBitWidth(), 0xffff);
	return;
	}
	}

	static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
	if (Depth >= SelectionDAG::MaxRecursionDepth)
	return;
	// Initialize UsefulBits
	if (!Depth) {
	unsigned Bitwidth = Op.getScalarValueSizeInBits();
	// At the beginning, assume every produced bits is useful
	UsefulBits = APInt(Bitwidth, 0);
	UsefulBits.flipAllBits();
	}
	APInt UsersUsefulBits(UsefulBits.getBitWidth(), 0);

	for (SDNode *Node : Op.getNode()->uses()) {
	// A use cannot produce useful bits
	APInt UsefulBitsForUse = APInt(UsefulBits);
	getUsefulBitsForUse(Node, UsefulBitsForUse, Op, Depth);
	UsersUsefulBits \|= UsefulBitsForUse;
	}
	// UsefulBits contains the produced bits that are meaningful for the
	// current definition, thus a user cannot make a bit meaningful at
	// this point
	UsefulBits &= UsersUsefulBits;
	}

	/// Create a machine node performing a notional SHL of Op by ShlAmount. If
	/// ShlAmount is negative, do a (logical) right-shift instead. If ShlAmount is
	/// 0, return Op unchanged.
	static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) {
	if (ShlAmount == 0)
	return Op;

	EVT VT = Op.getValueType();
	SDLoc dl(Op);
	unsigned BitWidth = VT.getSizeInBits();
	unsigned UBFMOpc = BitWidth == 32 ? AArch64::UBFMWri : AArch64::UBFMXri;

	SDNode *ShiftNode;
	if (ShlAmount > 0) {
	// LSL wD, wN, #Amt == UBFM wD, wN, #32-Amt, #31-Amt
	ShiftNode = CurDAG->getMachineNode(
	UBFMOpc, dl, VT, Op,
	CurDAG->getTargetConstant(BitWidth - ShlAmount, dl, VT),
	CurDAG->getTargetConstant(BitWidth - 1 - ShlAmount, dl, VT));
	} else {
	// LSR wD, wN, #Amt == UBFM wD, wN, #Amt, #32-1
	assert(ShlAmount < 0 && "expected right shift");
	int ShrAmount = -ShlAmount;
	ShiftNode = CurDAG->getMachineNode(
	UBFMOpc, dl, VT, Op, CurDAG->getTargetConstant(ShrAmount, dl, VT),
	CurDAG->getTargetConstant(BitWidth - 1, dl, VT));
	}

	return SDValue(ShiftNode, 0);
	}

	/// Does this tree qualify as an attempt to move a bitfield into position,
	/// essentially "(and (shl VAL, N), Mask)".
	static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
	bool BiggerPattern,
	SDValue &Src, int &ShiftAmount,
	int &MaskWidth) {
	EVT VT = Op.getValueType();
	unsigned BitWidth = VT.getSizeInBits();
	(void)BitWidth;
	assert(BitWidth == 32 \|\| BitWidth == 64);

	KnownBits Known = CurDAG->computeKnownBits(Op);

	// Non-zero in the sense that they're not provably zero, which is the key
	// point if we want to use this value
	uint64_t NonZeroBits = (~Known.Zero).getZExtValue();

	// Discard a constant AND mask if present. It's safe because the node will
	// already have been factored into the computeKnownBits calculation above.
	uint64_t AndImm;
	if (isOpcWithIntImmediate(Op.getNode(), ISD::AND, AndImm)) {
	assert((~APInt(BitWidth, AndImm) & ~Known.Zero) == 0);
	Op = Op.getOperand(0);
	}

	// Don't match if the SHL has more than one use, since then we'll end up
	// generating SHL+UBFIZ instead of just keeping SHL+AND.
	if (!BiggerPattern && !Op.hasOneUse())
	return false;

	uint64_t ShlImm;
	if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm))
	return false;
	Op = Op.getOperand(0);

	if (!isShiftedMask_64(NonZeroBits))
	return false;

	ShiftAmount = countTrailingZeros(NonZeroBits);
	MaskWidth = countTrailingOnes(NonZeroBits >> ShiftAmount);

	// BFI encompasses sufficiently many nodes that it's worth inserting an extra
	// LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL
	// amount. BiggerPattern is true when this pattern is being matched for BFI,
	// BiggerPattern is false when this pattern is being matched for UBFIZ, in
	// which case it is not profitable to insert an extra shift.
	if (ShlImm - ShiftAmount != 0 && !BiggerPattern)
	return false;
	Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount);

	return true;
	}

	static bool isShiftedMask(uint64_t Mask, EVT VT) {
	assert(VT == MVT::i32 \|\| VT == MVT::i64);
	if (VT == MVT::i32)
	return isShiftedMask_32(Mask);
	return isShiftedMask_64(Mask);
	}

	// Generate a BFI/BFXIL from 'or (and X, MaskImm), OrImm' iff the value being
	// inserted only sets known zero bits.
	static bool tryBitfieldInsertOpFromOrAndImm(SDNode N, SelectionDAG CurDAG) {
	assert(N->getOpcode() == ISD::OR && "Expect a OR operation");

	EVT VT = N->getValueType(0);
	if (VT != MVT::i32 && VT != MVT::i64)
	return false;

	unsigned BitWidth = VT.getSizeInBits();

	uint64_t OrImm;
	if (!isOpcWithIntImmediate(N, ISD::OR, OrImm))
	return false;

	// Skip this transformation if the ORR immediate can be encoded in the ORR.
	// Otherwise, we'll trade an AND+ORR for ORR+BFI/BFXIL, which is most likely
	// performance neutral.
	if (AArch64_AM::isLogicalImmediate(OrImm, BitWidth))
	return false;

	uint64_t MaskImm;
	SDValue And = N->getOperand(0);
	// Must be a single use AND with an immediate operand.
	if (!And.hasOneUse() \|\|
	!isOpcWithIntImmediate(And.getNode(), ISD::AND, MaskImm))
	return false;

	// Compute the Known Zero for the AND as this allows us to catch more general
	// cases than just looking for AND with imm.
	KnownBits Known = CurDAG->computeKnownBits(And);

	// Non-zero in the sense that they're not provably zero, which is the key
	// point if we want to use this value.
	uint64_t NotKnownZero = (~Known.Zero).getZExtValue();

	// The KnownZero mask must be a shifted mask (e.g., 1110..011, 11100..00).
	if (!isShiftedMask(Known.Zero.getZExtValue(), VT))
	return false;

	// The bits being inserted must only set those bits that are known to be zero.
	if ((OrImm & NotKnownZero) != 0) {
	// FIXME: It's okay if the OrImm sets NotKnownZero bits to 1, but we don't
	// currently handle this case.
	return false;
	}

	// BFI/BFXIL dst, src, #lsb, #width.
	int LSB = countTrailingOnes(NotKnownZero);
	int Width = BitWidth - APInt(BitWidth, NotKnownZero).countPopulation();

	// BFI/BFXIL is an alias of BFM, so translate to BFM operands.
	unsigned ImmR = (BitWidth - LSB) % BitWidth;
	unsigned ImmS = Width - 1;

	// If we're creating a BFI instruction avoid cases where we need more
	// instructions to materialize the BFI constant as compared to the original
	// ORR. A BFXIL will use the same constant as the original ORR, so the code
	// should be no worse in this case.
	bool IsBFI = LSB != 0;
	uint64_t BFIImm = OrImm >> LSB;
	if (IsBFI && !AArch64_AM::isLogicalImmediate(BFIImm, BitWidth)) {
	// We have a BFI instruction and we know the constant can't be materialized
	// with a ORR-immediate with the zero register.
	unsigned OrChunks = 0, BFIChunks = 0;
	for (unsigned Shift = 0; Shift < BitWidth; Shift += 16) {
	if (((OrImm >> Shift) & 0xFFFF) != 0)
	++OrChunks;
	if (((BFIImm >> Shift) & 0xFFFF) != 0)
	++BFIChunks;
	}
	if (BFIChunks > OrChunks)
	return false;
	}

	// Materialize the constant to be inserted.
	SDLoc DL(N);
	unsigned MOVIOpc = VT == MVT::i32 ? AArch64::MOVi32imm : AArch64::MOVi64imm;
	SDNode *MOVI = CurDAG->getMachineNode(
	MOVIOpc, DL, VT, CurDAG->getTargetConstant(BFIImm, DL, VT));

	// Create the BFI/BFXIL instruction.
	SDValue Ops[] = {And.getOperand(0), SDValue(MOVI, 0),
	CurDAG->getTargetConstant(ImmR, DL, VT),
	CurDAG->getTargetConstant(ImmS, DL, VT)};
	unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
	CurDAG->SelectNodeTo(N, Opc, VT, Ops);
	return true;
	}

	static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits,
	SelectionDAG *CurDAG) {
	assert(N->getOpcode() == ISD::OR && "Expect a OR operation");

	EVT VT = N->getValueType(0);
	if (VT != MVT::i32 && VT != MVT::i64)
	return false;

	unsigned BitWidth = VT.getSizeInBits();

	// Because of simplify-demanded-bits in DAGCombine, involved masks may not
	// have the expected shape. Try to undo that.

	unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
	unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();

	// Given a OR operation, check if we have the following pattern
	// ubfm c, b, imm, imm2 (or something that does the same jobs, see
	// isBitfieldExtractOp)
	// d = e & mask2 ; where mask is a binary sequence of 1..10..0 and
	// countTrailingZeros(mask2) == imm2 - imm + 1
	// f = d \| c
	// if yes, replace the OR instruction with:
	// f = BFM Opd0, Opd1, LSB, MSB ; where LSB = imm, and MSB = imm2

	// OR is commutative, check all combinations of operand order and values of
	// BiggerPattern, i.e.
	// Opd0, Opd1, BiggerPattern=false
	// Opd1, Opd0, BiggerPattern=false
	// Opd0, Opd1, BiggerPattern=true
	// Opd1, Opd0, BiggerPattern=true
	// Several of these combinations may match, so check with BiggerPattern=false
	// first since that will produce better results by matching more instructions
	// and/or inserting fewer extra instructions.
	for (int I = 0; I < 4; ++I) {

	SDValue Dst, Src;
	unsigned ImmR, ImmS;
	bool BiggerPattern = I / 2;
	SDValue OrOpd0Val = N->getOperand(I % 2);
	SDNode *OrOpd0 = OrOpd0Val.getNode();
	SDValue OrOpd1Val = N->getOperand((I + 1) % 2);
	SDNode *OrOpd1 = OrOpd1Val.getNode();

	unsigned BFXOpc;
	int DstLSB, Width;
	if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS,
	NumberOfIgnoredLowBits, BiggerPattern)) {
	// Check that the returned opcode is compatible with the pattern,
	// i.e., same type and zero extended (U and not S)
	if ((BFXOpc != AArch64::UBFMXri && VT == MVT::i64) \|\|
	(BFXOpc != AArch64::UBFMWri && VT == MVT::i32))
	continue;

	// Compute the width of the bitfield insertion
	DstLSB = 0;
	Width = ImmS - ImmR + 1;
	// FIXME: This constraint is to catch bitfield insertion we may
	// want to widen the pattern if we want to grab general bitfied
	// move case
	if (Width <= 0)
	continue;

	// If the mask on the insertee is correct, we have a BFXIL operation. We
	// can share the ImmR and ImmS values from the already-computed UBFM.
	} else if (isBitfieldPositioningOp(CurDAG, OrOpd0Val,
	BiggerPattern,
	Src, DstLSB, Width)) {
	ImmR = (BitWidth - DstLSB) % BitWidth;
	ImmS = Width - 1;
	} else
	continue;

	// Check the second part of the pattern
	EVT VT = OrOpd1Val.getValueType();
	assert((VT == MVT::i32 \|\| VT == MVT::i64) && "unexpected OR operand");

	// Compute the Known Zero for the candidate of the first operand.
	// This allows to catch more general case than just looking for
	// AND with imm. Indeed, simplify-demanded-bits may have removed
	// the AND instruction because it proves it was useless.
	KnownBits Known = CurDAG->computeKnownBits(OrOpd1Val);

	// Check if there is enough room for the second operand to appear
	// in the first one
	APInt BitsToBeInserted =
	APInt::getBitsSet(Known.getBitWidth(), DstLSB, DstLSB + Width);

	if ((BitsToBeInserted & ~Known.Zero) != 0)
	continue;

	// Set the first operand
	uint64_t Imm;
	if (isOpcWithIntImmediate(OrOpd1, ISD::AND, Imm) &&
	isBitfieldDstMask(Imm, BitsToBeInserted, NumberOfIgnoredHighBits, VT))
	// In that case, we can eliminate the AND
	Dst = OrOpd1->getOperand(0);
	else
	// Maybe the AND has been removed by simplify-demanded-bits
	// or is useful because it discards more bits
	Dst = OrOpd1Val;

	// both parts match
	SDLoc DL(N);
	SDValue Ops[] = {Dst, Src, CurDAG->getTargetConstant(ImmR, DL, VT),
	CurDAG->getTargetConstant(ImmS, DL, VT)};
	unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
	CurDAG->SelectNodeTo(N, Opc, VT, Ops);
	return true;
	}

	// Generate a BFXIL from 'or (and X, Mask0Imm), (and Y, Mask1Imm)' iff
	// Mask0Imm and ~Mask1Imm are equivalent and one of the MaskImms is a shifted
	// mask (e.g., 0x000ffff0).
	uint64_t Mask0Imm, Mask1Imm;
	SDValue And0 = N->getOperand(0);
	SDValue And1 = N->getOperand(1);
	if (And0.hasOneUse() && And1.hasOneUse() &&
	isOpcWithIntImmediate(And0.getNode(), ISD::AND, Mask0Imm) &&
	isOpcWithIntImmediate(And1.getNode(), ISD::AND, Mask1Imm) &&
	APInt(BitWidth, Mask0Imm) == ~APInt(BitWidth, Mask1Imm) &&
	(isShiftedMask(Mask0Imm, VT) \|\| isShiftedMask(Mask1Imm, VT))) {

	// ORR is commutative, so canonicalize to the form 'or (and X, Mask0Imm),
	// (and Y, Mask1Imm)' where Mask1Imm is the shifted mask masking off the
	// bits to be inserted.
	if (isShiftedMask(Mask0Imm, VT)) {
	std::swap(And0, And1);
	std::swap(Mask0Imm, Mask1Imm);
	}

	SDValue Src = And1->getOperand(0);
	SDValue Dst = And0->getOperand(0);
	unsigned LSB = countTrailingZeros(Mask1Imm);
	int Width = BitWidth - APInt(BitWidth, Mask0Imm).countPopulation();

	// The BFXIL inserts the low-order bits from a source register, so right
	// shift the needed bits into place.
	SDLoc DL(N);
	unsigned ShiftOpc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
	SDNode *LSR = CurDAG->getMachineNode(
	ShiftOpc, DL, VT, Src, CurDAG->getTargetConstant(LSB, DL, VT),
	CurDAG->getTargetConstant(BitWidth - 1, DL, VT));

	// BFXIL is an alias of BFM, so translate to BFM operands.
	unsigned ImmR = (BitWidth - LSB) % BitWidth;
	unsigned ImmS = Width - 1;

	// Create the BFXIL instruction.
	SDValue Ops[] = {Dst, SDValue(LSR, 0),
	CurDAG->getTargetConstant(ImmR, DL, VT),
	CurDAG->getTargetConstant(ImmS, DL, VT)};
	unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
	CurDAG->SelectNodeTo(N, Opc, VT, Ops);
	return true;
	}

	return false;
	}

	bool AArch64DAGToDAGISel::tryBitfieldInsertOp(SDNode *N) {
	if (N->getOpcode() != ISD::OR)
	return false;

	APInt NUsefulBits;
	getUsefulBits(SDValue(N, 0), NUsefulBits);

	// If all bits are not useful, just return UNDEF.
	if (!NUsefulBits) {
	CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, N->getValueType(0));
	return true;
	}

	if (tryBitfieldInsertOpFromOr(N, NUsefulBits, CurDAG))
	return true;

	return tryBitfieldInsertOpFromOrAndImm(N, CurDAG);
	}

	/// SelectBitfieldInsertInZeroOp - Match a UBFIZ instruction that is the
	/// equivalent of a left shift by a constant amount followed by an and masking
	/// out a contiguous set of bits.
	bool AArch64DAGToDAGISel::tryBitfieldInsertInZeroOp(SDNode *N) {
	if (N->getOpcode() != ISD::AND)
	return false;

	EVT VT = N->getValueType(0);
	if (VT != MVT::i32 && VT != MVT::i64)
	return false;

	SDValue Op0;
	int DstLSB, Width;
	if (!isBitfieldPositioningOp(CurDAG, SDValue(N, 0), /BiggerPattern=/false,
	Op0, DstLSB, Width))
	return false;

	// ImmR is the rotate right amount.
	unsigned ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
	// ImmS is the most significant bit of the source to be moved.
	unsigned ImmS = Width - 1;

	SDLoc DL(N);
	SDValue Ops[] = {Op0, CurDAG->getTargetConstant(ImmR, DL, VT),
	CurDAG->getTargetConstant(ImmS, DL, VT)};
	unsigned Opc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
	CurDAG->SelectNodeTo(N, Opc, VT, Ops);
	return true;
	}

	/// tryShiftAmountMod - Take advantage of built-in mod of shift amount in
	/// variable shift/rotate instructions.
	bool AArch64DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
	EVT VT = N->getValueType(0);

	unsigned Opc;
	switch (N->getOpcode()) {
	case ISD::ROTR:
	Opc = (VT == MVT::i32) ? AArch64::RORVWr : AArch64::RORVXr;
	break;
	case ISD::SHL:
	Opc = (VT == MVT::i32) ? AArch64::LSLVWr : AArch64::LSLVXr;
	break;
	case ISD::SRL:
	Opc = (VT == MVT::i32) ? AArch64::LSRVWr : AArch64::LSRVXr;
	break;
	case ISD::SRA:
	Opc = (VT == MVT::i32) ? AArch64::ASRVWr : AArch64::ASRVXr;
	break;
	default:
	return false;
	}

	uint64_t Size;
	uint64_t Bits;
	if (VT == MVT::i32) {
	Bits = 5;
	Size = 32;
	} else if (VT == MVT::i64) {
	Bits = 6;
	Size = 64;
	} else
	return false;

	SDValue ShiftAmt = N->getOperand(1);
	SDLoc DL(N);
	SDValue NewShiftAmt;

	// Skip over an extend of the shift amount.
	if (ShiftAmt->getOpcode() == ISD::ZERO_EXTEND \|\|
	ShiftAmt->getOpcode() == ISD::ANY_EXTEND)
	ShiftAmt = ShiftAmt->getOperand(0);

	if (ShiftAmt->getOpcode() == ISD::ADD \|\| ShiftAmt->getOpcode() == ISD::SUB) {
	SDValue Add0 = ShiftAmt->getOperand(0);
	SDValue Add1 = ShiftAmt->getOperand(1);
	uint64_t Add0Imm;
	uint64_t Add1Imm;
	// If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
	// to avoid the ADD/SUB.
	if (isIntImmediate(Add1, Add1Imm) && (Add1Imm % Size == 0))
	NewShiftAmt = Add0;
	// If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
	// generate a NEG instead of a SUB of a constant.
	else if (ShiftAmt->getOpcode() == ISD::SUB &&
	isIntImmediate(Add0, Add0Imm) && Add0Imm != 0 &&
	(Add0Imm % Size == 0)) {
	unsigned NegOpc;
	unsigned ZeroReg;
	EVT SubVT = ShiftAmt->getValueType(0);
	if (SubVT == MVT::i32) {
	NegOpc = AArch64::SUBWrr;
	ZeroReg = AArch64::WZR;
	} else {
	assert(SubVT == MVT::i64);
	NegOpc = AArch64::SUBXrr;
	ZeroReg = AArch64::XZR;
	}
	SDValue Zero =
	CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, ZeroReg, SubVT);
	MachineSDNode *Neg =
	CurDAG->getMachineNode(NegOpc, DL, SubVT, Zero, Add1);
	NewShiftAmt = SDValue(Neg, 0);
	} else
	return false;
	} else {
	// If the shift amount is masked with an AND, check that the mask covers the
	// bits that are implicitly ANDed off by the above opcodes and if so, skip
	// the AND.
	uint64_t MaskImm;
	if (!isOpcWithIntImmediate(ShiftAmt.getNode(), ISD::AND, MaskImm) &&
	!isOpcWithIntImmediate(ShiftAmt.getNode(), AArch64ISD::ANDS, MaskImm))
	return false;

	if (countTrailingOnes(MaskImm) < Bits)
	return false;

	NewShiftAmt = ShiftAmt->getOperand(0);
	}

	// Narrow/widen the shift amount to match the size of the shift operation.
	if (VT == MVT::i32)
	NewShiftAmt = narrowIfNeeded(CurDAG, NewShiftAmt);
	else if (VT == MVT::i64 && NewShiftAmt->getValueType(0) == MVT::i32) {
	SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, DL, MVT::i32);
	MachineSDNode *Ext = CurDAG->getMachineNode(
	AArch64::SUBREG_TO_REG, DL, VT,
	CurDAG->getTargetConstant(0, DL, MVT::i64), NewShiftAmt, SubReg);
	NewShiftAmt = SDValue(Ext, 0);
	}

	SDValue Ops[] = {N->getOperand(0), NewShiftAmt};
	CurDAG->SelectNodeTo(N, Opc, VT, Ops);
	return true;
	}

	bool
	AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
	unsigned RegWidth) {
	APFloat FVal(0.0);
	if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
	FVal = CN->getValueAPF();
	else if (LoadSDNode *LN = dyn_cast<LoadSDNode>(N)) {
	// Some otherwise illegal constants are allowed in this case.
	if (LN->getOperand(1).getOpcode() != AArch64ISD::ADDlow \|\|
	!isa<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1)))
	return false;

	ConstantPoolSDNode *CN =
	dyn_cast<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1));
	FVal = cast<ConstantFP>(CN->getConstVal())->getValueAPF();
	} else
	return false;

	// An FCVT[SU] instruction performs: convertToInt(Val * 2^fbits) where fbits
	// is between 1 and 32 for a destination w-register, or 1 and 64 for an
	// x-register.
	//
	// By this stage, we've detected (fp_to_[su]int (fmul Val, THIS_NODE)) so we
	// want THIS_NODE to be 2^fbits. This is much easier to deal with using
	// integers.
	bool IsExact;

	// fbits is between 1 and 64 in the worst-case, which means the fmul
	// could have 2^64 as an actual operand. Need 65 bits of precision.
	APSInt IntVal(65, true);
	FVal.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact);

	// N.b. isPowerOf2 also checks for > 0.
	if (!IsExact \|\| !IntVal.isPowerOf2()) return false;
	unsigned FBits = IntVal.logBase2();

	// Checks above should have guaranteed that we haven't lost information in
	// finding FBits, but it must still be in range.
	if (FBits == 0 \|\| FBits > RegWidth) return false;

	FixedPos = CurDAG->getTargetConstant(FBits, SDLoc(N), MVT::i32);
	return true;
	}

	// Inspects a register string of the form o0:op1:CRn:CRm:op2 gets the fields
	// of the string and obtains the integer values from them and combines these
	// into a single value to be used in the MRS/MSR instruction.
	static int getIntOperandFromRegisterString(StringRef RegString) {
	SmallVector<StringRef, 5> Fields;
	RegString.split(Fields, ':');

	if (Fields.size() == 1)
	return -1;

	assert(Fields.size() == 5
	&& "Invalid number of fields in read register string");

	SmallVector<int, 5> Ops;
	bool AllIntFields = true;

	for (StringRef Field : Fields) {
	unsigned IntField;
	AllIntFields &= !Field.getAsInteger(10, IntField);
	Ops.push_back(IntField);
	}

	assert(AllIntFields &&
	"Unexpected non-integer value in special register string.");

	// Need to combine the integer fields of the string into a single value
	// based on the bit encoding of MRS/MSR instruction.
	return (Ops[0] << 14) \| (Ops[1] << 11) \| (Ops[2] << 7) \|
	(Ops[3] << 3) \| (Ops[4]);
	}

	// Lower the read_register intrinsic to an MRS instruction node if the special
	// register string argument is either of the form detailed in the ALCE (the
	// form described in getIntOperandsFromRegsterString) or is a named register
	// known by the MRS SysReg mapper.
	bool AArch64DAGToDAGISel::tryReadRegister(SDNode *N) {
	const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
	const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
	SDLoc DL(N);

	int Reg = getIntOperandFromRegisterString(RegString->getString());
	if (Reg != -1) {
	ReplaceNode(N, CurDAG->getMachineNode(
	AArch64::MRS, DL, N->getSimpleValueType(0), MVT::Other,
	CurDAG->getTargetConstant(Reg, DL, MVT::i32),
	N->getOperand(0)));
	return true;
	}

	// Use the sysreg mapper to map the remaining possible strings to the
	// value for the register to be used for the instruction operand.
	auto TheReg = AArch64SysReg::lookupSysRegByName(RegString->getString());
	if (TheReg && TheReg->Readable &&
	TheReg->haveFeatures(Subtarget->getFeatureBits()))
	Reg = TheReg->Encoding;
	else
	Reg = AArch64SysReg::parseGenericRegister(RegString->getString());

	if (Reg != -1) {
	ReplaceNode(N, CurDAG->getMachineNode(
	AArch64::MRS, DL, N->getSimpleValueType(0), MVT::Other,
	CurDAG->getTargetConstant(Reg, DL, MVT::i32),
	N->getOperand(0)));
	return true;
	}

	if (RegString->getString() == "pc") {
	ReplaceNode(N, CurDAG->getMachineNode(
	AArch64::ADR, DL, N->getSimpleValueType(0), MVT::Other,
	CurDAG->getTargetConstant(0, DL, MVT::i32),
	N->getOperand(0)));
	return true;
	}

	return false;
	}

	// Lower the write_register intrinsic to an MSR instruction node if the special
	// register string argument is either of the form detailed in the ALCE (the
	// form described in getIntOperandsFromRegsterString) or is a named register
	// known by the MSR SysReg mapper.
	bool AArch64DAGToDAGISel::tryWriteRegister(SDNode *N) {
	const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
	const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
	SDLoc DL(N);

	int Reg = getIntOperandFromRegisterString(RegString->getString());
	if (Reg != -1) {
	ReplaceNode(
	N, CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other,
	CurDAG->getTargetConstant(Reg, DL, MVT::i32),
	N->getOperand(2), N->getOperand(0)));
	return true;
	}

	// Check if the register was one of those allowed as the pstatefield value in
	// the MSR (immediate) instruction. To accept the values allowed in the
	// pstatefield for the MSR (immediate) instruction, we also require that an
	// immediate value has been provided as an argument, we know that this is
	// the case as it has been ensured by semantic checking.
	auto PMapper = AArch64PState::lookupPStateByName(RegString->getString());
	if (PMapper) {
	assert (isa<ConstantSDNode>(N->getOperand(2))
	&& "Expected a constant integer expression.");
	unsigned Reg = PMapper->Encoding;
	uint64_t Immed = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
	unsigned State;
	if (Reg == AArch64PState::PAN \|\| Reg == AArch64PState::UAO \|\| Reg == AArch64PState::SSBS) {
	assert(Immed < 2 && "Bad imm");
	State = AArch64::MSRpstateImm1;
	} else {
	assert(Immed < 16 && "Bad imm");
	State = AArch64::MSRpstateImm4;
	}
	ReplaceNode(N, CurDAG->getMachineNode(
	State, DL, MVT::Other,
	CurDAG->getTargetConstant(Reg, DL, MVT::i32),
	CurDAG->getTargetConstant(Immed, DL, MVT::i16),
	N->getOperand(0)));
	return true;
	}

	// Use the sysreg mapper to attempt to map the remaining possible strings
	// to the value for the register to be used for the MSR (register)
	// instruction operand.
	auto TheReg = AArch64SysReg::lookupSysRegByName(RegString->getString());
	if (TheReg && TheReg->Writeable &&
	TheReg->haveFeatures(Subtarget->getFeatureBits()))
	Reg = TheReg->Encoding;
	else
	Reg = AArch64SysReg::parseGenericRegister(RegString->getString());
	if (Reg != -1) {
	ReplaceNode(N, CurDAG->getMachineNode(
	AArch64::MSR, DL, MVT::Other,
	CurDAG->getTargetConstant(Reg, DL, MVT::i32),
	N->getOperand(2), N->getOperand(0)));
	return true;
	}

	return false;
	}

	/// We've got special pseudo-instructions for these
	bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
	unsigned Opcode;
	EVT MemTy = cast<MemSDNode>(N)->getMemoryVT();

	// Leave IR for LSE if subtarget supports it.
	if (Subtarget->hasLSE()) return false;

	if (MemTy == MVT::i8)
	Opcode = AArch64::CMP_SWAP_8;
	else if (MemTy == MVT::i16)
	Opcode = AArch64::CMP_SWAP_16;
	else if (MemTy == MVT::i32)
	Opcode = AArch64::CMP_SWAP_32;
	else if (MemTy == MVT::i64)
	Opcode = AArch64::CMP_SWAP_64;
	else
	llvm_unreachable("Unknown AtomicCmpSwap type");

	MVT RegTy = MemTy == MVT::i64 ? MVT::i64 : MVT::i32;
	SDValue Ops[] = {N->getOperand(1), N->getOperand(2), N->getOperand(3),
	N->getOperand(0)};
	SDNode *CmpSwap = CurDAG->getMachineNode(
	Opcode, SDLoc(N),
	CurDAG->getVTList(RegTy, MVT::i32, MVT::Other), Ops);

	MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
	CurDAG->setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});

	ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0));
	ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2));
	CurDAG->RemoveDeadNode(N);

	return true;
	}

	bool AArch64DAGToDAGISel::SelectSVE8BitLslImm(SDValue N, SDValue &Base,
	SDValue &Offset) {
	auto C = dyn_cast<ConstantSDNode>(N);
	if (!C)
	return false;

	auto Ty = N->getValueType(0);

	int64_t Imm = C->getSExtValue();
	SDLoc DL(N);

	if ((Imm >= -128) && (Imm <= 127)) {
	Base = CurDAG->getTargetConstant(Imm, DL, Ty);
	Offset = CurDAG->getTargetConstant(0, DL, Ty);
	return true;
	}

	if (((Imm % 256) == 0) && (Imm >= -32768) && (Imm <= 32512)) {
	Base = CurDAG->getTargetConstant(Imm/256, DL, Ty);
	Offset = CurDAG->getTargetConstant(8, DL, Ty);
	return true;
	}

	return false;
	}

	bool AArch64DAGToDAGISel::SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift) {
	if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
	const int64_t ImmVal = CNode->getZExtValue();
	SDLoc DL(N);

	switch (VT.SimpleTy) {
	case MVT::i8:
	if ((ImmVal & 0xFF) == ImmVal) {
	Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
	Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
	return true;
	}
	break;
	case MVT::i16:
	case MVT::i32:
	case MVT::i64:
	if ((ImmVal & 0xFF) == ImmVal) {
	Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
	Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
	return true;
	} else if ((ImmVal & 0xFF00) == ImmVal) {
	Shift = CurDAG->getTargetConstant(8, DL, MVT::i32);
	Imm = CurDAG->getTargetConstant(ImmVal >> 8, DL, MVT::i32);
	return true;
	}
	break;
	default:
	break;
	}
	}

	return false;
	}

	bool AArch64DAGToDAGISel::SelectSVESignedArithImm(SDValue N, SDValue &Imm) {
	if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
	int64_t ImmVal = CNode->getSExtValue();
	SDLoc DL(N);
	if (ImmVal >= -128 && ImmVal < 128) {
	Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
	return true;
	}
	}
	return false;
	}

	bool AArch64DAGToDAGISel::SelectSVEArithImm(SDValue N, SDValue &Imm) {
	if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
	uint64_t ImmVal = CNode->getSExtValue();
	SDLoc DL(N);
	ImmVal = ImmVal & 0xFF;
	if (ImmVal < 256) {
	Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
	return true;
	}
	}
	return false;
	}

	bool AArch64DAGToDAGISel::SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm) {
	if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
	uint64_t ImmVal = CNode->getZExtValue();
	SDLoc DL(N);

	// Shift mask depending on type size.
	switch (VT.SimpleTy) {
	case MVT::i8:
	ImmVal &= 0xFF;
	ImmVal \|= ImmVal << 8;
	ImmVal \|= ImmVal << 16;
	ImmVal \|= ImmVal << 32;
	break;
	case MVT::i16:
	ImmVal &= 0xFFFF;
	ImmVal \|= ImmVal << 16;
	ImmVal \|= ImmVal << 32;
	break;
	case MVT::i32:
	ImmVal &= 0xFFFFFFFF;
	ImmVal \|= ImmVal << 32;
	break;
	case MVT::i64:
	break;
	default:
	llvm_unreachable("Unexpected type");
	}

	uint64_t encoding;
	if (AArch64_AM::processLogicalImmediate(ImmVal, 64, encoding)) {
	Imm = CurDAG->getTargetConstant(encoding, DL, MVT::i64);
	return true;
	}
	}
	return false;
	}

	// This method is only needed to "cast" i64s into i32s when the value
	// is a valid shift which has been splatted into a vector with i64 elements.
	// Every other type is fine in tablegen.
	bool AArch64DAGToDAGISel::SelectSVEShiftImm64(SDValue N, uint64_t Low,
	uint64_t High, SDValue &Imm) {
	if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
	uint64_t ImmVal = CN->getZExtValue();
	SDLoc DL(N);

	if (ImmVal >= Low && ImmVal <= High) {
	Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
	return true;
	}
	}

	return false;
	}

	bool AArch64DAGToDAGISel::trySelectStackSlotTagP(SDNode *N) {
	// tagp(FrameIndex, IRGstack, tag_offset):
	// since the offset between FrameIndex and IRGstack is a compile-time
	// constant, this can be lowered to a single ADDG instruction.
	if (!(isa<FrameIndexSDNode>(N->getOperand(1)))) {
	return false;
	}

	SDValue IRG_SP = N->getOperand(2);
	if (IRG_SP->getOpcode() != ISD::INTRINSIC_W_CHAIN \|\|
	cast<ConstantSDNode>(IRG_SP->getOperand(1))->getZExtValue() !=
	Intrinsic::aarch64_irg_sp) {
	return false;
	}

	const TargetLowering *TLI = getTargetLowering();
	SDLoc DL(N);
	int FI = cast<FrameIndexSDNode>(N->getOperand(1))->getIndex();
	SDValue FiOp = CurDAG->getTargetFrameIndex(
	FI, TLI->getPointerTy(CurDAG->getDataLayout()));
	int TagOffset = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();

	SDNode *Out = CurDAG->getMachineNode(
	AArch64::TAGPstack, DL, MVT::i64,
	{FiOp, CurDAG->getTargetConstant(0, DL, MVT::i64), N->getOperand(2),
	CurDAG->getTargetConstant(TagOffset, DL, MVT::i64)});
	ReplaceNode(N, Out);
	return true;
	}

	void AArch64DAGToDAGISel::SelectTagP(SDNode *N) {
	assert(isa<ConstantSDNode>(N->getOperand(3)) &&
	"llvm.aarch64.tagp third argument must be an immediate");
	if (trySelectStackSlotTagP(N))
	return;
	// FIXME: above applies in any case when offset between Op1 and Op2 is a
	// compile-time constant, not just for stack allocations.

	// General case for unrelated pointers in Op1 and Op2.
	SDLoc DL(N);
	int TagOffset = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
	SDNode *N1 = CurDAG->getMachineNode(AArch64::SUBP, DL, MVT::i64,
	{N->getOperand(1), N->getOperand(2)});
	SDNode *N2 = CurDAG->getMachineNode(AArch64::ADDXrr, DL, MVT::i64,
	{SDValue(N1, 0), N->getOperand(2)});
	SDNode *N3 = CurDAG->getMachineNode(
	AArch64::ADDG, DL, MVT::i64,
	{SDValue(N2, 0), CurDAG->getTargetConstant(0, DL, MVT::i64),
	CurDAG->getTargetConstant(TagOffset, DL, MVT::i64)});
	ReplaceNode(N, N3);
	}

	// NOTE: We cannot use EXTRACT_SUBREG in all cases because the fixed length
	// vector types larger than NEON don't have a matching SubRegIndex.
	static SDNode extractSubReg(SelectionDAG DAG, EVT VT, SDValue V) {
	assert(V.getValueType().isScalableVector() &&
	V.getValueType().getSizeInBits().getKnownMinSize() ==
	AArch64::SVEBitsPerBlock &&
	"Expected to extract from a packed scalable vector!");
	assert(VT.isFixedLengthVector() &&
	"Expected to extract a fixed length vector!");

	SDLoc DL(V);
	switch (VT.getSizeInBits()) {
	case 64: {
	auto SubReg = DAG->getTargetConstant(AArch64::dsub, DL, MVT::i32);
	return DAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT, V, SubReg);
	}
	case 128: {
	auto SubReg = DAG->getTargetConstant(AArch64::zsub, DL, MVT::i32);
	return DAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT, V, SubReg);
	}
	default: {
	auto RC = DAG->getTargetConstant(AArch64::ZPRRegClassID, DL, MVT::i64);
	return DAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC);
	}
	}
	}

	// NOTE: We cannot use INSERT_SUBREG in all cases because the fixed length
	// vector types larger than NEON don't have a matching SubRegIndex.
	static SDNode insertSubReg(SelectionDAG DAG, EVT VT, SDValue V) {
	assert(VT.isScalableVector() &&
	VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock &&
	"Expected to insert into a packed scalable vector!");
	assert(V.getValueType().isFixedLengthVector() &&
	"Expected to insert a fixed length vector!");

	SDLoc DL(V);
	switch (V.getValueType().getSizeInBits()) {
	case 64: {
	auto SubReg = DAG->getTargetConstant(AArch64::dsub, DL, MVT::i32);
	auto Container = DAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
	return DAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, VT,
	SDValue(Container, 0), V, SubReg);
	}
	case 128: {
	auto SubReg = DAG->getTargetConstant(AArch64::zsub, DL, MVT::i32);
	auto Container = DAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
	return DAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, VT,
	SDValue(Container, 0), V, SubReg);
	}
	default: {
	auto RC = DAG->getTargetConstant(AArch64::ZPRRegClassID, DL, MVT::i64);
	return DAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC);
	}
	}
	}

	void AArch64DAGToDAGISel::Select(SDNode *Node) {
	// If we have a custom node, we already have selected!
	if (Node->isMachineOpcode()) {
	LLVM_DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
	Node->setNodeId(-1);
	return;
	}

	// Few custom selection stuff.
	EVT VT = Node->getValueType(0);

	switch (Node->getOpcode()) {
	default:
	break;

	case ISD::ATOMIC_CMP_SWAP:
	if (SelectCMP_SWAP(Node))
	return;
	break;

	case ISD::READ_REGISTER:
	if (tryReadRegister(Node))
	return;
	break;

	case ISD::WRITE_REGISTER:
	if (tryWriteRegister(Node))
	return;
	break;

	case ISD::ADD:
	if (tryMLAV64LaneV128(Node))
	return;
	break;

	case ISD::LOAD: {
	// Try to select as an indexed load. Fall through to normal processing
	// if we can't.
	if (tryIndexedLoad(Node))
	return;
	break;
	}

	case ISD::SRL:
	case ISD::AND:
	case ISD::SRA:
	case ISD::SIGN_EXTEND_INREG:
	if (tryBitfieldExtractOp(Node))
	return;
	if (tryBitfieldInsertInZeroOp(Node))
	return;
	LLVM_FALLTHROUGH;
	case ISD::ROTR:
	case ISD::SHL:
	if (tryShiftAmountMod(Node))
	return;
	break;

	case ISD::SIGN_EXTEND:
	if (tryBitfieldExtractOpFromSExt(Node))
	return;
	break;

	case ISD::FP_EXTEND:
	if (tryHighFPExt(Node))
	return;
	break;

	case ISD::OR:
	if (tryBitfieldInsertOp(Node))
	return;
	break;

	case ISD::EXTRACT_SUBVECTOR: {
	// Bail when not a "cast" like extract_subvector.
	if (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue() != 0)
	break;

	// Bail when normal isel can do the job.
	EVT InVT = Node->getOperand(0).getValueType();
	if (VT.isScalableVector() \|\| InVT.isFixedLengthVector())
	break;

	// NOTE: We can only get here when doing fixed length SVE code generation.
	// We do manual selection because the types involved are not linked to real
	// registers (despite being legal) and must be coerced into SVE registers.
	//
	// NOTE: If the above changes, be aware that selection will still not work
	// because the td definition of extract_vector does not support extracting
	// a fixed length vector from a scalable vector.

	ReplaceNode(Node, extractSubReg(CurDAG, VT, Node->getOperand(0)));
	return;
	}

	case ISD::INSERT_SUBVECTOR: {
	// Bail when not a "cast" like insert_subvector.
	if (cast<ConstantSDNode>(Node->getOperand(2))->getZExtValue() != 0)
	break;
	if (!Node->getOperand(0).isUndef())
	break;

	// Bail when normal isel should do the job.
	EVT InVT = Node->getOperand(1).getValueType();
	if (VT.isFixedLengthVector() \|\| InVT.isScalableVector())
	break;

	// NOTE: We can only get here when doing fixed length SVE code generation.
	// We do manual selection because the types involved are not linked to real
	// registers (despite being legal) and must be coerced into SVE registers.
	//
	// NOTE: If the above changes, be aware that selection will still not work
	// because the td definition of insert_vector does not support inserting a
	// fixed length vector into a scalable vector.

	ReplaceNode(Node, insertSubReg(CurDAG, VT, Node->getOperand(1)));
	return;
	}

	case ISD::Constant: {
	// Materialize zero constants as copies from WZR/XZR. This allows
	// the coalescer to propagate these into other instructions.
	ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
	if (ConstNode->isNullValue()) {
	if (VT == MVT::i32) {
	SDValue New = CurDAG->getCopyFromReg(
	CurDAG->getEntryNode(), SDLoc(Node), AArch64::WZR, MVT::i32);
	ReplaceNode(Node, New.getNode());
	return;
	} else if (VT == MVT::i64) {
	SDValue New = CurDAG->getCopyFromReg(
	CurDAG->getEntryNode(), SDLoc(Node), AArch64::XZR, MVT::i64);
	ReplaceNode(Node, New.getNode());
	return;
	}
	}
	break;
	}

	case ISD::FrameIndex: {
	// Selects to ADDXri FI, 0 which in turn will become ADDXri SP, imm.
	int FI = cast<FrameIndexSDNode>(Node)->getIndex();
	unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
	const TargetLowering *TLI = getTargetLowering();
	SDValue TFI = CurDAG->getTargetFrameIndex(
	FI, TLI->getPointerTy(CurDAG->getDataLayout()));
	SDLoc DL(Node);
	SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, DL, MVT::i32),
	CurDAG->getTargetConstant(Shifter, DL, MVT::i32) };
	CurDAG->SelectNodeTo(Node, AArch64::ADDXri, MVT::i64, Ops);
	return;
	}
	case ISD::INTRINSIC_W_CHAIN: {
	unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
	switch (IntNo) {
	default:
	break;
	case Intrinsic::aarch64_ldaxp:
	case Intrinsic::aarch64_ldxp: {
	unsigned Op =
	IntNo == Intrinsic::aarch64_ldaxp ? AArch64::LDAXPX : AArch64::LDXPX;
	SDValue MemAddr = Node->getOperand(2);
	SDLoc DL(Node);
	SDValue Chain = Node->getOperand(0);

	SDNode *Ld = CurDAG->getMachineNode(Op, DL, MVT::i64, MVT::i64,
	MVT::Other, MemAddr, Chain);

	// Transfer memoperands.
	MachineMemOperand *MemOp =
	cast<MemIntrinsicSDNode>(Node)->getMemOperand();
	CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ld), {MemOp});
	ReplaceNode(Node, Ld);
	return;
	}
	case Intrinsic::aarch64_stlxp:
	case Intrinsic::aarch64_stxp: {
	unsigned Op =
	IntNo == Intrinsic::aarch64_stlxp ? AArch64::STLXPX : AArch64::STXPX;
	SDLoc DL(Node);
	SDValue Chain = Node->getOperand(0);
	SDValue ValLo = Node->getOperand(2);
	SDValue ValHi = Node->getOperand(3);
	SDValue MemAddr = Node->getOperand(4);

	// Place arguments in the right order.
	SDValue Ops[] = {ValLo, ValHi, MemAddr, Chain};

	SDNode *St = CurDAG->getMachineNode(Op, DL, MVT::i32, MVT::Other, Ops);
	// Transfer memoperands.
	MachineMemOperand *MemOp =
	cast<MemIntrinsicSDNode>(Node)->getMemOperand();
	CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp});

	ReplaceNode(Node, St);
	return;
	}
	case Intrinsic::aarch64_neon_ld1x2:
	if (VT == MVT::v8i8) {
	SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0);
	return;
	} else if (VT == MVT::v16i8) {
	SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4bf16) {
	SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v8bf16) {
	SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectLoad(Node, 2, AArch64::LD1Twov4s, AArch64::qsub0);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectLoad(Node, 2, AArch64::LD1Twov2d, AArch64::qsub0);
	return;
	}
	break;
	case Intrinsic::aarch64_neon_ld1x3:
	if (VT == MVT::v8i8) {
	SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0);
	return;
	} else if (VT == MVT::v16i8) {
	SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4bf16) {
	SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v8bf16) {
	SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectLoad(Node, 3, AArch64::LD1Threev4s, AArch64::qsub0);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectLoad(Node, 3, AArch64::LD1Threev2d, AArch64::qsub0);
	return;
	}
	break;
	case Intrinsic::aarch64_neon_ld1x4:
	if (VT == MVT::v8i8) {
	SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0);
	return;
	} else if (VT == MVT::v16i8) {
	SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4bf16) {
	SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v8bf16) {
	SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectLoad(Node, 4, AArch64::LD1Fourv4s, AArch64::qsub0);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectLoad(Node, 4, AArch64::LD1Fourv2d, AArch64::qsub0);
	return;
	}
	break;
	case Intrinsic::aarch64_neon_ld2:
	if (VT == MVT::v8i8) {
	SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0);
	return;
	} else if (VT == MVT::v16i8) {
	SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4bf16) {
	SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v8bf16) {
	SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectLoad(Node, 2, AArch64::LD2Twov4s, AArch64::qsub0);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectLoad(Node, 2, AArch64::LD2Twov2d, AArch64::qsub0);
	return;
	}
	break;
	case Intrinsic::aarch64_neon_ld3:
	if (VT == MVT::v8i8) {
	SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0);
	return;
	} else if (VT == MVT::v16i8) {
	SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4bf16) {
	SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v8bf16) {
	SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectLoad(Node, 3, AArch64::LD3Threev4s, AArch64::qsub0);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectLoad(Node, 3, AArch64::LD3Threev2d, AArch64::qsub0);
	return;
	}
	break;
	case Intrinsic::aarch64_neon_ld4:
	if (VT == MVT::v8i8) {
	SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0);
	return;
	} else if (VT == MVT::v16i8) {
	SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4bf16) {
	SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v8bf16) {
	SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectLoad(Node, 4, AArch64::LD4Fourv4s, AArch64::qsub0);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectLoad(Node, 4, AArch64::LD4Fourv2d, AArch64::qsub0);
	return;
	}
	break;
	case Intrinsic::aarch64_neon_ld2r:
	if (VT == MVT::v8i8) {
	SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0);
	return;
	} else if (VT == MVT::v16i8) {
	SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4bf16) {
	SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v8bf16) {
	SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectLoad(Node, 2, AArch64::LD2Rv4s, AArch64::qsub0);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectLoad(Node, 2, AArch64::LD2Rv1d, AArch64::dsub0);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectLoad(Node, 2, AArch64::LD2Rv2d, AArch64::qsub0);
	return;
	}
	break;
	case Intrinsic::aarch64_neon_ld3r:
	if (VT == MVT::v8i8) {
	SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0);
	return;
	} else if (VT == MVT::v16i8) {
	SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4bf16) {
	SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v8bf16) {
	SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectLoad(Node, 3, AArch64::LD3Rv4s, AArch64::qsub0);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectLoad(Node, 3, AArch64::LD3Rv1d, AArch64::dsub0);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectLoad(Node, 3, AArch64::LD3Rv2d, AArch64::qsub0);
	return;
	}
	break;
	case Intrinsic::aarch64_neon_ld4r:
	if (VT == MVT::v8i8) {
	SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0);
	return;
	} else if (VT == MVT::v16i8) {
	SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4bf16) {
	SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v8bf16) {
	SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectLoad(Node, 4, AArch64::LD4Rv4s, AArch64::qsub0);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectLoad(Node, 4, AArch64::LD4Rv1d, AArch64::dsub0);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectLoad(Node, 4, AArch64::LD4Rv2d, AArch64::qsub0);
	return;
	}
	break;
	case Intrinsic::aarch64_neon_ld2lane:
	if (VT == MVT::v16i8 \|\| VT == MVT::v8i8) {
	SelectLoadLane(Node, 2, AArch64::LD2i8);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\|
	VT == MVT::v8f16 \|\| VT == MVT::v4bf16 \|\| VT == MVT::v8bf16) {
	SelectLoadLane(Node, 2, AArch64::LD2i16);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v2i32 \|\| VT == MVT::v4f32 \|\|
	VT == MVT::v2f32) {
	SelectLoadLane(Node, 2, AArch64::LD2i32);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v1i64 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::v1f64) {
	SelectLoadLane(Node, 2, AArch64::LD2i64);
	return;
	}
	break;
	case Intrinsic::aarch64_neon_ld3lane:
	if (VT == MVT::v16i8 \|\| VT == MVT::v8i8) {
	SelectLoadLane(Node, 3, AArch64::LD3i8);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\|
	VT == MVT::v8f16 \|\| VT == MVT::v4bf16 \|\| VT == MVT::v8bf16) {
	SelectLoadLane(Node, 3, AArch64::LD3i16);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v2i32 \|\| VT == MVT::v4f32 \|\|
	VT == MVT::v2f32) {
	SelectLoadLane(Node, 3, AArch64::LD3i32);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v1i64 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::v1f64) {
	SelectLoadLane(Node, 3, AArch64::LD3i64);
	return;
	}
	break;
	case Intrinsic::aarch64_neon_ld4lane:
	if (VT == MVT::v16i8 \|\| VT == MVT::v8i8) {
	SelectLoadLane(Node, 4, AArch64::LD4i8);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\|
	VT == MVT::v8f16 \|\| VT == MVT::v4bf16 \|\| VT == MVT::v8bf16) {
	SelectLoadLane(Node, 4, AArch64::LD4i16);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v2i32 \|\| VT == MVT::v4f32 \|\|
	VT == MVT::v2f32) {
	SelectLoadLane(Node, 4, AArch64::LD4i32);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v1i64 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::v1f64) {
	SelectLoadLane(Node, 4, AArch64::LD4i64);
	return;
	}
	break;
	}
	} break;
	case ISD::INTRINSIC_WO_CHAIN: {
	unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
	switch (IntNo) {
	default:
	break;
	case Intrinsic::aarch64_tagp:
	SelectTagP(Node);
	return;
	case Intrinsic::aarch64_neon_tbl2:
	SelectTable(Node, 2,
	VT == MVT::v8i8 ? AArch64::TBLv8i8Two : AArch64::TBLv16i8Two,
	false);
	return;
	case Intrinsic::aarch64_neon_tbl3:
	SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBLv8i8Three
	: AArch64::TBLv16i8Three,
	false);
	return;
	case Intrinsic::aarch64_neon_tbl4:
	SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBLv8i8Four
	: AArch64::TBLv16i8Four,
	false);
	return;
	case Intrinsic::aarch64_neon_tbx2:
	SelectTable(Node, 2,
	VT == MVT::v8i8 ? AArch64::TBXv8i8Two : AArch64::TBXv16i8Two,
	true);
	return;
	case Intrinsic::aarch64_neon_tbx3:
	SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBXv8i8Three
	: AArch64::TBXv16i8Three,
	true);
	return;
	case Intrinsic::aarch64_neon_tbx4:
	SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBXv8i8Four
	: AArch64::TBXv16i8Four,
	true);
	return;
	case Intrinsic::aarch64_neon_smull:
	case Intrinsic::aarch64_neon_umull:
	if (tryMULLV64LaneV128(IntNo, Node))
	return;
	break;
	}
	break;
	}
	case ISD::INTRINSIC_VOID: {
	unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
	if (Node->getNumOperands() >= 3)
	VT = Node->getOperand(2)->getValueType(0);
	switch (IntNo) {
	default:
	break;
	case Intrinsic::aarch64_neon_st1x2: {
	if (VT == MVT::v8i8) {
	SelectStore(Node, 2, AArch64::ST1Twov8b);
	return;
	} else if (VT == MVT::v16i8) {
	SelectStore(Node, 2, AArch64::ST1Twov16b);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\|
	VT == MVT::v4bf16) {
	SelectStore(Node, 2, AArch64::ST1Twov4h);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\|
	VT == MVT::v8bf16) {
	SelectStore(Node, 2, AArch64::ST1Twov8h);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectStore(Node, 2, AArch64::ST1Twov2s);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectStore(Node, 2, AArch64::ST1Twov4s);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectStore(Node, 2, AArch64::ST1Twov2d);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectStore(Node, 2, AArch64::ST1Twov1d);
	return;
	}
	break;
	}
	case Intrinsic::aarch64_neon_st1x3: {
	if (VT == MVT::v8i8) {
	SelectStore(Node, 3, AArch64::ST1Threev8b);
	return;
	} else if (VT == MVT::v16i8) {
	SelectStore(Node, 3, AArch64::ST1Threev16b);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\|
	VT == MVT::v4bf16) {
	SelectStore(Node, 3, AArch64::ST1Threev4h);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\|
	VT == MVT::v8bf16) {
	SelectStore(Node, 3, AArch64::ST1Threev8h);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectStore(Node, 3, AArch64::ST1Threev2s);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectStore(Node, 3, AArch64::ST1Threev4s);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectStore(Node, 3, AArch64::ST1Threev2d);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectStore(Node, 3, AArch64::ST1Threev1d);
	return;
	}
	break;
	}
	case Intrinsic::aarch64_neon_st1x4: {
	if (VT == MVT::v8i8) {
	SelectStore(Node, 4, AArch64::ST1Fourv8b);
	return;
	} else if (VT == MVT::v16i8) {
	SelectStore(Node, 4, AArch64::ST1Fourv16b);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\|
	VT == MVT::v4bf16) {
	SelectStore(Node, 4, AArch64::ST1Fourv4h);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\|
	VT == MVT::v8bf16) {
	SelectStore(Node, 4, AArch64::ST1Fourv8h);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectStore(Node, 4, AArch64::ST1Fourv2s);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectStore(Node, 4, AArch64::ST1Fourv4s);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectStore(Node, 4, AArch64::ST1Fourv2d);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectStore(Node, 4, AArch64::ST1Fourv1d);
	return;
	}
	break;
	}
	case Intrinsic::aarch64_neon_st2: {
	if (VT == MVT::v8i8) {
	SelectStore(Node, 2, AArch64::ST2Twov8b);
	return;
	} else if (VT == MVT::v16i8) {
	SelectStore(Node, 2, AArch64::ST2Twov16b);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\|
	VT == MVT::v4bf16) {
	SelectStore(Node, 2, AArch64::ST2Twov4h);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\|
	VT == MVT::v8bf16) {
	SelectStore(Node, 2, AArch64::ST2Twov8h);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectStore(Node, 2, AArch64::ST2Twov2s);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectStore(Node, 2, AArch64::ST2Twov4s);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectStore(Node, 2, AArch64::ST2Twov2d);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectStore(Node, 2, AArch64::ST1Twov1d);
	return;
	}
	break;
	}
	case Intrinsic::aarch64_neon_st3: {
	if (VT == MVT::v8i8) {
	SelectStore(Node, 3, AArch64::ST3Threev8b);
	return;
	} else if (VT == MVT::v16i8) {
	SelectStore(Node, 3, AArch64::ST3Threev16b);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\|
	VT == MVT::v4bf16) {
	SelectStore(Node, 3, AArch64::ST3Threev4h);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\|
	VT == MVT::v8bf16) {
	SelectStore(Node, 3, AArch64::ST3Threev8h);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectStore(Node, 3, AArch64::ST3Threev2s);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectStore(Node, 3, AArch64::ST3Threev4s);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectStore(Node, 3, AArch64::ST3Threev2d);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectStore(Node, 3, AArch64::ST1Threev1d);
	return;
	}
	break;
	}
	case Intrinsic::aarch64_neon_st4: {
	if (VT == MVT::v8i8) {
	SelectStore(Node, 4, AArch64::ST4Fourv8b);
	return;
	} else if (VT == MVT::v16i8) {
	SelectStore(Node, 4, AArch64::ST4Fourv16b);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\|
	VT == MVT::v4bf16) {
	SelectStore(Node, 4, AArch64::ST4Fourv4h);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\|
	VT == MVT::v8bf16) {
	SelectStore(Node, 4, AArch64::ST4Fourv8h);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectStore(Node, 4, AArch64::ST4Fourv2s);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectStore(Node, 4, AArch64::ST4Fourv4s);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectStore(Node, 4, AArch64::ST4Fourv2d);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectStore(Node, 4, AArch64::ST1Fourv1d);
	return;
	}
	break;
	}
	case Intrinsic::aarch64_neon_st2lane: {
	if (VT == MVT::v16i8 \|\| VT == MVT::v8i8) {
	SelectStoreLane(Node, 2, AArch64::ST2i8);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\|
	VT == MVT::v8f16 \|\| VT == MVT::v4bf16 \|\| VT == MVT::v8bf16) {
	SelectStoreLane(Node, 2, AArch64::ST2i16);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v2i32 \|\| VT == MVT::v4f32 \|\|
	VT == MVT::v2f32) {
	SelectStoreLane(Node, 2, AArch64::ST2i32);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v1i64 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::v1f64) {
	SelectStoreLane(Node, 2, AArch64::ST2i64);
	return;
	}
	break;
	}
	case Intrinsic::aarch64_neon_st3lane: {
	if (VT == MVT::v16i8 \|\| VT == MVT::v8i8) {
	SelectStoreLane(Node, 3, AArch64::ST3i8);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\|
	VT == MVT::v8f16 \|\| VT == MVT::v4bf16 \|\| VT == MVT::v8bf16) {
	SelectStoreLane(Node, 3, AArch64::ST3i16);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v2i32 \|\| VT == MVT::v4f32 \|\|
	VT == MVT::v2f32) {
	SelectStoreLane(Node, 3, AArch64::ST3i32);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v1i64 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::v1f64) {
	SelectStoreLane(Node, 3, AArch64::ST3i64);
	return;
	}
	break;
	}
	case Intrinsic::aarch64_neon_st4lane: {
	if (VT == MVT::v16i8 \|\| VT == MVT::v8i8) {
	SelectStoreLane(Node, 4, AArch64::ST4i8);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\|
	VT == MVT::v8f16 \|\| VT == MVT::v4bf16 \|\| VT == MVT::v8bf16) {
	SelectStoreLane(Node, 4, AArch64::ST4i16);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v2i32 \|\| VT == MVT::v4f32 \|\|
	VT == MVT::v2f32) {
	SelectStoreLane(Node, 4, AArch64::ST4i32);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v1i64 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::v1f64) {
	SelectStoreLane(Node, 4, AArch64::ST4i64);
	return;
	}
	break;
	}
	case Intrinsic::aarch64_sve_st2: {
	if (VT == MVT::nxv16i8) {
	- SelectPredicatedStore</Scale=/0>(Node, 2, AArch64::ST2B,
	- AArch64::ST2B_IMM);
	+ SelectPredicatedStore(Node, 2, 0, AArch64::ST2B, AArch64::ST2B_IMM);
	return;
	} else if (VT == MVT::nxv8i16 \|\| VT == MVT::nxv8f16 \|\|
	(VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
	- SelectPredicatedStore</Scale=/1>(Node, 2, AArch64::ST2H,
	- AArch64::ST2H_IMM);
	+ SelectPredicatedStore(Node, 2, 1, AArch64::ST2H, AArch64::ST2H_IMM);
	return;
	} else if (VT == MVT::nxv4i32 \|\| VT == MVT::nxv4f32) {
	- SelectPredicatedStore</Scale=/2>(Node, 2, AArch64::ST2W,
	- AArch64::ST2W_IMM);
	+ SelectPredicatedStore(Node, 2, 2, AArch64::ST2W, AArch64::ST2W_IMM);
	return;
	} else if (VT == MVT::nxv2i64 \|\| VT == MVT::nxv2f64) {
	- SelectPredicatedStore</Scale=/3>(Node, 2, AArch64::ST2D,
	- AArch64::ST2D_IMM);
	+ SelectPredicatedStore(Node, 2, 3, AArch64::ST2D, AArch64::ST2D_IMM);
	return;
	}
	break;
	}
	case Intrinsic::aarch64_sve_st3: {
	if (VT == MVT::nxv16i8) {
	- SelectPredicatedStore</Scale=/0>(Node, 3, AArch64::ST3B,
	- AArch64::ST3B_IMM);
	+ SelectPredicatedStore(Node, 3, 0, AArch64::ST3B, AArch64::ST3B_IMM);
	return;
	} else if (VT == MVT::nxv8i16 \|\| VT == MVT::nxv8f16 \|\|
	(VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
	- SelectPredicatedStore</Scale=/1>(Node, 3, AArch64::ST3H,
	- AArch64::ST3H_IMM);
	+ SelectPredicatedStore(Node, 3, 1, AArch64::ST3H, AArch64::ST3H_IMM);
	return;
	} else if (VT == MVT::nxv4i32 \|\| VT == MVT::nxv4f32) {
	- SelectPredicatedStore</Scale=/2>(Node, 3, AArch64::ST3W,
	- AArch64::ST3W_IMM);
	+ SelectPredicatedStore(Node, 3, 2, AArch64::ST3W, AArch64::ST3W_IMM);
	return;
	} else if (VT == MVT::nxv2i64 \|\| VT == MVT::nxv2f64) {
	- SelectPredicatedStore</Scale=/3>(Node, 3, AArch64::ST3D,
	- AArch64::ST3D_IMM);
	+ SelectPredicatedStore(Node, 3, 3, AArch64::ST3D, AArch64::ST3D_IMM);
	return;
	}
	break;
	}
	case Intrinsic::aarch64_sve_st4: {
	if (VT == MVT::nxv16i8) {
	- SelectPredicatedStore</Scale=/0>(Node, 4, AArch64::ST4B,
	- AArch64::ST4B_IMM);
	+ SelectPredicatedStore(Node, 4, 0, AArch64::ST4B, AArch64::ST4B_IMM);
	return;
	} else if (VT == MVT::nxv8i16 \|\| VT == MVT::nxv8f16 \|\|
	(VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
	- SelectPredicatedStore</Scale=/1>(Node, 4, AArch64::ST4H,
	- AArch64::ST4H_IMM);
	+ SelectPredicatedStore(Node, 4, 1, AArch64::ST4H, AArch64::ST4H_IMM);
	return;
	} else if (VT == MVT::nxv4i32 \|\| VT == MVT::nxv4f32) {
	- SelectPredicatedStore</Scale=/2>(Node, 4, AArch64::ST4W,
	- AArch64::ST4W_IMM);
	+ SelectPredicatedStore(Node, 4, 2, AArch64::ST4W, AArch64::ST4W_IMM);
	return;
	} else if (VT == MVT::nxv2i64 \|\| VT == MVT::nxv2f64) {
	- SelectPredicatedStore</Scale=/3>(Node, 4, AArch64::ST4D,
	- AArch64::ST4D_IMM);
	+ SelectPredicatedStore(Node, 4, 3, AArch64::ST4D, AArch64::ST4D_IMM);
	return;
	}
	break;
	}
	}
	break;
	}
	case AArch64ISD::LD2post: {
	if (VT == MVT::v8i8) {
	SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v16i8) {
	SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4bf16) {
	SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v8bf16) {
	SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectPostLoad(Node, 2, AArch64::LD2Twov4s_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectPostLoad(Node, 2, AArch64::LD2Twov2d_POST, AArch64::qsub0);
	return;
	}
	break;
	}
	case AArch64ISD::LD3post: {
	if (VT == MVT::v8i8) {
	SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v16i8) {
	SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4bf16) {
	SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v8bf16) {
	SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectPostLoad(Node, 3, AArch64::LD3Threev4s_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectPostLoad(Node, 3, AArch64::LD3Threev2d_POST, AArch64::qsub0);
	return;
	}
	break;
	}
	case AArch64ISD::LD4post: {
	if (VT == MVT::v8i8) {
	SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v16i8) {
	SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4bf16) {
	SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v8bf16) {
	SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectPostLoad(Node, 4, AArch64::LD4Fourv4s_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectPostLoad(Node, 4, AArch64::LD4Fourv2d_POST, AArch64::qsub0);
	return;
	}
	break;
	}
	case AArch64ISD::LD1x2post: {
	if (VT == MVT::v8i8) {
	SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v16i8) {
	SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4bf16) {
	SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v8bf16) {
	SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectPostLoad(Node, 2, AArch64::LD1Twov4s_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectPostLoad(Node, 2, AArch64::LD1Twov2d_POST, AArch64::qsub0);
	return;
	}
	break;
	}
	case AArch64ISD::LD1x3post: {
	if (VT == MVT::v8i8) {
	SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v16i8) {
	SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4bf16) {
	SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v8bf16) {
	SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectPostLoad(Node, 3, AArch64::LD1Threev4s_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectPostLoad(Node, 3, AArch64::LD1Threev2d_POST, AArch64::qsub0);
	return;
	}
	break;
	}
	case AArch64ISD::LD1x4post: {
	if (VT == MVT::v8i8) {
	SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v16i8) {
	SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4bf16) {
	SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v8bf16) {
	SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectPostLoad(Node, 4, AArch64::LD1Fourv4s_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectPostLoad(Node, 4, AArch64::LD1Fourv2d_POST, AArch64::qsub0);
	return;
	}
	break;
	}
	case AArch64ISD::LD1DUPpost: {
	if (VT == MVT::v8i8) {
	SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v16i8) {
	SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4bf16) {
	SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v8bf16) {
	SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectPostLoad(Node, 1, AArch64::LD1Rv4s_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectPostLoad(Node, 1, AArch64::LD1Rv1d_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectPostLoad(Node, 1, AArch64::LD1Rv2d_POST, AArch64::qsub0);
	return;
	}
	break;
	}
	case AArch64ISD::LD2DUPpost: {
	if (VT == MVT::v8i8) {
	SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v16i8) {
	SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4bf16) {
	SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v8bf16) {
	SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectPostLoad(Node, 2, AArch64::LD2Rv4s_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectPostLoad(Node, 2, AArch64::LD2Rv1d_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectPostLoad(Node, 2, AArch64::LD2Rv2d_POST, AArch64::qsub0);
	return;
	}
	break;
	}
	case AArch64ISD::LD3DUPpost: {
	if (VT == MVT::v8i8) {
	SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v16i8) {
	SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4bf16) {
	SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v8bf16) {
	SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectPostLoad(Node, 3, AArch64::LD3Rv4s_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectPostLoad(Node, 3, AArch64::LD3Rv1d_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectPostLoad(Node, 3, AArch64::LD3Rv2d_POST, AArch64::qsub0);
	return;
	}
	break;
	}
	case AArch64ISD::LD4DUPpost: {
	if (VT == MVT::v8i8) {
	SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v16i8) {
	SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4bf16) {
	SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v8bf16) {
	SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectPostLoad(Node, 4, AArch64::LD4Rv4s_POST, AArch64::qsub0);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectPostLoad(Node, 4, AArch64::LD4Rv1d_POST, AArch64::dsub0);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectPostLoad(Node, 4, AArch64::LD4Rv2d_POST, AArch64::qsub0);
	return;
	}
	break;
	}
	case AArch64ISD::LD1LANEpost: {
	if (VT == MVT::v16i8 \|\| VT == MVT::v8i8) {
	SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\|
	VT == MVT::v8f16 \|\| VT == MVT::v4bf16 \|\| VT == MVT::v8bf16) {
	SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v2i32 \|\| VT == MVT::v4f32 \|\|
	VT == MVT::v2f32) {
	SelectPostLoadLane(Node, 1, AArch64::LD1i32_POST);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v1i64 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::v1f64) {
	SelectPostLoadLane(Node, 1, AArch64::LD1i64_POST);
	return;
	}
	break;
	}
	case AArch64ISD::LD2LANEpost: {
	if (VT == MVT::v16i8 \|\| VT == MVT::v8i8) {
	SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\|
	VT == MVT::v8f16 \|\| VT == MVT::v4bf16 \|\| VT == MVT::v8bf16) {
	SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v2i32 \|\| VT == MVT::v4f32 \|\|
	VT == MVT::v2f32) {
	SelectPostLoadLane(Node, 2, AArch64::LD2i32_POST);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v1i64 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::v1f64) {
	SelectPostLoadLane(Node, 2, AArch64::LD2i64_POST);
	return;
	}
	break;
	}
	case AArch64ISD::LD3LANEpost: {
	if (VT == MVT::v16i8 \|\| VT == MVT::v8i8) {
	SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\|
	VT == MVT::v8f16 \|\| VT == MVT::v4bf16 \|\| VT == MVT::v8bf16) {
	SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v2i32 \|\| VT == MVT::v4f32 \|\|
	VT == MVT::v2f32) {
	SelectPostLoadLane(Node, 3, AArch64::LD3i32_POST);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v1i64 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::v1f64) {
	SelectPostLoadLane(Node, 3, AArch64::LD3i64_POST);
	return;
	}
	break;
	}
	case AArch64ISD::LD4LANEpost: {
	if (VT == MVT::v16i8 \|\| VT == MVT::v8i8) {
	SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\|
	VT == MVT::v8f16 \|\| VT == MVT::v4bf16 \|\| VT == MVT::v8bf16) {
	SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v2i32 \|\| VT == MVT::v4f32 \|\|
	VT == MVT::v2f32) {
	SelectPostLoadLane(Node, 4, AArch64::LD4i32_POST);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v1i64 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::v1f64) {
	SelectPostLoadLane(Node, 4, AArch64::LD4i64_POST);
	return;
	}
	break;
	}
	case AArch64ISD::ST2post: {
	VT = Node->getOperand(1).getValueType();
	if (VT == MVT::v8i8) {
	SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST);
	return;
	} else if (VT == MVT::v16i8) {
	SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4bf16) {
	SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v8bf16) {
	SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectPostStore(Node, 2, AArch64::ST2Twov4s_POST);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectPostStore(Node, 2, AArch64::ST2Twov2d_POST);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
	return;
	}
	break;
	}
	case AArch64ISD::ST3post: {
	VT = Node->getOperand(1).getValueType();
	if (VT == MVT::v8i8) {
	SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST);
	return;
	} else if (VT == MVT::v16i8) {
	SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4bf16) {
	SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v8bf16) {
	SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectPostStore(Node, 3, AArch64::ST3Threev4s_POST);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectPostStore(Node, 3, AArch64::ST3Threev2d_POST);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
	return;
	}
	break;
	}
	case AArch64ISD::ST4post: {
	VT = Node->getOperand(1).getValueType();
	if (VT == MVT::v8i8) {
	SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST);
	return;
	} else if (VT == MVT::v16i8) {
	SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4bf16) {
	SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v8bf16) {
	SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectPostStore(Node, 4, AArch64::ST4Fourv4s_POST);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectPostStore(Node, 4, AArch64::ST4Fourv2d_POST);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
	return;
	}
	break;
	}
	case AArch64ISD::ST1x2post: {
	VT = Node->getOperand(1).getValueType();
	if (VT == MVT::v8i8) {
	SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST);
	return;
	} else if (VT == MVT::v16i8) {
	SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4bf16) {
	SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v8bf16) {
	SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectPostStore(Node, 2, AArch64::ST1Twov4s_POST);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectPostStore(Node, 2, AArch64::ST1Twov2d_POST);
	return;
	}
	break;
	}
	case AArch64ISD::ST1x3post: {
	VT = Node->getOperand(1).getValueType();
	if (VT == MVT::v8i8) {
	SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST);
	return;
	} else if (VT == MVT::v16i8) {
	SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4bf16) {
	SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v8bf16 ) {
	SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectPostStore(Node, 3, AArch64::ST1Threev4s_POST);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectPostStore(Node, 3, AArch64::ST1Threev2d_POST);
	return;
	}
	break;
	}
	case AArch64ISD::ST1x4post: {
	VT = Node->getOperand(1).getValueType();
	if (VT == MVT::v8i8) {
	SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST);
	return;
	} else if (VT == MVT::v16i8) {
	SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST);
	return;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4bf16) {
	SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v8bf16) {
	SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST);
	return;
	} else if (VT == MVT::v2i32 \|\| VT == MVT::v2f32) {
	SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32) {
	SelectPostStore(Node, 4, AArch64::ST1Fourv4s_POST);
	return;
	} else if (VT == MVT::v1i64 \|\| VT == MVT::v1f64) {
	SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	SelectPostStore(Node, 4, AArch64::ST1Fourv2d_POST);
	return;
	}
	break;
	}
	case AArch64ISD::ST2LANEpost: {
	VT = Node->getOperand(1).getValueType();
	if (VT == MVT::v16i8 \|\| VT == MVT::v8i8) {
	SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\|
	VT == MVT::v8f16 \|\| VT == MVT::v4bf16 \|\| VT == MVT::v8bf16) {
	SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v2i32 \|\| VT == MVT::v4f32 \|\|
	VT == MVT::v2f32) {
	SelectPostStoreLane(Node, 2, AArch64::ST2i32_POST);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v1i64 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::v1f64) {
	SelectPostStoreLane(Node, 2, AArch64::ST2i64_POST);
	return;
	}
	break;
	}
	case AArch64ISD::ST3LANEpost: {
	VT = Node->getOperand(1).getValueType();
	if (VT == MVT::v16i8 \|\| VT == MVT::v8i8) {
	SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\|
	VT == MVT::v8f16 \|\| VT == MVT::v4bf16 \|\| VT == MVT::v8bf16) {
	SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v2i32 \|\| VT == MVT::v4f32 \|\|
	VT == MVT::v2f32) {
	SelectPostStoreLane(Node, 3, AArch64::ST3i32_POST);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v1i64 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::v1f64) {
	SelectPostStoreLane(Node, 3, AArch64::ST3i64_POST);
	return;
	}
	break;
	}
	case AArch64ISD::ST4LANEpost: {
	VT = Node->getOperand(1).getValueType();
	if (VT == MVT::v16i8 \|\| VT == MVT::v8i8) {
	SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST);
	return;
	} else if (VT == MVT::v8i16 \|\| VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\|
	VT == MVT::v8f16 \|\| VT == MVT::v4bf16 \|\| VT == MVT::v8bf16) {
	SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST);
	return;
	} else if (VT == MVT::v4i32 \|\| VT == MVT::v2i32 \|\| VT == MVT::v4f32 \|\|
	VT == MVT::v2f32) {
	SelectPostStoreLane(Node, 4, AArch64::ST4i32_POST);
	return;
	} else if (VT == MVT::v2i64 \|\| VT == MVT::v1i64 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::v1f64) {
	SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST);
	return;
	}
	break;
	}
	case AArch64ISD::SVE_LD2_MERGE_ZERO: {
	if (VT == MVT::nxv16i8) {
	- SelectPredicatedLoad(Node, 2, AArch64::LD2B_IMM);
	+ SelectPredicatedLoad(Node, 2, 0, AArch64::LD2B_IMM, AArch64::LD2B);
	return;
	} else if (VT == MVT::nxv8i16 \|\| VT == MVT::nxv8f16 \|\|
	(VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
	- SelectPredicatedLoad(Node, 2, AArch64::LD2H_IMM);
	+ SelectPredicatedLoad(Node, 2, 1, AArch64::LD2H_IMM, AArch64::LD2H);
	return;
	} else if (VT == MVT::nxv4i32 \|\| VT == MVT::nxv4f32) {
	- SelectPredicatedLoad(Node, 2, AArch64::LD2W_IMM);
	+ SelectPredicatedLoad(Node, 2, 2, AArch64::LD2W_IMM, AArch64::LD2W);
	return;
	} else if (VT == MVT::nxv2i64 \|\| VT == MVT::nxv2f64) {
	- SelectPredicatedLoad(Node, 2, AArch64::LD2D_IMM);
	+ SelectPredicatedLoad(Node, 2, 3, AArch64::LD2D_IMM, AArch64::LD2D);
	return;
	}
	break;
	}
	case AArch64ISD::SVE_LD3_MERGE_ZERO: {
	if (VT == MVT::nxv16i8) {
	- SelectPredicatedLoad(Node, 3, AArch64::LD3B_IMM);
	+ SelectPredicatedLoad(Node, 3, 0, AArch64::LD3B_IMM, AArch64::LD3B);
	return;
	} else if (VT == MVT::nxv8i16 \|\| VT == MVT::nxv8f16 \|\|
	(VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
	- SelectPredicatedLoad(Node, 3, AArch64::LD3H_IMM);
	+ SelectPredicatedLoad(Node, 3, 1, AArch64::LD3H_IMM, AArch64::LD3H);
	return;
	} else if (VT == MVT::nxv4i32 \|\| VT == MVT::nxv4f32) {
	- SelectPredicatedLoad(Node, 3, AArch64::LD3W_IMM);
	+ SelectPredicatedLoad(Node, 3, 2, AArch64::LD3W_IMM, AArch64::LD3W);
	return;
	} else if (VT == MVT::nxv2i64 \|\| VT == MVT::nxv2f64) {
	- SelectPredicatedLoad(Node, 3, AArch64::LD3D_IMM);
	+ SelectPredicatedLoad(Node, 3, 3, AArch64::LD3D_IMM, AArch64::LD3D);
	return;
	}
	break;
	}
	case AArch64ISD::SVE_LD4_MERGE_ZERO: {
	if (VT == MVT::nxv16i8) {
	- SelectPredicatedLoad(Node, 4, AArch64::LD4B_IMM);
	+ SelectPredicatedLoad(Node, 4, 0, AArch64::LD4B_IMM, AArch64::LD4B);
	return;
	} else if (VT == MVT::nxv8i16 \|\| VT == MVT::nxv8f16 \|\|
	(VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
	- SelectPredicatedLoad(Node, 4, AArch64::LD4H_IMM);
	+ SelectPredicatedLoad(Node, 4, 1, AArch64::LD4H_IMM, AArch64::LD4H);
	return;
	} else if (VT == MVT::nxv4i32 \|\| VT == MVT::nxv4f32) {
	- SelectPredicatedLoad(Node, 4, AArch64::LD4W_IMM);
	+ SelectPredicatedLoad(Node, 4, 2, AArch64::LD4W_IMM, AArch64::LD4W);
	return;
	} else if (VT == MVT::nxv2i64 \|\| VT == MVT::nxv2f64) {
	- SelectPredicatedLoad(Node, 4, AArch64::LD4D_IMM);
	+ SelectPredicatedLoad(Node, 4, 3, AArch64::LD4D_IMM, AArch64::LD4D);
	return;
	}
	break;
	}
	}

	// Select the default instruction
	SelectCode(Node);
	}

	/// createAArch64ISelDag - This pass converts a legalized DAG into a
	/// AArch64-specific DAG, ready for instruction scheduling.
	FunctionPass *llvm::createAArch64ISelDag(AArch64TargetMachine &TM,
	CodeGenOpt::Level OptLevel) {
	return new AArch64DAGToDAGISel(TM, OptLevel);
	}

	/// When \p PredVT is a scalable vector predicate in the form
	/// MVT::nx<M>xi1, it builds the correspondent scalable vector of
	-/// integers MVT::nx<M>xi<bits> s.t. M x bits = 128. If the input
	+/// integers MVT::nx<M>xi<bits> s.t. M x bits = 128. When targeting
	+/// structured vectors (NumVec >1), the output data type is
	+/// MVT::nx<M*NumVec>xi<bits> s.t. M x bits = 128. If the input
	/// PredVT is not in the form MVT::nx<M>xi1, it returns an invalid
	/// EVT.
	-static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT) {
	+static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT,
	+ unsigned NumVec) {
	+ assert(NumVec > 0 && NumVec < 5 && "Invalid number of vectors.");
	if (!PredVT.isScalableVector() \|\| PredVT.getVectorElementType() != MVT::i1)
	return EVT();

	if (PredVT != MVT::nxv16i1 && PredVT != MVT::nxv8i1 &&
	PredVT != MVT::nxv4i1 && PredVT != MVT::nxv2i1)
	return EVT();

	ElementCount EC = PredVT.getVectorElementCount();
	EVT ScalarVT = EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.Min);
	- EVT MemVT = EVT::getVectorVT(Ctx, ScalarVT, EC);
	+ EVT MemVT = EVT::getVectorVT(Ctx, ScalarVT, EC * NumVec);
	+
	return MemVT;
	}

	/// Return the EVT of the data associated to a memory operation in \p
	/// Root. If such EVT cannot be retrived, it returns an invalid EVT.
	static EVT getMemVTFromNode(LLVMContext &Ctx, SDNode *Root) {
	if (isa<MemSDNode>(Root))
	return cast<MemSDNode>(Root)->getMemoryVT();

	if (isa<MemIntrinsicSDNode>(Root))
	return cast<MemIntrinsicSDNode>(Root)->getMemoryVT();

	const unsigned Opcode = Root->getOpcode();
	// For custom ISD nodes, we have to look at them individually to extract the
	// type of the data moved to/from memory.
	switch (Opcode) {
	case AArch64ISD::LD1_MERGE_ZERO:
	case AArch64ISD::LD1S_MERGE_ZERO:
	case AArch64ISD::LDNF1_MERGE_ZERO:
	case AArch64ISD::LDNF1S_MERGE_ZERO:
	return cast<VTSDNode>(Root->getOperand(3))->getVT();
	case AArch64ISD::ST1_PRED:
	return cast<VTSDNode>(Root->getOperand(4))->getVT();
	+ case AArch64ISD::SVE_LD2_MERGE_ZERO:
	+ return getPackedVectorTypeFromPredicateType(
	+ Ctx, Root->getOperand(1)->getValueType(0), /NumVec=/2);
	+ case AArch64ISD::SVE_LD3_MERGE_ZERO:
	+ return getPackedVectorTypeFromPredicateType(
	+ Ctx, Root->getOperand(1)->getValueType(0), /NumVec=/3);
	+ case AArch64ISD::SVE_LD4_MERGE_ZERO:
	+ return getPackedVectorTypeFromPredicateType(
	+ Ctx, Root->getOperand(1)->getValueType(0), /NumVec=/4);
	default:
	break;
	}

	if (Opcode != ISD::INTRINSIC_VOID)
	return EVT();

	const unsigned IntNo =
	cast<ConstantSDNode>(Root->getOperand(1))->getZExtValue();
	if (IntNo != Intrinsic::aarch64_sve_prf)
	return EVT();

	// We are using an SVE prefetch intrinsic. Type must be inferred
	// from the width of the predicate.
	return getPackedVectorTypeFromPredicateType(
	- Ctx, Root->getOperand(2)->getValueType(0));
	+ Ctx, Root->getOperand(2)->getValueType(0), /NumVec=/1);
	}

	/// SelectAddrModeIndexedSVE - Attempt selection of the addressing mode:
	/// Base + OffImm * sizeof(MemVT) for Min >= OffImm <= Max
	/// where Root is the memory access using N for its address.
	template <int64_t Min, int64_t Max>
	bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N,
	SDValue &Base,
	SDValue &OffImm) {
	const EVT MemVT = getMemVTFromNode(*(CurDAG->getContext()), Root);

	if (MemVT == EVT())
	return false;

	if (N.getOpcode() != ISD::ADD)
	return false;

	SDValue VScale = N.getOperand(1);
	if (VScale.getOpcode() != ISD::VSCALE)
	return false;

	TypeSize TS = MemVT.getSizeInBits();
	int64_t MemWidthBytes = static_cast<int64_t>(TS.getKnownMinSize()) / 8;
	int64_t MulImm = cast<ConstantSDNode>(VScale.getOperand(0))->getSExtValue();

	if ((MulImm % MemWidthBytes) != 0)
	return false;

	int64_t Offset = MulImm / MemWidthBytes;
	if (Offset < Min \|\| Offset > Max)
	return false;

	Base = N.getOperand(0);
	OffImm = CurDAG->getTargetConstant(Offset, SDLoc(N), MVT::i64);
	return true;
	}

	/// Select register plus register addressing mode for SVE, with scaled
	/// offset.
	bool AArch64DAGToDAGISel::SelectSVERegRegAddrMode(SDValue N, unsigned Scale,
	SDValue &Base,
	SDValue &Offset) {
	if (N.getOpcode() != ISD::ADD)
	return false;

	// Process an ADD node.
	const SDValue LHS = N.getOperand(0);
	const SDValue RHS = N.getOperand(1);

	// 8 bit data does not come with the SHL node, so it is treated
	// separately.
	if (Scale == 0) {
	Base = LHS;
	Offset = RHS;
	return true;
	}

	// Check if the RHS is a shift node with a constant.
	if (RHS.getOpcode() != ISD::SHL)
	return false;

	const SDValue ShiftRHS = RHS.getOperand(1);
	if (auto *C = dyn_cast<ConstantSDNode>(ShiftRHS))
	if (C->getZExtValue() == Scale) {
	Base = LHS;
	Offset = RHS.getOperand(0);
	return true;
	}

	return false;
	}
	diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
	index 85db14ab66fe..1500da2fdfc7 100644
	--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
	+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
	@@ -1,15156 +1,15177 @@
	//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the AArch64TargetLowering class.
	//
	//===----------------------------------------------------------------------===//

	#include "AArch64ISelLowering.h"
	#include "AArch64CallingConvention.h"
	#include "AArch64ExpandImm.h"
	#include "AArch64MachineFunctionInfo.h"
	#include "AArch64PerfectShuffle.h"
	#include "AArch64RegisterInfo.h"
	#include "AArch64Subtarget.h"
	#include "MCTargetDesc/AArch64AddressingModes.h"
	#include "Utils/AArch64BaseInfo.h"
	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/Analysis/VectorUtils.h"
	#include "llvm/CodeGen/CallingConvLower.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/RuntimeLibcalls.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/TargetCallingConv.h"
	#include "llvm/CodeGen/TargetInstrInfo.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GetElementPtrTypeIterator.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/IntrinsicsAArch64.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/OperandTraits.h"
	#include "llvm/IR/PatternMatch.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Use.h"
	#include "llvm/IR/Value.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MachineValueType.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include <algorithm>
	#include <bitset>
	#include <cassert>
	#include <cctype>
	#include <cstdint>
	#include <cstdlib>
	#include <iterator>
	#include <limits>
	#include <tuple>
	#include <utility>
	#include <vector>

	using namespace llvm;
	using namespace llvm::PatternMatch;

	#define DEBUG_TYPE "aarch64-lower"

	STATISTIC(NumTailCalls, "Number of tail calls");
	STATISTIC(NumShiftInserts, "Number of vector shift inserts");
	STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");

	// FIXME: The necessary dtprel relocations don't seem to be supported
	// well in the GNU bfd and gold linkers at the moment. Therefore, by
	// default, for now, fall back to GeneralDynamic code generation.
	cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
	"aarch64-elf-ldtls-generation", cl::Hidden,
	cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
	cl::init(false));

	static cl::opt<bool>
	EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
	cl::desc("Enable AArch64 logical imm instruction "
	"optimization"),
	cl::init(true));

	/// Value type used for condition codes.
	static const MVT MVT_CC = MVT::i32;

	/// Returns true if VT's elements occupy the lowest bit positions of its
	/// associated register class without any intervening space.
	///
	/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
	/// same register class, but only nxv8f16 can be treated as a packed vector.
	static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
	assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
	"Expected legal vector type!");
	return VT.isFixedLengthVector() \|\|
	VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock;
	}

	AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
	const AArch64Subtarget &STI)
	: TargetLowering(TM), Subtarget(&STI) {
	// AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
	// we have to make something up. Arbitrarily, choose ZeroOrOne.
	setBooleanContents(ZeroOrOneBooleanContent);
	// When comparing vectors the result sets the different elements in the
	// vector to all-one or all-zero.
	setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

	// Set up the register classes.
	addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
	addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);

	if (Subtarget->hasFPARMv8()) {
	addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
	addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
	addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
	addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
	addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
	}

	if (Subtarget->hasNEON()) {
	addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
	addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
	// Someone set us up the NEON.
	addDRTypeForNEON(MVT::v2f32);
	addDRTypeForNEON(MVT::v8i8);
	addDRTypeForNEON(MVT::v4i16);
	addDRTypeForNEON(MVT::v2i32);
	addDRTypeForNEON(MVT::v1i64);
	addDRTypeForNEON(MVT::v1f64);
	addDRTypeForNEON(MVT::v4f16);
	addDRTypeForNEON(MVT::v4bf16);

	addQRTypeForNEON(MVT::v4f32);
	addQRTypeForNEON(MVT::v2f64);
	addQRTypeForNEON(MVT::v16i8);
	addQRTypeForNEON(MVT::v8i16);
	addQRTypeForNEON(MVT::v4i32);
	addQRTypeForNEON(MVT::v2i64);
	addQRTypeForNEON(MVT::v8f16);
	addQRTypeForNEON(MVT::v8bf16);
	}

	if (Subtarget->hasSVE()) {
	// Add legal sve predicate types
	addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
	addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
	addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
	addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);

	// Add legal sve data types
	addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
	addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
	addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
	addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);

	addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
	addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
	addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
	addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
	addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
	addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);

	if (Subtarget->hasBF16()) {
	addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
	addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
	addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
	}

	if (useSVEForFixedLengthVectors()) {
	for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
	if (useSVEForFixedLengthVectorVT(VT))
	addRegisterClass(VT, &AArch64::ZPRRegClass);

	for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
	if (useSVEForFixedLengthVectorVT(VT))
	addRegisterClass(VT, &AArch64::ZPRRegClass);
	}

	for (auto VT : { MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64 }) {
	setOperationAction(ISD::SADDSAT, VT, Legal);
	setOperationAction(ISD::UADDSAT, VT, Legal);
	setOperationAction(ISD::SSUBSAT, VT, Legal);
	setOperationAction(ISD::USUBSAT, VT, Legal);
	setOperationAction(ISD::UREM, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::SDIVREM, VT, Expand);
	setOperationAction(ISD::UDIVREM, VT, Expand);
	}

	for (auto VT :
	{ MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
	MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
	setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal);

	for (auto VT :
	{ MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, MVT::nxv4f32,
	MVT::nxv2f64 }) {
	setCondCodeAction(ISD::SETO, VT, Expand);
	setCondCodeAction(ISD::SETOLT, VT, Expand);
	setCondCodeAction(ISD::SETOLE, VT, Expand);
	setCondCodeAction(ISD::SETULT, VT, Expand);
	setCondCodeAction(ISD::SETULE, VT, Expand);
	setCondCodeAction(ISD::SETUGE, VT, Expand);
	setCondCodeAction(ISD::SETUGT, VT, Expand);
	setCondCodeAction(ISD::SETUEQ, VT, Expand);
	setCondCodeAction(ISD::SETUNE, VT, Expand);
	}
	}

	// Compute derived properties from the register classes
	computeRegisterProperties(Subtarget->getRegisterInfo());

	// Provide all sorts of operation actions
	setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
	setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
	setOperationAction(ISD::SETCC, MVT::i32, Custom);
	setOperationAction(ISD::SETCC, MVT::i64, Custom);
	setOperationAction(ISD::SETCC, MVT::f16, Custom);
	setOperationAction(ISD::SETCC, MVT::f32, Custom);
	setOperationAction(ISD::SETCC, MVT::f64, Custom);
	setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
	setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom);
	setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom);
	setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
	setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom);
	setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
	setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
	setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
	setOperationAction(ISD::BRCOND, MVT::Other, Expand);
	setOperationAction(ISD::BR_CC, MVT::i32, Custom);
	setOperationAction(ISD::BR_CC, MVT::i64, Custom);
	setOperationAction(ISD::BR_CC, MVT::f16, Custom);
	setOperationAction(ISD::BR_CC, MVT::f32, Custom);
	setOperationAction(ISD::BR_CC, MVT::f64, Custom);
	setOperationAction(ISD::SELECT, MVT::i32, Custom);
	setOperationAction(ISD::SELECT, MVT::i64, Custom);
	setOperationAction(ISD::SELECT, MVT::f16, Custom);
	setOperationAction(ISD::SELECT, MVT::f32, Custom);
	setOperationAction(ISD::SELECT, MVT::f64, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
	setOperationAction(ISD::BR_JT, MVT::Other, Custom);
	setOperationAction(ISD::JumpTable, MVT::i64, Custom);

	setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
	setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
	setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);

	setOperationAction(ISD::FREM, MVT::f32, Expand);
	setOperationAction(ISD::FREM, MVT::f64, Expand);
	setOperationAction(ISD::FREM, MVT::f80, Expand);

	setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);

	// Custom lowering hooks are needed for XOR
	// to fold it into CSINC/CSINV.
	setOperationAction(ISD::XOR, MVT::i32, Custom);
	setOperationAction(ISD::XOR, MVT::i64, Custom);

	// Virtually no operation on f128 is legal, but LLVM can't expand them when
	// there's a valid register class, so we need custom operations in most cases.
	setOperationAction(ISD::FABS, MVT::f128, Expand);
	setOperationAction(ISD::FADD, MVT::f128, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
	setOperationAction(ISD::FCOS, MVT::f128, Expand);
	setOperationAction(ISD::FDIV, MVT::f128, Custom);
	setOperationAction(ISD::FMA, MVT::f128, Expand);
	setOperationAction(ISD::FMUL, MVT::f128, Custom);
	setOperationAction(ISD::FNEG, MVT::f128, Expand);
	setOperationAction(ISD::FPOW, MVT::f128, Expand);
	setOperationAction(ISD::FREM, MVT::f128, Expand);
	setOperationAction(ISD::FRINT, MVT::f128, Expand);
	setOperationAction(ISD::FSIN, MVT::f128, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
	setOperationAction(ISD::FSQRT, MVT::f128, Expand);
	setOperationAction(ISD::FSUB, MVT::f128, Custom);
	setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
	setOperationAction(ISD::SETCC, MVT::f128, Custom);
	setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom);
	setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom);
	setOperationAction(ISD::BR_CC, MVT::f128, Custom);
	setOperationAction(ISD::SELECT, MVT::f128, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
	setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);

	// Lowering for many of the conversions is actually specified by the non-f128
	// type. The LowerXXX function will be trivial when f128 isn't involved.
	setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);

	// Variable arguments.
	setOperationAction(ISD::VASTART, MVT::Other, Custom);
	setOperationAction(ISD::VAARG, MVT::Other, Custom);
	setOperationAction(ISD::VACOPY, MVT::Other, Custom);
	setOperationAction(ISD::VAEND, MVT::Other, Expand);

	// Variable-sized objects.
	setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
	setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);

	if (Subtarget->isTargetWindows())
	setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
	else
	setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);

	// Constant pool entries
	setOperationAction(ISD::ConstantPool, MVT::i64, Custom);

	// BlockAddress
	setOperationAction(ISD::BlockAddress, MVT::i64, Custom);

	// Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
	setOperationAction(ISD::ADDC, MVT::i32, Custom);
	setOperationAction(ISD::ADDE, MVT::i32, Custom);
	setOperationAction(ISD::SUBC, MVT::i32, Custom);
	setOperationAction(ISD::SUBE, MVT::i32, Custom);
	setOperationAction(ISD::ADDC, MVT::i64, Custom);
	setOperationAction(ISD::ADDE, MVT::i64, Custom);
	setOperationAction(ISD::SUBC, MVT::i64, Custom);
	setOperationAction(ISD::SUBE, MVT::i64, Custom);

	// AArch64 lacks both left-rotate and popcount instructions.
	setOperationAction(ISD::ROTL, MVT::i32, Expand);
	setOperationAction(ISD::ROTL, MVT::i64, Expand);
	for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
	setOperationAction(ISD::ROTL, VT, Expand);
	setOperationAction(ISD::ROTR, VT, Expand);
	}

	// AArch64 doesn't have i32 MULH{S\|U}.
	setOperationAction(ISD::MULHU, MVT::i32, Expand);
	setOperationAction(ISD::MULHS, MVT::i32, Expand);

	// AArch64 doesn't have {U\|S}MUL_LOHI.
	setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
	setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);

	setOperationAction(ISD::CTPOP, MVT::i32, Custom);
	setOperationAction(ISD::CTPOP, MVT::i64, Custom);
	setOperationAction(ISD::CTPOP, MVT::i128, Custom);

	setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
	setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
	for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
	setOperationAction(ISD::SDIVREM, VT, Expand);
	setOperationAction(ISD::UDIVREM, VT, Expand);
	}
	setOperationAction(ISD::SREM, MVT::i32, Expand);
	setOperationAction(ISD::SREM, MVT::i64, Expand);
	setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
	setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
	setOperationAction(ISD::UREM, MVT::i32, Expand);
	setOperationAction(ISD::UREM, MVT::i64, Expand);

	// Custom lower Add/Sub/Mul with overflow.
	setOperationAction(ISD::SADDO, MVT::i32, Custom);
	setOperationAction(ISD::SADDO, MVT::i64, Custom);
	setOperationAction(ISD::UADDO, MVT::i32, Custom);
	setOperationAction(ISD::UADDO, MVT::i64, Custom);
	setOperationAction(ISD::SSUBO, MVT::i32, Custom);
	setOperationAction(ISD::SSUBO, MVT::i64, Custom);
	setOperationAction(ISD::USUBO, MVT::i32, Custom);
	setOperationAction(ISD::USUBO, MVT::i64, Custom);
	setOperationAction(ISD::SMULO, MVT::i32, Custom);
	setOperationAction(ISD::SMULO, MVT::i64, Custom);
	setOperationAction(ISD::UMULO, MVT::i32, Custom);
	setOperationAction(ISD::UMULO, MVT::i64, Custom);

	setOperationAction(ISD::FSIN, MVT::f32, Expand);
	setOperationAction(ISD::FSIN, MVT::f64, Expand);
	setOperationAction(ISD::FCOS, MVT::f32, Expand);
	setOperationAction(ISD::FCOS, MVT::f64, Expand);
	setOperationAction(ISD::FPOW, MVT::f32, Expand);
	setOperationAction(ISD::FPOW, MVT::f64, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
	if (Subtarget->hasFullFP16())
	setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom);
	else
	setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);

	setOperationAction(ISD::FREM, MVT::f16, Promote);
	setOperationAction(ISD::FREM, MVT::v4f16, Expand);
	setOperationAction(ISD::FREM, MVT::v8f16, Expand);
	setOperationAction(ISD::FPOW, MVT::f16, Promote);
	setOperationAction(ISD::FPOW, MVT::v4f16, Expand);
	setOperationAction(ISD::FPOW, MVT::v8f16, Expand);
	setOperationAction(ISD::FPOWI, MVT::f16, Promote);
	setOperationAction(ISD::FPOWI, MVT::v4f16, Expand);
	setOperationAction(ISD::FPOWI, MVT::v8f16, Expand);
	setOperationAction(ISD::FCOS, MVT::f16, Promote);
	setOperationAction(ISD::FCOS, MVT::v4f16, Expand);
	setOperationAction(ISD::FCOS, MVT::v8f16, Expand);
	setOperationAction(ISD::FSIN, MVT::f16, Promote);
	setOperationAction(ISD::FSIN, MVT::v4f16, Expand);
	setOperationAction(ISD::FSIN, MVT::v8f16, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
	setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand);
	setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand);
	setOperationAction(ISD::FEXP, MVT::f16, Promote);
	setOperationAction(ISD::FEXP, MVT::v4f16, Expand);
	setOperationAction(ISD::FEXP, MVT::v8f16, Expand);
	setOperationAction(ISD::FEXP2, MVT::f16, Promote);
	setOperationAction(ISD::FEXP2, MVT::v4f16, Expand);
	setOperationAction(ISD::FEXP2, MVT::v8f16, Expand);
	setOperationAction(ISD::FLOG, MVT::f16, Promote);
	setOperationAction(ISD::FLOG, MVT::v4f16, Expand);
	setOperationAction(ISD::FLOG, MVT::v8f16, Expand);
	setOperationAction(ISD::FLOG2, MVT::f16, Promote);
	setOperationAction(ISD::FLOG2, MVT::v4f16, Expand);
	setOperationAction(ISD::FLOG2, MVT::v8f16, Expand);
	setOperationAction(ISD::FLOG10, MVT::f16, Promote);
	setOperationAction(ISD::FLOG10, MVT::v4f16, Expand);
	setOperationAction(ISD::FLOG10, MVT::v8f16, Expand);

	if (!Subtarget->hasFullFP16()) {
	setOperationAction(ISD::SELECT, MVT::f16, Promote);
	setOperationAction(ISD::SELECT_CC, MVT::f16, Promote);
	setOperationAction(ISD::SETCC, MVT::f16, Promote);
	setOperationAction(ISD::BR_CC, MVT::f16, Promote);
	setOperationAction(ISD::FADD, MVT::f16, Promote);
	setOperationAction(ISD::FSUB, MVT::f16, Promote);
	setOperationAction(ISD::FMUL, MVT::f16, Promote);
	setOperationAction(ISD::FDIV, MVT::f16, Promote);
	setOperationAction(ISD::FMA, MVT::f16, Promote);
	setOperationAction(ISD::FNEG, MVT::f16, Promote);
	setOperationAction(ISD::FABS, MVT::f16, Promote);
	setOperationAction(ISD::FCEIL, MVT::f16, Promote);
	setOperationAction(ISD::FSQRT, MVT::f16, Promote);
	setOperationAction(ISD::FFLOOR, MVT::f16, Promote);
	setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote);
	setOperationAction(ISD::FRINT, MVT::f16, Promote);
	setOperationAction(ISD::FROUND, MVT::f16, Promote);
	setOperationAction(ISD::FTRUNC, MVT::f16, Promote);
	setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
	setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
	setOperationAction(ISD::FMINIMUM, MVT::f16, Promote);
	setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote);

	// promote v4f16 to v4f32 when that is known to be safe.
	setOperationAction(ISD::FADD, MVT::v4f16, Promote);
	setOperationAction(ISD::FSUB, MVT::v4f16, Promote);
	setOperationAction(ISD::FMUL, MVT::v4f16, Promote);
	setOperationAction(ISD::FDIV, MVT::v4f16, Promote);
	AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
	AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
	AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
	AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);

	setOperationAction(ISD::FABS, MVT::v4f16, Expand);
	setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
	setOperationAction(ISD::FROUND, MVT::v4f16, Expand);
	setOperationAction(ISD::FMA, MVT::v4f16, Expand);
	setOperationAction(ISD::SETCC, MVT::v4f16, Expand);
	setOperationAction(ISD::BR_CC, MVT::v4f16, Expand);
	setOperationAction(ISD::SELECT, MVT::v4f16, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand);
	setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand);
	setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand);
	setOperationAction(ISD::FCEIL, MVT::v4f16, Expand);
	setOperationAction(ISD::FRINT, MVT::v4f16, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand);
	setOperationAction(ISD::FSQRT, MVT::v4f16, Expand);

	setOperationAction(ISD::FABS, MVT::v8f16, Expand);
	setOperationAction(ISD::FADD, MVT::v8f16, Expand);
	setOperationAction(ISD::FCEIL, MVT::v8f16, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand);
	setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
	setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand);
	setOperationAction(ISD::FMA, MVT::v8f16, Expand);
	setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand);
	setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
	setOperationAction(ISD::FROUND, MVT::v8f16, Expand);
	setOperationAction(ISD::FRINT, MVT::v8f16, Expand);
	setOperationAction(ISD::FSQRT, MVT::v8f16, Expand);
	setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
	setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand);
	setOperationAction(ISD::SETCC, MVT::v8f16, Expand);
	setOperationAction(ISD::BR_CC, MVT::v8f16, Expand);
	setOperationAction(ISD::SELECT, MVT::v8f16, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand);
	setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand);
	}

	// AArch64 has implementations of a lot of rounding-like FP operations.
	for (MVT Ty : {MVT::f32, MVT::f64}) {
	setOperationAction(ISD::FFLOOR, Ty, Legal);
	setOperationAction(ISD::FNEARBYINT, Ty, Legal);
	setOperationAction(ISD::FCEIL, Ty, Legal);
	setOperationAction(ISD::FRINT, Ty, Legal);
	setOperationAction(ISD::FTRUNC, Ty, Legal);
	setOperationAction(ISD::FROUND, Ty, Legal);
	setOperationAction(ISD::FMINNUM, Ty, Legal);
	setOperationAction(ISD::FMAXNUM, Ty, Legal);
	setOperationAction(ISD::FMINIMUM, Ty, Legal);
	setOperationAction(ISD::FMAXIMUM, Ty, Legal);
	setOperationAction(ISD::LROUND, Ty, Legal);
	setOperationAction(ISD::LLROUND, Ty, Legal);
	setOperationAction(ISD::LRINT, Ty, Legal);
	setOperationAction(ISD::LLRINT, Ty, Legal);
	}

	if (Subtarget->hasFullFP16()) {
	setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal);
	setOperationAction(ISD::FFLOOR, MVT::f16, Legal);
	setOperationAction(ISD::FCEIL, MVT::f16, Legal);
	setOperationAction(ISD::FRINT, MVT::f16, Legal);
	setOperationAction(ISD::FTRUNC, MVT::f16, Legal);
	setOperationAction(ISD::FROUND, MVT::f16, Legal);
	setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
	setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
	setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
	setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
	}

	setOperationAction(ISD::PREFETCH, MVT::Other, Custom);

	setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);

	setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);

	// 128-bit loads and stores can be done without expanding
	setOperationAction(ISD::LOAD, MVT::i128, Custom);
	setOperationAction(ISD::STORE, MVT::i128, Custom);

	// 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
	// custom lowering, as there are no un-paired non-temporal stores and
	// legalization will break up 256 bit inputs.
	setOperationAction(ISD::STORE, MVT::v32i8, Custom);
	setOperationAction(ISD::STORE, MVT::v16i16, Custom);
	setOperationAction(ISD::STORE, MVT::v16f16, Custom);
	setOperationAction(ISD::STORE, MVT::v8i32, Custom);
	setOperationAction(ISD::STORE, MVT::v8f32, Custom);
	setOperationAction(ISD::STORE, MVT::v4f64, Custom);
	setOperationAction(ISD::STORE, MVT::v4i64, Custom);

	// Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
	// This requires the Performance Monitors extension.
	if (Subtarget->hasPerfMon())
	setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);

	if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
	getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
	// Issue __sincos_stret if available.
	setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
	setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
	} else {
	setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
	}

	if (Subtarget->getTargetTriple().isOSMSVCRT()) {
	// MSVCRT doesn't have powi; fall back to pow
	setLibcallName(RTLIB::POWI_F32, nullptr);
	setLibcallName(RTLIB::POWI_F64, nullptr);
	}

	// Make floating-point constants legal for the large code model, so they don't
	// become loads from the constant pool.
	if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
	setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
	setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
	}

	// AArch64 does not have floating-point extending loads, i1 sign-extending
	// load, floating-point truncating stores, or v2i32->v2i16 truncating store.
	for (MVT VT : MVT::fp_valuetypes()) {
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
	}
	for (MVT VT : MVT::integer_valuetypes())
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);

	setTruncStoreAction(MVT::f32, MVT::f16, Expand);
	setTruncStoreAction(MVT::f64, MVT::f32, Expand);
	setTruncStoreAction(MVT::f64, MVT::f16, Expand);
	setTruncStoreAction(MVT::f128, MVT::f80, Expand);
	setTruncStoreAction(MVT::f128, MVT::f64, Expand);
	setTruncStoreAction(MVT::f128, MVT::f32, Expand);
	setTruncStoreAction(MVT::f128, MVT::f16, Expand);

	setOperationAction(ISD::BITCAST, MVT::i16, Custom);
	setOperationAction(ISD::BITCAST, MVT::f16, Custom);
	setOperationAction(ISD::BITCAST, MVT::bf16, Custom);

	// Indexed loads and stores are supported.
	for (unsigned im = (unsigned)ISD::PRE_INC;
	im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
	setIndexedLoadAction(im, MVT::i8, Legal);
	setIndexedLoadAction(im, MVT::i16, Legal);
	setIndexedLoadAction(im, MVT::i32, Legal);
	setIndexedLoadAction(im, MVT::i64, Legal);
	setIndexedLoadAction(im, MVT::f64, Legal);
	setIndexedLoadAction(im, MVT::f32, Legal);
	setIndexedLoadAction(im, MVT::f16, Legal);
	setIndexedLoadAction(im, MVT::bf16, Legal);
	setIndexedStoreAction(im, MVT::i8, Legal);
	setIndexedStoreAction(im, MVT::i16, Legal);
	setIndexedStoreAction(im, MVT::i32, Legal);
	setIndexedStoreAction(im, MVT::i64, Legal);
	setIndexedStoreAction(im, MVT::f64, Legal);
	setIndexedStoreAction(im, MVT::f32, Legal);
	setIndexedStoreAction(im, MVT::f16, Legal);
	setIndexedStoreAction(im, MVT::bf16, Legal);
	}

	// Trap.
	setOperationAction(ISD::TRAP, MVT::Other, Legal);
	if (Subtarget->isTargetWindows())
	setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);

	// We combine OR nodes for bitfield operations.
	setTargetDAGCombine(ISD::OR);
	// Try to create BICs for vector ANDs.
	setTargetDAGCombine(ISD::AND);

	// Vector add and sub nodes may conceal a high-half opportunity.
	// Also, try to fold ADD into CSINC/CSINV..
	setTargetDAGCombine(ISD::ADD);
	setTargetDAGCombine(ISD::SUB);
	setTargetDAGCombine(ISD::SRL);
	setTargetDAGCombine(ISD::XOR);
	setTargetDAGCombine(ISD::SINT_TO_FP);
	setTargetDAGCombine(ISD::UINT_TO_FP);

	setTargetDAGCombine(ISD::FP_TO_SINT);
	setTargetDAGCombine(ISD::FP_TO_UINT);
	setTargetDAGCombine(ISD::FDIV);

	setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);

	setTargetDAGCombine(ISD::ANY_EXTEND);
	setTargetDAGCombine(ISD::ZERO_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
	setTargetDAGCombine(ISD::CONCAT_VECTORS);
	setTargetDAGCombine(ISD::STORE);
	if (Subtarget->supportsAddressTopByteIgnored())
	setTargetDAGCombine(ISD::LOAD);

	setTargetDAGCombine(ISD::MUL);

	setTargetDAGCombine(ISD::SELECT);
	setTargetDAGCombine(ISD::VSELECT);

	setTargetDAGCombine(ISD::INTRINSIC_VOID);
	setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
	setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);

	setTargetDAGCombine(ISD::GlobalAddress);

	// In case of strict alignment, avoid an excessive number of byte wide stores.
	MaxStoresPerMemsetOptSize = 8;
	MaxStoresPerMemset = Subtarget->requiresStrictAlign()
	? MaxStoresPerMemsetOptSize : 32;

	MaxGluedStoresPerMemcpy = 4;
	MaxStoresPerMemcpyOptSize = 4;
	MaxStoresPerMemcpy = Subtarget->requiresStrictAlign()
	? MaxStoresPerMemcpyOptSize : 16;

	MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4;

	MaxLoadsPerMemcmpOptSize = 4;
	MaxLoadsPerMemcmp = Subtarget->requiresStrictAlign()
	? MaxLoadsPerMemcmpOptSize : 8;

	setStackPointerRegisterToSaveRestore(AArch64::SP);

	setSchedulingPreference(Sched::Hybrid);

	EnableExtLdPromotion = true;

	// Set required alignment.
	setMinFunctionAlignment(Align(4));
	// Set preferred alignments.
	setPrefLoopAlignment(Align(1ULL << STI.getPrefLoopLogAlignment()));
	setPrefFunctionAlignment(Align(1ULL << STI.getPrefFunctionLogAlignment()));

	// Only change the limit for entries in a jump table if specified by
	// the sub target, but not at the command line.
	unsigned MaxJT = STI.getMaximumJumpTableSize();
	if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
	setMaximumJumpTableSize(MaxJT);

	setHasExtractBitsInsn(true);

	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);

	if (Subtarget->hasNEON()) {
	// FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
	// silliness like this:
	setOperationAction(ISD::FABS, MVT::v1f64, Expand);
	setOperationAction(ISD::FADD, MVT::v1f64, Expand);
	setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
	setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
	setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
	setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
	setOperationAction(ISD::FMA, MVT::v1f64, Expand);
	setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
	setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
	setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
	setOperationAction(ISD::FREM, MVT::v1f64, Expand);
	setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
	setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
	setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
	setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
	setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
	setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
	setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
	setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
	setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
	setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
	setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);

	setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
	setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
	setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
	setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
	setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);

	setOperationAction(ISD::MUL, MVT::v1i64, Expand);

	// AArch64 doesn't have a direct vector ->f32 conversion instructions for
	// elements smaller than i32, so promote the input to i32 first.
	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
	// i8 vector elements also need promotion to i32 for v8i8
	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
	// Similarly, there is no direct i32 -> f64 vector conversion instruction.
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
	// Or, direct i32 -> f16 vector conversion. Set it so custom, so the
	// conversion happens in two steps: v4i32 -> v4f32 -> v4f16
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);

	if (Subtarget->hasFullFP16()) {
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
	} else {
	// when AArch64 doesn't have fullfp16 support, promote the input
	// to i32 first.
	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
	}

	setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
	setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);

	// AArch64 doesn't have MUL.2d:
	setOperationAction(ISD::MUL, MVT::v2i64, Expand);
	// Custom handling for some quad-vector types to detect MULL.
	setOperationAction(ISD::MUL, MVT::v8i16, Custom);
	setOperationAction(ISD::MUL, MVT::v4i32, Custom);
	setOperationAction(ISD::MUL, MVT::v2i64, Custom);

	for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
	MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	// Vector reductions
	setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
	setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
	setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
	setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
	setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);

	// Saturates
	setOperationAction(ISD::SADDSAT, VT, Legal);
	setOperationAction(ISD::UADDSAT, VT, Legal);
	setOperationAction(ISD::SSUBSAT, VT, Legal);
	setOperationAction(ISD::USUBSAT, VT, Legal);

	setOperationAction(ISD::TRUNCATE, VT, Custom);
	}
	for (MVT VT : { MVT::v4f16, MVT::v2f32,
	MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
	setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
	setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
	}

	setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
	setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
	// Likewise, narrowing and extending vector loads/stores aren't handled
	// directly.
	for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
	setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);

	if (VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\| VT == MVT::v4i32) {
	setOperationAction(ISD::MULHS, VT, Legal);
	setOperationAction(ISD::MULHU, VT, Legal);
	} else {
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	}
	setOperationAction(ISD::SMUL_LOHI, VT, Expand);
	setOperationAction(ISD::UMUL_LOHI, VT, Expand);

	setOperationAction(ISD::BSWAP, VT, Expand);
	setOperationAction(ISD::CTTZ, VT, Expand);

	for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
	setTruncStoreAction(VT, InnerVT, Expand);
	setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
	setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
	}
	}

	// AArch64 has implementations of a lot of rounding-like FP operations.
	for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
	setOperationAction(ISD::FFLOOR, Ty, Legal);
	setOperationAction(ISD::FNEARBYINT, Ty, Legal);
	setOperationAction(ISD::FCEIL, Ty, Legal);
	setOperationAction(ISD::FRINT, Ty, Legal);
	setOperationAction(ISD::FTRUNC, Ty, Legal);
	setOperationAction(ISD::FROUND, Ty, Legal);
	}

	if (Subtarget->hasFullFP16()) {
	for (MVT Ty : {MVT::v4f16, MVT::v8f16}) {
	setOperationAction(ISD::FFLOOR, Ty, Legal);
	setOperationAction(ISD::FNEARBYINT, Ty, Legal);
	setOperationAction(ISD::FCEIL, Ty, Legal);
	setOperationAction(ISD::FRINT, Ty, Legal);
	setOperationAction(ISD::FTRUNC, Ty, Legal);
	setOperationAction(ISD::FROUND, Ty, Legal);
	}
	}

	if (Subtarget->hasSVE())
	setOperationAction(ISD::VSCALE, MVT::i32, Custom);

	setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
	}

	if (Subtarget->hasSVE()) {
	// FIXME: Add custom lowering of MLOAD to handle different passthrus (not a
	// splat of 0 or undef) once vector selects supported in SVE codegen. See
	// D68877 for more details.
	for (MVT VT : MVT::integer_scalable_vector_valuetypes()) {
	if (isTypeLegal(VT)) {
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
	setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::SDIV, VT, Custom);
	setOperationAction(ISD::UDIV, VT, Custom);
	setOperationAction(ISD::SMIN, VT, Custom);
	setOperationAction(ISD::UMIN, VT, Custom);
	setOperationAction(ISD::SMAX, VT, Custom);
	setOperationAction(ISD::UMAX, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	- if (VT.getScalarType() == MVT::i1)
	+ if (VT.getScalarType() == MVT::i1) {
	setOperationAction(ISD::SETCC, VT, Custom);
	+ setOperationAction(ISD::TRUNCATE, VT, Custom);
	+ setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
	+ }
	}
	}

	for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32})
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);

	for (MVT VT : MVT::fp_scalable_vector_valuetypes()) {
	if (isTypeLegal(VT)) {
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
	setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::FMA, VT, Custom);
	}
	}

	// NOTE: Currently this has to happen after computeRegisterProperties rather
	// than the preferred option of combining it with the addRegisterClass call.
	if (useSVEForFixedLengthVectors()) {
	for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
	if (useSVEForFixedLengthVectorVT(VT))
	addTypeForFixedLengthSVE(VT);
	for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
	if (useSVEForFixedLengthVectorVT(VT))
	addTypeForFixedLengthSVE(VT);

	// 64bit results can mean a bigger than NEON input.
	for (auto VT : {MVT::v8i8, MVT::v4i16})
	setOperationAction(ISD::TRUNCATE, VT, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);

	// 128bit results imply a bigger than NEON input.
	for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
	setOperationAction(ISD::TRUNCATE, VT, Custom);
	for (auto VT : {MVT::v8f16, MVT::v4f32})
	setOperationAction(ISD::FP_ROUND, VT, Expand);
	}
	}

	PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
	}

	void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
	assert(VT.isVector() && "VT should be a vector type");

	if (VT.isFloatingPoint()) {
	MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT();
	setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
	setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
	}

	// Mark vector float intrinsics as expand.
	if (VT == MVT::v2f32 \|\| VT == MVT::v4f32 \|\| VT == MVT::v2f64) {
	setOperationAction(ISD::FSIN, VT, Expand);
	setOperationAction(ISD::FCOS, VT, Expand);
	setOperationAction(ISD::FPOW, VT, Expand);
	setOperationAction(ISD::FLOG, VT, Expand);
	setOperationAction(ISD::FLOG2, VT, Expand);
	setOperationAction(ISD::FLOG10, VT, Expand);
	setOperationAction(ISD::FEXP, VT, Expand);
	setOperationAction(ISD::FEXP2, VT, Expand);

	// But we do support custom-lowering for FCOPYSIGN.
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
	}

	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::OR, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);

	setOperationAction(ISD::SELECT, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);
	setOperationAction(ISD::VSELECT, VT, Expand);
	for (MVT InnerVT : MVT::all_valuetypes())
	setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

	// CNT supports only B element sizes, then use UADDLP to widen.
	if (VT != MVT::v8i8 && VT != MVT::v16i8)
	setOperationAction(ISD::CTPOP, VT, Custom);

	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::FREM, VT, Expand);

	setOperationAction(ISD::FP_TO_SINT, VT, Custom);
	setOperationAction(ISD::FP_TO_UINT, VT, Custom);

	if (!VT.isFloatingPoint())
	setOperationAction(ISD::ABS, VT, Legal);

	// [SU][MIN\|MAX] are available for all NEON types apart from i64.
	if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
	for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
	setOperationAction(Opcode, VT, Legal);

	// F[MIN\|MAX][NUM\|NAN] are available for all FP NEON types.
	if (VT.isFloatingPoint() &&
	(VT.getVectorElementType() != MVT::f16 \|\| Subtarget->hasFullFP16()))
	for (unsigned Opcode :
	{ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM})
	setOperationAction(Opcode, VT, Legal);

	if (Subtarget->isLittleEndian()) {
	for (unsigned im = (unsigned)ISD::PRE_INC;
	im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
	setIndexedLoadAction(im, VT, Legal);
	setIndexedStoreAction(im, VT, Legal);
	}
	}
	}

	void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");

	// By default everything must be expanded.
	for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
	setOperationAction(Op, VT, Expand);

	// We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one.
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

	// Lower fixed length vector operations to scalable equivalents.
	setOperationAction(ISD::ADD, VT, Custom);
	setOperationAction(ISD::FADD, VT, Custom);
	setOperationAction(ISD::LOAD, VT, Custom);
	setOperationAction(ISD::STORE, VT, Custom);
	setOperationAction(ISD::TRUNCATE, VT, Custom);
	}

	void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
	addRegisterClass(VT, &AArch64::FPR64RegClass);
	addTypeForNEON(VT, MVT::v2i32);
	}

	void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
	addRegisterClass(VT, &AArch64::FPR128RegClass);
	addTypeForNEON(VT, MVT::v4i32);
	}

	EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &,
	LLVMContext &C, EVT VT) const {
	if (!VT.isVector())
	return MVT::i32;
	if (VT.isScalableVector())
	return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
	return VT.changeVectorElementTypeToInteger();
	}

	static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
	const APInt &Demanded,
	TargetLowering::TargetLoweringOpt &TLO,
	unsigned NewOpc) {
	uint64_t OldImm = Imm, NewImm, Enc;
	uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;

	// Return if the immediate is already all zeros, all ones, a bimm32 or a
	// bimm64.
	if (Imm == 0 \|\| Imm == Mask \|\|
	AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
	return false;

	unsigned EltSize = Size;
	uint64_t DemandedBits = Demanded.getZExtValue();

	// Clear bits that are not demanded.
	Imm &= DemandedBits;

	while (true) {
	// The goal here is to set the non-demanded bits in a way that minimizes
	// the number of switching between 0 and 1. In order to achieve this goal,
	// we set the non-demanded bits to the value of the preceding demanded bits.
	// For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
	// non-demanded bit), we copy bit0 (1) to the least significant 'x',
	// bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
	// The final result is 0b11000011.
	uint64_t NonDemandedBits = ~DemandedBits;
	uint64_t InvertedImm = ~Imm & DemandedBits;
	uint64_t RotatedImm =
	((InvertedImm << 1) \| (InvertedImm >> (EltSize - 1) & 1)) &
	NonDemandedBits;
	uint64_t Sum = RotatedImm + NonDemandedBits;
	bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
	uint64_t Ones = (Sum + Carry) & NonDemandedBits;
	NewImm = (Imm \| Ones) & Mask;

	// If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
	// or all-ones or all-zeros, in which case we can stop searching. Otherwise,
	// we halve the element size and continue the search.
	if (isShiftedMask_64(NewImm) \|\| isShiftedMask_64(~(NewImm \| ~Mask)))
	break;

	// We cannot shrink the element size any further if it is 2-bits.
	if (EltSize == 2)
	return false;

	EltSize /= 2;
	Mask >>= EltSize;
	uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;

	// Return if there is mismatch in any of the demanded bits of Imm and Hi.
	if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
	return false;

	// Merge the upper and lower halves of Imm and DemandedBits.
	Imm \|= Hi;
	DemandedBits \|= DemandedBitsHi;
	}

	++NumOptimizedImms;

	// Replicate the element across the register width.
	while (EltSize < Size) {
	NewImm \|= NewImm << EltSize;
	EltSize *= 2;
	}

	(void)OldImm;
	assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
	"demanded bits should never be altered");
	assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");

	// Create the new constant immediate node.
	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	SDValue New;

	// If the new constant immediate is all-zeros or all-ones, let the target
	// independent DAG combine optimize this node.
	if (NewImm == 0 \|\| NewImm == OrigMask) {
	New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
	TLO.DAG.getConstant(NewImm, DL, VT));
	// Otherwise, create a machine node so that target independent DAG combine
	// doesn't undo this optimization.
	} else {
	Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
	SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
	New = SDValue(
	TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
	}

	return TLO.CombineTo(Op, New);
	}

	bool AArch64TargetLowering::targetShrinkDemandedConstant(
	SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
	TargetLoweringOpt &TLO) const {
	// Delay this optimization to as late as possible.
	if (!TLO.LegalOps)
	return false;

	if (!EnableOptimizeLogicalImm)
	return false;

	EVT VT = Op.getValueType();
	if (VT.isVector())
	return false;

	unsigned Size = VT.getSizeInBits();
	assert((Size == 32 \|\| Size == 64) &&
	"i32 or i64 is expected after legalization.");

	// Exit early if we demand all bits.
	if (DemandedBits.countPopulation() == Size)
	return false;

	unsigned NewOpc;
	switch (Op.getOpcode()) {
	default:
	return false;
	case ISD::AND:
	NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
	break;
	case ISD::OR:
	NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
	break;
	case ISD::XOR:
	NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
	break;
	}
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	if (!C)
	return false;
	uint64_t Imm = C->getZExtValue();
	return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
	}

	/// computeKnownBitsForTargetNode - Determine which of the bits specified in
	/// Mask are known to be either zero or one and return them Known.
	void AArch64TargetLowering::computeKnownBitsForTargetNode(
	const SDValue Op, KnownBits &Known,
	const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
	switch (Op.getOpcode()) {
	default:
	break;
	case AArch64ISD::CSEL: {
	KnownBits Known2;
	Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
	Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
	Known.Zero &= Known2.Zero;
	Known.One &= Known2.One;
	break;
	}
	case AArch64ISD::LOADgot:
	case AArch64ISD::ADDlow: {
	if (!Subtarget->isTargetILP32())
	break;
	// In ILP32 mode all valid pointers are in the low 4GB of the address-space.
	Known.Zero = APInt::getHighBitsSet(64, 32);
	break;
	}
	case ISD::INTRINSIC_W_CHAIN: {
	ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
	Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
	switch (IntID) {
	default: return;
	case Intrinsic::aarch64_ldaxr:
	case Intrinsic::aarch64_ldxr: {
	unsigned BitWidth = Known.getBitWidth();
	EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
	unsigned MemBits = VT.getScalarSizeInBits();
	Known.Zero \|= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
	return;
	}
	}
	break;
	}
	case ISD::INTRINSIC_WO_CHAIN:
	case ISD::INTRINSIC_VOID: {
	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	switch (IntNo) {
	default:
	break;
	case Intrinsic::aarch64_neon_umaxv:
	case Intrinsic::aarch64_neon_uminv: {
	// Figure out the datatype of the vector operand. The UMINV instruction
	// will zero extend the result, so we can mark as known zero all the
	// bits larger than the element datatype. 32-bit or larget doesn't need
	// this as those are legal types and will be handled by isel directly.
	MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
	unsigned BitWidth = Known.getBitWidth();
	if (VT == MVT::v8i8 \|\| VT == MVT::v16i8) {
	assert(BitWidth >= 8 && "Unexpected width!");
	APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
	Known.Zero \|= Mask;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v8i16) {
	assert(BitWidth >= 16 && "Unexpected width!");
	APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
	Known.Zero \|= Mask;
	}
	break;
	} break;
	}
	}
	}
	}

	MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
	EVT) const {
	return MVT::i64;
	}

	bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
	EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
	bool *Fast) const {
	if (Subtarget->requiresStrictAlign())
	return false;

	if (Fast) {
	// Some CPUs are fine with unaligned stores except for 128-bit ones.
	*Fast = !Subtarget->isMisaligned128StoreSlow() \|\| VT.getStoreSize() != 16 \|\|
	// See comments in performSTORECombine() for more details about
	// these conditions.

	// Code that uses clang vector extensions can mark that it
	// wants unaligned accesses to be treated as fast by
	// underspecifying alignment to be 1 or 2.
	Align <= 2 \|\|

	// Disregard v2i64. Memcpy lowering produces those and splitting
	// them regresses performance on micro-benchmarks and olden/bh.
	VT == MVT::v2i64;
	}
	return true;
	}

	// Same as above but handling LLTs instead.
	bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
	LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
	bool *Fast) const {
	if (Subtarget->requiresStrictAlign())
	return false;

	if (Fast) {
	// Some CPUs are fine with unaligned stores except for 128-bit ones.
	*Fast = !Subtarget->isMisaligned128StoreSlow() \|\|
	Ty.getSizeInBytes() != 16 \|\|
	// See comments in performSTORECombine() for more details about
	// these conditions.

	// Code that uses clang vector extensions can mark that it
	// wants unaligned accesses to be treated as fast by
	// underspecifying alignment to be 1 or 2.
	Alignment <= 2 \|\|

	// Disregard v2i64. Memcpy lowering produces those and splitting
	// them regresses performance on micro-benchmarks and olden/bh.
	Ty == LLT::vector(2, 64);
	}
	return true;
	}

	FastISel *
	AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo) const {
	return AArch64::createFastISel(funcInfo, libInfo);
	}

	const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
	#define MAKE_CASE(V) \
	case V: \
	return #V;
	switch ((AArch64ISD::NodeType)Opcode) {
	case AArch64ISD::FIRST_NUMBER:
	break;
	MAKE_CASE(AArch64ISD::CALL)
	MAKE_CASE(AArch64ISD::ADRP)
	MAKE_CASE(AArch64ISD::ADR)
	MAKE_CASE(AArch64ISD::ADDlow)
	MAKE_CASE(AArch64ISD::LOADgot)
	MAKE_CASE(AArch64ISD::RET_FLAG)
	MAKE_CASE(AArch64ISD::BRCOND)
	MAKE_CASE(AArch64ISD::CSEL)
	MAKE_CASE(AArch64ISD::FCSEL)
	MAKE_CASE(AArch64ISD::CSINV)
	MAKE_CASE(AArch64ISD::CSNEG)
	MAKE_CASE(AArch64ISD::CSINC)
	MAKE_CASE(AArch64ISD::THREAD_POINTER)
	MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
	MAKE_CASE(AArch64ISD::ADD_PRED)
	MAKE_CASE(AArch64ISD::SDIV_PRED)
	MAKE_CASE(AArch64ISD::UDIV_PRED)
	MAKE_CASE(AArch64ISD::SMIN_MERGE_OP1)
	MAKE_CASE(AArch64ISD::UMIN_MERGE_OP1)
	MAKE_CASE(AArch64ISD::SMAX_MERGE_OP1)
	MAKE_CASE(AArch64ISD::UMAX_MERGE_OP1)
	MAKE_CASE(AArch64ISD::SHL_MERGE_OP1)
	MAKE_CASE(AArch64ISD::SRL_MERGE_OP1)
	MAKE_CASE(AArch64ISD::SRA_MERGE_OP1)
	MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::ADC)
	MAKE_CASE(AArch64ISD::SBC)
	MAKE_CASE(AArch64ISD::ADDS)
	MAKE_CASE(AArch64ISD::SUBS)
	MAKE_CASE(AArch64ISD::ADCS)
	MAKE_CASE(AArch64ISD::SBCS)
	MAKE_CASE(AArch64ISD::ANDS)
	MAKE_CASE(AArch64ISD::CCMP)
	MAKE_CASE(AArch64ISD::CCMN)
	MAKE_CASE(AArch64ISD::FCCMP)
	MAKE_CASE(AArch64ISD::FCMP)
	MAKE_CASE(AArch64ISD::STRICT_FCMP)
	MAKE_CASE(AArch64ISD::STRICT_FCMPE)
	MAKE_CASE(AArch64ISD::DUP)
	MAKE_CASE(AArch64ISD::DUPLANE8)
	MAKE_CASE(AArch64ISD::DUPLANE16)
	MAKE_CASE(AArch64ISD::DUPLANE32)
	MAKE_CASE(AArch64ISD::DUPLANE64)
	MAKE_CASE(AArch64ISD::MOVI)
	MAKE_CASE(AArch64ISD::MOVIshift)
	MAKE_CASE(AArch64ISD::MOVIedit)
	MAKE_CASE(AArch64ISD::MOVImsl)
	MAKE_CASE(AArch64ISD::FMOV)
	MAKE_CASE(AArch64ISD::MVNIshift)
	MAKE_CASE(AArch64ISD::MVNImsl)
	MAKE_CASE(AArch64ISD::BICi)
	MAKE_CASE(AArch64ISD::ORRi)
	MAKE_CASE(AArch64ISD::BSP)
	MAKE_CASE(AArch64ISD::NEG)
	MAKE_CASE(AArch64ISD::EXTR)
	MAKE_CASE(AArch64ISD::ZIP1)
	MAKE_CASE(AArch64ISD::ZIP2)
	MAKE_CASE(AArch64ISD::UZP1)
	MAKE_CASE(AArch64ISD::UZP2)
	MAKE_CASE(AArch64ISD::TRN1)
	MAKE_CASE(AArch64ISD::TRN2)
	MAKE_CASE(AArch64ISD::REV16)
	MAKE_CASE(AArch64ISD::REV32)
	MAKE_CASE(AArch64ISD::REV64)
	MAKE_CASE(AArch64ISD::EXT)
	MAKE_CASE(AArch64ISD::VSHL)
	MAKE_CASE(AArch64ISD::VLSHR)
	MAKE_CASE(AArch64ISD::VASHR)
	MAKE_CASE(AArch64ISD::VSLI)
	MAKE_CASE(AArch64ISD::VSRI)
	MAKE_CASE(AArch64ISD::CMEQ)
	MAKE_CASE(AArch64ISD::CMGE)
	MAKE_CASE(AArch64ISD::CMGT)
	MAKE_CASE(AArch64ISD::CMHI)
	MAKE_CASE(AArch64ISD::CMHS)
	MAKE_CASE(AArch64ISD::FCMEQ)
	MAKE_CASE(AArch64ISD::FCMGE)
	MAKE_CASE(AArch64ISD::FCMGT)
	MAKE_CASE(AArch64ISD::CMEQz)
	MAKE_CASE(AArch64ISD::CMGEz)
	MAKE_CASE(AArch64ISD::CMGTz)
	MAKE_CASE(AArch64ISD::CMLEz)
	MAKE_CASE(AArch64ISD::CMLTz)
	MAKE_CASE(AArch64ISD::FCMEQz)
	MAKE_CASE(AArch64ISD::FCMGEz)
	MAKE_CASE(AArch64ISD::FCMGTz)
	MAKE_CASE(AArch64ISD::FCMLEz)
	MAKE_CASE(AArch64ISD::FCMLTz)
	MAKE_CASE(AArch64ISD::SADDV)
	MAKE_CASE(AArch64ISD::UADDV)
	MAKE_CASE(AArch64ISD::SRHADD)
	MAKE_CASE(AArch64ISD::URHADD)
	MAKE_CASE(AArch64ISD::SMINV)
	MAKE_CASE(AArch64ISD::UMINV)
	MAKE_CASE(AArch64ISD::SMAXV)
	MAKE_CASE(AArch64ISD::UMAXV)
	MAKE_CASE(AArch64ISD::SMAXV_PRED)
	MAKE_CASE(AArch64ISD::UMAXV_PRED)
	MAKE_CASE(AArch64ISD::SMINV_PRED)
	MAKE_CASE(AArch64ISD::UMINV_PRED)
	MAKE_CASE(AArch64ISD::ORV_PRED)
	MAKE_CASE(AArch64ISD::EORV_PRED)
	MAKE_CASE(AArch64ISD::ANDV_PRED)
	MAKE_CASE(AArch64ISD::CLASTA_N)
	MAKE_CASE(AArch64ISD::CLASTB_N)
	MAKE_CASE(AArch64ISD::LASTA)
	MAKE_CASE(AArch64ISD::LASTB)
	MAKE_CASE(AArch64ISD::REV)
	MAKE_CASE(AArch64ISD::REINTERPRET_CAST)
	MAKE_CASE(AArch64ISD::TBL)
	MAKE_CASE(AArch64ISD::FADD_PRED)
	MAKE_CASE(AArch64ISD::FADDA_PRED)
	MAKE_CASE(AArch64ISD::FADDV_PRED)
	MAKE_CASE(AArch64ISD::FMA_PRED)
	MAKE_CASE(AArch64ISD::FMAXV_PRED)
	MAKE_CASE(AArch64ISD::FMAXNMV_PRED)
	MAKE_CASE(AArch64ISD::FMINV_PRED)
	MAKE_CASE(AArch64ISD::FMINNMV_PRED)
	MAKE_CASE(AArch64ISD::NOT)
	MAKE_CASE(AArch64ISD::BIT)
	MAKE_CASE(AArch64ISD::CBZ)
	MAKE_CASE(AArch64ISD::CBNZ)
	MAKE_CASE(AArch64ISD::TBZ)
	MAKE_CASE(AArch64ISD::TBNZ)
	MAKE_CASE(AArch64ISD::TC_RETURN)
	MAKE_CASE(AArch64ISD::PREFETCH)
	MAKE_CASE(AArch64ISD::SITOF)
	MAKE_CASE(AArch64ISD::UITOF)
	MAKE_CASE(AArch64ISD::NVCAST)
	MAKE_CASE(AArch64ISD::SQSHL_I)
	MAKE_CASE(AArch64ISD::UQSHL_I)
	MAKE_CASE(AArch64ISD::SRSHR_I)
	MAKE_CASE(AArch64ISD::URSHR_I)
	MAKE_CASE(AArch64ISD::SQSHLU_I)
	MAKE_CASE(AArch64ISD::WrapperLarge)
	MAKE_CASE(AArch64ISD::LD2post)
	MAKE_CASE(AArch64ISD::LD3post)
	MAKE_CASE(AArch64ISD::LD4post)
	MAKE_CASE(AArch64ISD::ST2post)
	MAKE_CASE(AArch64ISD::ST3post)
	MAKE_CASE(AArch64ISD::ST4post)
	MAKE_CASE(AArch64ISD::LD1x2post)
	MAKE_CASE(AArch64ISD::LD1x3post)
	MAKE_CASE(AArch64ISD::LD1x4post)
	MAKE_CASE(AArch64ISD::ST1x2post)
	MAKE_CASE(AArch64ISD::ST1x3post)
	MAKE_CASE(AArch64ISD::ST1x4post)
	MAKE_CASE(AArch64ISD::LD1DUPpost)
	MAKE_CASE(AArch64ISD::LD2DUPpost)
	MAKE_CASE(AArch64ISD::LD3DUPpost)
	MAKE_CASE(AArch64ISD::LD4DUPpost)
	MAKE_CASE(AArch64ISD::LD1LANEpost)
	MAKE_CASE(AArch64ISD::LD2LANEpost)
	MAKE_CASE(AArch64ISD::LD3LANEpost)
	MAKE_CASE(AArch64ISD::LD4LANEpost)
	MAKE_CASE(AArch64ISD::ST2LANEpost)
	MAKE_CASE(AArch64ISD::ST3LANEpost)
	MAKE_CASE(AArch64ISD::ST4LANEpost)
	MAKE_CASE(AArch64ISD::SMULL)
	MAKE_CASE(AArch64ISD::UMULL)
	MAKE_CASE(AArch64ISD::FRECPE)
	MAKE_CASE(AArch64ISD::FRECPS)
	MAKE_CASE(AArch64ISD::FRSQRTE)
	MAKE_CASE(AArch64ISD::FRSQRTS)
	MAKE_CASE(AArch64ISD::STG)
	MAKE_CASE(AArch64ISD::STZG)
	MAKE_CASE(AArch64ISD::ST2G)
	MAKE_CASE(AArch64ISD::STZ2G)
	MAKE_CASE(AArch64ISD::SUNPKHI)
	MAKE_CASE(AArch64ISD::SUNPKLO)
	MAKE_CASE(AArch64ISD::UUNPKHI)
	MAKE_CASE(AArch64ISD::UUNPKLO)
	MAKE_CASE(AArch64ISD::INSR)
	MAKE_CASE(AArch64ISD::PTEST)
	MAKE_CASE(AArch64ISD::PTRUE)
	MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::ST1_PRED)
	MAKE_CASE(AArch64ISD::SST1_PRED)
	MAKE_CASE(AArch64ISD::SST1_SCALED_PRED)
	MAKE_CASE(AArch64ISD::SST1_SXTW_PRED)
	MAKE_CASE(AArch64ISD::SST1_UXTW_PRED)
	MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED)
	MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED)
	MAKE_CASE(AArch64ISD::SST1_IMM_PRED)
	MAKE_CASE(AArch64ISD::SSTNT1_PRED)
	MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED)
	MAKE_CASE(AArch64ISD::LDP)
	MAKE_CASE(AArch64ISD::STP)
	MAKE_CASE(AArch64ISD::STNP)
	MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU)
	MAKE_CASE(AArch64ISD::INDEX_VECTOR)
	}
	#undef MAKE_CASE
	return nullptr;
	}

	MachineBasicBlock *
	AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	// We materialise the F128CSEL pseudo-instruction as some control flow and a
	// phi node:

	// OrigBB:
	// [... previous instrs leading to comparison ...]
	// b.ne TrueBB
	// b EndBB
	// TrueBB:
	// ; Fallthrough
	// EndBB:
	// Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]

	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction::iterator It = ++MBB->getIterator();

	Register DestReg = MI.getOperand(0).getReg();
	Register IfTrueReg = MI.getOperand(1).getReg();
	Register IfFalseReg = MI.getOperand(2).getReg();
	unsigned CondCode = MI.getOperand(3).getImm();
	bool NZCVKilled = MI.getOperand(4).isKill();

	MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MF->insert(It, TrueBB);
	MF->insert(It, EndBB);

	// Transfer rest of current basic-block to EndBB
	EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
	MBB->end());
	EndBB->transferSuccessorsAndUpdatePHIs(MBB);

	BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
	BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
	MBB->addSuccessor(TrueBB);
	MBB->addSuccessor(EndBB);

	// TrueBB falls through to the end.
	TrueBB->addSuccessor(EndBB);

	if (!NZCVKilled) {
	TrueBB->addLiveIn(AArch64::NZCV);
	EndBB->addLiveIn(AArch64::NZCV);
	}

	BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
	.addReg(IfTrueReg)
	.addMBB(TrueBB)
	.addReg(IfFalseReg)
	.addMBB(MBB);

	MI.eraseFromParent();
	return EndBB;
	}

	MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
	MachineInstr &MI, MachineBasicBlock *BB) const {
	assert(!isAsynchronousEHPersonality(classifyEHPersonality(
	BB->getParent()->getFunction().getPersonalityFn())) &&
	"SEH does not use catchret!");
	return BB;
	}

	MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
	MachineInstr &MI, MachineBasicBlock *BB) const {
	switch (MI.getOpcode()) {
	default:
	#ifndef NDEBUG
	MI.dump();
	#endif
	llvm_unreachable("Unexpected instruction for custom inserter!");

	case AArch64::F128CSEL:
	return EmitF128CSEL(MI, BB);

	case TargetOpcode::STACKMAP:
	case TargetOpcode::PATCHPOINT:
	return emitPatchPoint(MI, BB);

	case AArch64::CATCHRET:
	return EmitLoweredCatchRet(MI, BB);
	}
	}

	//===----------------------------------------------------------------------===//
	// AArch64 Lowering private implementation.
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// Lowering Code
	//===----------------------------------------------------------------------===//

	/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
	/// CC
	static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
	switch (CC) {
	default:
	llvm_unreachable("Unknown condition code!");
	case ISD::SETNE:
	return AArch64CC::NE;
	case ISD::SETEQ:
	return AArch64CC::EQ;
	case ISD::SETGT:
	return AArch64CC::GT;
	case ISD::SETGE:
	return AArch64CC::GE;
	case ISD::SETLT:
	return AArch64CC::LT;
	case ISD::SETLE:
	return AArch64CC::LE;
	case ISD::SETUGT:
	return AArch64CC::HI;
	case ISD::SETUGE:
	return AArch64CC::HS;
	case ISD::SETULT:
	return AArch64CC::LO;
	case ISD::SETULE:
	return AArch64CC::LS;
	}
	}

	/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
	static void changeFPCCToAArch64CC(ISD::CondCode CC,
	AArch64CC::CondCode &CondCode,
	AArch64CC::CondCode &CondCode2) {
	CondCode2 = AArch64CC::AL;
	switch (CC) {
	default:
	llvm_unreachable("Unknown FP condition!");
	case ISD::SETEQ:
	case ISD::SETOEQ:
	CondCode = AArch64CC::EQ;
	break;
	case ISD::SETGT:
	case ISD::SETOGT:
	CondCode = AArch64CC::GT;
	break;
	case ISD::SETGE:
	case ISD::SETOGE:
	CondCode = AArch64CC::GE;
	break;
	case ISD::SETOLT:
	CondCode = AArch64CC::MI;
	break;
	case ISD::SETOLE:
	CondCode = AArch64CC::LS;
	break;
	case ISD::SETONE:
	CondCode = AArch64CC::MI;
	CondCode2 = AArch64CC::GT;
	break;
	case ISD::SETO:
	CondCode = AArch64CC::VC;
	break;
	case ISD::SETUO:
	CondCode = AArch64CC::VS;
	break;
	case ISD::SETUEQ:
	CondCode = AArch64CC::EQ;
	CondCode2 = AArch64CC::VS;
	break;
	case ISD::SETUGT:
	CondCode = AArch64CC::HI;
	break;
	case ISD::SETUGE:
	CondCode = AArch64CC::PL;
	break;
	case ISD::SETLT:
	case ISD::SETULT:
	CondCode = AArch64CC::LT;
	break;
	case ISD::SETLE:
	case ISD::SETULE:
	CondCode = AArch64CC::LE;
	break;
	case ISD::SETNE:
	case ISD::SETUNE:
	CondCode = AArch64CC::NE;
	break;
	}
	}

	/// Convert a DAG fp condition code to an AArch64 CC.
	/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
	/// should be AND'ed instead of OR'ed.
	static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
	AArch64CC::CondCode &CondCode,
	AArch64CC::CondCode &CondCode2) {
	CondCode2 = AArch64CC::AL;
	switch (CC) {
	default:
	changeFPCCToAArch64CC(CC, CondCode, CondCode2);
	assert(CondCode2 == AArch64CC::AL);
	break;
	case ISD::SETONE:
	// (a one b)
	// == ((a olt b) \|\| (a ogt b))
	// == ((a ord b) && (a une b))
	CondCode = AArch64CC::VC;
	CondCode2 = AArch64CC::NE;
	break;
	case ISD::SETUEQ:
	// (a ueq b)
	// == ((a uno b) \|\| (a oeq b))
	// == ((a ule b) && (a uge b))
	CondCode = AArch64CC::PL;
	CondCode2 = AArch64CC::LE;
	break;
	}
	}

	/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
	/// CC usable with the vector instructions. Fewer operations are available
	/// without a real NZCV register, so we have to use less efficient combinations
	/// to get the same effect.
	static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
	AArch64CC::CondCode &CondCode,
	AArch64CC::CondCode &CondCode2,
	bool &Invert) {
	Invert = false;
	switch (CC) {
	default:
	// Mostly the scalar mappings work fine.
	changeFPCCToAArch64CC(CC, CondCode, CondCode2);
	break;
	case ISD::SETUO:
	Invert = true;
	LLVM_FALLTHROUGH;
	case ISD::SETO:
	CondCode = AArch64CC::MI;
	CondCode2 = AArch64CC::GE;
	break;
	case ISD::SETUEQ:
	case ISD::SETULT:
	case ISD::SETULE:
	case ISD::SETUGT:
	case ISD::SETUGE:
	// All of the compare-mask comparisons are ordered, but we can switch
	// between the two by a double inversion. E.g. ULE == !OGT.
	Invert = true;
	changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
	CondCode, CondCode2);
	break;
	}
	}

	static bool isLegalArithImmed(uint64_t C) {
	// Matches AArch64DAGToDAGISel::SelectArithImmed().
	bool IsLegal = (C >> 12 == 0) \|\| ((C & 0xFFFULL) == 0 && C >> 24 == 0);
	LLVM_DEBUG(dbgs() << "Is imm " << C
	<< " legal: " << (IsLegal ? "yes\n" : "no\n"));
	return IsLegal;
	}

	// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
	// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
	// can be set differently by this operation. It comes down to whether
	// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
	// everything is fine. If not then the optimization is wrong. Thus general
	// comparisons are only valid if op2 != 0.
	//
	// So, finally, the only LLVM-native comparisons that don't mention C and V
	// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
	// the absence of information about op2.
	static bool isCMN(SDValue Op, ISD::CondCode CC) {
	return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE);
	}

	static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
	SelectionDAG &DAG, SDValue Chain,
	bool IsSignaling) {
	EVT VT = LHS.getValueType();
	assert(VT != MVT::f128);
	assert(VT != MVT::f16 && "Lowering of strict fp16 not yet implemented");
	unsigned Opcode =
	IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
	return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
	}

	static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG) {
	EVT VT = LHS.getValueType();
	const bool FullFP16 =
	static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();

	if (VT.isFloatingPoint()) {
	assert(VT != MVT::f128);
	if (VT == MVT::f16 && !FullFP16) {
	LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
	RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
	VT = MVT::f32;
	}
	return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
	}

	// The CMP instruction is just an alias for SUBS, and representing it as
	// SUBS means that it's possible to get CSE with subtract operations.
	// A later phase can perform the optimization of setting the destination
	// register to WZR/XZR if it ends up being unused.
	unsigned Opcode = AArch64ISD::SUBS;

	if (isCMN(RHS, CC)) {
	// Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
	Opcode = AArch64ISD::ADDS;
	RHS = RHS.getOperand(1);
	} else if (isCMN(LHS, CC)) {
	// As we are looking for EQ/NE compares, the operands can be commuted ; can
	// we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
	Opcode = AArch64ISD::ADDS;
	LHS = LHS.getOperand(1);
	} else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
	if (LHS.getOpcode() == ISD::AND) {
	// Similarly, (CMP (and X, Y), 0) can be implemented with a TST
	// (a.k.a. ANDS) except that the flags are only guaranteed to work for one
	// of the signed comparisons.
	const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
	DAG.getVTList(VT, MVT_CC),
	LHS.getOperand(0),
	LHS.getOperand(1));
	// Replace all users of (and X, Y) with newly generated (ands X, Y)
	DAG.ReplaceAllUsesWith(LHS, ANDSNode);
	return ANDSNode.getValue(1);
	} else if (LHS.getOpcode() == AArch64ISD::ANDS) {
	// Use result of ANDS
	return LHS.getValue(1);
	}
	}

	return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
	.getValue(1);
	}

	/// \defgroup AArch64CCMP CMP;CCMP matching
	///
	/// These functions deal with the formation of CMP;CCMP;... sequences.
	/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
	/// a comparison. They set the NZCV flags to a predefined value if their
	/// predicate is false. This allows to express arbitrary conjunctions, for
	/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
	/// expressed as:
	/// cmp A
	/// ccmp B, inv(CB), CA
	/// check for CB flags
	///
	/// This naturally lets us implement chains of AND operations with SETCC
	/// operands. And we can even implement some other situations by transforming
	/// them:
	/// - We can implement (NEG SETCC) i.e. negating a single comparison by
	/// negating the flags used in a CCMP/FCCMP operations.
	/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
	/// by negating the flags we test for afterwards. i.e.
	/// NEG (CMP CCMP CCCMP ...) can be implemented.
	/// - Note that we can only ever negate all previously processed results.
	/// What we can not implement by flipping the flags to test is a negation
	/// of two sub-trees (because the negation affects all sub-trees emitted so
	/// far, so the 2nd sub-tree we emit would also affect the first).
	/// With those tools we can implement some OR operations:
	/// - (OR (SETCC A) (SETCC B)) can be implemented via:
	/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
	/// - After transforming OR to NEG/AND combinations we may be able to use NEG
	/// elimination rules from earlier to implement the whole thing as a
	/// CCMP/FCCMP chain.
	///
	/// As complete example:
	/// or (or (setCA (cmp A)) (setCB (cmp B)))
	/// (and (setCC (cmp C)) (setCD (cmp D)))"
	/// can be reassociated to:
	/// or (and (setCC (cmp C)) setCD (cmp D))
	// (or (setCA (cmp A)) (setCB (cmp B)))
	/// can be transformed to:
	/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
	/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
	/// which can be implemented as:
	/// cmp C
	/// ccmp D, inv(CD), CC
	/// ccmp A, CA, inv(CD)
	/// ccmp B, CB, inv(CA)
	/// check for CB flags
	///
	/// A counterexample is "or (and A B) (and C D)" which translates to
	/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
	/// can only implement 1 of the inner (not) operations, but not both!
	/// @{

	/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
	static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
	ISD::CondCode CC, SDValue CCOp,
	AArch64CC::CondCode Predicate,
	AArch64CC::CondCode OutCC,
	const SDLoc &DL, SelectionDAG &DAG) {
	unsigned Opcode = 0;
	const bool FullFP16 =
	static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();

	if (LHS.getValueType().isFloatingPoint()) {
	assert(LHS.getValueType() != MVT::f128);
	if (LHS.getValueType() == MVT::f16 && !FullFP16) {
	LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
	RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
	}
	Opcode = AArch64ISD::FCCMP;
	} else if (RHS.getOpcode() == ISD::SUB) {
	SDValue SubOp0 = RHS.getOperand(0);
	if (isNullConstant(SubOp0) && (CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	// See emitComparison() on why we can only do this for SETEQ and SETNE.
	Opcode = AArch64ISD::CCMN;
	RHS = RHS.getOperand(1);
	}
	}
	if (Opcode == 0)
	Opcode = AArch64ISD::CCMP;

	SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
	AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
	unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
	SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
	return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
	}

	/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
	/// expressed as a conjunction. See \ref AArch64CCMP.
	/// \param CanNegate Set to true if we can negate the whole sub-tree just by
	/// changing the conditions on the SETCC tests.
	/// (this means we can call emitConjunctionRec() with
	/// Negate==true on this sub-tree)
	/// \param MustBeFirst Set to true if this subtree needs to be negated and we
	/// cannot do the negation naturally. We are required to
	/// emit the subtree first in this case.
	/// \param WillNegate Is true if are called when the result of this
	/// subexpression must be negated. This happens when the
	/// outer expression is an OR. We can use this fact to know
	/// that we have a double negation (or (or ...) ...) that
	/// can be implemented for free.
	static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
	bool &MustBeFirst, bool WillNegate,
	unsigned Depth = 0) {
	if (!Val.hasOneUse())
	return false;
	unsigned Opcode = Val->getOpcode();
	if (Opcode == ISD::SETCC) {
	if (Val->getOperand(0).getValueType() == MVT::f128)
	return false;
	CanNegate = true;
	MustBeFirst = false;
	return true;
	}
	// Protect against exponential runtime and stack overflow.
	if (Depth > 6)
	return false;
	if (Opcode == ISD::AND \|\| Opcode == ISD::OR) {
	bool IsOR = Opcode == ISD::OR;
	SDValue O0 = Val->getOperand(0);
	SDValue O1 = Val->getOperand(1);
	bool CanNegateL;
	bool MustBeFirstL;
	if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
	return false;
	bool CanNegateR;
	bool MustBeFirstR;
	if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
	return false;

	if (MustBeFirstL && MustBeFirstR)
	return false;

	if (IsOR) {
	// For an OR expression we need to be able to naturally negate at least
	// one side or we cannot do the transformation at all.
	if (!CanNegateL && !CanNegateR)
	return false;
	// If we the result of the OR will be negated and we can naturally negate
	// the leafs, then this sub-tree as a whole negates naturally.
	CanNegate = WillNegate && CanNegateL && CanNegateR;
	// If we cannot naturally negate the whole sub-tree, then this must be
	// emitted first.
	MustBeFirst = !CanNegate;
	} else {
	assert(Opcode == ISD::AND && "Must be OR or AND");
	// We cannot naturally negate an AND operation.
	CanNegate = false;
	MustBeFirst = MustBeFirstL \|\| MustBeFirstR;
	}
	return true;
	}
	return false;
	}

	/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
	/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
	/// Tries to transform the given i1 producing node @p Val to a series compare
	/// and conditional compare operations. @returns an NZCV flags producing node
	/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
	/// transformation was not possible.
	/// \p Negate is true if we want this sub-tree being negated just by changing
	/// SETCC conditions.
	static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
	AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
	AArch64CC::CondCode Predicate) {
	// We're at a tree leaf, produce a conditional comparison operation.
	unsigned Opcode = Val->getOpcode();
	if (Opcode == ISD::SETCC) {
	SDValue LHS = Val->getOperand(0);
	SDValue RHS = Val->getOperand(1);
	ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
	bool isInteger = LHS.getValueType().isInteger();
	if (Negate)
	CC = getSetCCInverse(CC, LHS.getValueType());
	SDLoc DL(Val);
	// Determine OutCC and handle FP special case.
	if (isInteger) {
	OutCC = changeIntCCToAArch64CC(CC);
	} else {
	assert(LHS.getValueType().isFloatingPoint());
	AArch64CC::CondCode ExtraCC;
	changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
	// Some floating point conditions can't be tested with a single condition
	// code. Construct an additional comparison in this case.
	if (ExtraCC != AArch64CC::AL) {
	SDValue ExtraCmp;
	if (!CCOp.getNode())
	ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
	else
	ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
	ExtraCC, DL, DAG);
	CCOp = ExtraCmp;
	Predicate = ExtraCC;
	}
	}

	// Produce a normal comparison if we are first in the chain
	if (!CCOp)
	return emitComparison(LHS, RHS, CC, DL, DAG);
	// Otherwise produce a ccmp.
	return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
	DAG);
	}
	assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");

	bool IsOR = Opcode == ISD::OR;

	SDValue LHS = Val->getOperand(0);
	bool CanNegateL;
	bool MustBeFirstL;
	bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
	assert(ValidL && "Valid conjunction/disjunction tree");
	(void)ValidL;

	SDValue RHS = Val->getOperand(1);
	bool CanNegateR;
	bool MustBeFirstR;
	bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
	assert(ValidR && "Valid conjunction/disjunction tree");
	(void)ValidR;

	// Swap sub-tree that must come first to the right side.
	if (MustBeFirstL) {
	assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
	std::swap(LHS, RHS);
	std::swap(CanNegateL, CanNegateR);
	std::swap(MustBeFirstL, MustBeFirstR);
	}

	bool NegateR;
	bool NegateAfterR;
	bool NegateL;
	bool NegateAfterAll;
	if (Opcode == ISD::OR) {
	// Swap the sub-tree that we can negate naturally to the left.
	if (!CanNegateL) {
	assert(CanNegateR && "at least one side must be negatable");
	assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
	assert(!Negate);
	std::swap(LHS, RHS);
	NegateR = false;
	NegateAfterR = true;
	} else {
	// Negate the left sub-tree if possible, otherwise negate the result.
	NegateR = CanNegateR;
	NegateAfterR = !CanNegateR;
	}
	NegateL = true;
	NegateAfterAll = !Negate;
	} else {
	assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
	assert(!Negate && "Valid conjunction/disjunction tree");

	NegateL = false;
	NegateR = false;
	NegateAfterR = false;
	NegateAfterAll = false;
	}

	// Emit sub-trees.
	AArch64CC::CondCode RHSCC;
	SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
	if (NegateAfterR)
	RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
	SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
	if (NegateAfterAll)
	OutCC = AArch64CC::getInvertedCondCode(OutCC);
	return CmpL;
	}

	/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
	/// In some cases this is even possible with OR operations in the expression.
	/// See \ref AArch64CCMP.
	/// \see emitConjunctionRec().
	static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val,
	AArch64CC::CondCode &OutCC) {
	bool DummyCanNegate;
	bool DummyMustBeFirst;
	if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
	return SDValue();

	return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
	}

	/// @}

	/// Returns how profitable it is to fold a comparison's operand's shift and/or
	/// extension operations.
	static unsigned getCmpOperandFoldingProfit(SDValue Op) {
	auto isSupportedExtend = [&](SDValue V) {
	if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
	return true;

	if (V.getOpcode() == ISD::AND)
	if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
	uint64_t Mask = MaskCst->getZExtValue();
	return (Mask == 0xFF \|\| Mask == 0xFFFF \|\| Mask == 0xFFFFFFFF);
	}

	return false;
	};

	if (!Op.hasOneUse())
	return 0;

	if (isSupportedExtend(Op))
	return 1;

	unsigned Opc = Op.getOpcode();
	if (Opc == ISD::SHL \|\| Opc == ISD::SRL \|\| Opc == ISD::SRA)
	if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	uint64_t Shift = ShiftCst->getZExtValue();
	if (isSupportedExtend(Op.getOperand(0)))
	return (Shift <= 4) ? 2 : 1;
	EVT VT = Op.getValueType();
	if ((VT == MVT::i32 && Shift <= 31) \|\| (VT == MVT::i64 && Shift <= 63))
	return 1;
	}

	return 0;
	}

	static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
	SDValue &AArch64cc, SelectionDAG &DAG,
	const SDLoc &dl) {
	if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
	EVT VT = RHS.getValueType();
	uint64_t C = RHSC->getZExtValue();
	if (!isLegalArithImmed(C)) {
	// Constant does not fit, try adjusting it by one?
	switch (CC) {
	default:
	break;
	case ISD::SETLT:
	case ISD::SETGE:
	if ((VT == MVT::i32 && C != 0x80000000 &&
	isLegalArithImmed((uint32_t)(C - 1))) \|\|
	(VT == MVT::i64 && C != 0x80000000ULL &&
	isLegalArithImmed(C - 1ULL))) {
	CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
	C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
	RHS = DAG.getConstant(C, dl, VT);
	}
	break;
	case ISD::SETULT:
	case ISD::SETUGE:
	if ((VT == MVT::i32 && C != 0 &&
	isLegalArithImmed((uint32_t)(C - 1))) \|\|
	(VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
	CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
	C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
	RHS = DAG.getConstant(C, dl, VT);
	}
	break;
	case ISD::SETLE:
	case ISD::SETGT:
	if ((VT == MVT::i32 && C != INT32_MAX &&
	isLegalArithImmed((uint32_t)(C + 1))) \|\|
	(VT == MVT::i64 && C != INT64_MAX &&
	isLegalArithImmed(C + 1ULL))) {
	CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
	C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
	RHS = DAG.getConstant(C, dl, VT);
	}
	break;
	case ISD::SETULE:
	case ISD::SETUGT:
	if ((VT == MVT::i32 && C != UINT32_MAX &&
	isLegalArithImmed((uint32_t)(C + 1))) \|\|
	(VT == MVT::i64 && C != UINT64_MAX &&
	isLegalArithImmed(C + 1ULL))) {
	CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
	C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
	RHS = DAG.getConstant(C, dl, VT);
	}
	break;
	}
	}
	}

	// Comparisons are canonicalized so that the RHS operand is simpler than the
	// LHS one, the extreme case being when RHS is an immediate. However, AArch64
	// can fold some shift+extend operations on the RHS operand, so swap the
	// operands if that can be done.
	//
	// For example:
	// lsl w13, w11, #1
	// cmp w13, w12
	// can be turned into:
	// cmp w12, w11, lsl #1
	if (!isa<ConstantSDNode>(RHS) \|\|
	!isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
	SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;

	if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) {
	std::swap(LHS, RHS);
	CC = ISD::getSetCCSwappedOperands(CC);
	}
	}

	SDValue Cmp;
	AArch64CC::CondCode AArch64CC;
	if ((CC == ISD::SETEQ \|\| CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
	const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);

	// The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
	// For the i8 operand, the largest immediate is 255, so this can be easily
	// encoded in the compare instruction. For the i16 operand, however, the
	// largest immediate cannot be encoded in the compare.
	// Therefore, use a sign extending load and cmn to avoid materializing the
	// -1 constant. For example,
	// movz w1, #65535
	// ldrh w0, [x0, #0]
	// cmp w0, w1
	// >
	// ldrsh w0, [x0, #0]
	// cmn w0, #1
	// Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
	// if and only if (sext LHS) == (sext RHS). The checks are in place to
	// ensure both the LHS and RHS are truly zero extended and to make sure the
	// transformation is profitable.
	if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
	cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
	cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
	LHS.getNode()->hasNUsesOfValue(1, 0)) {
	int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
	if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
	SDValue SExt =
	DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
	DAG.getValueType(MVT::i16));
	Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
	RHS.getValueType()),
	CC, dl, DAG);
	AArch64CC = changeIntCCToAArch64CC(CC);
	}
	}

	if (!Cmp && (RHSC->isNullValue() \|\| RHSC->isOne())) {
	if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
	if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
	AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
	}
	}
	}

	if (!Cmp) {
	Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
	AArch64CC = changeIntCCToAArch64CC(CC);
	}
	AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
	return Cmp;
	}

	static std::pair<SDValue, SDValue>
	getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
	assert((Op.getValueType() == MVT::i32 \|\| Op.getValueType() == MVT::i64) &&
	"Unsupported value type");
	SDValue Value, Overflow;
	SDLoc DL(Op);
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	unsigned Opc = 0;
	switch (Op.getOpcode()) {
	default:
	llvm_unreachable("Unknown overflow instruction!");
	case ISD::SADDO:
	Opc = AArch64ISD::ADDS;
	CC = AArch64CC::VS;
	break;
	case ISD::UADDO:
	Opc = AArch64ISD::ADDS;
	CC = AArch64CC::HS;
	break;
	case ISD::SSUBO:
	Opc = AArch64ISD::SUBS;
	CC = AArch64CC::VS;
	break;
	case ISD::USUBO:
	Opc = AArch64ISD::SUBS;
	CC = AArch64CC::LO;
	break;
	// Multiply needs a little bit extra work.
	case ISD::SMULO:
	case ISD::UMULO: {
	CC = AArch64CC::NE;
	bool IsSigned = Op.getOpcode() == ISD::SMULO;
	if (Op.getValueType() == MVT::i32) {
	unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	// For a 32 bit multiply with overflow check we want the instruction
	// selector to generate a widening multiply (SMADDL/UMADDL). For that we
	// need to generate the following pattern:
	// (i64 add 0, (i64 mul (i64 sext\|zext i32 %a), (i64 sext\|zext i32 %b))
	LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
	RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
	SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
	SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
	DAG.getConstant(0, DL, MVT::i64));
	// On AArch64 the upper 32 bits are always zero extended for a 32 bit
	// operation. We need to clear out the upper 32 bits, because we used a
	// widening multiply that wrote all 64 bits. In the end this should be a
	// noop.
	Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
	if (IsSigned) {
	// The signed overflow check requires more than just a simple check for
	// any bit set in the upper 32 bits of the result. These bits could be
	// just the sign bits of a negative number. To perform the overflow
	// check we have to arithmetic shift right the 32nd bit of the result by
	// 31 bits. Then we compare the result to the upper 32 bits.
	SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
	DAG.getConstant(32, DL, MVT::i64));
	UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
	SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
	DAG.getConstant(31, DL, MVT::i64));
	// It is important that LowerBits is last, otherwise the arithmetic
	// shift will not be folded into the compare (SUBS).
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
	Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
	.getValue(1);
	} else {
	// The overflow check for unsigned multiply is easy. We only need to
	// check if any of the upper 32 bits are set. This can be done with a
	// CMP (shifted register). For that we need to generate the following
	// pattern:
	// (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
	SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
	DAG.getConstant(32, DL, MVT::i64));
	SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
	Overflow =
	DAG.getNode(AArch64ISD::SUBS, DL, VTs,
	DAG.getConstant(0, DL, MVT::i64),
	UpperBits).getValue(1);
	}
	break;
	}
	assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
	// For the 64 bit multiply
	Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
	if (IsSigned) {
	SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
	SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
	DAG.getConstant(63, DL, MVT::i64));
	// It is important that LowerBits is last, otherwise the arithmetic
	// shift will not be folded into the compare (SUBS).
	SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
	Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
	.getValue(1);
	} else {
	SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
	SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
	Overflow =
	DAG.getNode(AArch64ISD::SUBS, DL, VTs,
	DAG.getConstant(0, DL, MVT::i64),
	UpperBits).getValue(1);
	}
	break;
	}
	} // switch (...)

	if (Opc) {
	SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);

	// Emit the AArch64 operation with overflow check.
	Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
	Overflow = Value.getValue(1);
	}
	return std::make_pair(Value, Overflow);
	}

	SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
	RTLIB::Libcall Call) const {
	bool IsStrict = Op->isStrictFPOpcode();
	unsigned Offset = IsStrict ? 1 : 0;
	SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
	SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end());
	MakeLibCallOptions CallOptions;
	SDValue Result;
	SDLoc dl(Op);
	std::tie(Result, Chain) = makeLibCall(DAG, Call, Op.getValueType(), Ops,
	CallOptions, dl, Chain);
	return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
	}

	static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
	SDValue Sel = Op.getOperand(0);
	SDValue Other = Op.getOperand(1);
	SDLoc dl(Sel);

	// If the operand is an overflow checking operation, invert the condition
	// code and kill the Not operation. I.e., transform:
	// (xor (overflow_op_bool, 1))
	// -->
	// (csel 1, 0, invert(cc), overflow_op_bool)
	// ... which later gets transformed to just a cset instruction with an
	// inverted condition code, rather than a cset + eor sequence.
	if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) {
	// Only lower legal XALUO ops.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
	return SDValue();

	SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
	SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
	AArch64CC::CondCode CC;
	SDValue Value, Overflow;
	std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
	SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
	return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
	CCVal, Overflow);
	}
	// If neither operand is a SELECT_CC, give up.
	if (Sel.getOpcode() != ISD::SELECT_CC)
	std::swap(Sel, Other);
	if (Sel.getOpcode() != ISD::SELECT_CC)
	return Op;

	// The folding we want to perform is:
	// (xor x, (select_cc a, b, cc, 0, -1) )
	// -->
	// (csel x, (xor x, -1), cc ...)
	//
	// The latter will get matched to a CSINV instruction.

	ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
	SDValue LHS = Sel.getOperand(0);
	SDValue RHS = Sel.getOperand(1);
	SDValue TVal = Sel.getOperand(2);
	SDValue FVal = Sel.getOperand(3);

	// FIXME: This could be generalized to non-integer comparisons.
	if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
	return Op;

	ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
	ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);

	// The values aren't constants, this isn't the pattern we're looking for.
	if (!CFVal \|\| !CTVal)
	return Op;

	// We can commute the SELECT_CC by inverting the condition. This
	// might be needed to make this fit into a CSINV pattern.
	if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, LHS.getValueType());
	}

	// If the constants line up, perform the transform!
	if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
	SDValue CCVal;
	SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);

	FVal = Other;
	TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
	DAG.getConstant(-1ULL, dl, Other.getValueType()));

	return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
	CCVal, Cmp);
	}

	return Op;
	}

	static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
	EVT VT = Op.getValueType();

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	SDVTList VTs = DAG.getVTList(VT, MVT::i32);

	unsigned Opc;
	bool ExtraOp = false;
	switch (Op.getOpcode()) {
	default:
	llvm_unreachable("Invalid code");
	case ISD::ADDC:
	Opc = AArch64ISD::ADDS;
	break;
	case ISD::SUBC:
	Opc = AArch64ISD::SUBS;
	break;
	case ISD::ADDE:
	Opc = AArch64ISD::ADCS;
	ExtraOp = true;
	break;
	case ISD::SUBE:
	Opc = AArch64ISD::SBCS;
	ExtraOp = true;
	break;
	}

	if (!ExtraOp)
	return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
	return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
	Op.getOperand(2));
	}

	static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
	return SDValue();

	SDLoc dl(Op);
	AArch64CC::CondCode CC;
	// The actual operation that sets the overflow or carry flag.
	SDValue Value, Overflow;
	std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);

	// We use 0 and 1 as false and true values.
	SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
	SDValue FVal = DAG.getConstant(0, dl, MVT::i32);

	// We use an inverted condition, because the conditional select is inverted
	// too. This will allow it to be selected to a single instruction:
	// CSINC Wd, WZR, WZR, invert(cond).
	SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
	Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
	CCVal, Overflow);

	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
	}

	// Prefetch operands are:
	// 1: Address to prefetch
	// 2: bool isWrite
	// 3: int locality (0 = no locality ... 3 = extreme locality)
	// 4: bool isDataCache
	static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
	SDLoc DL(Op);
	unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
	unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
	unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();

	bool IsStream = !Locality;
	// When the locality number is set
	if (Locality) {
	// The front-end should have filtered out the out-of-range values
	assert(Locality <= 3 && "Prefetch locality out-of-range");
	// The locality degree is the opposite of the cache speed.
	// Put the number the other way around.
	// The encoding starts at 0 for level 1
	Locality = 3 - Locality;
	}

	// built the mask value encoding the expected behavior.
	unsigned PrfOp = (IsWrite << 4) \| // Load/Store bit
	(!IsData << 3) \| // IsDataCache bit
	(Locality << 1) \| // Cache level bits
	(unsigned)IsStream; // Stream bit
	return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
	DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
	}

	SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");

	RTLIB::Libcall LC;
	LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());

	return LowerF128Call(Op, DAG, LC);
	}

	SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
	SelectionDAG &DAG) const {
	bool IsStrict = Op->isStrictFPOpcode();
	SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
	EVT SrcVT = SrcVal.getValueType();

	if (SrcVT != MVT::f128) {
	// Expand cases where the input is a vector bigger than NEON.
	if (useSVEForFixedLengthVectorVT(SrcVT))
	return SDValue();

	// It's legal except when f128 is involved
	return Op;
	}

	RTLIB::Libcall LC;
	LC = RTLIB::getFPROUND(SrcVT, Op.getValueType());

	// FP_ROUND node has a second operand indicating whether it is known to be
	// precise. That doesn't take part in the LibCall so we can't directly use
	// LowerF128Call.
	MakeLibCallOptions CallOptions;
	SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
	SDValue Result;
	SDLoc dl(Op);
	std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
	CallOptions, dl, Chain);
	return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
	}

	SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
	SelectionDAG &DAG) const {
	// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
	// Any additional optimization in this function should be recorded
	// in the cost tables.
	EVT InVT = Op.getOperand(0).getValueType();
	EVT VT = Op.getValueType();
	unsigned NumElts = InVT.getVectorNumElements();

	// f16 conversions are promoted to f32 when full fp16 is not supported.
	if (InVT.getVectorElementType() == MVT::f16 &&
	!Subtarget->hasFullFP16()) {
	MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
	SDLoc dl(Op);
	return DAG.getNode(
	Op.getOpcode(), dl, Op.getValueType(),
	DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
	}

	if (VT.getSizeInBits() < InVT.getSizeInBits()) {
	SDLoc dl(Op);
	SDValue Cv =
	DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
	Op.getOperand(0));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
	}

	if (VT.getSizeInBits() > InVT.getSizeInBits()) {
	SDLoc dl(Op);
	MVT ExtVT =
	MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
	VT.getVectorNumElements());
	SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
	return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
	}

	// Type changing conversions are illegal.
	return Op;
	}

	SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
	SelectionDAG &DAG) const {
	bool IsStrict = Op->isStrictFPOpcode();
	SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);

	if (SrcVal.getValueType().isVector())
	return LowerVectorFP_TO_INT(Op, DAG);

	// f16 conversions are promoted to f32 when full fp16 is not supported.
	if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
	assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
	SDLoc dl(Op);
	return DAG.getNode(
	Op.getOpcode(), dl, Op.getValueType(),
	DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
	}

	if (SrcVal.getValueType() != MVT::f128) {
	// It's legal except when f128 is involved
	return Op;
	}

	RTLIB::Libcall LC;
	if (Op.getOpcode() == ISD::FP_TO_SINT \|\|
	Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
	LC = RTLIB::getFPTOSINT(SrcVal.getValueType(), Op.getValueType());
	else
	LC = RTLIB::getFPTOUINT(SrcVal.getValueType(), Op.getValueType());

	return LowerF128Call(Op, DAG, LC);
	}

	static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
	// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
	// Any additional optimization in this function should be recorded
	// in the cost tables.
	EVT VT = Op.getValueType();
	SDLoc dl(Op);
	SDValue In = Op.getOperand(0);
	EVT InVT = In.getValueType();

	if (VT.getSizeInBits() < InVT.getSizeInBits()) {
	MVT CastVT =
	MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
	InVT.getVectorNumElements());
	In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
	return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
	}

	if (VT.getSizeInBits() > InVT.getSizeInBits()) {
	unsigned CastOpc =
	Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	EVT CastVT = VT.changeVectorElementTypeToInteger();
	In = DAG.getNode(CastOpc, dl, CastVT, In);
	return DAG.getNode(Op.getOpcode(), dl, VT, In);
	}

	return Op;
	}

	SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	if (Op.getValueType().isVector())
	return LowerVectorINT_TO_FP(Op, DAG);

	bool IsStrict = Op->isStrictFPOpcode();
	SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);

	// f16 conversions are promoted to f32 when full fp16 is not supported.
	if (Op.getValueType() == MVT::f16 &&
	!Subtarget->hasFullFP16()) {
	assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
	SDLoc dl(Op);
	return DAG.getNode(
	ISD::FP_ROUND, dl, MVT::f16,
	DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal),
	DAG.getIntPtrConstant(0, dl));
	}

	// i128 conversions are libcalls.
	if (SrcVal.getValueType() == MVT::i128)
	return SDValue();

	// Other conversions are legal, unless it's to the completely software-based
	// fp128.
	if (Op.getValueType() != MVT::f128)
	return Op;

	RTLIB::Libcall LC;
	if (Op.getOpcode() == ISD::SINT_TO_FP \|\|
	Op.getOpcode() == ISD::STRICT_SINT_TO_FP)
	LC = RTLIB::getSINTTOFP(SrcVal.getValueType(), Op.getValueType());
	else
	LC = RTLIB::getUINTTOFP(SrcVal.getValueType(), Op.getValueType());

	return LowerF128Call(Op, DAG, LC);
	}

	SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
	SelectionDAG &DAG) const {
	// For iOS, we want to call an alternative entry point: __sincos_stret,
	// which returns the values in two S / D registers.
	SDLoc dl(Op);
	SDValue Arg = Op.getOperand(0);
	EVT ArgVT = Arg.getValueType();
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());

	ArgListTy Args;
	ArgListEntry Entry;

	Entry.Node = Arg;
	Entry.Ty = ArgTy;
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);

	RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
	: RTLIB::SINCOS_STRET_F32;
	const char *LibcallName = getLibcallName(LC);
	SDValue Callee =
	DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));

	StructType *RetTy = StructType::get(ArgTy, ArgTy);
	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(DAG.getEntryNode())
	.setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));

	std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
	return CallResult.first;
	}

	static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
	EVT OpVT = Op.getValueType();
	if (OpVT != MVT::f16 && OpVT != MVT::bf16)
	return SDValue();

	assert(Op.getOperand(0).getValueType() == MVT::i16);
	SDLoc DL(Op);

	Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
	Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
	return SDValue(
	DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, OpVT, Op,
	DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
	0);
	}

	static EVT getExtensionTo64Bits(const EVT &OrigVT) {
	if (OrigVT.getSizeInBits() >= 64)
	return OrigVT;

	assert(OrigVT.isSimple() && "Expecting a simple value type");

	MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
	switch (OrigSimpleTy) {
	default: llvm_unreachable("Unexpected Vector Type");
	case MVT::v2i8:
	case MVT::v2i16:
	return MVT::v2i32;
	case MVT::v4i8:
	return MVT::v4i16;
	}
	}

	static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
	const EVT &OrigTy,
	const EVT &ExtTy,
	unsigned ExtOpcode) {
	// The vector originally had a size of OrigTy. It was then extended to ExtTy.
	// We expect the ExtTy to be 128-bits total. If the OrigTy is less than
	// 64-bits we need to insert a new extension so that it will be 64-bits.
	assert(ExtTy.is128BitVector() && "Unexpected extension size");
	if (OrigTy.getSizeInBits() >= 64)
	return N;

	// Must extend size to at least 64 bits to be used as an operand for VMULL.
	EVT NewVT = getExtensionTo64Bits(OrigTy);

	return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
	}

	static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
	bool isSigned) {
	EVT VT = N->getValueType(0);

	if (N->getOpcode() != ISD::BUILD_VECTOR)
	return false;

	for (const SDValue &Elt : N->op_values()) {
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
	unsigned EltSize = VT.getScalarSizeInBits();
	unsigned HalfSize = EltSize / 2;
	if (isSigned) {
	if (!isIntN(HalfSize, C->getSExtValue()))
	return false;
	} else {
	if (!isUIntN(HalfSize, C->getZExtValue()))
	return false;
	}
	continue;
	}
	return false;
	}

	return true;
	}

	static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
	if (N->getOpcode() == ISD::SIGN_EXTEND \|\| N->getOpcode() == ISD::ZERO_EXTEND)
	return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
	N->getOperand(0)->getValueType(0),
	N->getValueType(0),
	N->getOpcode());

	assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
	EVT VT = N->getValueType(0);
	SDLoc dl(N);
	unsigned EltSize = VT.getScalarSizeInBits() / 2;
	unsigned NumElts = VT.getVectorNumElements();
	MVT TruncVT = MVT::getIntegerVT(EltSize);
	SmallVector<SDValue, 8> Ops;
	for (unsigned i = 0; i != NumElts; ++i) {
	ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
	const APInt &CInt = C->getAPIntValue();
	// Element types smaller than 32 bits are not legal, so use i32 elements.
	// The values are implicitly truncated so sext vs. zext doesn't matter.
	Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
	}
	return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
	}

	static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
	return N->getOpcode() == ISD::SIGN_EXTEND \|\|
	isExtendedBUILD_VECTOR(N, DAG, true);
	}

	static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
	return N->getOpcode() == ISD::ZERO_EXTEND \|\|
	isExtendedBUILD_VECTOR(N, DAG, false);
	}

	static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
	unsigned Opcode = N->getOpcode();
	if (Opcode == ISD::ADD \|\| Opcode == ISD::SUB) {
	SDNode *N0 = N->getOperand(0).getNode();
	SDNode *N1 = N->getOperand(1).getNode();
	return N0->hasOneUse() && N1->hasOneUse() &&
	isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
	}
	return false;
	}

	static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
	unsigned Opcode = N->getOpcode();
	if (Opcode == ISD::ADD \|\| Opcode == ISD::SUB) {
	SDNode *N0 = N->getOperand(0).getNode();
	SDNode *N1 = N->getOperand(1).getNode();
	return N0->hasOneUse() && N1->hasOneUse() &&
	isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
	}
	return false;
	}

	SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
	SelectionDAG &DAG) const {
	// The rounding mode is in bits 23:22 of the FPSCR.
	// The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
	// The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
	// so that the shift + and get folded into a bitfield extract.
	SDLoc dl(Op);

	SDValue Chain = Op.getOperand(0);
	SDValue FPCR_64 = DAG.getNode(
	ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
	{Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
	Chain = FPCR_64.getValue(1);
	SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
	SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
	DAG.getConstant(1U << 22, dl, MVT::i32));
	SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
	DAG.getConstant(22, dl, MVT::i32));
	SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
	DAG.getConstant(3, dl, MVT::i32));
	return DAG.getMergeValues({AND, Chain}, dl);
	}

	static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
	// Multiplications are only custom-lowered for 128-bit vectors so that
	// VMULL can be detected. Otherwise v2i64 multiplications are not legal.
	EVT VT = Op.getValueType();
	assert(VT.is128BitVector() && VT.isInteger() &&
	"unexpected type for custom-lowering ISD::MUL");
	SDNode *N0 = Op.getOperand(0).getNode();
	SDNode *N1 = Op.getOperand(1).getNode();
	unsigned NewOpc = 0;
	bool isMLA = false;
	bool isN0SExt = isSignExtended(N0, DAG);
	bool isN1SExt = isSignExtended(N1, DAG);
	if (isN0SExt && isN1SExt)
	NewOpc = AArch64ISD::SMULL;
	else {
	bool isN0ZExt = isZeroExtended(N0, DAG);
	bool isN1ZExt = isZeroExtended(N1, DAG);
	if (isN0ZExt && isN1ZExt)
	NewOpc = AArch64ISD::UMULL;
	else if (isN1SExt \|\| isN1ZExt) {
	// Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
	// into (s/zext A * s/zext C) + (s/zext B * s/zext C)
	if (isN1SExt && isAddSubSExt(N0, DAG)) {
	NewOpc = AArch64ISD::SMULL;
	isMLA = true;
	} else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
	NewOpc = AArch64ISD::UMULL;
	isMLA = true;
	} else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
	std::swap(N0, N1);
	NewOpc = AArch64ISD::UMULL;
	isMLA = true;
	}
	}

	if (!NewOpc) {
	if (VT == MVT::v2i64)
	// Fall through to expand this. It is not legal.
	return SDValue();
	else
	// Other vector multiplications are legal.
	return Op;
	}
	}

	// Legalize to a S/UMULL instruction
	SDLoc DL(Op);
	SDValue Op0;
	SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
	if (!isMLA) {
	Op0 = skipExtensionForVectorMULL(N0, DAG);
	assert(Op0.getValueType().is64BitVector() &&
	Op1.getValueType().is64BitVector() &&
	"unexpected types for extended operands to VMULL");
	return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
	}
	// Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
	// isel lowering to take advantage of no-stall back to back s/umul + s/umla.
	// This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
	SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
	SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
	EVT Op1VT = Op1.getValueType();
	return DAG.getNode(N0->getOpcode(), DL, VT,
	DAG.getNode(NewOpc, DL, VT,
	DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
	DAG.getNode(NewOpc, DL, VT,
	DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
	}

	static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
	int Pattern) {
	return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
	DAG.getTargetConstant(Pattern, DL, MVT::i32));
	}

	SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
	SelectionDAG &DAG) const {
	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	SDLoc dl(Op);
	switch (IntNo) {
	default: return SDValue(); // Don't custom lower most intrinsics.
	case Intrinsic::thread_pointer: {
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
	}
	case Intrinsic::aarch64_neon_abs: {
	EVT Ty = Op.getValueType();
	if (Ty == MVT::i64) {
	SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
	Op.getOperand(1));
	Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
	return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
	} else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
	return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
	} else {
	report_fatal_error("Unexpected type for AArch64 NEON intrinic");
	}
	}
	case Intrinsic::aarch64_neon_smax:
	return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_neon_umax:
	return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_neon_smin:
	return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_neon_umin:
	return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));

	case Intrinsic::aarch64_sve_sunpkhi:
	return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
	Op.getOperand(1));
	case Intrinsic::aarch64_sve_sunpklo:
	return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
	Op.getOperand(1));
	case Intrinsic::aarch64_sve_uunpkhi:
	return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
	Op.getOperand(1));
	case Intrinsic::aarch64_sve_uunpklo:
	return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
	Op.getOperand(1));
	case Intrinsic::aarch64_sve_clasta_n:
	return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
	case Intrinsic::aarch64_sve_clastb_n:
	return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
	case Intrinsic::aarch64_sve_lasta:
	return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_sve_lastb:
	return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_sve_rev:
	return DAG.getNode(AArch64ISD::REV, dl, Op.getValueType(),
	Op.getOperand(1));
	case Intrinsic::aarch64_sve_tbl:
	return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_sve_trn1:
	return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_sve_trn2:
	return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_sve_uzp1:
	return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_sve_uzp2:
	return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_sve_zip1:
	return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_sve_zip2:
	return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_sve_ptrue:
	return DAG.getNode(AArch64ISD::PTRUE, dl, Op.getValueType(),
	Op.getOperand(1));
	case Intrinsic::aarch64_sve_dupq_lane:
	return LowerDUPQLane(Op, DAG);
	case Intrinsic::aarch64_sve_convert_from_svbool:
	return DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, Op.getValueType(),
	Op.getOperand(1));
	case Intrinsic::aarch64_sve_convert_to_svbool: {
	EVT OutVT = Op.getValueType();
	EVT InVT = Op.getOperand(1).getValueType();
	// Return the operand if the cast isn't changing type,
	// i.e. <n x 16 x i1> -> <n x 16 x i1>
	if (InVT == OutVT)
	return Op.getOperand(1);
	// Otherwise, zero the newly introduced lanes.
	SDValue Reinterpret =
	DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, OutVT, Op.getOperand(1));
	SDValue Mask = getPTrue(DAG, dl, InVT, AArch64SVEPredPattern::all);
	SDValue MaskReinterpret =
	DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, OutVT, Mask);
	return DAG.getNode(ISD::AND, dl, OutVT, Reinterpret, MaskReinterpret);
	}

	case Intrinsic::aarch64_sve_insr: {
	SDValue Scalar = Op.getOperand(2);
	EVT ScalarTy = Scalar.getValueType();
	if ((ScalarTy == MVT::i8) \|\| (ScalarTy == MVT::i16))
	Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);

	return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
	Op.getOperand(1), Scalar);
	}

	case Intrinsic::localaddress: {
	const auto &MF = DAG.getMachineFunction();
	const auto *RegInfo = Subtarget->getRegisterInfo();
	unsigned Reg = RegInfo->getLocalAddressRegister(MF);
	return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
	Op.getSimpleValueType());
	}

	case Intrinsic::eh_recoverfp: {
	// FIXME: This needs to be implemented to correctly handle highly aligned
	// stack objects. For now we simply return the incoming FP. Refer D53541
	// for more details.
	SDValue FnOp = Op.getOperand(1);
	SDValue IncomingFPOp = Op.getOperand(2);
	GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
	auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
	if (!Fn)
	report_fatal_error(
	"llvm.eh.recoverfp must take a function as the first argument");
	return IncomingFPOp;
	}

	case Intrinsic::aarch64_neon_vsri:
	case Intrinsic::aarch64_neon_vsli: {
	EVT Ty = Op.getValueType();

	if (!Ty.isVector())
	report_fatal_error("Unexpected type for aarch64_neon_vsli");

	assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());

	bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri;
	unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
	return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
	Op.getOperand(3));
	}

	case Intrinsic::aarch64_neon_srhadd:
	case Intrinsic::aarch64_neon_urhadd: {
	bool IsSignedAdd = IntNo == Intrinsic::aarch64_neon_srhadd;
	unsigned Opcode = IsSignedAdd ? AArch64ISD::SRHADD : AArch64ISD::URHADD;
	return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
	Op.getOperand(2));
	}
	}
	}

	bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
	return ExtVal.getValueType().isScalableVector();
	}

	// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
	static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
	EVT VT, EVT MemVT,
	SelectionDAG &DAG) {
	assert(VT.isVector() && "VT should be a vector type");
	assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);

	SDValue Value = ST->getValue();

	// It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
	// the word lane which represent the v4i8 subvector. It optimizes the store
	// to:
	//
	// xtn v0.8b, v0.8h
	// str s0, [x0]

	SDValue Undef = DAG.getUNDEF(MVT::i16);
	SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
	{Undef, Undef, Undef, Undef});

	SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
	Value, UndefVec);
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);

	Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
	SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
	Trunc, DAG.getConstant(0, DL, MVT::i64));

	return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
	ST->getBasePtr(), ST->getMemOperand());
	}

	// Custom lowering for any store, vector or scalar and/or default or with
	// a truncate operations. Currently only custom lower truncate operation
	// from vector v4i16 to v4i8 or volatile stores of i128.
	SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc Dl(Op);
	StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
	assert (StoreNode && "Can only custom lower store nodes");

	SDValue Value = StoreNode->getValue();

	EVT VT = Value.getValueType();
	EVT MemVT = StoreNode->getMemoryVT();

	if (VT.isVector()) {
	if (useSVEForFixedLengthVectorVT(VT))
	return LowerFixedLengthVectorStoreToSVE(Op, DAG);

	unsigned AS = StoreNode->getAddressSpace();
	Align Alignment = StoreNode->getAlign();
	if (Alignment < MemVT.getStoreSize() &&
	!allowsMisalignedMemoryAccesses(MemVT, AS, Alignment.value(),
	StoreNode->getMemOperand()->getFlags(),
	nullptr)) {
	return scalarizeVectorStore(StoreNode, DAG);
	}

	if (StoreNode->isTruncatingStore()) {
	return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
	}
	// 256 bit non-temporal stores can be lowered to STNP. Do this as part of
	// the custom lowering, as there are no un-paired non-temporal stores and
	// legalization will break up 256 bit inputs.
	if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
	MemVT.getVectorElementCount().Min % 2u == 0 &&
	((MemVT.getScalarSizeInBits() == 8u \|\|
	MemVT.getScalarSizeInBits() == 16u \|\|
	MemVT.getScalarSizeInBits() == 32u \|\|
	MemVT.getScalarSizeInBits() == 64u))) {
	SDValue Lo =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
	MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
	StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
	SDValue Hi = DAG.getNode(
	ISD::EXTRACT_SUBVECTOR, Dl,
	MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
	StoreNode->getValue(),
	DAG.getConstant(MemVT.getVectorElementCount().Min / 2, Dl, MVT::i64));
	SDValue Result = DAG.getMemIntrinsicNode(
	AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
	{StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
	StoreNode->getMemoryVT(), StoreNode->getMemOperand());
	return Result;
	}
	} else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
	assert(StoreNode->getValue()->getValueType(0) == MVT::i128);
	SDValue Lo =
	DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),
	DAG.getConstant(0, Dl, MVT::i64));
	SDValue Hi =
	DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),
	DAG.getConstant(1, Dl, MVT::i64));
	SDValue Result = DAG.getMemIntrinsicNode(
	AArch64ISD::STP, Dl, DAG.getVTList(MVT::Other),
	{StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
	StoreNode->getMemoryVT(), StoreNode->getMemOperand());
	return Result;
	}

	return SDValue();
	}

	SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
	SelectionDAG &DAG) const {
	LLVM_DEBUG(dbgs() << "Custom lowering: ");
	LLVM_DEBUG(Op.dump());

	switch (Op.getOpcode()) {
	default:
	llvm_unreachable("unimplemented operand");
	return SDValue();
	case ISD::BITCAST:
	return LowerBITCAST(Op, DAG);
	case ISD::GlobalAddress:
	return LowerGlobalAddress(Op, DAG);
	case ISD::GlobalTLSAddress:
	return LowerGlobalTLSAddress(Op, DAG);
	case ISD::SETCC:
	case ISD::STRICT_FSETCC:
	case ISD::STRICT_FSETCCS:
	return LowerSETCC(Op, DAG);
	case ISD::BR_CC:
	return LowerBR_CC(Op, DAG);
	case ISD::SELECT:
	return LowerSELECT(Op, DAG);
	case ISD::SELECT_CC:
	return LowerSELECT_CC(Op, DAG);
	case ISD::JumpTable:
	return LowerJumpTable(Op, DAG);
	case ISD::BR_JT:
	return LowerBR_JT(Op, DAG);
	case ISD::ConstantPool:
	return LowerConstantPool(Op, DAG);
	case ISD::BlockAddress:
	return LowerBlockAddress(Op, DAG);
	case ISD::VASTART:
	return LowerVASTART(Op, DAG);
	case ISD::VACOPY:
	return LowerVACOPY(Op, DAG);
	case ISD::VAARG:
	return LowerVAARG(Op, DAG);
	case ISD::ADDC:
	case ISD::ADDE:
	case ISD::SUBC:
	case ISD::SUBE:
	return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
	case ISD::SADDO:
	case ISD::UADDO:
	case ISD::SSUBO:
	case ISD::USUBO:
	case ISD::SMULO:
	case ISD::UMULO:
	return LowerXALUO(Op, DAG);
	case ISD::FADD:
	if (useSVEForFixedLengthVectorVT(Op.getValueType()))
	return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
	return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
	case ISD::FSUB:
	return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
	case ISD::FMUL:
	return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
	case ISD::FMA:
	return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
	case ISD::FDIV:
	return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
	case ISD::FP_ROUND:
	case ISD::STRICT_FP_ROUND:
	return LowerFP_ROUND(Op, DAG);
	case ISD::FP_EXTEND:
	return LowerFP_EXTEND(Op, DAG);
	case ISD::FRAMEADDR:
	return LowerFRAMEADDR(Op, DAG);
	case ISD::SPONENTRY:
	return LowerSPONENTRY(Op, DAG);
	case ISD::RETURNADDR:
	return LowerRETURNADDR(Op, DAG);
	case ISD::ADDROFRETURNADDR:
	return LowerADDROFRETURNADDR(Op, DAG);
	case ISD::INSERT_VECTOR_ELT:
	return LowerINSERT_VECTOR_ELT(Op, DAG);
	case ISD::EXTRACT_VECTOR_ELT:
	return LowerEXTRACT_VECTOR_ELT(Op, DAG);
	case ISD::BUILD_VECTOR:
	return LowerBUILD_VECTOR(Op, DAG);
	case ISD::VECTOR_SHUFFLE:
	return LowerVECTOR_SHUFFLE(Op, DAG);
	case ISD::SPLAT_VECTOR:
	return LowerSPLAT_VECTOR(Op, DAG);
	case ISD::EXTRACT_SUBVECTOR:
	return LowerEXTRACT_SUBVECTOR(Op, DAG);
	case ISD::INSERT_SUBVECTOR:
	return LowerINSERT_SUBVECTOR(Op, DAG);
	case ISD::SDIV:
	return LowerToPredicatedOp(Op, DAG, AArch64ISD::SDIV_PRED);
	case ISD::UDIV:
	return LowerToPredicatedOp(Op, DAG, AArch64ISD::UDIV_PRED);
	case ISD::SMIN:
	return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_MERGE_OP1);
	case ISD::UMIN:
	return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_MERGE_OP1);
	case ISD::SMAX:
	return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_MERGE_OP1);
	case ISD::UMAX:
	return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_MERGE_OP1);
	case ISD::SRA:
	case ISD::SRL:
	case ISD::SHL:
	return LowerVectorSRA_SRL_SHL(Op, DAG);
	case ISD::SHL_PARTS:
	return LowerShiftLeftParts(Op, DAG);
	case ISD::SRL_PARTS:
	case ISD::SRA_PARTS:
	return LowerShiftRightParts(Op, DAG);
	case ISD::CTPOP:
	return LowerCTPOP(Op, DAG);
	case ISD::FCOPYSIGN:
	return LowerFCOPYSIGN(Op, DAG);
	case ISD::OR:
	return LowerVectorOR(Op, DAG);
	case ISD::XOR:
	return LowerXOR(Op, DAG);
	case ISD::PREFETCH:
	return LowerPREFETCH(Op, DAG);
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	case ISD::STRICT_SINT_TO_FP:
	case ISD::STRICT_UINT_TO_FP:
	return LowerINT_TO_FP(Op, DAG);
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	case ISD::STRICT_FP_TO_SINT:
	case ISD::STRICT_FP_TO_UINT:
	return LowerFP_TO_INT(Op, DAG);
	case ISD::FSINCOS:
	return LowerFSINCOS(Op, DAG);
	case ISD::FLT_ROUNDS_:
	return LowerFLT_ROUNDS_(Op, DAG);
	case ISD::MUL:
	return LowerMUL(Op, DAG);
	case ISD::INTRINSIC_WO_CHAIN:
	return LowerINTRINSIC_WO_CHAIN(Op, DAG);
	case ISD::STORE:
	return LowerSTORE(Op, DAG);
	case ISD::VECREDUCE_ADD:
	case ISD::VECREDUCE_SMAX:
	case ISD::VECREDUCE_SMIN:
	case ISD::VECREDUCE_UMAX:
	case ISD::VECREDUCE_UMIN:
	case ISD::VECREDUCE_FMAX:
	case ISD::VECREDUCE_FMIN:
	return LowerVECREDUCE(Op, DAG);
	case ISD::ATOMIC_LOAD_SUB:
	return LowerATOMIC_LOAD_SUB(Op, DAG);
	case ISD::ATOMIC_LOAD_AND:
	return LowerATOMIC_LOAD_AND(Op, DAG);
	case ISD::DYNAMIC_STACKALLOC:
	return LowerDYNAMIC_STACKALLOC(Op, DAG);
	case ISD::VSCALE:
	return LowerVSCALE(Op, DAG);
	case ISD::TRUNCATE:
	return LowerTRUNCATE(Op, DAG);
	case ISD::LOAD:
	if (useSVEForFixedLengthVectorVT(Op.getValueType()))
	return LowerFixedLengthVectorLoadToSVE(Op, DAG);
	llvm_unreachable("Unexpected request to lower ISD::LOAD");
	case ISD::ADD:
	if (useSVEForFixedLengthVectorVT(Op.getValueType()))
	return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED);
	llvm_unreachable("Unexpected request to lower ISD::ADD");
	}
	}

	bool AArch64TargetLowering::useSVEForFixedLengthVectors() const {
	// Prefer NEON unless larger SVE registers are available.
	return Subtarget->hasSVE() && Subtarget->getMinSVEVectorSizeInBits() >= 256;
	}

	bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(EVT VT) const {
	if (!useSVEForFixedLengthVectors())
	return false;

	if (!VT.isFixedLengthVector())
	return false;

	// Fixed length predicates should be promoted to i8.
	// NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
	if (VT.getVectorElementType() == MVT::i1)
	return false;

	// Don't use SVE for vectors we cannot scalarize if required.
	switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
	default:
	return false;
	case MVT::i8:
	case MVT::i16:
	case MVT::i32:
	case MVT::i64:
	case MVT::f16:
	case MVT::f32:
	case MVT::f64:
	break;
	}

	// Ensure NEON MVTs only belong to a single register class.
	if (VT.getSizeInBits() <= 128)
	return false;

	// Don't use SVE for types that don't fit.
	if (VT.getSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
	return false;

	// TODO: Perhaps an artificial restriction, but worth having whilst getting
	// the base fixed length SVE support in place.
	if (!VT.isPow2VectorType())
	return false;

	return true;
	}

	//===----------------------------------------------------------------------===//
	// Calling Convention Implementation
	//===----------------------------------------------------------------------===//

	/// Selects the correct CCAssignFn for a given CallingConvention value.
	CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
	bool IsVarArg) const {
	switch (CC) {
	default:
	report_fatal_error("Unsupported calling convention.");
	case CallingConv::WebKit_JS:
	return CC_AArch64_WebKit_JS;
	case CallingConv::GHC:
	return CC_AArch64_GHC;
	case CallingConv::C:
	case CallingConv::Fast:
	case CallingConv::PreserveMost:
	case CallingConv::CXX_FAST_TLS:
	case CallingConv::Swift:
	if (Subtarget->isTargetWindows() && IsVarArg)
	return CC_AArch64_Win64_VarArg;
	if (!Subtarget->isTargetDarwin())
	return CC_AArch64_AAPCS;
	if (!IsVarArg)
	return CC_AArch64_DarwinPCS;
	return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
	: CC_AArch64_DarwinPCS_VarArg;
	case CallingConv::Win64:
	return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
	case CallingConv::CFGuard_Check:
	return CC_AArch64_Win64_CFGuard_Check;
	case CallingConv::AArch64_VectorCall:
	case CallingConv::AArch64_SVE_VectorCall:
	return CC_AArch64_AAPCS;
	}
	}

	CCAssignFn *
	AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
	return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
	: RetCC_AArch64_AAPCS;
	}

	SDValue AArch64TargetLowering::LowerFormalArguments(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());

	// Assign locations to all of the incoming arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	DenseMap<unsigned, SDValue> CopiedRegs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
	*DAG.getContext());

	// At this point, Ins[].VT may already be promoted to i32. To correctly
	// handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
	// i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
	// Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
	// we use a special version of AnalyzeFormalArguments to pass in ValVT and
	// LocVT.
	unsigned NumArgs = Ins.size();
	Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin();
	unsigned CurArgIdx = 0;
	for (unsigned i = 0; i != NumArgs; ++i) {
	MVT ValVT = Ins[i].VT;
	if (Ins[i].isOrigArg()) {
	std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
	CurArgIdx = Ins[i].getOrigArgIndex();

	// Get type of the original argument.
	EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
	/AllowUnknown/ true);
	MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
	// If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
	if (ActualMVT == MVT::i1 \|\| ActualMVT == MVT::i8)
	ValVT = MVT::i8;
	else if (ActualMVT == MVT::i16)
	ValVT = MVT::i16;
	}
	CCAssignFn AssignFn = CCAssignFnForCall(CallConv, /IsVarArg=*/false);
	bool Res =
	AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
	assert(!Res && "Call operand has unhandled type");
	(void)Res;
	}
	assert(ArgLocs.size() == Ins.size());
	SmallVector<SDValue, 16> ArgValues;
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];

	if (Ins[i].Flags.isByVal()) {
	// Byval is used for HFAs in the PCS, but the system should work in a
	// non-compliant manner for larger structs.
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	int Size = Ins[i].Flags.getByValSize();
	unsigned NumRegs = (Size + 7) / 8;

	// FIXME: This works on big-endian for composite byvals, which are the common
	// case. It should also work for fundamental types too.
	unsigned FrameIdx =
	MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
	SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
	InVals.push_back(FrameIdxN);

	continue;
	}

	SDValue ArgValue;
	if (VA.isRegLoc()) {
	// Arguments stored in registers.
	EVT RegVT = VA.getLocVT();
	const TargetRegisterClass *RC;

	if (RegVT == MVT::i32)
	RC = &AArch64::GPR32RegClass;
	else if (RegVT == MVT::i64)
	RC = &AArch64::GPR64RegClass;
	else if (RegVT == MVT::f16 \|\| RegVT == MVT::bf16)
	RC = &AArch64::FPR16RegClass;
	else if (RegVT == MVT::f32)
	RC = &AArch64::FPR32RegClass;
	else if (RegVT == MVT::f64 \|\| RegVT.is64BitVector())
	RC = &AArch64::FPR64RegClass;
	else if (RegVT == MVT::f128 \|\| RegVT.is128BitVector())
	RC = &AArch64::FPR128RegClass;
	else if (RegVT.isScalableVector() &&
	RegVT.getVectorElementType() == MVT::i1)
	RC = &AArch64::PPRRegClass;
	else if (RegVT.isScalableVector())
	RC = &AArch64::ZPRRegClass;
	else
	llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");

	// Transform the arguments in physical registers into virtual ones.
	unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);

	// If this is an 8, 16 or 32-bit value, it is really passed promoted
	// to 64 bits. Insert an assert[sz]ext to capture this, then
	// truncate to the right size.
	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	break;
	case CCValAssign::Indirect:
	assert(VA.getValVT().isScalableVector() &&
	"Only scalable vectors can be passed indirectly");
	break;
	case CCValAssign::BCvt:
	ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
	break;
	case CCValAssign::AExt:
	case CCValAssign::SExt:
	case CCValAssign::ZExt:
	break;
	case CCValAssign::AExtUpper:
	ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
	DAG.getConstant(32, DL, RegVT));
	ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
	break;
	}
	} else { // VA.isRegLoc()
	assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
	unsigned ArgOffset = VA.getLocMemOffset();
	unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
	? VA.getLocVT().getSizeInBits()
	: VA.getValVT().getSizeInBits()) / 8;

	uint32_t BEAlign = 0;
	if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
	!Ins[i].Flags.isInConsecutiveRegs())
	BEAlign = 8 - ArgSize;

	int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);

	// Create load nodes to retrieve arguments from the stack.
	SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));

	// For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
	ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
	MVT MemVT = VA.getValVT();

	switch (VA.getLocInfo()) {
	default:
	break;
	case CCValAssign::Trunc:
	case CCValAssign::BCvt:
	MemVT = VA.getLocVT();
	break;
	case CCValAssign::Indirect:
	assert(VA.getValVT().isScalableVector() &&
	"Only scalable vectors can be passed indirectly");
	MemVT = VA.getLocVT();
	break;
	case CCValAssign::SExt:
	ExtType = ISD::SEXTLOAD;
	break;
	case CCValAssign::ZExt:
	ExtType = ISD::ZEXTLOAD;
	break;
	case CCValAssign::AExt:
	ExtType = ISD::EXTLOAD;
	break;
	}

	ArgValue = DAG.getExtLoad(
	ExtType, DL, VA.getLocVT(), Chain, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
	MemVT);

	}

	if (VA.getLocInfo() == CCValAssign::Indirect) {
	assert(VA.getValVT().isScalableVector() &&
	"Only scalable vectors can be passed indirectly");
	// If value is passed via pointer - do a load.
	ArgValue =
	DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue, MachinePointerInfo());
	}

	if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
	ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
	ArgValue, DAG.getValueType(MVT::i32));
	InVals.push_back(ArgValue);
	}

	// varargs
	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
	if (isVarArg) {
	if (!Subtarget->isTargetDarwin() \|\| IsWin64) {
	// The AAPCS variadic function ABI is identical to the non-variadic
	// one. As a result there may be more arguments in registers and we should
	// save them for future reference.
	// Win64 variadic functions also pass arguments in registers, but all float
	// arguments are passed in integer registers.
	saveVarArgRegisters(CCInfo, DAG, DL, Chain);
	}

	// This will point to the next argument passed via stack.
	unsigned StackOffset = CCInfo.getNextStackOffset();
	// We currently pass all varargs at 8-byte alignment, or 4 for ILP32
	StackOffset = alignTo(StackOffset, Subtarget->isTargetILP32() ? 4 : 8);
	FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));

	if (MFI.hasMustTailInVarArgFunc()) {
	SmallVector<MVT, 2> RegParmTypes;
	RegParmTypes.push_back(MVT::i64);
	RegParmTypes.push_back(MVT::f128);
	// Compute the set of forwarded registers. The rest are scratch.
	SmallVectorImpl<ForwardedRegister> &Forwards =
	FuncInfo->getForwardedMustTailRegParms();
	CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
	CC_AArch64_AAPCS);

	// Conservatively forward X8, since it might be used for aggregate return.
	if (!CCInfo.isAllocated(AArch64::X8)) {
	unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
	Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
	}
	}
	}

	// On Windows, InReg pointers must be returned, so record the pointer in a
	// virtual register at the start of the function so it can be returned in the
	// epilogue.
	if (IsWin64) {
	for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
	if (Ins[I].Flags.isInReg()) {
	assert(!FuncInfo->getSRetReturnReg());

	MVT PtrTy = getPointerTy(DAG.getDataLayout());
	Register Reg =
	MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
	FuncInfo->setSRetReturnReg(Reg);

	SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
	break;
	}
	}
	}

	unsigned StackArgSize = CCInfo.getNextStackOffset();
	bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
	if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
	// This is a non-standard ABI so by fiat I say we're allowed to make full
	// use of the stack area to be popped, which must be aligned to 16 bytes in
	// any case:
	StackArgSize = alignTo(StackArgSize, 16);

	// If we're expected to restore the stack (e.g. fastcc) then we'll be adding
	// a multiple of 16.
	FuncInfo->setArgumentStackToRestore(StackArgSize);

	// This realignment carries over to the available bytes below. Our own
	// callers will guarantee the space is free by giving an aligned value to
	// CALLSEQ_START.
	}
	// Even if we're not expected to free up the space, it's useful to know how
	// much is there while considering tail calls (because we can reuse it).
	FuncInfo->setBytesInStackArgArea(StackArgSize);

	if (Subtarget->hasCustomCallingConv())
	Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);

	return Chain;
	}

	void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
	SelectionDAG &DAG,
	const SDLoc &DL,
	SDValue &Chain) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());

	SmallVector<SDValue, 8> MemOps;

	static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
	AArch64::X3, AArch64::X4, AArch64::X5,
	AArch64::X6, AArch64::X7 };
	static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
	unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);

	unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
	int GPRIdx = 0;
	if (GPRSaveSize != 0) {
	if (IsWin64) {
	GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
	if (GPRSaveSize & 15)
	// The extra size here, if triggered, will always be 8.
	MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
	} else
	GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);

	SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);

	for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
	unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
	SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
	SDValue Store = DAG.getStore(
	Val.getValue(1), DL, Val, FIN,
	IsWin64
	? MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
	GPRIdx,
	(i - FirstVariadicGPR) * 8)
	: MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
	MemOps.push_back(Store);
	FIN =
	DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
	}
	}
	FuncInfo->setVarArgsGPRIndex(GPRIdx);
	FuncInfo->setVarArgsGPRSize(GPRSaveSize);

	if (Subtarget->hasFPARMv8() && !IsWin64) {
	static const MCPhysReg FPRArgRegs[] = {
	AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
	AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
	static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
	unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);

	unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
	int FPRIdx = 0;
	if (FPRSaveSize != 0) {
	FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);

	SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);

	for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
	unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
	SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);

	SDValue Store = DAG.getStore(
	Val.getValue(1), DL, Val, FIN,
	MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16));
	MemOps.push_back(Store);
	FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
	DAG.getConstant(16, DL, PtrVT));
	}
	}
	FuncInfo->setVarArgsFPRIndex(FPRIdx);
	FuncInfo->setVarArgsFPRSize(FPRSaveSize);
	}

	if (!MemOps.empty()) {
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
	}
	}

	/// LowerCallResult - Lower the result values of a call into the
	/// appropriate copies out of appropriate physical registers.
	SDValue AArch64TargetLowering::LowerCallResult(
	SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
	SDValue ThisVal) const {
	CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
	? RetCC_AArch64_WebKit_JS
	: RetCC_AArch64_AAPCS;
	// Assign locations to each value returned by this call.
	SmallVector<CCValAssign, 16> RVLocs;
	DenseMap<unsigned, SDValue> CopiedRegs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());
	CCInfo.AnalyzeCallResult(Ins, RetCC);

	// Copy all of the result registers out of their specified physreg.
	for (unsigned i = 0; i != RVLocs.size(); ++i) {
	CCValAssign VA = RVLocs[i];

	// Pass 'this' value directly from the argument to return value, to avoid
	// reg unit interference
	if (i == 0 && isThisReturn) {
	assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
	"unexpected return calling convention register assignment");
	InVals.push_back(ThisVal);
	continue;
	}

	// Avoid copying a physreg twice since RegAllocFast is incompetent and only
	// allows one use of a physreg per block.
	SDValue Val = CopiedRegs.lookup(VA.getLocReg());
	if (!Val) {
	Val =
	DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
	Chain = Val.getValue(1);
	InFlag = Val.getValue(2);
	CopiedRegs[VA.getLocReg()] = Val;
	}

	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	break;
	case CCValAssign::BCvt:
	Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
	break;
	case CCValAssign::AExtUpper:
	Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
	DAG.getConstant(32, DL, VA.getLocVT()));
	LLVM_FALLTHROUGH;
	case CCValAssign::AExt:
	LLVM_FALLTHROUGH;
	case CCValAssign::ZExt:
	Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
	break;
	}

	InVals.push_back(Val);
	}

	return Chain;
	}

	/// Return true if the calling convention is one that we can guarantee TCO for.
	static bool canGuaranteeTCO(CallingConv::ID CC) {
	return CC == CallingConv::Fast;
	}

	/// Return true if we might ever do TCO for calls with this calling convention.
	static bool mayTailCallThisCC(CallingConv::ID CC) {
	switch (CC) {
	case CallingConv::C:
	case CallingConv::PreserveMost:
	case CallingConv::Swift:
	return true;
	default:
	return canGuaranteeTCO(CC);
	}
	}

	bool AArch64TargetLowering::isEligibleForTailCallOptimization(
	SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
	if (!mayTailCallThisCC(CalleeCC))
	return false;

	MachineFunction &MF = DAG.getMachineFunction();
	const Function &CallerF = MF.getFunction();
	CallingConv::ID CallerCC = CallerF.getCallingConv();
	bool CCMatch = CallerCC == CalleeCC;

	// When using the Windows calling convention on a non-windows OS, we want
	// to back up and restore X18 in such functions; we can't do a tail call
	// from those functions.
	if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
	CalleeCC != CallingConv::Win64)
	return false;

	// Byval parameters hand the function a pointer directly into the stack area
	// we want to reuse during a tail call. Working around this is possible (see
	// X86) but less efficient and uglier in LowerCall.
	for (Function::const_arg_iterator i = CallerF.arg_begin(),
	e = CallerF.arg_end();
	i != e; ++i) {
	if (i->hasByValAttr())
	return false;

	// On Windows, "inreg" attributes signify non-aggregate indirect returns.
	// In this case, it is necessary to save/restore X0 in the callee. Tail
	// call opt interferes with this. So we disable tail call opt when the
	// caller has an argument with "inreg" attribute.

	// FIXME: Check whether the callee also has an "inreg" argument.
	if (i->hasInRegAttr())
	return false;
	}

	if (getTargetMachine().Options.GuaranteedTailCallOpt)
	return canGuaranteeTCO(CalleeCC) && CCMatch;

	// Externally-defined functions with weak linkage should not be
	// tail-called on AArch64 when the OS does not support dynamic
	// pre-emption of symbols, as the AAELF spec requires normal calls
	// to undefined weak functions to be replaced with a NOP or jump to the
	// next instruction. The behaviour of branch instructions in this
	// situation (as used for tail calls) is implementation-defined, so we
	// cannot rely on the linker replacing the tail call with a return.
	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
	const GlobalValue *GV = G->getGlobal();
	const Triple &TT = getTargetMachine().getTargetTriple();
	if (GV->hasExternalWeakLinkage() &&
	(!TT.isOSWindows() \|\| TT.isOSBinFormatELF() \|\| TT.isOSBinFormatMachO()))
	return false;
	}

	// Now we search for cases where we can use a tail call without changing the
	// ABI. Sibcall is used in some places (particularly gcc) to refer to this
	// concept.

	// I want anyone implementing a new calling convention to think long and hard
	// about this assert.
	assert((!isVarArg \|\| CalleeCC == CallingConv::C) &&
	"Unexpected variadic calling convention");

	LLVMContext &C = *DAG.getContext();
	if (isVarArg && !Outs.empty()) {
	// At least two cases here: if caller is fastcc then we can't have any
	// memory arguments (we'd be expected to clean up the stack afterwards). If
	// caller is C then we could potentially use its argument area.

	// FIXME: for now we take the most conservative of these in both cases:
	// disallow all variadic memory operands.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
	for (const CCValAssign &ArgLoc : ArgLocs)
	if (!ArgLoc.isRegLoc())
	return false;
	}

	// Check that the call results are passed in the same way.
	if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
	CCAssignFnForCall(CalleeCC, isVarArg),
	CCAssignFnForCall(CallerCC, isVarArg)))
	return false;
	// The callee has to preserve all registers the caller needs to preserve.
	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
	if (!CCMatch) {
	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
	if (Subtarget->hasCustomCallingConv()) {
	TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
	TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
	}
	if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
	return false;
	}

	// Nothing more to check if the callee is taking no arguments
	if (Outs.empty())
	return true;

	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));

	const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();

	// If any of the arguments is passed indirectly, it must be SVE, so the
	// 'getBytesInStackArgArea' is not sufficient to determine whether we need to
	// allocate space on the stack. That is why we determine this explicitly here
	// the call cannot be a tailcall.
	if (llvm::any_of(ArgLocs, [](CCValAssign &A) {
	assert((A.getLocInfo() != CCValAssign::Indirect \|\|
	A.getValVT().isScalableVector()) &&
	"Expected value to be scalable");
	return A.getLocInfo() == CCValAssign::Indirect;
	}))
	return false;

	// If the stack arguments for this call do not fit into our own save area then
	// the call cannot be made tail.
	if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
	return false;

	const MachineRegisterInfo &MRI = MF.getRegInfo();
	if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
	return false;

	return true;
	}

	SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
	SelectionDAG &DAG,
	MachineFrameInfo &MFI,
	int ClobberedFI) const {
	SmallVector<SDValue, 8> ArgChains;
	int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
	int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;

	// Include the original chain at the beginning of the list. When this is
	// used by target LowerCall hooks, this helps legalize find the
	// CALLSEQ_BEGIN node.
	ArgChains.push_back(Chain);

	// Add a chain value for each stack argument corresponding
	for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
	UE = DAG.getEntryNode().getNode()->use_end();
	U != UE; ++U)
	if (LoadSDNode L = dyn_cast<LoadSDNode>(U))
	if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
	if (FI->getIndex() < 0) {
	int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
	int64_t InLastByte = InFirstByte;
	InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;

	if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) \|\|
	(FirstByte <= InFirstByte && InFirstByte <= LastByte))
	ArgChains.push_back(SDValue(L, 1));
	}

	// Build a tokenfactor for all the chains.
	return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
	}

	bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
	bool TailCallOpt) const {
	return CallCC == CallingConv::Fast && TailCallOpt;
	}

	/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
	/// and add input and output parameter nodes.
	SDValue
	AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	SelectionDAG &DAG = CLI.DAG;
	SDLoc &DL = CLI.DL;
	SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
	SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
	SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
	SDValue Chain = CLI.Chain;
	SDValue Callee = CLI.Callee;
	bool &IsTailCall = CLI.IsTailCall;
	CallingConv::ID CallConv = CLI.CallConv;
	bool IsVarArg = CLI.IsVarArg;

	MachineFunction &MF = DAG.getMachineFunction();
	MachineFunction::CallSiteInfo CSInfo;
	bool IsThisReturn = false;

	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
	bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
	bool IsSibCall = false;

	if (IsTailCall) {
	// Check if it's really possible to do a tail call.
	IsTailCall = isEligibleForTailCallOptimization(
	Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
	if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
	report_fatal_error("failed to perform tail call elimination on a call "
	"site marked musttail");

	// A sibling call is one where we're under the usual C ABI and not planning
	// to change that but can still do a tail call:
	if (!TailCallOpt && IsTailCall)
	IsSibCall = true;

	if (IsTailCall)
	++NumTailCalls;
	}

	// Analyze operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
	*DAG.getContext());

	if (IsVarArg) {
	// Handle fixed and variable vector arguments differently.
	// Variable vector arguments always go into memory.
	unsigned NumArgs = Outs.size();

	for (unsigned i = 0; i != NumArgs; ++i) {
	MVT ArgVT = Outs[i].VT;
	ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
	CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
	/IsVarArg=/ !Outs[i].IsFixed);
	bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
	assert(!Res && "Call operand has unhandled type");
	(void)Res;
	}
	} else {
	// At this point, Outs[].VT may already be promoted to i32. To correctly
	// handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
	// i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
	// Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
	// we use a special version of AnalyzeCallOperands to pass in ValVT and
	// LocVT.
	unsigned NumArgs = Outs.size();
	for (unsigned i = 0; i != NumArgs; ++i) {
	MVT ValVT = Outs[i].VT;
	// Get type of the original argument.
	EVT ActualVT = getValueType(DAG.getDataLayout(),
	CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
	/AllowUnknown/ true);
	MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
	ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
	// If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
	if (ActualMVT == MVT::i1 \|\| ActualMVT == MVT::i8)
	ValVT = MVT::i8;
	else if (ActualMVT == MVT::i16)
	ValVT = MVT::i16;

	CCAssignFn AssignFn = CCAssignFnForCall(CallConv, /IsVarArg=*/false);
	bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
	assert(!Res && "Call operand has unhandled type");
	(void)Res;
	}
	}

	// Get a count of how many bytes are to be pushed on the stack.
	unsigned NumBytes = CCInfo.getNextStackOffset();

	if (IsSibCall) {
	// Since we're not changing the ABI to make this a tail call, the memory
	// operands are already available in the caller's incoming argument space.
	NumBytes = 0;
	}

	// FPDiff is the byte offset of the call's argument area from the callee's.
	// Stores to callee stack arguments will be placed in FixedStackSlots offset
	// by this amount for a tail call. In a sibling call it must be 0 because the
	// caller will deallocate the entire stack and the callee still expects its
	// arguments to begin at SP+0. Completely unused for non-tail calls.
	int FPDiff = 0;

	if (IsTailCall && !IsSibCall) {
	unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();

	// Since callee will pop argument stack as a tail call, we must keep the
	// popped size 16-byte aligned.
	NumBytes = alignTo(NumBytes, 16);

	// FPDiff will be negative if this tail call requires more space than we
	// would automatically have in our incoming argument space. Positive if we
	// can actually shrink the stack.
	FPDiff = NumReusableBytes - NumBytes;

	// The stack pointer must be 16-byte aligned at all times it's used for a
	// memory operation, which in practice means at all times and in
	// particular across call boundaries. Therefore our own arguments started at
	// a 16-byte aligned SP and the delta applied for the tail call should
	// satisfy the same constraint.
	assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
	}

	// Adjust the stack pointer for the new arguments...
	// These operations are automatically eliminated by the prolog/epilog pass
	if (!IsSibCall)
	Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);

	SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
	getPointerTy(DAG.getDataLayout()));

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
	SmallSet<unsigned, 8> RegsUsed;
	SmallVector<SDValue, 8> MemOpChains;
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
	const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
	for (const auto &F : Forwards) {
	SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
	RegsToPass.emplace_back(F.PReg, Val);
	}
	}

	// Walk the register/memloc assignments, inserting copies/loads.
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	SDValue Arg = OutVals[i];
	ISD::ArgFlagsTy Flags = Outs[i].Flags;

	// Promote the value if needed.
	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	break;
	case CCValAssign::SExt:
	Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::ZExt:
	Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::AExt:
	if (Outs[i].ArgVT == MVT::i1) {
	// AAPCS requires i1 to be zero-extended to 8-bits by the caller.
	Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
	Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
	}
	Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::AExtUpper:
	assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
	Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
	Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
	DAG.getConstant(32, DL, VA.getLocVT()));
	break;
	case CCValAssign::BCvt:
	Arg = DAG.getBitcast(VA.getLocVT(), Arg);
	break;
	case CCValAssign::Trunc:
	Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
	break;
	case CCValAssign::FPExt:
	Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::Indirect:
	assert(VA.getValVT().isScalableVector() &&
	"Only scalable vectors can be passed indirectly");
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	Type Ty = EVT(VA.getValVT()).getTypeForEVT(DAG.getContext());
	Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
	int FI = MFI.CreateStackObject(
	VA.getValVT().getStoreSize().getKnownMinSize(), Alignment, false);
	MFI.setStackID(FI, TargetStackID::SVEVector);

	SDValue SpillSlot = DAG.getFrameIndex(
	FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
	Chain = DAG.getStore(
	Chain, DL, Arg, SpillSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	Arg = SpillSlot;
	break;
	}

	if (VA.isRegLoc()) {
	if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
	Outs[0].VT == MVT::i64) {
	assert(VA.getLocVT() == MVT::i64 &&
	"unexpected calling convention register assignment");
	assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
	"unexpected use of 'returned'");
	IsThisReturn = true;
	}
	if (RegsUsed.count(VA.getLocReg())) {
	// If this register has already been used then we're trying to pack
	// parts of an [N x i32] into an X-register. The extension type will
	// take care of putting the two halves in the right place but we have to
	// combine them.
	SDValue &Bits =
	std::find_if(RegsToPass.begin(), RegsToPass.end(),
	[=](const std::pair<unsigned, SDValue> &Elt) {
	return Elt.first == VA.getLocReg();
	})
	->second;
	Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
	// Call site info is used for function's parameter entry value
	// tracking. For now we track only simple cases when parameter
	// is transferred through whole register.
	CSInfo.erase(std::remove_if(CSInfo.begin(), CSInfo.end(),
	[&VA](MachineFunction::ArgRegPair ArgReg) {
	return ArgReg.Reg == VA.getLocReg();
	}),
	CSInfo.end());
	} else {
	RegsToPass.emplace_back(VA.getLocReg(), Arg);
	RegsUsed.insert(VA.getLocReg());
	const TargetOptions &Options = DAG.getTarget().Options;
	if (Options.EmitCallSiteInfo)
	CSInfo.emplace_back(VA.getLocReg(), i);
	}
	} else {
	assert(VA.isMemLoc());

	SDValue DstAddr;
	MachinePointerInfo DstInfo;

	// FIXME: This works on big-endian for composite byvals, which are the
	// common case. It should also work for fundamental types too.
	uint32_t BEAlign = 0;
	unsigned OpSize;
	if (VA.getLocInfo() == CCValAssign::Indirect)
	OpSize = VA.getLocVT().getSizeInBits();
	else
	OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
	: VA.getValVT().getSizeInBits();
	OpSize = (OpSize + 7) / 8;
	if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
	!Flags.isInConsecutiveRegs()) {
	if (OpSize < 8)
	BEAlign = 8 - OpSize;
	}
	unsigned LocMemOffset = VA.getLocMemOffset();
	int32_t Offset = LocMemOffset + BEAlign;
	SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
	PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);

	if (IsTailCall) {
	Offset = Offset + FPDiff;
	int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);

	DstAddr = DAG.getFrameIndex(FI, PtrVT);
	DstInfo =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);

	// Make sure any stack arguments overlapping with where we're storing
	// are loaded before this eventual operation. Otherwise they'll be
	// clobbered.
	Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
	} else {
	SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);

	DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
	DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(),
	LocMemOffset);
	}

	if (Outs[i].Flags.isByVal()) {
	SDValue SizeNode =
	DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
	SDValue Cpy = DAG.getMemcpy(
	Chain, DL, DstAddr, Arg, SizeNode,
	Outs[i].Flags.getNonZeroByValAlign(),
	/isVol = / false, /AlwaysInline = / false,
	/isTailCall = / false, DstInfo, MachinePointerInfo());

	MemOpChains.push_back(Cpy);
	} else {
	// Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
	// promoted to a legal register type i32, we should truncate Arg back to
	// i1/i8/i16.
	if (VA.getValVT() == MVT::i1 \|\| VA.getValVT() == MVT::i8 \|\|
	VA.getValVT() == MVT::i16)
	Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);

	SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
	MemOpChains.push_back(Store);
	}
	}
	}

	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);

	// Build a sequence of copy-to-reg nodes chained together with token chain
	// and flag operands which copy the outgoing args into the appropriate regs.
	SDValue InFlag;
	for (auto &RegToPass : RegsToPass) {
	Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
	RegToPass.second, InFlag);
	InFlag = Chain.getValue(1);
	}

	// If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
	// direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
	// node so that legalize doesn't hack it.
	if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
	auto GV = G->getGlobal();
	unsigned OpFlags =
	Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine());
	if (OpFlags & AArch64II::MO_GOT) {
	Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
	Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
	} else {
	const GlobalValue *GV = G->getGlobal();
	Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
	}
	} else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
	if (getTargetMachine().getCodeModel() == CodeModel::Large &&
	Subtarget->isTargetMachO()) {
	const char *Sym = S->getSymbol();
	Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
	Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
	} else {
	const char *Sym = S->getSymbol();
	Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
	}
	}

	// We don't usually want to end the call-sequence here because we would tidy
	// the frame up after the call, however in the ABI-changing tail-call case
	// we've carefully laid out the parameters so that when sp is reset they'll be
	// in the correct location.
	if (IsTailCall && !IsSibCall) {
	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
	DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
	InFlag = Chain.getValue(1);
	}

	std::vector<SDValue> Ops;
	Ops.push_back(Chain);
	Ops.push_back(Callee);

	if (IsTailCall) {
	// Each tail call may have to adjust the stack by a different amount, so
	// this information must travel along with the operation for eventual
	// consumption by emitEpilogue.
	Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
	}

	// Add argument registers to the end of the list so that they are known live
	// into the call.
	for (auto &RegToPass : RegsToPass)
	Ops.push_back(DAG.getRegister(RegToPass.first,
	RegToPass.second.getValueType()));

	// Check callee args/returns for SVE registers and set calling convention
	// accordingly.
	if (CallConv == CallingConv::C) {
	bool CalleeOutSVE = any_of(Outs, [](ISD::OutputArg &Out){
	return Out.VT.isScalableVector();
	});
	bool CalleeInSVE = any_of(Ins, [](ISD::InputArg &In){
	return In.VT.isScalableVector();
	});

	if (CalleeInSVE \|\| CalleeOutSVE)
	CallConv = CallingConv::AArch64_SVE_VectorCall;
	}

	// Add a register mask operand representing the call-preserved registers.
	const uint32_t *Mask;
	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	if (IsThisReturn) {
	// For 'this' returns, use the X0-preserving mask if applicable
	Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
	if (!Mask) {
	IsThisReturn = false;
	Mask = TRI->getCallPreservedMask(MF, CallConv);
	}
	} else
	Mask = TRI->getCallPreservedMask(MF, CallConv);

	if (Subtarget->hasCustomCallingConv())
	TRI->UpdateCustomCallPreservedMask(MF, &Mask);

	if (TRI->isAnyArgRegReserved(MF))
	TRI->emitReservedArgRegCallError(MF);

	assert(Mask && "Missing call preserved mask for calling convention");
	Ops.push_back(DAG.getRegisterMask(Mask));

	if (InFlag.getNode())
	Ops.push_back(InFlag);

	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

	// If we're doing a tall call, use a TC_RETURN here rather than an
	// actual call instruction.
	if (IsTailCall) {
	MF.getFrameInfo().setHasTailCall();
	SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
	DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
	return Ret;
	}

	// Returns a chain and a flag for retval copy to use.
	Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
	DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
	InFlag = Chain.getValue(1);
	DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));

	uint64_t CalleePopBytes =
	DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;

	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
	DAG.getIntPtrConstant(CalleePopBytes, DL, true),
	InFlag, DL);
	if (!Ins.empty())
	InFlag = Chain.getValue(1);

	// Handle result values, copying them out of physregs into vregs that we
	// return.
	return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
	InVals, IsThisReturn,
	IsThisReturn ? OutVals[0] : SDValue());
	}

	bool AArch64TargetLowering::CanLowerReturn(
	CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
	CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
	? RetCC_AArch64_WebKit_JS
	: RetCC_AArch64_AAPCS;
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
	return CCInfo.CheckReturn(Outs, RetCC);
	}

	SDValue
	AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &DL, SelectionDAG &DAG) const {
	auto &MF = DAG.getMachineFunction();
	auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();

	CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
	? RetCC_AArch64_WebKit_JS
	: RetCC_AArch64_AAPCS;
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());
	CCInfo.AnalyzeReturn(Outs, RetCC);

	// Copy the result values into the output registers.
	SDValue Flag;
	SmallVector<std::pair<unsigned, SDValue>, 4> RetVals;
	SmallSet<unsigned, 4> RegsUsed;
	for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
	++i, ++realRVLocIdx) {
	CCValAssign &VA = RVLocs[i];
	assert(VA.isRegLoc() && "Can only return in registers!");
	SDValue Arg = OutVals[realRVLocIdx];

	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	if (Outs[i].ArgVT == MVT::i1) {
	// AAPCS requires i1 to be zero-extended to i8 by the producer of the
	// value. This is strictly redundant on Darwin (which uses "zeroext
	// i1"), but will be optimised out before ISel.
	Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
	Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
	}
	break;
	case CCValAssign::BCvt:
	Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::AExt:
	case CCValAssign::ZExt:
	Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
	break;
	case CCValAssign::AExtUpper:
	assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
	Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
	Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
	DAG.getConstant(32, DL, VA.getLocVT()));
	break;
	}

	if (RegsUsed.count(VA.getLocReg())) {
	SDValue &Bits =
	std::find_if(RetVals.begin(), RetVals.end(),
	[=](const std::pair<unsigned, SDValue> &Elt) {
	return Elt.first == VA.getLocReg();
	})
	->second;
	Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
	} else {
	RetVals.emplace_back(VA.getLocReg(), Arg);
	RegsUsed.insert(VA.getLocReg());
	}
	}

	SmallVector<SDValue, 4> RetOps(1, Chain);
	for (auto &RetVal : RetVals) {
	Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Flag);
	Flag = Chain.getValue(1);
	RetOps.push_back(
	DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
	}

	// Windows AArch64 ABIs require that for returning structs by value we copy
	// the sret argument into X0 for the return.
	// We saved the argument into a virtual register in the entry block,
	// so now we copy the value out and into X0.
	if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
	SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
	getPointerTy(MF.getDataLayout()));

	unsigned RetValReg = AArch64::X0;
	Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Flag);
	Flag = Chain.getValue(1);

	RetOps.push_back(
	DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
	}

	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	const MCPhysReg *I =
	TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
	if (I) {
	for (; *I; ++I) {
	if (AArch64::GPR64RegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::i64));
	else if (AArch64::FPR64RegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
	}
	}

	RetOps[0] = Chain; // Update chain.

	// Add the flag if we have it.
	if (Flag.getNode())
	RetOps.push_back(Flag);

	return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
	}

	//===----------------------------------------------------------------------===//
	// Other Lowering Code
	//===----------------------------------------------------------------------===//

	SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
	SelectionDAG &DAG,
	unsigned Flag) const {
	return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
	N->getOffset(), Flag);
	}

	SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
	SelectionDAG &DAG,
	unsigned Flag) const {
	return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
	}

	SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
	SelectionDAG &DAG,
	unsigned Flag) const {
	return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
	N->getOffset(), Flag);
	}

	SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
	SelectionDAG &DAG,
	unsigned Flag) const {
	return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
	}

	// (loadGOT sym)
	template <class NodeTy>
	SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
	unsigned Flags) const {
	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
	SDLoc DL(N);
	EVT Ty = getPointerTy(DAG.getDataLayout());
	SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT \| Flags);
	// FIXME: Once remat is capable of dealing with instructions with register
	// operands, expand this into two nodes instead of using a wrapper node.
	return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
	}

	// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
	template <class NodeTy>
	SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
	unsigned Flags) const {
	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
	SDLoc DL(N);
	EVT Ty = getPointerTy(DAG.getDataLayout());
	const unsigned char MO_NC = AArch64II::MO_NC;
	return DAG.getNode(
	AArch64ISD::WrapperLarge, DL, Ty,
	getTargetNode(N, Ty, DAG, AArch64II::MO_G3 \| Flags),
	getTargetNode(N, Ty, DAG, AArch64II::MO_G2 \| MO_NC \| Flags),
	getTargetNode(N, Ty, DAG, AArch64II::MO_G1 \| MO_NC \| Flags),
	getTargetNode(N, Ty, DAG, AArch64II::MO_G0 \| MO_NC \| Flags));
	}

	// (addlow (adrp %hi(sym)) %lo(sym))
	template <class NodeTy>
	SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
	unsigned Flags) const {
	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
	SDLoc DL(N);
	EVT Ty = getPointerTy(DAG.getDataLayout());
	SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE \| Flags);
	SDValue Lo = getTargetNode(N, Ty, DAG,
	AArch64II::MO_PAGEOFF \| AArch64II::MO_NC \| Flags);
	SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
	return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
	}

	// (adr sym)
	template <class NodeTy>
	SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
	unsigned Flags) const {
	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
	SDLoc DL(N);
	EVT Ty = getPointerTy(DAG.getDataLayout());
	SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
	return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
	}

	SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
	SelectionDAG &DAG) const {
	GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
	const GlobalValue *GV = GN->getGlobal();
	unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());

	if (OpFlags != AArch64II::MO_NO_FLAG)
	assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
	"unexpected offset in global node");

	// This also catches the large code model case for Darwin, and tiny code
	// model with got relocations.
	if ((OpFlags & AArch64II::MO_GOT) != 0) {
	return getGOT(GN, DAG, OpFlags);
	}

	SDValue Result;
	if (getTargetMachine().getCodeModel() == CodeModel::Large) {
	Result = getAddrLarge(GN, DAG, OpFlags);
	} else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
	Result = getAddrTiny(GN, DAG, OpFlags);
	} else {
	Result = getAddr(GN, DAG, OpFlags);
	}
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDLoc DL(GN);
	if (OpFlags & (AArch64II::MO_DLLIMPORT \| AArch64II::MO_COFFSTUB))
	Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
	return Result;
	}

	/// Convert a TLS address reference into the correct sequence of loads
	/// and calls to compute the variable's address (for Darwin, currently) and
	/// return an SDValue containing the final node.

	/// Darwin only has one TLS scheme which must be capable of dealing with the
	/// fully general situation, in the worst case. This means:
	/// + "extern __thread" declaration.
	/// + Defined in a possibly unknown dynamic library.
	///
	/// The general system is that each __thread variable has a [3 x i64] descriptor
	/// which contains information used by the runtime to calculate the address. The
	/// only part of this the compiler needs to know about is the first xword, which
	/// contains a function pointer that must be called with the address of the
	/// entire descriptor in "x0".
	///
	/// Since this descriptor may be in a different unit, in general even the
	/// descriptor must be accessed via an indirect load. The "ideal" code sequence
	/// is:
	/// adrp x0, _var@TLVPPAGE
	/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
	/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
	/// ; the function pointer
	/// blr x1 ; Uses descriptor address in x0
	/// ; Address of _var is now in x0.
	///
	/// If the address of _var's descriptor is known to the linker, then it can
	/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
	/// a slight efficiency gain.
	SDValue
	AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Subtarget->isTargetDarwin() &&
	"This function expects a Darwin target");

	SDLoc DL(Op);
	MVT PtrVT = getPointerTy(DAG.getDataLayout());
	MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
	const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();

	SDValue TLVPAddr =
	DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
	SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);

	// The first entry in the descriptor is a function pointer that we must call
	// to obtain the address of the variable.
	SDValue Chain = DAG.getEntryNode();
	SDValue FuncTLVGet = DAG.getLoad(
	PtrMemVT, DL, Chain, DescAddr,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()),
	/* Alignment = */ PtrMemVT.getSizeInBits() / 8,
	MachineMemOperand::MOInvariant \| MachineMemOperand::MODereferenceable);
	Chain = FuncTLVGet.getValue(1);

	// Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
	FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);

	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setAdjustsStack(true);

	// TLS calls preserve all registers except those that absolutely must be
	// trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
	// silly).
	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	const uint32_t *Mask = TRI->getTLSCallPreservedMask();
	if (Subtarget->hasCustomCallingConv())
	TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);

	// Finally, we can make the call. This is just a degenerate version of a
	// normal AArch64 call node: x0 takes the address of the descriptor, and
	// returns the address of the variable in this thread.
	Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
	Chain =
	DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
	Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
	DAG.getRegisterMask(Mask), Chain.getValue(1));
	return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
	}

	/// Convert a thread-local variable reference into a sequence of instructions to
	/// compute the variable's address for the local exec TLS model of ELF targets.
	/// The sequence depends on the maximum TLS area size.
	SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
	SDValue ThreadBase,
	const SDLoc &DL,
	SelectionDAG &DAG) const {
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue TPOff, Addr;

	switch (DAG.getTarget().Options.TLSSize) {
	default:
	llvm_unreachable("Unexpected TLS size");

	case 12: {
	// mrs x0, TPIDR_EL0
	// add x0, x0, :tprel_lo12:a
	SDValue Var = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0, AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF);
	return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
	Var,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	}

	case 24: {
	// mrs x0, TPIDR_EL0
	// add x0, x0, :tprel_hi12:a
	// add x0, x0, :tprel_lo12_nc:a
	SDValue HiVar = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0, AArch64II::MO_TLS \| AArch64II::MO_HI12);
	SDValue LoVar = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0,
	AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);
	Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
	HiVar,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
	LoVar,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	}

	case 32: {
	// mrs x1, TPIDR_EL0
	// movz x0, #:tprel_g1:a
	// movk x0, #:tprel_g0_nc:a
	// add x0, x1, x0
	SDValue HiVar = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0, AArch64II::MO_TLS \| AArch64II::MO_G1);
	SDValue LoVar = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0,
	AArch64II::MO_TLS \| AArch64II::MO_G0 \| AArch64II::MO_NC);
	TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
	DAG.getTargetConstant(16, DL, MVT::i32)),
	0);
	TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
	}

	case 48: {
	// mrs x1, TPIDR_EL0
	// movz x0, #:tprel_g2:a
	// movk x0, #:tprel_g1_nc:a
	// movk x0, #:tprel_g0_nc:a
	// add x0, x1, x0
	SDValue HiVar = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0, AArch64II::MO_TLS \| AArch64II::MO_G2);
	SDValue MiVar = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0,
	AArch64II::MO_TLS \| AArch64II::MO_G1 \| AArch64II::MO_NC);
	SDValue LoVar = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0,
	AArch64II::MO_TLS \| AArch64II::MO_G0 \| AArch64II::MO_NC);
	TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
	DAG.getTargetConstant(32, DL, MVT::i32)),
	0);
	TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
	DAG.getTargetConstant(16, DL, MVT::i32)),
	0);
	TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
	}
	}
	}

	/// When accessing thread-local variables under either the general-dynamic or
	/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
	/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
	/// is a function pointer to carry out the resolution.
	///
	/// The sequence is:
	/// adrp x0, :tlsdesc:var
	/// ldr x1, [x0, #:tlsdesc_lo12:var]
	/// add x0, x0, #:tlsdesc_lo12:var
	/// .tlsdesccall var
	/// blr x1
	/// (TPIDR_EL0 offset now in x0)
	///
	/// The above sequence must be produced unscheduled, to enable the linker to
	/// optimize/relax this sequence.
	/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
	/// above sequence, and expanded really late in the compilation flow, to ensure
	/// the sequence is produced as per above.
	SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
	const SDLoc &DL,
	SelectionDAG &DAG) const {
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	SDValue Chain = DAG.getEntryNode();
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

	Chain =
	DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
	SDValue Glue = Chain.getValue(1);

	return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
	}

	SDValue
	AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Subtarget->isTargetELF() && "This function expects an ELF target");

	const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);

	TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());

	if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
	if (Model == TLSModel::LocalDynamic)
	Model = TLSModel::GeneralDynamic;
	}

	if (getTargetMachine().getCodeModel() == CodeModel::Large &&
	Model != TLSModel::LocalExec)
	report_fatal_error("ELF TLS only supported in small memory model or "
	"in local exec TLS model");
	// Different choices can be made for the maximum size of the TLS area for a
	// module. For the small address model, the default TLS size is 16MiB and the
	// maximum TLS size is 4GiB.
	// FIXME: add tiny and large code model support for TLS access models other
	// than local exec. We currently generate the same code as small for tiny,
	// which may be larger than needed.

	SDValue TPOff;
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDLoc DL(Op);
	const GlobalValue *GV = GA->getGlobal();

	SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);

	if (Model == TLSModel::LocalExec) {
	return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
	} else if (Model == TLSModel::InitialExec) {
	TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
	TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
	} else if (Model == TLSModel::LocalDynamic) {
	// Local-dynamic accesses proceed in two phases. A general-dynamic TLS
	// descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
	// the beginning of the module's TLS region, followed by a DTPREL offset
	// calculation.

	// These accesses will need deduplicating if there's more than one.
	AArch64FunctionInfo *MFI =
	DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
	MFI->incNumLocalDynamicTLSAccesses();

	// The call needs a relocation too for linker relaxation. It doesn't make
	// sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
	// the address.
	SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
	AArch64II::MO_TLS);

	// Now we can calculate the offset from TPIDR_EL0 to this module's
	// thread-local area.
	TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);

	// Now use :dtprel_whatever: operations to calculate this variable's offset
	// in its thread-storage area.
	SDValue HiVar = DAG.getTargetGlobalAddress(
	GV, DL, MVT::i64, 0, AArch64II::MO_TLS \| AArch64II::MO_HI12);
	SDValue LoVar = DAG.getTargetGlobalAddress(
	GV, DL, MVT::i64, 0,
	AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);

	TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	} else if (Model == TLSModel::GeneralDynamic) {
	// The call needs a relocation too for linker relaxation. It doesn't make
	// sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
	// the address.
	SDValue SymAddr =
	DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);

	// Finally we can make a call to calculate the offset from tpidr_el0.
	TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
	} else
	llvm_unreachable("Unsupported ELF TLS access model");

	return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
	}

	SDValue
	AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");

	SDValue Chain = DAG.getEntryNode();
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDLoc DL(Op);

	SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);

	// Load the ThreadLocalStoragePointer from the TEB
	// A pointer to the TLS array is located at offset 0x58 from the TEB.
	SDValue TLSArray =
	DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
	TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
	Chain = TLSArray.getValue(1);

	// Load the TLS index from the C runtime;
	// This does the same as getAddr(), but without having a GlobalAddressSDNode.
	// This also does the same as LOADgot, but using a generic i32 load,
	// while LOADgot only loads i64.
	SDValue TLSIndexHi =
	DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
	SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
	"_tls_index", PtrVT, AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);
	SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
	SDValue TLSIndex =
	DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
	TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
	Chain = TLSIndex.getValue(1);

	// The pointer to the thread's TLS data area is at the TLS Index scaled by 8
	// offset into the TLSArray.
	TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
	SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
	DAG.getConstant(3, DL, PtrVT));
	SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
	DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
	MachinePointerInfo());
	Chain = TLS.getValue(1);

	const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
	const GlobalValue *GV = GA->getGlobal();
	SDValue TGAHi = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0, AArch64II::MO_TLS \| AArch64II::MO_HI12);
	SDValue TGALo = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0,
	AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);

	// Add the offset from the start of the .tls section (section base).
	SDValue Addr =
	SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
	return Addr;
	}

	SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
	SelectionDAG &DAG) const {
	const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
	if (DAG.getTarget().useEmulatedTLS())
	return LowerToTLSEmulatedModel(GA, DAG);

	if (Subtarget->isTargetDarwin())
	return LowerDarwinGlobalTLSAddress(Op, DAG);
	if (Subtarget->isTargetELF())
	return LowerELFGlobalTLSAddress(Op, DAG);
	if (Subtarget->isTargetWindows())
	return LowerWindowsGlobalTLSAddress(Op, DAG);

	llvm_unreachable("Unexpected platform trying to use TLS");
	}

	SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
	SDValue Chain = Op.getOperand(0);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
	SDValue LHS = Op.getOperand(2);
	SDValue RHS = Op.getOperand(3);
	SDValue Dest = Op.getOperand(4);
	SDLoc dl(Op);

	MachineFunction &MF = DAG.getMachineFunction();
	// Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
	// will not be produced, as they are conditional branch instructions that do
	// not set flags.
	bool ProduceNonFlagSettingCondBr =
	!MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);

	// Handle f128 first, since lowering it will result in comparing the return
	// value of a libcall against zero, which is just what the rest of LowerBR_CC
	// is expecting to deal with.
	if (LHS.getValueType() == MVT::f128) {
	softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);

	// If softenSetCCOperands returned a scalar, we need to compare the result
	// against zero to select between true and false values.
	if (!RHS.getNode()) {
	RHS = DAG.getConstant(0, dl, LHS.getValueType());
	CC = ISD::SETNE;
	}
	}

	// Optimize {s\|u}{add\|sub\|mul}.with.overflow feeding into a branch
	// instruction.
	if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	// Only lower legal XALUO ops.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
	return SDValue();

	// The actual operation with overflow check.
	AArch64CC::CondCode OFCC;
	SDValue Value, Overflow;
	std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);

	if (CC == ISD::SETNE)
	OFCC = getInvertedCondCode(OFCC);
	SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);

	return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
	Overflow);
	}

	if (LHS.getValueType().isInteger()) {
	assert((LHS.getValueType() == RHS.getValueType()) &&
	(LHS.getValueType() == MVT::i32 \|\| LHS.getValueType() == MVT::i64));

	// If the RHS of the comparison is zero, we can potentially fold this
	// to a specialized branch.
	const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
	if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
	if (CC == ISD::SETEQ) {
	// See if we can use a TBZ to fold in an AND as well.
	// TBZ has a smaller branch displacement than CBZ. If the offset is
	// out of bounds, a late MI-layer pass rewrites branches.
	// 403.gcc is an example that hits this case.
	if (LHS.getOpcode() == ISD::AND &&
	isa<ConstantSDNode>(LHS.getOperand(1)) &&
	isPowerOf2_64(LHS.getConstantOperandVal(1))) {
	SDValue Test = LHS.getOperand(0);
	uint64_t Mask = LHS.getConstantOperandVal(1);
	return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
	DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
	Dest);
	}

	return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
	} else if (CC == ISD::SETNE) {
	// See if we can use a TBZ to fold in an AND as well.
	// TBZ has a smaller branch displacement than CBZ. If the offset is
	// out of bounds, a late MI-layer pass rewrites branches.
	// 403.gcc is an example that hits this case.
	if (LHS.getOpcode() == ISD::AND &&
	isa<ConstantSDNode>(LHS.getOperand(1)) &&
	isPowerOf2_64(LHS.getConstantOperandVal(1))) {
	SDValue Test = LHS.getOperand(0);
	uint64_t Mask = LHS.getConstantOperandVal(1);
	return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
	DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
	Dest);
	}

	return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
	} else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
	// Don't combine AND since emitComparison converts the AND to an ANDS
	// (a.k.a. TST) and the test in the test bit and branch instruction
	// becomes redundant. This would also increase register pressure.
	uint64_t Mask = LHS.getValueSizeInBits() - 1;
	return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
	DAG.getConstant(Mask, dl, MVT::i64), Dest);
	}
	}
	if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
	LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
	// Don't combine AND since emitComparison converts the AND to an ANDS
	// (a.k.a. TST) and the test in the test bit and branch instruction
	// becomes redundant. This would also increase register pressure.
	uint64_t Mask = LHS.getValueSizeInBits() - 1;
	return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
	DAG.getConstant(Mask, dl, MVT::i64), Dest);
	}

	SDValue CCVal;
	SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
	return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
	Cmp);
	}

	assert(LHS.getValueType() == MVT::f16 \|\| LHS.getValueType() == MVT::bf16 \|\|
	LHS.getValueType() == MVT::f32 \|\| LHS.getValueType() == MVT::f64);

	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
	// clean. Some of them require two branches to implement.
	SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
	AArch64CC::CondCode CC1, CC2;
	changeFPCCToAArch64CC(CC, CC1, CC2);
	SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
	SDValue BR1 =
	DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
	if (CC2 != AArch64CC::AL) {
	SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
	return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
	Cmp);
	}

	return BR1;
	}

	SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
	SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();
	SDLoc DL(Op);

	SDValue In1 = Op.getOperand(0);
	SDValue In2 = Op.getOperand(1);
	EVT SrcVT = In2.getValueType();

	if (SrcVT.bitsLT(VT))
	In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
	else if (SrcVT.bitsGT(VT))
	In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL));

	EVT VecVT;
	uint64_t EltMask;
	SDValue VecVal1, VecVal2;

	auto setVecVal = [&] (int Idx) {
	if (!VT.isVector()) {
	VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
	DAG.getUNDEF(VecVT), In1);
	VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
	DAG.getUNDEF(VecVT), In2);
	} else {
	VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
	VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
	}
	};

	if (VT == MVT::f32 \|\| VT == MVT::v2f32 \|\| VT == MVT::v4f32) {
	VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32);
	EltMask = 0x80000000ULL;
	setVecVal(AArch64::ssub);
	} else if (VT == MVT::f64 \|\| VT == MVT::v2f64) {
	VecVT = MVT::v2i64;

	// We want to materialize a mask with the high bit set, but the AdvSIMD
	// immediate moves cannot materialize that in a single instruction for
	// 64-bit elements. Instead, materialize zero and then negate it.
	EltMask = 0;

	setVecVal(AArch64::dsub);
	} else if (VT == MVT::f16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v8f16) {
	VecVT = (VT == MVT::v4f16 ? MVT::v4i16 : MVT::v8i16);
	EltMask = 0x8000ULL;
	setVecVal(AArch64::hsub);
	} else {
	llvm_unreachable("Invalid type for copysign!");
	}

	SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT);

	// If we couldn't materialize the mask above, then the mask vector will be
	// the zero vector, and we need to negate it here.
	if (VT == MVT::f64 \|\| VT == MVT::v2f64) {
	BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
	BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
	BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
	}

	SDValue Sel =
	DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);

	if (VT == MVT::f16)
	return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Sel);
	if (VT == MVT::f32)
	return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
	else if (VT == MVT::f64)
	return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
	else
	return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
	}

	SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
	if (DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat))
	return SDValue();

	if (!Subtarget->hasNEON())
	return SDValue();

	// While there is no integer popcount instruction, it can
	// be more efficiently lowered to the following sequence that uses
	// AdvSIMD registers/instructions as long as the copies to/from
	// the AdvSIMD registers are cheap.
	// FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
	// CNT V0.8B, V0.8B // 8xbyte pop-counts
	// ADDV B0, V0.8B // sum 8xbyte pop-counts
	// UMOV X0, V0.B[0] // copy byte result back to integer reg
	SDValue Val = Op.getOperand(0);
	SDLoc DL(Op);
	EVT VT = Op.getValueType();

	if (VT == MVT::i32 \|\| VT == MVT::i64) {
	if (VT == MVT::i32)
	Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
	Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);

	SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
	SDValue UaddLV = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
	DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);

	if (VT == MVT::i64)
	UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
	return UaddLV;
	} else if (VT == MVT::i128) {
	Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);

	SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
	SDValue UaddLV = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
	DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);

	return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
	}

	assert((VT == MVT::v1i64 \|\| VT == MVT::v2i64 \|\| VT == MVT::v2i32 \|\|
	VT == MVT::v4i32 \|\| VT == MVT::v4i16 \|\| VT == MVT::v8i16) &&
	"Unexpected type for custom ctpop lowering");

	EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
	Val = DAG.getBitcast(VT8Bit, Val);
	Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);

	// Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
	unsigned EltSize = 8;
	unsigned NumElts = VT.is64BitVector() ? 8 : 16;
	while (EltSize != VT.getScalarSizeInBits()) {
	EltSize *= 2;
	NumElts /= 2;
	MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
	Val = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, WidenVT,
	DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
	}

	return Val;
	}

	SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

	if (Op.getValueType().isVector())
	return LowerVSETCC(Op, DAG);

	bool IsStrict = Op->isStrictFPOpcode();
	bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
	unsigned OpNo = IsStrict ? 1 : 0;
	SDValue Chain;
	if (IsStrict)
	Chain = Op.getOperand(0);
	SDValue LHS = Op.getOperand(OpNo + 0);
	SDValue RHS = Op.getOperand(OpNo + 1);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
	SDLoc dl(Op);

	// We chose ZeroOrOneBooleanContents, so use zero and one.
	EVT VT = Op.getValueType();
	SDValue TVal = DAG.getConstant(1, dl, VT);
	SDValue FVal = DAG.getConstant(0, dl, VT);

	// Handle f128 first, since one possible outcome is a normal integer
	// comparison which gets picked up by the next if statement.
	if (LHS.getValueType() == MVT::f128) {
	softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
	IsSignaling);

	// If softenSetCCOperands returned a scalar, use it.
	if (!RHS.getNode()) {
	assert(LHS.getValueType() == Op.getValueType() &&
	"Unexpected setcc expansion!");
	return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
	}
	}

	if (LHS.getValueType().isInteger()) {
	SDValue CCVal;
	SDValue Cmp = getAArch64Cmp(
	LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);

	// Note that we inverted the condition above, so we reverse the order of
	// the true and false operands here. This will allow the setcc to be
	// matched to a single CSINC instruction.
	SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
	return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
	}

	// Now we know we're dealing with FP values.
	assert(LHS.getValueType() == MVT::f16 \|\| LHS.getValueType() == MVT::f32 \|\|
	LHS.getValueType() == MVT::f64);

	// If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
	// and do the comparison.
	SDValue Cmp;
	if (IsStrict)
	Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
	else
	Cmp = emitComparison(LHS, RHS, CC, dl, DAG);

	AArch64CC::CondCode CC1, CC2;
	changeFPCCToAArch64CC(CC, CC1, CC2);
	SDValue Res;
	if (CC2 == AArch64CC::AL) {
	changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
	CC2);
	SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);

	// Note that we inverted the condition above, so we reverse the order of
	// the true and false operands here. This will allow the setcc to be
	// matched to a single CSINC instruction.
	Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
	} else {
	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
	// totally clean. Some of them require two CSELs to implement. As is in
	// this case, we emit the first CSEL and then emit a second using the output
	// of the first as the RHS. We're effectively OR'ing the two CC's together.

	// FIXME: It would be nice if we could match the two CSELs to two CSINCs.
	SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
	SDValue CS1 =
	DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);

	SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
	Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
	}
	return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
	}

	SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
	SDValue RHS, SDValue TVal,
	SDValue FVal, const SDLoc &dl,
	SelectionDAG &DAG) const {
	// Handle f128 first, because it will result in a comparison of some RTLIB
	// call result against zero.
	if (LHS.getValueType() == MVT::f128) {
	softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);

	// If softenSetCCOperands returned a scalar, we need to compare the result
	// against zero to select between true and false values.
	if (!RHS.getNode()) {
	RHS = DAG.getConstant(0, dl, LHS.getValueType());
	CC = ISD::SETNE;
	}
	}

	// Also handle f16, for which we need to do a f32 comparison.
	if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
	LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
	RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
	}

	// Next, handle integers.
	if (LHS.getValueType().isInteger()) {
	assert((LHS.getValueType() == RHS.getValueType()) &&
	(LHS.getValueType() == MVT::i32 \|\| LHS.getValueType() == MVT::i64));

	unsigned Opcode = AArch64ISD::CSEL;

	// If both the TVal and the FVal are constants, see if we can swap them in
	// order to for a CSINV or CSINC out of them.
	ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
	ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);

	if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, LHS.getValueType());
	} else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, LHS.getValueType());
	} else if (TVal.getOpcode() == ISD::XOR) {
	// If TVal is a NOT we want to swap TVal and FVal so that we can match
	// with a CSINV rather than a CSEL.
	if (isAllOnesConstant(TVal.getOperand(1))) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, LHS.getValueType());
	}
	} else if (TVal.getOpcode() == ISD::SUB) {
	// If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
	// that we can match with a CSNEG rather than a CSEL.
	if (isNullConstant(TVal.getOperand(0))) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, LHS.getValueType());
	}
	} else if (CTVal && CFVal) {
	const int64_t TrueVal = CTVal->getSExtValue();
	const int64_t FalseVal = CFVal->getSExtValue();
	bool Swap = false;

	// If both TVal and FVal are constants, see if FVal is the
	// inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
	// instead of a CSEL in that case.
	if (TrueVal == ~FalseVal) {
	Opcode = AArch64ISD::CSINV;
	} else if (TrueVal == -FalseVal) {
	Opcode = AArch64ISD::CSNEG;
	} else if (TVal.getValueType() == MVT::i32) {
	// If our operands are only 32-bit wide, make sure we use 32-bit
	// arithmetic for the check whether we can use CSINC. This ensures that
	// the addition in the check will wrap around properly in case there is
	// an overflow (which would not be the case if we do the check with
	// 64-bit arithmetic).
	const uint32_t TrueVal32 = CTVal->getZExtValue();
	const uint32_t FalseVal32 = CFVal->getZExtValue();

	if ((TrueVal32 == FalseVal32 + 1) \|\| (TrueVal32 + 1 == FalseVal32)) {
	Opcode = AArch64ISD::CSINC;

	if (TrueVal32 > FalseVal32) {
	Swap = true;
	}
	}
	// 64-bit check whether we can use CSINC.
	} else if ((TrueVal == FalseVal + 1) \|\| (TrueVal + 1 == FalseVal)) {
	Opcode = AArch64ISD::CSINC;

	if (TrueVal > FalseVal) {
	Swap = true;
	}
	}

	// Swap TVal and FVal if necessary.
	if (Swap) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, LHS.getValueType());
	}

	if (Opcode != AArch64ISD::CSEL) {
	// Drop FVal since we can get its value by simply inverting/negating
	// TVal.
	FVal = TVal;
	}
	}

	// Avoid materializing a constant when possible by reusing a known value in
	// a register. However, don't perform this optimization if the known value
	// is one, zero or negative one in the case of a CSEL. We can always
	// materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
	// FVal, respectively.
	ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
	if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
	!RHSVal->isNullValue() && !RHSVal->isAllOnesValue()) {
	AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
	// Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
	// "a != C ? x : a" to avoid materializing C.
	if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
	TVal = LHS;
	else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
	FVal = LHS;
	} else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
	assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
	// Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
	// avoid materializing C.
	AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
	if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
	Opcode = AArch64ISD::CSINV;
	TVal = LHS;
	FVal = DAG.getConstant(0, dl, FVal.getValueType());
	}
	}

	SDValue CCVal;
	SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
	EVT VT = TVal.getValueType();
	return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
	}

	// Now we know we're dealing with FP values.
	assert(LHS.getValueType() == MVT::f16 \|\| LHS.getValueType() == MVT::f32 \|\|
	LHS.getValueType() == MVT::f64);
	assert(LHS.getValueType() == RHS.getValueType());
	EVT VT = TVal.getValueType();
	SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);

	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
	// clean. Some of them require two CSELs to implement.
	AArch64CC::CondCode CC1, CC2;
	changeFPCCToAArch64CC(CC, CC1, CC2);

	if (DAG.getTarget().Options.UnsafeFPMath) {
	// Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
	// "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
	ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
	if (RHSVal && RHSVal->isZero()) {
	ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
	ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);

	if ((CC == ISD::SETEQ \|\| CC == ISD::SETOEQ \|\| CC == ISD::SETUEQ) &&
	CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
	TVal = LHS;
	else if ((CC == ISD::SETNE \|\| CC == ISD::SETONE \|\| CC == ISD::SETUNE) &&
	CFVal && CFVal->isZero() &&
	FVal.getValueType() == LHS.getValueType())
	FVal = LHS;
	}
	}

	// Emit first, and possibly only, CSEL.
	SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
	SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);

	// If we need a second CSEL, emit it, using the output of the first as the
	// RHS. We're effectively OR'ing the two CC's together.
	if (CC2 != AArch64CC::AL) {
	SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
	return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
	}

	// Otherwise, return the output of the first CSEL.
	return CS1;
	}

	SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
	SelectionDAG &DAG) const {
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	SDValue TVal = Op.getOperand(2);
	SDValue FVal = Op.getOperand(3);
	SDLoc DL(Op);
	return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
	}

	SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue CCVal = Op->getOperand(0);
	SDValue TVal = Op->getOperand(1);
	SDValue FVal = Op->getOperand(2);
	SDLoc DL(Op);

	EVT Ty = Op.getValueType();
	if (Ty.isScalableVector()) {
	SDValue TruncCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, CCVal);
	MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
	SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, TruncCC);
	return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
	}

	// Optimize {s\|u}{add\|sub\|mul}.with.overflow feeding into a select
	// instruction.
	if (ISD::isOverflowIntrOpRes(CCVal)) {
	// Only lower legal XALUO ops.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
	return SDValue();

	AArch64CC::CondCode OFCC;
	SDValue Value, Overflow;
	std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
	SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);

	return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
	CCVal, Overflow);
	}

	// Lower it the same way as we would lower a SELECT_CC node.
	ISD::CondCode CC;
	SDValue LHS, RHS;
	if (CCVal.getOpcode() == ISD::SETCC) {
	LHS = CCVal.getOperand(0);
	RHS = CCVal.getOperand(1);
	CC = cast<CondCodeSDNode>(CCVal->getOperand(2))->get();
	} else {
	LHS = CCVal;
	RHS = DAG.getConstant(0, DL, CCVal.getValueType());
	CC = ISD::SETNE;
	}
	return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
	}

	SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
	SelectionDAG &DAG) const {
	// Jump table entries as PC relative offsets. No additional tweaking
	// is necessary here. Just get the address of the jump table.
	JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

	if (getTargetMachine().getCodeModel() == CodeModel::Large &&
	!Subtarget->isTargetMachO()) {
	return getAddrLarge(JT, DAG);
	} else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
	return getAddrTiny(JT, DAG);
	}
	return getAddr(JT, DAG);
	}

	SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
	SelectionDAG &DAG) const {
	// Jump table entries as PC relative offsets. No additional tweaking
	// is necessary here. Just get the address of the jump table.
	SDLoc DL(Op);
	SDValue JT = Op.getOperand(1);
	SDValue Entry = Op.getOperand(2);
	int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();

	SDNode *Dest =
	DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
	Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
	return DAG.getNode(ISD::BRIND, DL, MVT::Other, Op.getOperand(0),
	SDValue(Dest, 0));
	}

	SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
	SelectionDAG &DAG) const {
	ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);

	if (getTargetMachine().getCodeModel() == CodeModel::Large) {
	// Use the GOT for the large code model on iOS.
	if (Subtarget->isTargetMachO()) {
	return getGOT(CP, DAG);
	}
	return getAddrLarge(CP, DAG);
	} else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
	return getAddrTiny(CP, DAG);
	} else {
	return getAddr(CP, DAG);
	}
	}

	SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
	SelectionDAG &DAG) const {
	BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
	if (getTargetMachine().getCodeModel() == CodeModel::Large &&
	!Subtarget->isTargetMachO()) {
	return getAddrLarge(BA, DAG);
	} else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
	return getAddrTiny(BA, DAG);
	}
	return getAddr(BA, DAG);
	}

	SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
	SelectionDAG &DAG) const {
	AArch64FunctionInfo *FuncInfo =
	DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();

	SDLoc DL(Op);
	SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
	getPointerTy(DAG.getDataLayout()));
	FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
	SelectionDAG &DAG) const {
	AArch64FunctionInfo *FuncInfo =
	DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();

	SDLoc DL(Op);
	SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
	? FuncInfo->getVarArgsGPRIndex()
	: FuncInfo->getVarArgsStackIndex(),
	getPointerTy(DAG.getDataLayout()));
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
	SelectionDAG &DAG) const {
	// The layout of the va_list struct is specified in the AArch64 Procedure Call
	// Standard, section B.3.
	MachineFunction &MF = DAG.getMachineFunction();
	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDLoc DL(Op);

	SDValue Chain = Op.getOperand(0);
	SDValue VAList = Op.getOperand(1);
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	SmallVector<SDValue, 4> MemOps;

	// void *__stack at offset 0
	SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
	MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
	MachinePointerInfo(SV), /* Alignment = */ 8));

	// void *__gr_top at offset 8
	int GPRSize = FuncInfo->getVarArgsGPRSize();
	if (GPRSize > 0) {
	SDValue GRTop, GRTopAddr;

	GRTopAddr =
	DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT));

	GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
	GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
	DAG.getConstant(GPRSize, DL, PtrVT));

	MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
	MachinePointerInfo(SV, 8),
	/* Alignment = */ 8));
	}

	// void *__vr_top at offset 16
	int FPRSize = FuncInfo->getVarArgsFPRSize();
	if (FPRSize > 0) {
	SDValue VRTop, VRTopAddr;
	VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
	DAG.getConstant(16, DL, PtrVT));

	VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
	VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
	DAG.getConstant(FPRSize, DL, PtrVT));

	MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
	MachinePointerInfo(SV, 16),
	/* Alignment = */ 8));
	}

	// int __gr_offs at offset 24
	SDValue GROffsAddr =
	DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT));
	MemOps.push_back(DAG.getStore(
	Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr,
	MachinePointerInfo(SV, 24), /* Alignment = */ 4));

	// int __vr_offs at offset 28
	SDValue VROffsAddr =
	DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT));
	MemOps.push_back(DAG.getStore(
	Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr,
	MachinePointerInfo(SV, 28), /* Alignment = */ 4));

	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
	}

	SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();

	if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
	return LowerWin64_VASTART(Op, DAG);
	else if (Subtarget->isTargetDarwin())
	return LowerDarwin_VASTART(Op, DAG);
	else
	return LowerAAPCS_VASTART(Op, DAG);
	}

	SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
	SelectionDAG &DAG) const {
	// AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
	// pointer.
	SDLoc DL(Op);
	unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
	unsigned VaListSize = (Subtarget->isTargetDarwin() \|\|
	Subtarget->isTargetWindows()) ? PtrSize : 32;
	const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
	const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();

	return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
	DAG.getConstant(VaListSize, DL, MVT::i32),
	Align(PtrSize), false, false, false,
	MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
	}

	SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
	assert(Subtarget->isTargetDarwin() &&
	"automatic va_arg instruction only works on Darwin");

	const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	SDValue Chain = Op.getOperand(0);
	SDValue Addr = Op.getOperand(1);
	MaybeAlign Align(Op.getConstantOperandVal(3));
	unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
	SDValue VAList =
	DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
	Chain = VAList.getValue(1);
	VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);

	if (Align && *Align > MinSlotSize) {
	VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
	DAG.getConstant(Align->value() - 1, DL, PtrVT));
	VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
	DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
	}

	Type ArgTy = VT.getTypeForEVT(DAG.getContext());
	unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);

	// Scalar integer and FP values smaller than 64 bits are implicitly extended
	// up to 64 bits. At the very least, we have to increase the striding of the
	// vaargs list to match this, and for FP values we need to introduce
	// FP_ROUND nodes as well.
	if (VT.isInteger() && !VT.isVector())
	ArgSize = std::max(ArgSize, MinSlotSize);
	bool NeedFPTrunc = false;
	if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
	ArgSize = 8;
	NeedFPTrunc = true;
	}

	// Increment the pointer, VAList, to the next vaarg
	SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
	DAG.getConstant(ArgSize, DL, PtrVT));
	VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);

	// Store the incremented VAList to the legalized pointer
	SDValue APStore =
	DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));

	// Load the actual argument out of the pointer VAList
	if (NeedFPTrunc) {
	// Load the value as an f64.
	SDValue WideFP =
	DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
	// Round the value down to an f32.
	SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
	DAG.getIntPtrConstant(1, DL));
	SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
	// Merge the rounded value with the chain output of the load.
	return DAG.getMergeValues(Ops, DL);
	}

	return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
	}

	SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setFrameAddressIsTaken(true);

	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	SDValue FrameAddr =
	DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
	while (Depth--)
	FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
	MachinePointerInfo());

	if (Subtarget->isTargetILP32())
	FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
	DAG.getValueType(VT));

	return FrameAddr;
	}

	SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

	EVT VT = getPointerTy(DAG.getDataLayout());
	SDLoc DL(Op);
	int FI = MFI.CreateFixedObject(4, 0, false);
	return DAG.getFrameIndex(FI, VT);
	}

	#define GET_REGISTER_MATCHER
	#include "AArch64GenAsmMatcher.inc"

	// FIXME? Maybe this could be a TableGen attribute on some registers and
	// this table could be generated automatically from RegInfo.
	Register AArch64TargetLowering::
	getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
	Register Reg = MatchRegisterName(RegName);
	if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
	const MCRegisterInfo *MRI = Subtarget->getRegisterInfo();
	unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
	if (!Subtarget->isXRegisterReserved(DwarfRegNum))
	Reg = 0;
	}
	if (Reg)
	return Reg;
	report_fatal_error(Twine("Invalid register name \""
	+ StringRef(RegName) + "\"."));
	}

	SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true);

	EVT VT = Op.getValueType();
	SDLoc DL(Op);

	SDValue FrameAddr =
	DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
	SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));

	return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
	}

	SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MFI.setReturnAddressIsTaken(true);

	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	if (Depth) {
	SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
	SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
	return DAG.getLoad(VT, DL, DAG.getEntryNode(),
	DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
	MachinePointerInfo());
	}

	// Return LR, which contains the return address. Mark it an implicit live-in.
	unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
	return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
	}

	/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
	/// i64 values and take a 2 x i64 value to shift plus a shift amount.
	SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getNumOperands() == 3 && "Not a double-shift!");
	EVT VT = Op.getValueType();
	unsigned VTBits = VT.getSizeInBits();
	SDLoc dl(Op);
	SDValue ShOpLo = Op.getOperand(0);
	SDValue ShOpHi = Op.getOperand(1);
	SDValue ShAmt = Op.getOperand(2);
	unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;

	assert(Op.getOpcode() == ISD::SRA_PARTS \|\| Op.getOpcode() == ISD::SRL_PARTS);

	SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
	DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
	SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);

	// Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which
	// is "undef". We wanted 0, so CSEL it directly.
	SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
	ISD::SETEQ, dl, DAG);
	SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
	HiBitsForLo =
	DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
	HiBitsForLo, CCVal, Cmp);

	SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
	DAG.getConstant(VTBits, dl, MVT::i64));

	SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
	SDValue LoForNormalShift =
	DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo);

	Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
	dl, DAG);
	CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
	SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
	SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
	LoForNormalShift, CCVal, Cmp);

	// AArch64 shifts larger than the register width are wrapped rather than
	// clamped, so we can't just emit "hi >> x".
	SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
	SDValue HiForBigShift =
	Opc == ISD::SRA
	? DAG.getNode(Opc, dl, VT, ShOpHi,
	DAG.getConstant(VTBits - 1, dl, MVT::i64))
	: DAG.getConstant(0, dl, VT);
	SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
	HiForNormalShift, CCVal, Cmp);

	SDValue Ops[2] = { Lo, Hi };
	return DAG.getMergeValues(Ops, dl);
	}

	/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
	/// i64 values and take a 2 x i64 value to shift plus a shift amount.
	SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getNumOperands() == 3 && "Not a double-shift!");
	EVT VT = Op.getValueType();
	unsigned VTBits = VT.getSizeInBits();
	SDLoc dl(Op);
	SDValue ShOpLo = Op.getOperand(0);
	SDValue ShOpHi = Op.getOperand(1);
	SDValue ShAmt = Op.getOperand(2);

	assert(Op.getOpcode() == ISD::SHL_PARTS);
	SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
	DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
	SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);

	// Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which
	// is "undef". We wanted 0, so CSEL it directly.
	SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
	ISD::SETEQ, dl, DAG);
	SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
	LoBitsForHi =
	DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
	LoBitsForHi, CCVal, Cmp);

	SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
	DAG.getConstant(VTBits, dl, MVT::i64));
	SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
	SDValue HiForNormalShift =
	DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi);

	SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);

	Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
	dl, DAG);
	CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
	SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
	HiForNormalShift, CCVal, Cmp);

	// AArch64 shifts of larger than register sizes are wrapped rather than
	// clamped, so we can't just emit "lo << a" if a is too big.
	SDValue LoForBigShift = DAG.getConstant(0, dl, VT);
	SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
	SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
	LoForNormalShift, CCVal, Cmp);

	SDValue Ops[2] = { Lo, Hi };
	return DAG.getMergeValues(Ops, dl);
	}

	bool AArch64TargetLowering::isOffsetFoldingLegal(
	const GlobalAddressSDNode *GA) const {
	// Offsets are folded in the DAG combine rather than here so that we can
	// intelligently choose an offset based on the uses.
	return false;
	}

	bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
	bool OptForSize) const {
	bool IsLegal = false;
	// We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
	// 16-bit case when target has full fp16 support.
	// FIXME: We should be able to handle f128 as well with a clever lowering.
	const APInt ImmInt = Imm.bitcastToAPInt();
	if (VT == MVT::f64)
	IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 \|\| Imm.isPosZero();
	else if (VT == MVT::f32)
	IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 \|\| Imm.isPosZero();
	else if (VT == MVT::f16 && Subtarget->hasFullFP16())
	IsLegal = AArch64_AM::getFP16Imm(ImmInt) != -1 \|\| Imm.isPosZero();
	// TODO: fmov h0, w0 is also legal, however on't have an isel pattern to
	// generate that fmov.

	// If we can not materialize in immediate field for fmov, check if the
	// value can be encoded as the immediate operand of a logical instruction.
	// The immediate value will be created with either MOVZ, MOVN, or ORR.
	if (!IsLegal && (VT == MVT::f64 \|\| VT == MVT::f32)) {
	// The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
	// however the mov+fmov sequence is always better because of the reduced
	// cache pressure. The timings are still the same if you consider
	// movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
	// movw+movk is fused). So we limit up to 2 instrdduction at most.
	SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
	AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(),
	Insn);
	unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
	IsLegal = Insn.size() <= Limit;
	}

	LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT.getEVTString()
	<< " imm value: "; Imm.dump(););
	return IsLegal;
	}

	//===----------------------------------------------------------------------===//
	// AArch64 Optimization Hooks
	//===----------------------------------------------------------------------===//

	static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
	SDValue Operand, SelectionDAG &DAG,
	int &ExtraSteps) {
	EVT VT = Operand.getValueType();
	if (ST->hasNEON() &&
	(VT == MVT::f64 \|\| VT == MVT::v1f64 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::f32 \|\| VT == MVT::v1f32 \|\|
	VT == MVT::v2f32 \|\| VT == MVT::v4f32)) {
	if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified)
	// For the reciprocal estimates, convergence is quadratic, so the number
	// of digits is doubled after each iteration. In ARMv8, the accuracy of
	// the initial estimate is 2^-8. Thus the number of extra steps to refine
	// the result for float (23 mantissa bits) is 2 and for double (52
	// mantissa bits) is 3.
	ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2;

	return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
	}

	return SDValue();
	}

	SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
	SelectionDAG &DAG, int Enabled,
	int &ExtraSteps,
	bool &UseOneConst,
	bool Reciprocal) const {
	if (Enabled == ReciprocalEstimate::Enabled \|\|
	(Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
	if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
	DAG, ExtraSteps)) {
	SDLoc DL(Operand);
	EVT VT = Operand.getValueType();

	SDNodeFlags Flags;
	Flags.setAllowReassociation(true);

	// Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
	// AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
	for (int i = ExtraSteps; i > 0; --i) {
	SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
	Flags);
	Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
	Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
	}
	if (!Reciprocal) {
	EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
	VT);
	SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
	SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ);

	Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
	// Correct the result if the operand is 0.0.
	Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL,
	VT, Eq, Operand, Estimate);
	}

	ExtraSteps = 0;
	return Estimate;
	}

	return SDValue();
	}

	SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
	SelectionDAG &DAG, int Enabled,
	int &ExtraSteps) const {
	if (Enabled == ReciprocalEstimate::Enabled)
	if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
	DAG, ExtraSteps)) {
	SDLoc DL(Operand);
	EVT VT = Operand.getValueType();

	SDNodeFlags Flags;
	Flags.setAllowReassociation(true);

	// Newton reciprocal iteration: E * (2 - X * E)
	// AArch64 reciprocal iteration instruction: (2 - M * N)
	for (int i = ExtraSteps; i > 0; --i) {
	SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
	Estimate, Flags);
	Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
	}

	ExtraSteps = 0;
	return Estimate;
	}

	return SDValue();
	}

	//===----------------------------------------------------------------------===//
	// AArch64 Inline Assembly Support
	//===----------------------------------------------------------------------===//

	// Table of Constraints
	// TODO: This is the current set of constraints supported by ARM for the
	// compiler, not all of them may make sense.
	//
	// r - A general register
	// w - An FP/SIMD register of some size in the range v0-v31
	// x - An FP/SIMD register of some size in the range v0-v15
	// I - Constant that can be used with an ADD instruction
	// J - Constant that can be used with a SUB instruction
	// K - Constant that can be used with a 32-bit logical instruction
	// L - Constant that can be used with a 64-bit logical instruction
	// M - Constant that can be used as a 32-bit MOV immediate
	// N - Constant that can be used as a 64-bit MOV immediate
	// Q - A memory reference with base register and no offset
	// S - A symbolic address
	// Y - Floating point constant zero
	// Z - Integer constant zero
	//
	// Note that general register operands will be output using their 64-bit x
	// register name, whatever the size of the variable, unless the asm operand
	// is prefixed by the %w modifier. Floating-point and SIMD register operands
	// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
	// %q modifier.
	const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
	// At this point, we have to lower this constraint to something else, so we
	// lower it to an "r" or "w". However, by doing this we will force the result
	// to be in register, while the X constraint is much more permissive.
	//
	// Although we are correct (we are free to emit anything, without
	// constraints), we might break use cases that would expect us to be more
	// efficient and emit something else.
	if (!Subtarget->hasFPARMv8())
	return "r";

	if (ConstraintVT.isFloatingPoint())
	return "w";

	if (ConstraintVT.isVector() &&
	(ConstraintVT.getSizeInBits() == 64 \|\|
	ConstraintVT.getSizeInBits() == 128))
	return "w";

	return "r";
	}

	enum PredicateConstraint {
	Upl,
	Upa,
	Invalid
	};

	static PredicateConstraint parsePredicateConstraint(StringRef Constraint) {
	PredicateConstraint P = PredicateConstraint::Invalid;
	if (Constraint == "Upa")
	P = PredicateConstraint::Upa;
	if (Constraint == "Upl")
	P = PredicateConstraint::Upl;
	return P;
	}

	/// getConstraintType - Given a constraint letter, return the type of
	/// constraint it is for this target.
	AArch64TargetLowering::ConstraintType
	AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	default:
	break;
	case 'x':
	case 'w':
	case 'y':
	return C_RegisterClass;
	// An address with a single base register. Due to the way we
	// currently handle addresses it is the same as 'r'.
	case 'Q':
	return C_Memory;
	case 'I':
	case 'J':
	case 'K':
	case 'L':
	case 'M':
	case 'N':
	case 'Y':
	case 'Z':
	return C_Immediate;
	case 'z':
	case 'S': // A symbolic address
	return C_Other;
	}
	} else if (parsePredicateConstraint(Constraint) !=
	PredicateConstraint::Invalid)
	return C_RegisterClass;
	return TargetLowering::getConstraintType(Constraint);
	}

	/// Examine constraint type and operand type and determine a weight value.
	/// This object must already have been set up with the operand type
	/// and the current alternative constraint selected.
	TargetLowering::ConstraintWeight
	AArch64TargetLowering::getSingleConstraintMatchWeight(
	AsmOperandInfo &info, const char *constraint) const {
	ConstraintWeight weight = CW_Invalid;
	Value *CallOperandVal = info.CallOperandVal;
	// If we don't have a value, we can't do a match,
	// but allow it at the lowest weight.
	if (!CallOperandVal)
	return CW_Default;
	Type *type = CallOperandVal->getType();
	// Look at the constraint type.
	switch (*constraint) {
	default:
	weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
	break;
	case 'x':
	case 'w':
	case 'y':
	if (type->isFloatingPointTy() \|\| type->isVectorTy())
	weight = CW_Register;
	break;
	case 'z':
	weight = CW_Constant;
	break;
	case 'U':
	if (parsePredicateConstraint(constraint) != PredicateConstraint::Invalid)
	weight = CW_Register;
	break;
	}
	return weight;
	}

	std::pair<unsigned, const TargetRegisterClass *>
	AArch64TargetLowering::getRegForInlineAsmConstraint(
	const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	case 'r':
	if (VT.getSizeInBits() == 64)
	return std::make_pair(0U, &AArch64::GPR64commonRegClass);
	return std::make_pair(0U, &AArch64::GPR32commonRegClass);
	case 'w':
	if (!Subtarget->hasFPARMv8())
	break;
	if (VT.isScalableVector())
	return std::make_pair(0U, &AArch64::ZPRRegClass);
	if (VT.getSizeInBits() == 16)
	return std::make_pair(0U, &AArch64::FPR16RegClass);
	if (VT.getSizeInBits() == 32)
	return std::make_pair(0U, &AArch64::FPR32RegClass);
	if (VT.getSizeInBits() == 64)
	return std::make_pair(0U, &AArch64::FPR64RegClass);
	if (VT.getSizeInBits() == 128)
	return std::make_pair(0U, &AArch64::FPR128RegClass);
	break;
	// The instructions that this constraint is designed for can
	// only take 128-bit registers so just use that regclass.
	case 'x':
	if (!Subtarget->hasFPARMv8())
	break;
	if (VT.isScalableVector())
	return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
	if (VT.getSizeInBits() == 128)
	return std::make_pair(0U, &AArch64::FPR128_loRegClass);
	break;
	case 'y':
	if (!Subtarget->hasFPARMv8())
	break;
	if (VT.isScalableVector())
	return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
	break;
	}
	} else {
	PredicateConstraint PC = parsePredicateConstraint(Constraint);
	if (PC != PredicateConstraint::Invalid) {
	assert(VT.isScalableVector());
	bool restricted = (PC == PredicateConstraint::Upl);
	return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass)
	: std::make_pair(0U, &AArch64::PPRRegClass);
	}
	}
	if (StringRef("{cc}").equals_lower(Constraint))
	return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);

	// Use the default implementation in TargetLowering to convert the register
	// constraint into a member of a register class.
	std::pair<unsigned, const TargetRegisterClass *> Res;
	Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

	// Not found as a standard register?
	if (!Res.second) {
	unsigned Size = Constraint.size();
	if ((Size == 4 \|\| Size == 5) && Constraint[0] == '{' &&
	tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
	int RegNo;
	bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
	if (!Failed && RegNo >= 0 && RegNo <= 31) {
	// v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
	// By default we'll emit v0-v31 for this unless there's a modifier where
	// we'll emit the correct register as well.
	if (VT != MVT::Other && VT.getSizeInBits() == 64) {
	Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
	Res.second = &AArch64::FPR64RegClass;
	} else {
	Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
	Res.second = &AArch64::FPR128RegClass;
	}
	}
	}
	}

	if (Res.second && !Subtarget->hasFPARMv8() &&
	!AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
	!AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
	return std::make_pair(0U, nullptr);

	return Res;
	}

	/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
	/// vector. If it is invalid, don't add anything to Ops.
	void AArch64TargetLowering::LowerAsmOperandForConstraint(
	SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
	SelectionDAG &DAG) const {
	SDValue Result;

	// Currently only support length 1 constraints.
	if (Constraint.length() != 1)
	return;

	char ConstraintLetter = Constraint[0];
	switch (ConstraintLetter) {
	default:
	break;

	// This set of constraints deal with valid constants for various instructions.
	// Validate and return a target constant for them if we can.
	case 'z': {
	// 'z' maps to xzr or wzr so it needs an input of 0.
	if (!isNullConstant(Op))
	return;

	if (Op.getValueType() == MVT::i64)
	Result = DAG.getRegister(AArch64::XZR, MVT::i64);
	else
	Result = DAG.getRegister(AArch64::WZR, MVT::i32);
	break;
	}
	case 'S': {
	// An absolute symbolic address or label reference.
	if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
	Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
	GA->getValueType(0));
	} else if (const BlockAddressSDNode *BA =
	dyn_cast<BlockAddressSDNode>(Op)) {
	Result =
	DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0));
	} else if (const ExternalSymbolSDNode *ES =
	dyn_cast<ExternalSymbolSDNode>(Op)) {
	Result =
	DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0));
	} else
	return;
	break;
	}

	case 'I':
	case 'J':
	case 'K':
	case 'L':
	case 'M':
	case 'N':
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
	if (!C)
	return;

	// Grab the value and do some validation.
	uint64_t CVal = C->getZExtValue();
	switch (ConstraintLetter) {
	// The I constraint applies only to simple ADD or SUB immediate operands:
	// i.e. 0 to 4095 with optional shift by 12
	// The J constraint applies only to ADD or SUB immediates that would be
	// valid when negated, i.e. if [an add pattern] were to be output as a SUB
	// instruction [or vice versa], in other words -1 to -4095 with optional
	// left shift by 12.
	case 'I':
	if (isUInt<12>(CVal) \|\| isShiftedUInt<12, 12>(CVal))
	break;
	return;
	case 'J': {
	uint64_t NVal = -C->getSExtValue();
	if (isUInt<12>(NVal) \|\| isShiftedUInt<12, 12>(NVal)) {
	CVal = C->getSExtValue();
	break;
	}
	return;
	}
	// The K and L constraints apply only to logical immediates, including
	// what used to be the MOVI alias for ORR (though the MOVI alias has now
	// been removed and MOV should be used). So these constraints have to
	// distinguish between bit patterns that are valid 32-bit or 64-bit
	// "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
	// not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
	// versa.
	case 'K':
	if (AArch64_AM::isLogicalImmediate(CVal, 32))
	break;
	return;
	case 'L':
	if (AArch64_AM::isLogicalImmediate(CVal, 64))
	break;
	return;
	// The M and N constraints are a superset of K and L respectively, for use
	// with the MOV (immediate) alias. As well as the logical immediates they
	// also match 32 or 64-bit immediates that can be loaded either using a
	// single MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
	// (M) or 64-bit 0x1234000000000000 (N) etc.
	// As a note some of this code is liberally stolen from the asm parser.
	case 'M': {
	if (!isUInt<32>(CVal))
	return;
	if (AArch64_AM::isLogicalImmediate(CVal, 32))
	break;
	if ((CVal & 0xFFFF) == CVal)
	break;
	if ((CVal & 0xFFFF0000ULL) == CVal)
	break;
	uint64_t NCVal = ~(uint32_t)CVal;
	if ((NCVal & 0xFFFFULL) == NCVal)
	break;
	if ((NCVal & 0xFFFF0000ULL) == NCVal)
	break;
	return;
	}
	case 'N': {
	if (AArch64_AM::isLogicalImmediate(CVal, 64))
	break;
	if ((CVal & 0xFFFFULL) == CVal)
	break;
	if ((CVal & 0xFFFF0000ULL) == CVal)
	break;
	if ((CVal & 0xFFFF00000000ULL) == CVal)
	break;
	if ((CVal & 0xFFFF000000000000ULL) == CVal)
	break;
	uint64_t NCVal = ~CVal;
	if ((NCVal & 0xFFFFULL) == NCVal)
	break;
	if ((NCVal & 0xFFFF0000ULL) == NCVal)
	break;
	if ((NCVal & 0xFFFF00000000ULL) == NCVal)
	break;
	if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
	break;
	return;
	}
	default:
	return;
	}

	// All assembler immediates are 64-bit integers.
	Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
	break;
	}

	if (Result.getNode()) {
	Ops.push_back(Result);
	return;
	}

	return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
	}

	//===----------------------------------------------------------------------===//
	// AArch64 Advanced SIMD Support
	//===----------------------------------------------------------------------===//

	/// WidenVector - Given a value in the V64 register class, produce the
	/// equivalent value in the V128 register class.
	static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
	EVT VT = V64Reg.getValueType();
	unsigned NarrowSize = VT.getVectorNumElements();
	MVT EltTy = VT.getVectorElementType().getSimpleVT();
	MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
	SDLoc DL(V64Reg);

	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
	V64Reg, DAG.getConstant(0, DL, MVT::i32));
	}

	/// getExtFactor - Determine the adjustment factor for the position when
	/// generating an "extract from vector registers" instruction.
	static unsigned getExtFactor(SDValue &V) {
	EVT EltType = V.getValueType().getVectorElementType();
	return EltType.getSizeInBits() / 8;
	}

	/// NarrowVector - Given a value in the V128 register class, produce the
	/// equivalent value in the V64 register class.
	static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
	EVT VT = V128Reg.getValueType();
	unsigned WideSize = VT.getVectorNumElements();
	MVT EltTy = VT.getVectorElementType().getSimpleVT();
	MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
	SDLoc DL(V128Reg);

	return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
	}

	// Gather data to see if the operation can be modelled as a
	// shuffle in combination with VEXTs.
	SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
	SDLoc dl(Op);
	EVT VT = Op.getValueType();
	unsigned NumElts = VT.getVectorNumElements();

	struct ShuffleSourceInfo {
	SDValue Vec;
	unsigned MinElt;
	unsigned MaxElt;

	// We may insert some combination of BITCASTs and VEXT nodes to force Vec to
	// be compatible with the shuffle we intend to construct. As a result
	// ShuffleVec will be some sliding window into the original Vec.
	SDValue ShuffleVec;

	// Code should guarantee that element i in Vec starts at element "WindowBase
	// + i * WindowScale in ShuffleVec".
	int WindowBase;
	int WindowScale;

	ShuffleSourceInfo(SDValue Vec)
	: Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
	ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}

	bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
	};

	// First gather all vectors used as an immediate source for this BUILD_VECTOR
	// node.
	SmallVector<ShuffleSourceInfo, 2> Sources;
	for (unsigned i = 0; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	if (V.isUndef())
	continue;
	else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(V.getOperand(1))) {
	LLVM_DEBUG(
	dbgs() << "Reshuffle failed: "
	"a shuffle can only come from building a vector from "
	"various elements of other vectors, provided their "
	"indices are constant\n");
	return SDValue();
	}

	// Add this element source to the list if it's not already there.
	SDValue SourceVec = V.getOperand(0);
	auto Source = find(Sources, SourceVec);
	if (Source == Sources.end())
	Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));

	// Update the minimum and maximum lane number seen.
	unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
	Source->MinElt = std::min(Source->MinElt, EltNo);
	Source->MaxElt = std::max(Source->MaxElt, EltNo);
	}

	if (Sources.size() > 2) {
	LLVM_DEBUG(
	dbgs() << "Reshuffle failed: currently only do something sane when at "
	"most two source vectors are involved\n");
	return SDValue();
	}

	// Find out the smallest element size among result and two sources, and use
	// it as element size to build the shuffle_vector.
	EVT SmallestEltTy = VT.getVectorElementType();
	for (auto &Source : Sources) {
	EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
	if (SrcEltTy.bitsLT(SmallestEltTy)) {
	SmallestEltTy = SrcEltTy;
	}
	}
	unsigned ResMultiplier =
	VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
	NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
	EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);

	// If the source vector is too wide or too narrow, we may nevertheless be able
	// to construct a compatible shuffle either by concatenating it with UNDEF or
	// extracting a suitable range of elements.
	for (auto &Src : Sources) {
	EVT SrcVT = Src.ShuffleVec.getValueType();

	if (SrcVT.getSizeInBits() == VT.getSizeInBits())
	continue;

	// This stage of the search produces a source with the same element type as
	// the original, but with a total width matching the BUILD_VECTOR output.
	EVT EltVT = SrcVT.getVectorElementType();
	unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
	EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);

	if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
	assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits());
	// We can pad out the smaller vector for free, so if it's part of a
	// shuffle...
	Src.ShuffleVec =
	DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
	DAG.getUNDEF(Src.ShuffleVec.getValueType()));
	continue;
	}

	assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits());

	if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
	LLVM_DEBUG(
	dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
	return SDValue();
	}

	if (Src.MinElt >= NumSrcElts) {
	// The extraction can just take the second half
	Src.ShuffleVec =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
	DAG.getConstant(NumSrcElts, dl, MVT::i64));
	Src.WindowBase = -NumSrcElts;
	} else if (Src.MaxElt < NumSrcElts) {
	// The extraction can just take the first half
	Src.ShuffleVec =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
	DAG.getConstant(0, dl, MVT::i64));
	} else {
	// An actual VEXT is needed
	SDValue VEXTSrc1 =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
	DAG.getConstant(0, dl, MVT::i64));
	SDValue VEXTSrc2 =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
	DAG.getConstant(NumSrcElts, dl, MVT::i64));
	unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);

	Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
	VEXTSrc2,
	DAG.getConstant(Imm, dl, MVT::i32));
	Src.WindowBase = -Src.MinElt;
	}
	}

	// Another possible incompatibility occurs from the vector element types. We
	// can fix this by bitcasting the source vectors to the same type we intend
	// for the shuffle.
	for (auto &Src : Sources) {
	EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
	if (SrcEltTy == SmallestEltTy)
	continue;
	assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
	Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
	Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
	Src.WindowBase *= Src.WindowScale;
	}

	// Final sanity check before we try to actually produce a shuffle.
	LLVM_DEBUG(for (auto Src
	: Sources)
	assert(Src.ShuffleVec.getValueType() == ShuffleVT););

	// The stars all align, our next step is to produce the mask for the shuffle.
	SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
	int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
	for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
	SDValue Entry = Op.getOperand(i);
	if (Entry.isUndef())
	continue;

	auto Src = find(Sources, Entry.getOperand(0));
	int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();

	// EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
	// trunc. So only std::min(SrcBits, DestBits) actually get defined in this
	// segment.
	EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
	int BitsDefined =
	std::min(OrigEltTy.getSizeInBits(), VT.getScalarSizeInBits());
	int LanesDefined = BitsDefined / BitsPerShuffleLane;

	// This source is expected to fill ResMultiplier lanes of the final shuffle,
	// starting at the appropriate offset.
	int LaneMask = &Mask[i ResMultiplier];

	int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
	ExtractBase += NumElts * (Src - Sources.begin());
	for (int j = 0; j < LanesDefined; ++j)
	LaneMask[j] = ExtractBase + j;
	}

	// Final check before we try to produce nonsense...
	if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
	LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
	return SDValue();
	}

	SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
	for (unsigned i = 0; i < Sources.size(); ++i)
	ShuffleOps[i] = Sources[i].ShuffleVec;

	SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
	ShuffleOps[1], Mask);
	SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);

	LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
	dbgs() << "Reshuffle, creating node: "; V.dump(););

	return V;
	}

	// check if an EXT instruction can handle the shuffle mask when the
	// vector sources of the shuffle are the same.
	static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
	unsigned NumElts = VT.getVectorNumElements();

	// Assume that the first shuffle index is not UNDEF. Fail if it is.
	if (M[0] < 0)
	return false;

	Imm = M[0];

	// If this is a VEXT shuffle, the immediate value is the index of the first
	// element. The other shuffle indices must be the successive elements after
	// the first one.
	unsigned ExpectedElt = Imm;
	for (unsigned i = 1; i < NumElts; ++i) {
	// Increment the expected index. If it wraps around, just follow it
	// back to index zero and keep going.
	++ExpectedElt;
	if (ExpectedElt == NumElts)
	ExpectedElt = 0;

	if (M[i] < 0)
	continue; // ignore UNDEF indices
	if (ExpectedElt != static_cast<unsigned>(M[i]))
	return false;
	}

	return true;
	}

	// check if an EXT instruction can handle the shuffle mask when the
	// vector sources of the shuffle are different.
	static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
	unsigned &Imm) {
	// Look for the first non-undef element.
	const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });

	// Benefit form APInt to handle overflow when calculating expected element.
	unsigned NumElts = VT.getVectorNumElements();
	unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
	APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
	// The following shuffle indices must be the successive elements after the
	// first real element.
	const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
	[&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
	if (FirstWrongElt != M.end())
	return false;

	// The index of an EXT is the first element if it is not UNDEF.
	// Watch out for the beginning UNDEFs. The EXT index should be the expected
	// value of the first element. E.g.
	// <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
	// <-1, -1, 0, 1, ...> is treated as <2NumElts-2, 2NumElts-1, 0, 1, ...>.
	// ExpectedElt is the last mask index plus 1.
	Imm = ExpectedElt.getZExtValue();

	// There are two difference cases requiring to reverse input vectors.
	// For example, for vector <4 x i32> we have the following cases,
	// Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
	// Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
	// For both cases, we finally use mask <5, 6, 7, 0>, which requires
	// to reverse two input vectors.
	if (Imm < NumElts)
	ReverseEXT = true;
	else
	Imm -= NumElts;

	return true;
	}

	/// isREVMask - Check if a vector shuffle corresponds to a REV
	/// instruction with the specified blocksize. (The order of the elements
	/// within each block of the vector is reversed.)
	static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
	assert((BlockSize == 16 \|\| BlockSize == 32 \|\| BlockSize == 64) &&
	"Only possible block sizes for REV are: 16, 32, 64");

	unsigned EltSz = VT.getScalarSizeInBits();
	if (EltSz == 64)
	return false;

	unsigned NumElts = VT.getVectorNumElements();
	unsigned BlockElts = M[0] + 1;
	// If the first shuffle index is UNDEF, be optimistic.
	if (M[0] < 0)
	BlockElts = BlockSize / EltSz;

	if (BlockSize <= EltSz \|\| BlockSize != BlockElts * EltSz)
	return false;

	for (unsigned i = 0; i < NumElts; ++i) {
	if (M[i] < 0)
	continue; // ignore UNDEF indices
	if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
	return false;
	}

	return true;
	}

	static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	if (NumElts % 2 != 0)
	return false;
	WhichResult = (M[0] == 0 ? 0 : 1);
	unsigned Idx = WhichResult * NumElts / 2;
	for (unsigned i = 0; i != NumElts; i += 2) {
	if ((M[i] >= 0 && (unsigned)M[i] != Idx) \|\|
	(M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
	return false;
	Idx += 1;
	}

	return true;
	}

	static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	WhichResult = (M[0] == 0 ? 0 : 1);
	for (unsigned i = 0; i != NumElts; ++i) {
	if (M[i] < 0)
	continue; // ignore UNDEF indices
	if ((unsigned)M[i] != 2 * i + WhichResult)
	return false;
	}

	return true;
	}

	static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	if (NumElts % 2 != 0)
	return false;
	WhichResult = (M[0] == 0 ? 0 : 1);
	for (unsigned i = 0; i < NumElts; i += 2) {
	if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) \|\|
	(M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
	return false;
	}
	return true;
	}

	/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
	/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
	static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	if (NumElts % 2 != 0)
	return false;
	WhichResult = (M[0] == 0 ? 0 : 1);
	unsigned Idx = WhichResult * NumElts / 2;
	for (unsigned i = 0; i != NumElts; i += 2) {
	if ((M[i] >= 0 && (unsigned)M[i] != Idx) \|\|
	(M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
	return false;
	Idx += 1;
	}

	return true;
	}

	/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
	/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
	static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned Half = VT.getVectorNumElements() / 2;
	WhichResult = (M[0] == 0 ? 0 : 1);
	for (unsigned j = 0; j != 2; ++j) {
	unsigned Idx = WhichResult;
	for (unsigned i = 0; i != Half; ++i) {
	int MIdx = M[i + j * Half];
	if (MIdx >= 0 && (unsigned)MIdx != Idx)
	return false;
	Idx += 2;
	}
	}

	return true;
	}

	/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
	/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
	static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	if (NumElts % 2 != 0)
	return false;
	WhichResult = (M[0] == 0 ? 0 : 1);
	for (unsigned i = 0; i < NumElts; i += 2) {
	if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) \|\|
	(M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
	return false;
	}
	return true;
	}

	static bool isINSMask(ArrayRef<int> M, int NumInputElements,
	bool &DstIsLeft, int &Anomaly) {
	if (M.size() != static_cast<size_t>(NumInputElements))
	return false;

	int NumLHSMatch = 0, NumRHSMatch = 0;
	int LastLHSMismatch = -1, LastRHSMismatch = -1;

	for (int i = 0; i < NumInputElements; ++i) {
	if (M[i] == -1) {
	++NumLHSMatch;
	++NumRHSMatch;
	continue;
	}

	if (M[i] == i)
	++NumLHSMatch;
	else
	LastLHSMismatch = i;

	if (M[i] == i + NumInputElements)
	++NumRHSMatch;
	else
	LastRHSMismatch = i;
	}

	if (NumLHSMatch == NumInputElements - 1) {
	DstIsLeft = true;
	Anomaly = LastLHSMismatch;
	return true;
	} else if (NumRHSMatch == NumInputElements - 1) {
	DstIsLeft = false;
	Anomaly = LastRHSMismatch;
	return true;
	}

	return false;
	}

	static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
	if (VT.getSizeInBits() != 128)
	return false;

	unsigned NumElts = VT.getVectorNumElements();

	for (int I = 0, E = NumElts / 2; I != E; I++) {
	if (Mask[I] != I)
	return false;
	}

	int Offset = NumElts / 2;
	for (int I = NumElts / 2, E = NumElts; I != E; I++) {
	if (Mask[I] != I + SplitLHS * Offset)
	return false;
	}

	return true;
	}

	static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
	SDLoc DL(Op);
	EVT VT = Op.getValueType();
	SDValue V0 = Op.getOperand(0);
	SDValue V1 = Op.getOperand(1);
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();

	if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() \|\|
	VT.getVectorElementType() != V1.getValueType().getVectorElementType())
	return SDValue();

	bool SplitV0 = V0.getValueSizeInBits() == 128;

	if (!isConcatMask(Mask, VT, SplitV0))
	return SDValue();

	EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
	if (SplitV0) {
	V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
	DAG.getConstant(0, DL, MVT::i64));
	}
	if (V1.getValueSizeInBits() == 128) {
	V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
	DAG.getConstant(0, DL, MVT::i64));
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
	}

	/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
	/// the specified operations to build the shuffle.
	static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
	SDValue RHS, SelectionDAG &DAG,
	const SDLoc &dl) {
	unsigned OpNum = (PFEntry >> 26) & 0x0F;
	unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
	unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);

	enum {
	OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
	OP_VREV,
	OP_VDUP0,
	OP_VDUP1,
	OP_VDUP2,
	OP_VDUP3,
	OP_VEXT1,
	OP_VEXT2,
	OP_VEXT3,
	OP_VUZPL, // VUZP, left result
	OP_VUZPR, // VUZP, right result
	OP_VZIPL, // VZIP, left result
	OP_VZIPR, // VZIP, right result
	OP_VTRNL, // VTRN, left result
	OP_VTRNR // VTRN, right result
	};

	if (OpNum == OP_COPY) {
	if (LHSID == (1 * 9 + 2) * 9 + 3)
	return LHS;
	assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
	return RHS;
	}

	SDValue OpLHS, OpRHS;
	OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
	OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
	EVT VT = OpLHS.getValueType();

	switch (OpNum) {
	default:
	llvm_unreachable("Unknown shuffle opcode!");
	case OP_VREV:
	// VREV divides the vector in half and swaps within the half.
	if (VT.getVectorElementType() == MVT::i32 \|\|
	VT.getVectorElementType() == MVT::f32)
	return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
	// vrev <4 x i16> -> REV32
	if (VT.getVectorElementType() == MVT::i16 \|\|
	VT.getVectorElementType() == MVT::f16 \|\|
	VT.getVectorElementType() == MVT::bf16)
	return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
	// vrev <4 x i8> -> REV16
	assert(VT.getVectorElementType() == MVT::i8);
	return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
	case OP_VDUP0:
	case OP_VDUP1:
	case OP_VDUP2:
	case OP_VDUP3: {
	EVT EltTy = VT.getVectorElementType();
	unsigned Opcode;
	if (EltTy == MVT::i8)
	Opcode = AArch64ISD::DUPLANE8;
	else if (EltTy == MVT::i16 \|\| EltTy == MVT::f16 \|\| EltTy == MVT::bf16)
	Opcode = AArch64ISD::DUPLANE16;
	else if (EltTy == MVT::i32 \|\| EltTy == MVT::f32)
	Opcode = AArch64ISD::DUPLANE32;
	else if (EltTy == MVT::i64 \|\| EltTy == MVT::f64)
	Opcode = AArch64ISD::DUPLANE64;
	else
	llvm_unreachable("Invalid vector element type?");

	if (VT.getSizeInBits() == 64)
	OpLHS = WidenVector(OpLHS, DAG);
	SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
	return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
	}
	case OP_VEXT1:
	case OP_VEXT2:
	case OP_VEXT3: {
	unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
	return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
	DAG.getConstant(Imm, dl, MVT::i32));
	}
	case OP_VUZPL:
	return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VUZPR:
	return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VZIPL:
	return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VZIPR:
	return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VTRNL:
	return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VTRNR:
	return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	}
	}

	static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
	SelectionDAG &DAG) {
	// Check to see if we can use the TBL instruction.
	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	SDLoc DL(Op);

	EVT EltVT = Op.getValueType().getVectorElementType();
	unsigned BytesPerElt = EltVT.getSizeInBits() / 8;

	SmallVector<SDValue, 8> TBLMask;
	for (int Val : ShuffleMask) {
	for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
	unsigned Offset = Byte + Val * BytesPerElt;
	TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
	}
	}

	MVT IndexVT = MVT::v8i8;
	unsigned IndexLen = 8;
	if (Op.getValueSizeInBits() == 128) {
	IndexVT = MVT::v16i8;
	IndexLen = 16;
	}

	SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
	SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);

	SDValue Shuffle;
	if (V2.getNode()->isUndef()) {
	if (IndexLen == 8)
	V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
	Shuffle = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
	DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
	DAG.getBuildVector(IndexVT, DL,
	makeArrayRef(TBLMask.data(), IndexLen)));
	} else {
	if (IndexLen == 8) {
	V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
	Shuffle = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
	DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
	DAG.getBuildVector(IndexVT, DL,
	makeArrayRef(TBLMask.data(), IndexLen)));
	} else {
	// FIXME: We cannot, for the moment, emit a TBL2 instruction because we
	// cannot currently represent the register constraints on the input
	// table registers.
	// Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
	// DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
	// IndexLen));
	Shuffle = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
	DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
	V2Cst, DAG.getBuildVector(IndexVT, DL,
	makeArrayRef(TBLMask.data(), IndexLen)));
	}
	}
	return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
	}

	static unsigned getDUPLANEOp(EVT EltType) {
	if (EltType == MVT::i8)
	return AArch64ISD::DUPLANE8;
	if (EltType == MVT::i16 \|\| EltType == MVT::f16 \|\| EltType == MVT::bf16)
	return AArch64ISD::DUPLANE16;
	if (EltType == MVT::i32 \|\| EltType == MVT::f32)
	return AArch64ISD::DUPLANE32;
	if (EltType == MVT::i64 \|\| EltType == MVT::f64)
	return AArch64ISD::DUPLANE64;

	llvm_unreachable("Invalid vector element type?");
	}

	SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	EVT VT = Op.getValueType();

	ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());

	// Convert shuffles that are directly supported on NEON to target-specific
	// DAG nodes, instead of keeping them as shuffles and matching them again
	// during code selection. This is more efficient and avoids the possibility
	// of inconsistencies between legalization and selection.
	ArrayRef<int> ShuffleMask = SVN->getMask();

	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);

	if (SVN->isSplat()) {
	int Lane = SVN->getSplatIndex();
	// If this is undef splat, generate it via "just" vdup, if possible.
	if (Lane == -1)
	Lane = 0;

	if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
	return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
	V1.getOperand(0));
	// Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
	// constant. If so, we can just reference the lane's definition directly.
	if (V1.getOpcode() == ISD::BUILD_VECTOR &&
	!isa<ConstantSDNode>(V1.getOperand(Lane)))
	return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));

	// Otherwise, duplicate from the lane of the input vector.
	unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());

	// Try to eliminate a bitcasted extract subvector before a DUPLANE.
	auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
	// Match: dup (bitcast (extract_subv X, C)), LaneC
	if (BitCast.getOpcode() != ISD::BITCAST \|\|
	BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
	return false;

	// The extract index must align in the destination type. That may not
	// happen if the bitcast is from narrow to wide type.
	SDValue Extract = BitCast.getOperand(0);
	unsigned ExtIdx = Extract.getConstantOperandVal(1);
	unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
	unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
	unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
	if (ExtIdxInBits % CastedEltBitWidth != 0)
	return false;

	// Update the lane value by offsetting with the scaled extract index.
	LaneC += ExtIdxInBits / CastedEltBitWidth;

	// Determine the casted vector type of the wide vector input.
	// dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
	// Examples:
	// dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
	// dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
	unsigned SrcVecNumElts =
	Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
	CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(),
	SrcVecNumElts);
	return true;
	};
	MVT CastVT;
	if (getScaledOffsetDup(V1, Lane, CastVT)) {
	V1 = DAG.getBitcast(CastVT, V1.getOperand(0).getOperand(0));
	} else if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
	// The lane is incremented by the index of the extract.
	// Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
	Lane += V1.getConstantOperandVal(1);
	V1 = V1.getOperand(0);
	} else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
	// The lane is decremented if we are splatting from the 2nd operand.
	// Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
	unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
	Lane -= Idx * VT.getVectorNumElements() / 2;
	V1 = WidenVector(V1.getOperand(Idx), DAG);
	} else if (VT.getSizeInBits() == 64) {
	// Widen the operand to 128-bit register with undef.
	V1 = WidenVector(V1, DAG);
	}
	return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64));
	}

	if (isREVMask(ShuffleMask, VT, 64))
	return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
	if (isREVMask(ShuffleMask, VT, 32))
	return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
	if (isREVMask(ShuffleMask, VT, 16))
	return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);

	bool ReverseEXT = false;
	unsigned Imm;
	if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
	if (ReverseEXT)
	std::swap(V1, V2);
	Imm *= getExtFactor(V1);
	return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
	DAG.getConstant(Imm, dl, MVT::i32));
	} else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
	Imm *= getExtFactor(V1);
	return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
	DAG.getConstant(Imm, dl, MVT::i32));
	}

	unsigned WhichResult;
	if (isZIPMask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
	}
	if (isUZPMask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
	}
	if (isTRNMask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
	}

	if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
	}
	if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
	}
	if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
	}

	if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
	return Concat;

	bool DstIsLeft;
	int Anomaly;
	int NumInputElements = V1.getValueType().getVectorNumElements();
	if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
	SDValue DstVec = DstIsLeft ? V1 : V2;
	SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);

	SDValue SrcVec = V1;
	int SrcLane = ShuffleMask[Anomaly];
	if (SrcLane >= NumInputElements) {
	SrcVec = V2;
	SrcLane -= VT.getVectorNumElements();
	}
	SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);

	EVT ScalarVT = VT.getVectorElementType();

	if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger())
	ScalarVT = MVT::i32;

	return DAG.getNode(
	ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
	DstLaneV);
	}

	// If the shuffle is not directly supported and it has 4 elements, use
	// the PerfectShuffle-generated table to synthesize it from other shuffles.
	unsigned NumElts = VT.getVectorNumElements();
	if (NumElts == 4) {
	unsigned PFIndexes[4];
	for (unsigned i = 0; i != 4; ++i) {
	if (ShuffleMask[i] < 0)
	PFIndexes[i] = 8;
	else
	PFIndexes[i] = ShuffleMask[i];
	}

	// Compute the index in the perfect shuffle table.
	unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
	PFIndexes[2] * 9 + PFIndexes[3];
	unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
	unsigned Cost = (PFEntry >> 30);

	if (Cost <= 4)
	return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
	}

	return GenerateTBL(Op, ShuffleMask, DAG);
	}

	SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	EVT VT = Op.getValueType();
	EVT ElemVT = VT.getScalarType();

	SDValue SplatVal = Op.getOperand(0);

	// Extend input splat value where needed to fit into a GPR (32b or 64b only)
	// FPRs don't have this restriction.
	switch (ElemVT.getSimpleVT().SimpleTy) {
	case MVT::i1: {
	// The only legal i1 vectors are SVE vectors, so we can use SVE-specific
	// lowering code.
	if (auto *ConstVal = dyn_cast<ConstantSDNode>(SplatVal)) {
	if (ConstVal->isOne())
	return getPTrue(DAG, dl, VT, AArch64SVEPredPattern::all);
	// TODO: Add special case for constant false
	}
	// The general case of i1. There isn't any natural way to do this,
	// so we use some trickery with whilelo.
	SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
	SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i64, SplatVal,
	DAG.getValueType(MVT::i1));
	SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl,
	MVT::i64);
	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID,
	DAG.getConstant(0, dl, MVT::i64), SplatVal);
	}
	case MVT::i8:
	case MVT::i16:
	case MVT::i32:
	SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32);
	break;
	case MVT::i64:
	SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
	break;
	case MVT::f16:
	case MVT::bf16:
	case MVT::f32:
	case MVT::f64:
	// Fine as is
	break;
	default:
	report_fatal_error("Unsupported SPLAT_VECTOR input operand type");
	}

	return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
	}

	SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);

	EVT VT = Op.getValueType();
	if (!isTypeLegal(VT) \|\| !VT.isScalableVector())
	return SDValue();

	// Current lowering only supports the SVE-ACLE types.
	if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock)
	return SDValue();

	// The DUPQ operation is indepedent of element type so normalise to i64s.
	SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
	SDValue Idx128 = Op.getOperand(2);

	// DUPQ can be used when idx is in range.
	auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
	if (CIdx && (CIdx->getZExtValue() <= 3)) {
	SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
	SDNode *DUPQ =
	DAG.getMachineNode(AArch64::DUP_ZZI_Q, DL, MVT::nxv2i64, V, CI);
	return DAG.getNode(ISD::BITCAST, DL, VT, SDValue(DUPQ, 0));
	}

	// The ACLE says this must produce the same result as:
	// svtbl(data, svadd_x(svptrue_b64(),
	// svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
	// index * 2))
	SDValue One = DAG.getConstant(1, DL, MVT::i64);
	SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);

	// create the vector 0,1,0,1,...
	SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
	SDValue SV = DAG.getNode(AArch64ISD::INDEX_VECTOR,
	DL, MVT::nxv2i64, Zero, One);
	SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);

	// create the vector idx64,idx64+1,idx64,idx64+1,...
	SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
	SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
	SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);

	// create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
	SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
	return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
	}


	static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
	APInt &UndefBits) {
	EVT VT = BVN->getValueType(0);
	APInt SplatBits, SplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
	unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;

	for (unsigned i = 0; i < NumSplats; ++i) {
	CnstBits <<= SplatBitSize;
	UndefBits <<= SplatBitSize;
	CnstBits \|= SplatBits.zextOrTrunc(VT.getSizeInBits());
	UndefBits \|= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
	}

	return true;
	}

	return false;
	}

	// Try 64-bit splatted SIMD immediate.
	static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
	const APInt &Bits) {
	if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
	uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
	EVT VT = Op.getValueType();
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;

	if (AArch64_AM::isAdvSIMDModImmType10(Value)) {
	Value = AArch64_AM::encodeAdvSIMDModImmType10(Value);

	SDLoc dl(Op);
	SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
	DAG.getConstant(Value, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	return SDValue();
	}

	// Try 32-bit splatted SIMD immediate.
	static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
	const APInt &Bits,
	const SDValue *LHS = nullptr) {
	if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
	uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
	EVT VT = Op.getValueType();
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	bool isAdvSIMDModImm = false;
	uint64_t Shift;

	if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType1(Value);
	Shift = 0;
	}
	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType2(Value);
	Shift = 8;
	}
	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType3(Value);
	Shift = 16;
	}
	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType4(Value);
	Shift = 24;
	}

	if (isAdvSIMDModImm) {
	SDLoc dl(Op);
	SDValue Mov;

	if (LHS)
	Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
	DAG.getConstant(Value, dl, MVT::i32),
	DAG.getConstant(Shift, dl, MVT::i32));
	else
	Mov = DAG.getNode(NewOp, dl, MovTy,
	DAG.getConstant(Value, dl, MVT::i32),
	DAG.getConstant(Shift, dl, MVT::i32));

	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	return SDValue();
	}

	// Try 16-bit splatted SIMD immediate.
	static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
	const APInt &Bits,
	const SDValue *LHS = nullptr) {
	if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
	uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
	EVT VT = Op.getValueType();
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
	bool isAdvSIMDModImm = false;
	uint64_t Shift;

	if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType5(Value);
	Shift = 0;
	}
	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType6(Value);
	Shift = 8;
	}

	if (isAdvSIMDModImm) {
	SDLoc dl(Op);
	SDValue Mov;

	if (LHS)
	Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
	DAG.getConstant(Value, dl, MVT::i32),
	DAG.getConstant(Shift, dl, MVT::i32));
	else
	Mov = DAG.getNode(NewOp, dl, MovTy,
	DAG.getConstant(Value, dl, MVT::i32),
	DAG.getConstant(Shift, dl, MVT::i32));

	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	return SDValue();
	}

	// Try 32-bit splatted SIMD immediate with shifted ones.
	static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op,
	SelectionDAG &DAG, const APInt &Bits) {
	if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
	uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
	EVT VT = Op.getValueType();
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	bool isAdvSIMDModImm = false;
	uint64_t Shift;

	if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType7(Value);
	Shift = 264;
	}
	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType8(Value);
	Shift = 272;
	}

	if (isAdvSIMDModImm) {
	SDLoc dl(Op);
	SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
	DAG.getConstant(Value, dl, MVT::i32),
	DAG.getConstant(Shift, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	return SDValue();
	}

	// Try 8-bit splatted SIMD immediate.
	static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
	const APInt &Bits) {
	if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
	uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
	EVT VT = Op.getValueType();
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;

	if (AArch64_AM::isAdvSIMDModImmType9(Value)) {
	Value = AArch64_AM::encodeAdvSIMDModImmType9(Value);

	SDLoc dl(Op);
	SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
	DAG.getConstant(Value, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	return SDValue();
	}

	// Try FP splatted SIMD immediate.
	static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
	const APInt &Bits) {
	if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
	uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
	EVT VT = Op.getValueType();
	bool isWide = (VT.getSizeInBits() == 128);
	MVT MovTy;
	bool isAdvSIMDModImm = false;

	if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType11(Value);
	MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
	}
	else if (isWide &&
	(isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType12(Value);
	MovTy = MVT::v2f64;
	}

	if (isAdvSIMDModImm) {
	SDLoc dl(Op);
	SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
	DAG.getConstant(Value, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	return SDValue();
	}

	// Specialized code to quickly find if PotentialBVec is a BuildVector that
	// consists of only the same constant int value, returned in reference arg
	// ConstVal
	static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
	uint64_t &ConstVal) {
	BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
	if (!Bvec)
	return false;
	ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
	if (!FirstElt)
	return false;
	EVT VT = Bvec->getValueType(0);
	unsigned NumElts = VT.getVectorNumElements();
	for (unsigned i = 1; i < NumElts; ++i)
	if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
	return false;
	ConstVal = FirstElt->getZExtValue();
	return true;
	}

	static unsigned getIntrinsicID(const SDNode *N) {
	unsigned Opcode = N->getOpcode();
	switch (Opcode) {
	default:
	return Intrinsic::not_intrinsic;
	case ISD::INTRINSIC_WO_CHAIN: {
	unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
	if (IID < Intrinsic::num_intrinsics)
	return IID;
	return Intrinsic::not_intrinsic;
	}
	}
	}

	// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
	// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
	// BUILD_VECTORs with constant element C1, C2 is a constant, and:
	// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
	// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
	// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
	static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
	EVT VT = N->getValueType(0);

	if (!VT.isVector())
	return SDValue();

	SDLoc DL(N);

	SDValue And;
	SDValue Shift;

	SDValue FirstOp = N->getOperand(0);
	unsigned FirstOpc = FirstOp.getOpcode();
	SDValue SecondOp = N->getOperand(1);
	unsigned SecondOpc = SecondOp.getOpcode();

	// Is one of the operands an AND or a BICi? The AND may have been optimised to
	// a BICi in order to use an immediate instead of a register.
	// Is the other operand an shl or lshr? This will have been turned into:
	// AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift.
	if ((FirstOpc == ISD::AND \|\| FirstOpc == AArch64ISD::BICi) &&
	(SecondOpc == AArch64ISD::VSHL \|\| SecondOpc == AArch64ISD::VLSHR)) {
	And = FirstOp;
	Shift = SecondOp;

	} else if ((SecondOpc == ISD::AND \|\| SecondOpc == AArch64ISD::BICi) &&
	(FirstOpc == AArch64ISD::VSHL \|\| FirstOpc == AArch64ISD::VLSHR)) {
	And = SecondOp;
	Shift = FirstOp;
	} else
	return SDValue();

	bool IsAnd = And.getOpcode() == ISD::AND;
	bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR;

	// Is the shift amount constant?
	ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
	if (!C2node)
	return SDValue();

	uint64_t C1;
	if (IsAnd) {
	// Is the and mask vector all constant?
	if (!isAllConstantBuildVector(And.getOperand(1), C1))
	return SDValue();
	} else {
	// Reconstruct the corresponding AND immediate from the two BICi immediates.
	ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
	ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
	assert(C1nodeImm && C1nodeShift);
	C1 = ~(C1nodeImm->getZExtValue() << C1nodeShift->getZExtValue());
	}

	// Is C1 == ~(Ones(ElemSizeInBits) << C2) or
	// C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
	// how much one can shift elements of a particular size?
	uint64_t C2 = C2node->getZExtValue();
	unsigned ElemSizeInBits = VT.getScalarSizeInBits();
	if (C2 > ElemSizeInBits)
	return SDValue();

	APInt C1AsAPInt(ElemSizeInBits, C1);
	APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
	: APInt::getLowBitsSet(ElemSizeInBits, C2);
	if (C1AsAPInt != RequiredC1)
	return SDValue();

	SDValue X = And.getOperand(0);
	SDValue Y = Shift.getOperand(0);

	unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
	SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Shift.getOperand(1));

	LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
	LLVM_DEBUG(N->dump(&DAG));
	LLVM_DEBUG(dbgs() << "into: \n");
	LLVM_DEBUG(ResultSLI->dump(&DAG));

	++NumShiftInserts;
	return ResultSLI;
	}

	SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
	SelectionDAG &DAG) const {
	// Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
	if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
	return Res;

	EVT VT = Op.getValueType();

	SDValue LHS = Op.getOperand(0);
	BuildVectorSDNode *BVN =
	dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
	if (!BVN) {
	// OR commutes, so try swapping the operands.
	LHS = Op.getOperand(1);
	BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
	}
	if (!BVN)
	return Op;

	APInt DefBits(VT.getSizeInBits(), 0);
	APInt UndefBits(VT.getSizeInBits(), 0);
	if (resolveBuildVector(BVN, DefBits, UndefBits)) {
	SDValue NewOp;

	if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
	DefBits, &LHS)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
	DefBits, &LHS)))
	return NewOp;

	if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
	UndefBits, &LHS)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
	UndefBits, &LHS)))
	return NewOp;
	}

	// We can always fall back to a non-immediate OR.
	return Op;
	}

	// Normalize the operands of BUILD_VECTOR. The value of constant operands will
	// be truncated to fit element width.
	static SDValue NormalizeBuildVector(SDValue Op,
	SelectionDAG &DAG) {
	assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
	SDLoc dl(Op);
	EVT VT = Op.getValueType();
	EVT EltTy= VT.getVectorElementType();

	if (EltTy.isFloatingPoint() \|\| EltTy.getSizeInBits() > 16)
	return Op;

	SmallVector<SDValue, 16> Ops;
	for (SDValue Lane : Op->ops()) {
	// For integer vectors, type legalization would have promoted the
	// operands already. Otherwise, if Op is a floating-point splat
	// (with operands cast to integers), then the only possibilities
	// are constants and UNDEFs.
	if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
	APInt LowBits(EltTy.getSizeInBits(),
	CstLane->getZExtValue());
	Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
	} else if (Lane.getNode()->isUndef()) {
	Lane = DAG.getUNDEF(MVT::i32);
	} else {
	assert(Lane.getValueType() == MVT::i32 &&
	"Unexpected BUILD_VECTOR operand type");
	}
	Ops.push_back(Lane);
	}
	return DAG.getBuildVector(VT, dl, Ops);
	}

	static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) {
	EVT VT = Op.getValueType();

	APInt DefBits(VT.getSizeInBits(), 0);
	APInt UndefBits(VT.getSizeInBits(), 0);
	BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
	if (resolveBuildVector(BVN, DefBits, UndefBits)) {
	SDValue NewOp;
	if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
	return NewOp;

	DefBits = ~DefBits;
	if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
	return NewOp;

	DefBits = UndefBits;
	if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
	return NewOp;

	DefBits = ~UndefBits;
	if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
	return NewOp;
	}

	return SDValue();
	}

	SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
	SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();

	// Try to build a simple constant vector.
	Op = NormalizeBuildVector(Op, DAG);
	if (VT.isInteger()) {
	// Certain vector constants, used to express things like logical NOT and
	// arithmetic NEG, are passed through unmodified. This allows special
	// patterns for these operations to match, which will lower these constants
	// to whatever is proven necessary.
	BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
	if (BVN->isConstant())
	if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
	unsigned BitSize = VT.getVectorElementType().getSizeInBits();
	APInt Val(BitSize,
	Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
	if (Val.isNullValue() \|\| Val.isAllOnesValue())
	return Op;
	}
	}

	if (SDValue V = ConstantBuildVector(Op, DAG))
	return V;

	// Scan through the operands to find some interesting properties we can
	// exploit:
	// 1) If only one value is used, we can use a DUP, or
	// 2) if only the low element is not undef, we can just insert that, or
	// 3) if only one constant value is used (w/ some non-constant lanes),
	// we can splat the constant value into the whole vector then fill
	// in the non-constant lanes.
	// 4) FIXME: If different constant values are used, but we can intelligently
	// select the values we'll be overwriting for the non-constant
	// lanes such that we can directly materialize the vector
	// some other way (MOVI, e.g.), we can be sneaky.
	// 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
	SDLoc dl(Op);
	unsigned NumElts = VT.getVectorNumElements();
	bool isOnlyLowElement = true;
	bool usesOnlyOneValue = true;
	bool usesOnlyOneConstantValue = true;
	bool isConstant = true;
	bool AllLanesExtractElt = true;
	unsigned NumConstantLanes = 0;
	SDValue Value;
	SDValue ConstantValue;
	for (unsigned i = 0; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	AllLanesExtractElt = false;
	if (V.isUndef())
	continue;
	if (i > 0)
	isOnlyLowElement = false;
	if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
	isConstant = false;

	if (isa<ConstantSDNode>(V) \|\| isa<ConstantFPSDNode>(V)) {
	++NumConstantLanes;
	if (!ConstantValue.getNode())
	ConstantValue = V;
	else if (ConstantValue != V)
	usesOnlyOneConstantValue = false;
	}

	if (!Value.getNode())
	Value = V;
	else if (V != Value)
	usesOnlyOneValue = false;
	}

	if (!Value.getNode()) {
	LLVM_DEBUG(
	dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
	return DAG.getUNDEF(VT);
	}

	// Convert BUILD_VECTOR where all elements but the lowest are undef into
	// SCALAR_TO_VECTOR, except for when we have a single-element constant vector
	// as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
	if (isOnlyLowElement && !(NumElts == 1 && isa<ConstantSDNode>(Value))) {
	LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
	"SCALAR_TO_VECTOR node\n");
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
	}

	if (AllLanesExtractElt) {
	SDNode *Vector = nullptr;
	bool Even = false;
	bool Odd = false;
	// Check whether the extract elements match the Even pattern <0,2,4,...> or
	// the Odd pattern <1,3,5,...>.
	for (unsigned i = 0; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	const SDNode *N = V.getNode();
	if (!isa<ConstantSDNode>(N->getOperand(1)))
	break;
	SDValue N0 = N->getOperand(0);

	// All elements are extracted from the same vector.
	if (!Vector) {
	Vector = N0.getNode();
	// Check that the type of EXTRACT_VECTOR_ELT matches the type of
	// BUILD_VECTOR.
	if (VT.getVectorElementType() !=
	N0.getValueType().getVectorElementType())
	break;
	} else if (Vector != N0.getNode()) {
	Odd = false;
	Even = false;
	break;
	}

	// Extracted values are either at Even indices <0,2,4,...> or at Odd
	// indices <1,3,5,...>.
	uint64_t Val = N->getConstantOperandVal(1);
	if (Val == 2 * i) {
	Even = true;
	continue;
	}
	if (Val - 1 == 2 * i) {
	Odd = true;
	continue;
	}

	// Something does not match: abort.
	Odd = false;
	Even = false;
	break;
	}
	if (Even \|\| Odd) {
	SDValue LHS =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
	DAG.getConstant(0, dl, MVT::i64));
	SDValue RHS =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
	DAG.getConstant(NumElts, dl, MVT::i64));

	if (Even && !Odd)
	return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
	RHS);
	if (Odd && !Even)
	return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
	RHS);
	}
	}

	// Use DUP for non-constant splats. For f32 constant splats, reduce to
	// i32 and try again.
	if (usesOnlyOneValue) {
	if (!isConstant) {
	if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Value.getValueType() != VT) {
	LLVM_DEBUG(
	dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
	return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
	}

	// This is actually a DUPLANExx operation, which keeps everything vectory.

	SDValue Lane = Value.getOperand(1);
	Value = Value.getOperand(0);
	if (Value.getValueSizeInBits() == 64) {
	LLVM_DEBUG(
	dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
	"widening it\n");
	Value = WidenVector(Value, DAG);
	}

	unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
	return DAG.getNode(Opcode, dl, VT, Value, Lane);
	}

	if (VT.getVectorElementType().isFloatingPoint()) {
	SmallVector<SDValue, 8> Ops;
	EVT EltTy = VT.getVectorElementType();
	assert ((EltTy == MVT::f16 \|\| EltTy == MVT::bf16 \|\| EltTy == MVT::f32 \|\|
	EltTy == MVT::f64) && "Unsupported floating-point vector type");
	LLVM_DEBUG(
	dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
	"BITCASTS, and try again\n");
	MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
	for (unsigned i = 0; i < NumElts; ++i)
	Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
	EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
	SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
	LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
	Val.dump(););
	Val = LowerBUILD_VECTOR(Val, DAG);
	if (Val.getNode())
	return DAG.getNode(ISD::BITCAST, dl, VT, Val);
	}
	}

	// If there was only one constant value used and for more than one lane,
	// start by splatting that value, then replace the non-constant lanes. This
	// is better than the default, which will perform a separate initialization
	// for each lane.
	if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
	// Firstly, try to materialize the splat constant.
	SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue),
	Val = ConstantBuildVector(Vec, DAG);
	if (!Val) {
	// Otherwise, materialize the constant and splat it.
	Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
	DAG.ReplaceAllUsesWith(Vec.getNode(), &Val);
	}

	// Now insert the non-constant lanes.
	for (unsigned i = 0; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
	if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V))
	// Note that type legalization likely mucked about with the VT of the
	// source operand, so we may have to convert it here before inserting.
	Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
	}
	return Val;
	}

	// This will generate a load from the constant pool.
	if (isConstant) {
	LLVM_DEBUG(
	dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
	"expansion\n");
	return SDValue();
	}

	// Empirical tests suggest this is rarely worth it for vectors of length <= 2.
	if (NumElts >= 4) {
	if (SDValue shuffle = ReconstructShuffle(Op, DAG))
	return shuffle;
	}

	// If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
	// know the default expansion would otherwise fall back on something even
	// worse. For a vector with one or two non-undef values, that's
	// scalar_to_vector for the elements followed by a shuffle (provided the
	// shuffle is valid for the target) and materialization element by element
	// on the stack followed by a load for everything else.
	if (!isConstant && !usesOnlyOneValue) {
	LLVM_DEBUG(
	dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
	"of INSERT_VECTOR_ELT\n");

	SDValue Vec = DAG.getUNDEF(VT);
	SDValue Op0 = Op.getOperand(0);
	unsigned i = 0;

	// Use SCALAR_TO_VECTOR for lane zero to
	// a) Avoid a RMW dependency on the full vector register, and
	// b) Allow the register coalescer to fold away the copy if the
	// value is already in an S or D register, and we're forced to emit an
	// INSERT_SUBREG that we can't fold anywhere.
	//
	// We also allow types like i8 and i16 which are illegal scalar but legal
	// vector element types. After type-legalization the inserted value is
	// extended (i32) and it is safe to cast them to the vector type by ignoring
	// the upper bits of the lowest lane (e.g. v8i8, v4i16).
	if (!Op0.isUndef()) {
	LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
	Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
	++i;
	}
	LLVM_DEBUG(if (i < NumElts) dbgs()
	<< "Creating nodes for the other vector elements:\n";);
	for (; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	if (V.isUndef())
	continue;
	SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
	Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
	}
	return Vec;
	}

	LLVM_DEBUG(
	dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
	"better alternative\n");
	return SDValue();
	}

	SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");

	// Check for non-constant or out of range lane.
	EVT VT = Op.getOperand(0).getValueType();
	ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
	if (!CI \|\| CI->getZExtValue() >= VT.getVectorNumElements())
	return SDValue();


	// Insertion/extraction are legal for V128 types.
	if (VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
	VT == MVT::v2i64 \|\| VT == MVT::v4f32 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::v8f16 \|\| VT == MVT::v8bf16)
	return Op;

	if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
	VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
	VT != MVT::v4bf16)
	return SDValue();

	// For V64 types, we perform insertion by expanding the value
	// to a V128 type and perform the insertion on that.
	SDLoc DL(Op);
	SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
	EVT WideTy = WideVec.getValueType();

	SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
	Op.getOperand(1), Op.getOperand(2));
	// Re-narrow the resultant vector.
	return NarrowVector(Node, DAG);
	}

	SDValue
	AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");

	// Check for non-constant or out of range lane.
	EVT VT = Op.getOperand(0).getValueType();
	ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	if (!CI \|\| CI->getZExtValue() >= VT.getVectorNumElements())
	return SDValue();


	// Insertion/extraction are legal for V128 types.
	if (VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
	VT == MVT::v2i64 \|\| VT == MVT::v4f32 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::v8f16 \|\| VT == MVT::v8bf16)
	return Op;

	if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
	VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
	VT != MVT::v4bf16)
	return SDValue();

	// For V64 types, we perform extraction by expanding the value
	// to a V128 type and perform the extraction on that.
	SDLoc DL(Op);
	SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
	EVT WideTy = WideVec.getValueType();

	EVT ExtrTy = WideTy.getVectorElementType();
	if (ExtrTy == MVT::i16 \|\| ExtrTy == MVT::i8)
	ExtrTy = MVT::i32;

	// For extractions, we just return the result directly.
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
	Op.getOperand(1));
	}

	SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getValueType().isFixedLengthVector() &&
	"Only cases that extract a fixed length vector are supported!");

	EVT InVT = Op.getOperand(0).getValueType();
	unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
	unsigned Size = Op.getValueSizeInBits();

	if (InVT.isScalableVector()) {
	// This will be matched by custom code during ISelDAGToDAG.
	if (Idx == 0 && isPackedVectorType(InVT, DAG))
	return Op;

	return SDValue();
	}

	// This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
	if (Idx == 0 && InVT.getSizeInBits() <= 128)
	return Op;

	// If this is extracting the upper 64-bits of a 128-bit vector, we match
	// that directly.
	if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64)
	return Op;

	return SDValue();
	}

	SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getValueType().isScalableVector() &&
	"Only expect to lower inserts into scalable vectors!");

	EVT InVT = Op.getOperand(1).getValueType();
	unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();

	// We don't have any patterns for scalable vector yet.
	if (InVT.isScalableVector() \|\| !useSVEForFixedLengthVectorVT(InVT))
	return SDValue();

	// This will be matched by custom code during ISelDAGToDAG.
	if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef())
	return Op;

	return SDValue();
	}

	bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
	// Currently no fixed length shuffles that require SVE are legal.
	if (useSVEForFixedLengthVectorVT(VT))
	return false;

	if (VT.getVectorNumElements() == 4 &&
	(VT.is128BitVector() \|\| VT.is64BitVector())) {
	unsigned PFIndexes[4];
	for (unsigned i = 0; i != 4; ++i) {
	if (M[i] < 0)
	PFIndexes[i] = 8;
	else
	PFIndexes[i] = M[i];
	}

	// Compute the index in the perfect shuffle table.
	unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
	PFIndexes[2] * 9 + PFIndexes[3];
	unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
	unsigned Cost = (PFEntry >> 30);

	if (Cost <= 4)
	return true;
	}

	bool DummyBool;
	int DummyInt;
	unsigned DummyUnsigned;

	return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) \|\| isREVMask(M, VT, 64) \|\|
	isREVMask(M, VT, 32) \|\| isREVMask(M, VT, 16) \|\|
	isEXTMask(M, VT, DummyBool, DummyUnsigned) \|\|
	// isTBLMask(M, VT) \|\| // FIXME: Port TBL support from ARM.
	isTRNMask(M, VT, DummyUnsigned) \|\| isUZPMask(M, VT, DummyUnsigned) \|\|
	isZIPMask(M, VT, DummyUnsigned) \|\|
	isTRN_v_undef_Mask(M, VT, DummyUnsigned) \|\|
	isUZP_v_undef_Mask(M, VT, DummyUnsigned) \|\|
	isZIP_v_undef_Mask(M, VT, DummyUnsigned) \|\|
	isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) \|\|
	isConcatMask(M, VT, VT.getSizeInBits() == 128));
	}

	/// getVShiftImm - Check if this is a valid build_vector for the immediate
	/// operand of a vector shift operation, where all the elements of the
	/// build_vector must have the same constant integer value.
	static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
	// Ignore bit_converts.
	while (Op.getOpcode() == ISD::BITCAST)
	Op = Op.getOperand(0);
	BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
	APInt SplatBits, SplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	if (!BVN \|\| !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
	HasAnyUndefs, ElementBits) \|\|
	SplatBitSize > ElementBits)
	return false;
	Cnt = SplatBits.getSExtValue();
	return true;
	}

	/// isVShiftLImm - Check if this is a valid build_vector for the immediate
	/// operand of a vector shift left operation. That value must be in the range:
	/// 0 <= Value < ElementBits for a left shift; or
	/// 0 <= Value <= ElementBits for a long left shift.
	static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
	assert(VT.isVector() && "vector shift count is not a vector type");
	int64_t ElementBits = VT.getScalarSizeInBits();
	if (!getVShiftImm(Op, ElementBits, Cnt))
	return false;
	return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
	}

	/// isVShiftRImm - Check if this is a valid build_vector for the immediate
	/// operand of a vector shift right operation. The value must be in the range:
	/// 1 <= Value <= ElementBits for a right shift; or
	static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
	assert(VT.isVector() && "vector shift count is not a vector type");
	int64_t ElementBits = VT.getScalarSizeInBits();
	if (!getVShiftImm(Op, ElementBits, Cnt))
	return false;
	return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
	}

	// Attempt to form urhadd(OpA, OpB) from
	// truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1)).
	// The original form of this expression is
	// truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and before this function
	// is called the srl will have been lowered to AArch64ISD::VLSHR and the
	// ((OpA + OpB + 1) >> 1) expression will have been changed to (OpB - (~OpA)).
	// This pass can also recognize a variant of this pattern that uses sign
	// extension instead of zero extension and form a srhadd(OpA, OpB) from it.
	SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
	SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();

	+ if (VT.getScalarType() == MVT::i1) {
	+ // Lower i1 truncate to `(x & 1) != 0`.
	+ SDLoc dl(Op);
	+ EVT OpVT = Op.getOperand(0).getValueType();
	+ SDValue Zero = DAG.getConstant(0, dl, OpVT);
	+ SDValue One = DAG.getConstant(1, dl, OpVT);
	+ SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
	+ return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
	+ }
	+
	if (!VT.isVector() \|\| VT.isScalableVector())
	return Op;

	if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
	return LowerFixedLengthVectorTruncateToSVE(Op, DAG);

	// Since we are looking for a right shift by a constant value of 1 and we are
	// operating on types at least 16 bits in length (sign/zero extended OpA and
	// OpB, which are at least 8 bits), it follows that the truncate will always
	// discard the shifted-in bit and therefore the right shift will be logical
	// regardless of the signedness of OpA and OpB.
	SDValue Shift = Op.getOperand(0);
	if (Shift.getOpcode() != AArch64ISD::VLSHR)
	return Op;

	// Is the right shift using an immediate value of 1?
	uint64_t ShiftAmount = Shift.getConstantOperandVal(1);
	if (ShiftAmount != 1)
	return Op;

	SDValue Sub = Shift->getOperand(0);
	if (Sub.getOpcode() != ISD::SUB)
	return Op;

	SDValue Xor = Sub.getOperand(1);
	if (Xor.getOpcode() != ISD::XOR)
	return Op;

	SDValue ExtendOpA = Xor.getOperand(0);
	SDValue ExtendOpB = Sub.getOperand(0);
	unsigned ExtendOpAOpc = ExtendOpA.getOpcode();
	unsigned ExtendOpBOpc = ExtendOpB.getOpcode();
	if (!(ExtendOpAOpc == ExtendOpBOpc &&
	(ExtendOpAOpc == ISD::ZERO_EXTEND \|\| ExtendOpAOpc == ISD::SIGN_EXTEND)))
	return Op;

	// Is the result of the right shift being truncated to the same value type as
	// the original operands, OpA and OpB?
	SDValue OpA = ExtendOpA.getOperand(0);
	SDValue OpB = ExtendOpB.getOperand(0);
	EVT OpAVT = OpA.getValueType();
	assert(ExtendOpA.getValueType() == ExtendOpB.getValueType());
	if (!(VT == OpAVT && OpAVT == OpB.getValueType()))
	return Op;

	// Is the XOR using a constant amount of all ones in the right hand side?
	uint64_t C;
	if (!isAllConstantBuildVector(Xor.getOperand(1), C))
	return Op;

	unsigned ElemSizeInBits = VT.getScalarSizeInBits();
	APInt CAsAPInt(ElemSizeInBits, C);
	if (CAsAPInt != APInt::getAllOnesValue(ElemSizeInBits))
	return Op;

	SDLoc DL(Op);
	bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND;
	unsigned RHADDOpc = IsSignExtend ? AArch64ISD::SRHADD : AArch64ISD::URHADD;
	SDValue ResultURHADD = DAG.getNode(RHADDOpc, DL, VT, OpA, OpB);

	return ResultURHADD;
	}

	SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
	SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	int64_t Cnt;

	if (!Op.getOperand(1).getValueType().isVector())
	return Op;
	unsigned EltSize = VT.getScalarSizeInBits();

	switch (Op.getOpcode()) {
	default:
	llvm_unreachable("unexpected shift opcode");

	case ISD::SHL:
	if (VT.isScalableVector())
	return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_MERGE_OP1);

	if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
	return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
	DAG.getConstant(Cnt, DL, MVT::i32));
	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
	DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
	MVT::i32),
	Op.getOperand(0), Op.getOperand(1));
	case ISD::SRA:
	case ISD::SRL:
	if (VT.isScalableVector()) {
	unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_MERGE_OP1
	: AArch64ISD::SRL_MERGE_OP1;
	return LowerToPredicatedOp(Op, DAG, Opc);
	}

	// Right shift immediate
	if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
	unsigned Opc =
	(Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
	return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
	DAG.getConstant(Cnt, DL, MVT::i32));
	}

	// Right shift register. Note, there is not a shift right register
	// instruction, but the shift left register instruction takes a signed
	// value, where negative numbers specify a right shift.
	unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
	: Intrinsic::aarch64_neon_ushl;
	// negate the shift amount
	SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1));
	SDValue NegShiftLeft =
	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
	DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
	NegShift);
	return NegShiftLeft;
	}

	return SDValue();
	}

	static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
	AArch64CC::CondCode CC, bool NoNans, EVT VT,
	const SDLoc &dl, SelectionDAG &DAG) {
	EVT SrcVT = LHS.getValueType();
	assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
	"function only supposed to emit natural comparisons");

	BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
	APInt CnstBits(VT.getSizeInBits(), 0);
	APInt UndefBits(VT.getSizeInBits(), 0);
	bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
	bool IsZero = IsCnst && (CnstBits == 0);

	if (SrcVT.getVectorElementType().isFloatingPoint()) {
	switch (CC) {
	default:
	return SDValue();
	case AArch64CC::NE: {
	SDValue Fcmeq;
	if (IsZero)
	Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
	else
	Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
	return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq);
	}
	case AArch64CC::EQ:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
	case AArch64CC::GE:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
	case AArch64CC::GT:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
	case AArch64CC::LS:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
	case AArch64CC::LT:
	if (!NoNans)
	return SDValue();
	// If we ignore NaNs then we can use to the MI implementation.
	LLVM_FALLTHROUGH;
	case AArch64CC::MI:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
	}
	}

	switch (CC) {
	default:
	return SDValue();
	case AArch64CC::NE: {
	SDValue Cmeq;
	if (IsZero)
	Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
	else
	Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
	return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq);
	}
	case AArch64CC::EQ:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
	case AArch64CC::GE:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
	case AArch64CC::GT:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
	case AArch64CC::LE:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
	case AArch64CC::LS:
	return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
	case AArch64CC::LO:
	return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
	case AArch64CC::LT:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
	case AArch64CC::HI:
	return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
	case AArch64CC::HS:
	return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
	}
	}

	SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
	SelectionDAG &DAG) const {
	if (Op.getValueType().isScalableVector()) {
	if (Op.getOperand(0).getValueType().isFloatingPoint())
	return Op;
	return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
	}

	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
	SDLoc dl(Op);

	if (LHS.getValueType().getVectorElementType().isInteger()) {
	assert(LHS.getValueType() == RHS.getValueType());
	AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
	SDValue Cmp =
	EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
	return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
	}

	const bool FullFP16 =
	static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();

	// Make v4f16 (only) fcmp operations utilise vector instructions
	// v8f16 support will be a litle more complicated
	if (!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) {
	if (LHS.getValueType().getVectorNumElements() == 4) {
	LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
	RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
	SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
	DAG.ReplaceAllUsesWith(Op, NewSetcc);
	CmpVT = MVT::v4i32;
	} else
	return SDValue();
	}

	assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) \|\|
	LHS.getValueType().getVectorElementType() != MVT::f128);

	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
	// clean. Some of them require two branches to implement.
	AArch64CC::CondCode CC1, CC2;
	bool ShouldInvert;
	changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);

	bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
	SDValue Cmp =
	EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
	if (!Cmp.getNode())
	return SDValue();

	if (CC2 != AArch64CC::AL) {
	SDValue Cmp2 =
	EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
	if (!Cmp2.getNode())
	return SDValue();

	Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
	}

	Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());

	if (ShouldInvert)
	Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());

	return Cmp;
	}

	static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
	SelectionDAG &DAG) {
	SDValue VecOp = ScalarOp.getOperand(0);
	auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
	DAG.getConstant(0, DL, MVT::i64));
	}

	SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	switch (Op.getOpcode()) {
	case ISD::VECREDUCE_ADD:
	return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
	case ISD::VECREDUCE_SMAX:
	return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
	case ISD::VECREDUCE_SMIN:
	return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
	case ISD::VECREDUCE_UMAX:
	return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
	case ISD::VECREDUCE_UMIN:
	return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
	case ISD::VECREDUCE_FMAX: {
	assert(Op->getFlags().hasNoNaNs() && "fmax vector reduction needs NoNaN flag");
	return DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
	DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32),
	Op.getOperand(0));
	}
	case ISD::VECREDUCE_FMIN: {
	assert(Op->getFlags().hasNoNaNs() && "fmin vector reduction needs NoNaN flag");
	return DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
	DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32),
	Op.getOperand(0));
	}
	default:
	llvm_unreachable("Unhandled reduction");
	}
	}

	SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
	SelectionDAG &DAG) const {
	auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
	if (!Subtarget.hasLSE())
	return SDValue();

	// LSE has an atomic load-add instruction, but not a load-sub.
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue RHS = Op.getOperand(2);
	AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
	RHS = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), RHS);
	return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, AN->getMemoryVT(),
	Op.getOperand(0), Op.getOperand(1), RHS,
	AN->getMemOperand());
	}

	SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
	SelectionDAG &DAG) const {
	auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
	if (!Subtarget.hasLSE())
	return SDValue();

	// LSE has an atomic load-clear instruction, but not a load-and.
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue RHS = Op.getOperand(2);
	AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
	RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
	return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
	Op.getOperand(0), Op.getOperand(1), RHS,
	AN->getMemOperand());
	}

	SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
	SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
	SDLoc dl(Op);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Callee = DAG.getTargetExternalSymbol("__chkstk", PtrVT, 0);

	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
	if (Subtarget->hasCustomCallingConv())
	TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);

	Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
	DAG.getConstant(4, dl, MVT::i64));
	Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
	Chain =
	DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
	Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
	DAG.getRegisterMask(Mask), Chain.getValue(1));
	// To match the actual intent better, we should read the output from X15 here
	// again (instead of potentially spilling it to the stack), but rereading Size
	// from X15 here doesn't work at -O0, since it thinks that X15 is undefined
	// here.

	Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
	DAG.getConstant(4, dl, MVT::i64));
	return Chain;
	}

	SDValue
	AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Subtarget->isTargetWindows() &&
	"Only Windows alloca probing supported");
	SDLoc dl(Op);
	// Get the inputs.
	SDNode *Node = Op.getNode();
	SDValue Chain = Op.getOperand(0);
	SDValue Size = Op.getOperand(1);
	MaybeAlign Align =
	cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
	EVT VT = Node->getValueType(0);

	if (DAG.getMachineFunction().getFunction().hasFnAttribute(
	"no-stack-arg-probe")) {
	SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
	Chain = SP.getValue(1);
	SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
	if (Align)
	SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
	DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
	SDValue Ops[2] = {SP, Chain};
	return DAG.getMergeValues(Ops, dl);
	}

	Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

	Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);

	SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
	Chain = SP.getValue(1);
	SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
	if (Align)
	SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
	DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);

	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
	DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);

	SDValue Ops[2] = {SP, Chain};
	return DAG.getMergeValues(Ops, dl);
	}

	SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
	SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();
	assert(VT != MVT::i64 && "Expected illegal VSCALE node");

	SDLoc DL(Op);
	APInt MulImm = cast<ConstantSDNode>(Op.getOperand(0))->getAPIntValue();
	return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sextOrSelf(64)),
	DL, VT);
	}

	/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
	template <unsigned NumVecs>
	static bool setInfoSVEStN(AArch64TargetLowering::IntrinsicInfo &Info,
	const CallInst &CI) {
	Info.opc = ISD::INTRINSIC_VOID;
	// Retrieve EC from first vector argument.
	const EVT VT = EVT::getEVT(CI.getArgOperand(0)->getType());
	ElementCount EC = VT.getVectorElementCount();
	#ifndef NDEBUG
	// Check the assumption that all input vectors are the same type.
	for (unsigned I = 0; I < NumVecs; ++I)
	assert(VT == EVT::getEVT(CI.getArgOperand(I)->getType()) &&
	"Invalid type.");
	#endif
	// memVT is `NumVecs * VT`.
	Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(),
	EC * NumVecs);
	Info.ptrVal = CI.getArgOperand(CI.getNumArgOperands() - 1);
	Info.offset = 0;
	Info.align.reset();
	Info.flags = MachineMemOperand::MOStore;
	return true;
	}

	/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
	/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
	/// specified in the intrinsic calls.
	bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
	const CallInst &I,
	MachineFunction &MF,
	unsigned Intrinsic) const {
	auto &DL = I.getModule()->getDataLayout();
	switch (Intrinsic) {
	case Intrinsic::aarch64_sve_st2:
	return setInfoSVEStN<2>(Info, I);
	case Intrinsic::aarch64_sve_st3:
	return setInfoSVEStN<3>(Info, I);
	case Intrinsic::aarch64_sve_st4:
	return setInfoSVEStN<4>(Info, I);
	case Intrinsic::aarch64_neon_ld2:
	case Intrinsic::aarch64_neon_ld3:
	case Intrinsic::aarch64_neon_ld4:
	case Intrinsic::aarch64_neon_ld1x2:
	case Intrinsic::aarch64_neon_ld1x3:
	case Intrinsic::aarch64_neon_ld1x4:
	case Intrinsic::aarch64_neon_ld2lane:
	case Intrinsic::aarch64_neon_ld3lane:
	case Intrinsic::aarch64_neon_ld4lane:
	case Intrinsic::aarch64_neon_ld2r:
	case Intrinsic::aarch64_neon_ld3r:
	case Intrinsic::aarch64_neon_ld4r: {
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	// Conservatively set memVT to the entire set of vectors loaded.
	uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
	Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
	Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
	Info.offset = 0;
	Info.align.reset();
	// volatile loads with NEON intrinsics not supported
	Info.flags = MachineMemOperand::MOLoad;
	return true;
	}
	case Intrinsic::aarch64_neon_st2:
	case Intrinsic::aarch64_neon_st3:
	case Intrinsic::aarch64_neon_st4:
	case Intrinsic::aarch64_neon_st1x2:
	case Intrinsic::aarch64_neon_st1x3:
	case Intrinsic::aarch64_neon_st1x4:
	case Intrinsic::aarch64_neon_st2lane:
	case Intrinsic::aarch64_neon_st3lane:
	case Intrinsic::aarch64_neon_st4lane: {
	Info.opc = ISD::INTRINSIC_VOID;
	// Conservatively set memVT to the entire set of vectors stored.
	unsigned NumElts = 0;
	for (unsigned ArgI = 0, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
	Type *ArgTy = I.getArgOperand(ArgI)->getType();
	if (!ArgTy->isVectorTy())
	break;
	NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
	}
	Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
	Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
	Info.offset = 0;
	Info.align.reset();
	// volatile stores with NEON intrinsics not supported
	Info.flags = MachineMemOperand::MOStore;
	return true;
	}
	case Intrinsic::aarch64_ldaxr:
	case Intrinsic::aarch64_ldxr: {
	PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::getVT(PtrTy->getElementType());
	Info.ptrVal = I.getArgOperand(0);
	Info.offset = 0;
	Info.align = DL.getABITypeAlign(PtrTy->getElementType());
	Info.flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOVolatile;
	return true;
	}
	case Intrinsic::aarch64_stlxr:
	case Intrinsic::aarch64_stxr: {
	PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::getVT(PtrTy->getElementType());
	Info.ptrVal = I.getArgOperand(1);
	Info.offset = 0;
	Info.align = DL.getABITypeAlign(PtrTy->getElementType());
	Info.flags = MachineMemOperand::MOStore \| MachineMemOperand::MOVolatile;
	return true;
	}
	case Intrinsic::aarch64_ldaxp:
	case Intrinsic::aarch64_ldxp:
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::i128;
	Info.ptrVal = I.getArgOperand(0);
	Info.offset = 0;
	Info.align = Align(16);
	Info.flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOVolatile;
	return true;
	case Intrinsic::aarch64_stlxp:
	case Intrinsic::aarch64_stxp:
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::i128;
	Info.ptrVal = I.getArgOperand(2);
	Info.offset = 0;
	Info.align = Align(16);
	Info.flags = MachineMemOperand::MOStore \| MachineMemOperand::MOVolatile;
	return true;
	case Intrinsic::aarch64_sve_ldnt1: {
	PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::getVT(I.getType());
	Info.ptrVal = I.getArgOperand(1);
	Info.offset = 0;
	Info.align = DL.getABITypeAlign(PtrTy->getElementType());
	Info.flags = MachineMemOperand::MOLoad;
	if (Intrinsic == Intrinsic::aarch64_sve_ldnt1)
	Info.flags \|= MachineMemOperand::MONonTemporal;
	return true;
	}
	case Intrinsic::aarch64_sve_stnt1: {
	PointerType *PtrTy = cast<PointerType>(I.getArgOperand(2)->getType());
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::getVT(I.getOperand(0)->getType());
	Info.ptrVal = I.getArgOperand(2);
	Info.offset = 0;
	Info.align = DL.getABITypeAlign(PtrTy->getElementType());
	Info.flags = MachineMemOperand::MOStore;
	if (Intrinsic == Intrinsic::aarch64_sve_stnt1)
	Info.flags \|= MachineMemOperand::MONonTemporal;
	return true;
	}
	default:
	break;
	}

	return false;
	}

	bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
	ISD::LoadExtType ExtTy,
	EVT NewVT) const {
	// TODO: This may be worth removing. Check regression tests for diffs.
	if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
	return false;

	// If we're reducing the load width in order to avoid having to use an extra
	// instruction to do extension then it's probably a good idea.
	if (ExtTy != ISD::NON_EXTLOAD)
	return true;
	// Don't reduce load width if it would prevent us from combining a shift into
	// the offset.
	MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
	assert(Mem);
	const SDValue &Base = Mem->getBasePtr();
	if (Base.getOpcode() == ISD::ADD &&
	Base.getOperand(1).getOpcode() == ISD::SHL &&
	Base.getOperand(1).hasOneUse() &&
	Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
	// The shift can be combined if it matches the size of the value being
	// loaded (and so reducing the width would make it not match).
	uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
	uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
	if (ShiftAmount == Log2_32(LoadBytes))
	return false;
	}
	// We have no reason to disallow reducing the load width, so allow it.
	return true;
	}

	// Truncations from 64-bit GPR to 32-bit GPR is free.
	bool AArch64TargetLowering::isTruncateFree(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;
	unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
	unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
	return NumBits1 > NumBits2;
	}
	bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
	if (VT1.isVector() \|\| VT2.isVector() \|\| !VT1.isInteger() \|\| !VT2.isInteger())
	return false;
	unsigned NumBits1 = VT1.getSizeInBits();
	unsigned NumBits2 = VT2.getSizeInBits();
	return NumBits1 > NumBits2;
	}

	/// Check if it is profitable to hoist instruction in then/else to if.
	/// Not profitable if I and it's user can form a FMA instruction
	/// because we prefer FMSUB/FMADD.
	bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
	if (I->getOpcode() != Instruction::FMul)
	return true;

	if (!I->hasOneUse())
	return true;

	Instruction *User = I->user_back();

	if (User &&
	!(User->getOpcode() == Instruction::FSub \|\|
	User->getOpcode() == Instruction::FAdd))
	return true;

	const TargetOptions &Options = getTargetMachine().Options;
	const Function *F = I->getFunction();
	const DataLayout &DL = F->getParent()->getDataLayout();
	Type *Ty = User->getOperand(0)->getType();

	return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
	isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
	(Options.AllowFPOpFusion == FPOpFusion::Fast \|\|
	Options.UnsafeFPMath));
	}

	// All 32-bit GPR operations implicitly zero the high-half of the corresponding
	// 64-bit GPR.
	bool AArch64TargetLowering::isZExtFree(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;
	unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
	unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
	return NumBits1 == 32 && NumBits2 == 64;
	}
	bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
	if (VT1.isVector() \|\| VT2.isVector() \|\| !VT1.isInteger() \|\| !VT2.isInteger())
	return false;
	unsigned NumBits1 = VT1.getSizeInBits();
	unsigned NumBits2 = VT2.getSizeInBits();
	return NumBits1 == 32 && NumBits2 == 64;
	}

	bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
	EVT VT1 = Val.getValueType();
	if (isZExtFree(VT1, VT2)) {
	return true;
	}

	if (Val.getOpcode() != ISD::LOAD)
	return false;

	// 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
	return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
	VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
	VT1.getSizeInBits() <= 32);
	}

	bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
	if (isa<FPExtInst>(Ext))
	return false;

	// Vector types are not free.
	if (Ext->getType()->isVectorTy())
	return false;

	for (const Use &U : Ext->uses()) {
	// The extension is free if we can fold it with a left shift in an
	// addressing mode or an arithmetic operation: add, sub, and cmp.

	// Is there a shift?
	const Instruction *Instr = cast<Instruction>(U.getUser());

	// Is this a constant shift?
	switch (Instr->getOpcode()) {
	case Instruction::Shl:
	if (!isa<ConstantInt>(Instr->getOperand(1)))
	return false;
	break;
	case Instruction::GetElementPtr: {
	gep_type_iterator GTI = gep_type_begin(Instr);
	auto &DL = Ext->getModule()->getDataLayout();
	std::advance(GTI, U.getOperandNo()-1);
	Type *IdxTy = GTI.getIndexedType();
	// This extension will end up with a shift because of the scaling factor.
	// 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
	// Get the shift amount based on the scaling factor:
	// log2(sizeof(IdxTy)) - log2(8).
	uint64_t ShiftAmt =
	countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy).getFixedSize()) - 3;
	// Is the constant foldable in the shift of the addressing mode?
	// I.e., shift amount is between 1 and 4 inclusive.
	if (ShiftAmt == 0 \|\| ShiftAmt > 4)
	return false;
	break;
	}
	case Instruction::Trunc:
	// Check if this is a noop.
	// trunc(sext ty1 to ty2) to ty1.
	if (Instr->getType() == Ext->getOperand(0)->getType())
	continue;
	LLVM_FALLTHROUGH;
	default:
	return false;
	}

	// At this point we can use the bfm family, so this extension is free
	// for that use.
	}
	return true;
	}

	/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
	/// or upper half of the vector elements.
	static bool areExtractShuffleVectors(Value Op1, Value Op2) {
	auto areTypesHalfed = [](Value FullV, Value HalfV) {
	auto *FullTy = FullV->getType();
	auto *HalfTy = HalfV->getType();
	return FullTy->getPrimitiveSizeInBits().getFixedSize() ==
	2 * HalfTy->getPrimitiveSizeInBits().getFixedSize();
	};

	auto extractHalf = [](Value FullV, Value HalfV) {
	auto *FullVT = cast<FixedVectorType>(FullV->getType());
	auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
	return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
	};

	ArrayRef<int> M1, M2;
	Value S1Op1, S2Op1;
	if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) \|\|
	!match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
	return false;

	// Check that the operands are half as wide as the result and we extract
	// half of the elements of the input vectors.
	if (!areTypesHalfed(S1Op1, Op1) \|\| !areTypesHalfed(S2Op1, Op2) \|\|
	!extractHalf(S1Op1, Op1) \|\| !extractHalf(S2Op1, Op2))
	return false;

	// Check the mask extracts either the lower or upper half of vector
	// elements.
	int M1Start = -1;
	int M2Start = -1;
	int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
	if (!ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start) \|\|
	!ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start) \|\|
	M1Start != M2Start \|\| (M1Start != 0 && M2Start != (NumElements / 2)))
	return false;

	return true;
	}

	/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
	/// of the vector elements.
	static bool areExtractExts(Value Ext1, Value Ext2) {
	auto areExtDoubled = [](Instruction *Ext) {
	return Ext->getType()->getScalarSizeInBits() ==
	2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
	};

	if (!match(Ext1, m_ZExtOrSExt(m_Value())) \|\|
	!match(Ext2, m_ZExtOrSExt(m_Value())) \|\|
	!areExtDoubled(cast<Instruction>(Ext1)) \|\|
	!areExtDoubled(cast<Instruction>(Ext2)))
	return false;

	return true;
	}

	/// Check if Op could be used with vmull_high_p64 intrinsic.
	static bool isOperandOfVmullHighP64(Value *Op) {
	Value *VectorOperand = nullptr;
	ConstantInt *ElementIndex = nullptr;
	return match(Op, m_ExtractElt(m_Value(VectorOperand),
	m_ConstantInt(ElementIndex))) &&
	ElementIndex->getValue() == 1 &&
	isa<FixedVectorType>(VectorOperand->getType()) &&
	cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
	}

	/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
	static bool areOperandsOfVmullHighP64(Value Op1, Value Op2) {
	return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2);
	}

	/// Check if sinking \p I's operands to I's basic block is profitable, because
	/// the operands can be folded into a target instruction, e.g.
	/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
	bool AArch64TargetLowering::shouldSinkOperands(
	Instruction I, SmallVectorImpl<Use > &Ops) const {
	if (!I->getType()->isVectorTy())
	return false;

	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
	switch (II->getIntrinsicID()) {
	case Intrinsic::aarch64_neon_umull:
	if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
	return false;
	Ops.push_back(&II->getOperandUse(0));
	Ops.push_back(&II->getOperandUse(1));
	return true;

	case Intrinsic::aarch64_neon_pmull64:
	if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
	II->getArgOperand(1)))
	return false;
	Ops.push_back(&II->getArgOperandUse(0));
	Ops.push_back(&II->getArgOperandUse(1));
	return true;

	default:
	return false;
	}
	}

	switch (I->getOpcode()) {
	case Instruction::Sub:
	case Instruction::Add: {
	if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
	return false;

	// If the exts' operands extract either the lower or upper elements, we
	// can sink them too.
	auto Ext1 = cast<Instruction>(I->getOperand(0));
	auto Ext2 = cast<Instruction>(I->getOperand(1));
	if (areExtractShuffleVectors(Ext1, Ext2)) {
	Ops.push_back(&Ext1->getOperandUse(0));
	Ops.push_back(&Ext2->getOperandUse(0));
	}

	Ops.push_back(&I->getOperandUse(0));
	Ops.push_back(&I->getOperandUse(1));

	return true;
	}
	default:
	return false;
	}
	return false;
	}

	bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
	Align &RequiredAligment) const {
	if (!LoadedType.isSimple() \|\|
	(!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
	return false;
	// Cyclone supports unaligned accesses.
	RequiredAligment = Align(1);
	unsigned NumBits = LoadedType.getSizeInBits();
	return NumBits == 32 \|\| NumBits == 64;
	}

	/// A helper function for determining the number of interleaved accesses we
	/// will generate when lowering accesses of the given type.
	unsigned
	AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
	const DataLayout &DL) const {
	return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
	}

	MachineMemOperand::Flags
	AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const {
	if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
	I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
	return MOStridedAccess;
	return MachineMemOperand::MONone;
	}

	bool AArch64TargetLowering::isLegalInterleavedAccessType(
	VectorType *VecTy, const DataLayout &DL) const {

	unsigned VecSize = DL.getTypeSizeInBits(VecTy);
	unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());

	// Ensure the number of vector elements is greater than 1.
	if (cast<FixedVectorType>(VecTy)->getNumElements() < 2)
	return false;

	// Ensure the element type is legal.
	if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
	return false;

	// Ensure the total vector size is 64 or a multiple of 128. Types larger than
	// 128 will be split into multiple interleaved accesses.
	return VecSize == 64 \|\| VecSize % 128 == 0;
	}

	/// Lower an interleaved load into a ldN intrinsic.
	///
	/// E.g. Lower an interleaved load (Factor = 2):
	/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
	/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
	/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
	///
	/// Into:
	/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
	/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
	/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
	bool AArch64TargetLowering::lowerInterleavedLoad(
	LoadInst LI, ArrayRef<ShuffleVectorInst > Shuffles,
	ArrayRef<unsigned> Indices, unsigned Factor) const {
	assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
	"Invalid interleave factor");
	assert(!Shuffles.empty() && "Empty shufflevector input");
	assert(Shuffles.size() == Indices.size() &&
	"Unmatched number of shufflevectors and indices");

	const DataLayout &DL = LI->getModule()->getDataLayout();

	VectorType *VTy = Shuffles[0]->getType();

	// Skip if we do not have NEON and skip illegal vector types. We can
	// "legalize" wide vector types into multiple interleaved accesses as long as
	// the vector types are divisible by 128.
	if (!Subtarget->hasNEON() \|\| !isLegalInterleavedAccessType(VTy, DL))
	return false;

	unsigned NumLoads = getNumInterleavedAccesses(VTy, DL);

	auto *FVTy = cast<FixedVectorType>(VTy);

	// A pointer vector can not be the return type of the ldN intrinsics. Need to
	// load integer vectors first and then convert to pointer vectors.
	Type *EltTy = FVTy->getElementType();
	if (EltTy->isPointerTy())
	FVTy =
	FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());

	IRBuilder<> Builder(LI);

	// The base address of the load.
	Value *BaseAddr = LI->getPointerOperand();

	if (NumLoads > 1) {
	// If we're going to generate more than one load, reset the sub-vector type
	// to something legal.
	FVTy = FixedVectorType::get(FVTy->getElementType(),
	FVTy->getNumElements() / NumLoads);

	// We will compute the pointer operand of each load from the original base
	// address using GEPs. Cast the base address to a pointer to the scalar
	// element type.
	BaseAddr = Builder.CreateBitCast(
	BaseAddr,
	FVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()));
	}

	Type *PtrTy = FVTy->getPointerTo(LI->getPointerAddressSpace());
	Type *Tys[2] = {FVTy, PtrTy};
	static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
	Intrinsic::aarch64_neon_ld3,
	Intrinsic::aarch64_neon_ld4};
	Function *LdNFunc =
	Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);

	// Holds sub-vectors extracted from the load intrinsic return values. The
	// sub-vectors are associated with the shufflevector instructions they will
	// replace.
	DenseMap<ShuffleVectorInst , SmallVector<Value , 4>> SubVecs;

	for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {

	// If we're generating more than one load, compute the base address of
	// subsequent loads as an offset from the previous.
	if (LoadCount > 0)
	BaseAddr = Builder.CreateConstGEP1_32(FVTy->getElementType(), BaseAddr,
	FVTy->getNumElements() * Factor);

	CallInst *LdN = Builder.CreateCall(
	LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN");

	// Extract and store the sub-vectors returned by the load intrinsic.
	for (unsigned i = 0; i < Shuffles.size(); i++) {
	ShuffleVectorInst *SVI = Shuffles[i];
	unsigned Index = Indices[i];

	Value *SubVec = Builder.CreateExtractValue(LdN, Index);

	// Convert the integer vector to pointer vector if the element is pointer.
	if (EltTy->isPointerTy())
	SubVec = Builder.CreateIntToPtr(
	SubVec, FixedVectorType::get(SVI->getType()->getElementType(),
	FVTy->getNumElements()));
	SubVecs[SVI].push_back(SubVec);
	}
	}

	// Replace uses of the shufflevector instructions with the sub-vectors
	// returned by the load intrinsic. If a shufflevector instruction is
	// associated with more than one sub-vector, those sub-vectors will be
	// concatenated into a single wide vector.
	for (ShuffleVectorInst *SVI : Shuffles) {
	auto &SubVec = SubVecs[SVI];
	auto *WideVec =
	SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
	SVI->replaceAllUsesWith(WideVec);
	}

	return true;
	}

	/// Lower an interleaved store into a stN intrinsic.
	///
	/// E.g. Lower an interleaved store (Factor = 3):
	/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
	/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
	/// store <12 x i32> %i.vec, <12 x i32>* %ptr
	///
	/// Into:
	/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
	/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
	/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
	/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
	///
	/// Note that the new shufflevectors will be removed and we'll only generate one
	/// st3 instruction in CodeGen.
	///
	/// Example for a more general valid mask (Factor 3). Lower:
	/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
	/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
	/// store <12 x i32> %i.vec, <12 x i32>* %ptr
	///
	/// Into:
	/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
	/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
	/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
	/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
	bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
	ShuffleVectorInst *SVI,
	unsigned Factor) const {
	assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
	"Invalid interleave factor");

	auto *VecTy = cast<FixedVectorType>(SVI->getType());
	assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");

	unsigned LaneLen = VecTy->getNumElements() / Factor;
	Type *EltTy = VecTy->getElementType();
	auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);

	const DataLayout &DL = SI->getModule()->getDataLayout();

	// Skip if we do not have NEON and skip illegal vector types. We can
	// "legalize" wide vector types into multiple interleaved accesses as long as
	// the vector types are divisible by 128.
	if (!Subtarget->hasNEON() \|\| !isLegalInterleavedAccessType(SubVecTy, DL))
	return false;

	unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);

	Value *Op0 = SVI->getOperand(0);
	Value *Op1 = SVI->getOperand(1);
	IRBuilder<> Builder(SI);

	// StN intrinsics don't support pointer vectors as arguments. Convert pointer
	// vectors to integer vectors.
	if (EltTy->isPointerTy()) {
	Type *IntTy = DL.getIntPtrType(EltTy);
	unsigned NumOpElts =
	cast<FixedVectorType>(Op0->getType())->getNumElements();

	// Convert to the corresponding integer vector.
	auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
	Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
	Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);

	SubVecTy = FixedVectorType::get(IntTy, LaneLen);
	}

	// The base address of the store.
	Value *BaseAddr = SI->getPointerOperand();

	if (NumStores > 1) {
	// If we're going to generate more than one store, reset the lane length
	// and sub-vector type to something legal.
	LaneLen /= NumStores;
	SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);

	// We will compute the pointer operand of each store from the original base
	// address using GEPs. Cast the base address to a pointer to the scalar
	// element type.
	BaseAddr = Builder.CreateBitCast(
	BaseAddr,
	SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace()));
	}

	auto Mask = SVI->getShuffleMask();

	Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
	Type *Tys[2] = {SubVecTy, PtrTy};
	static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2,
	Intrinsic::aarch64_neon_st3,
	Intrinsic::aarch64_neon_st4};
	Function *StNFunc =
	Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);

	for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {

	SmallVector<Value *, 5> Ops;

	// Split the shufflevector operands into sub vectors for the new stN call.
	for (unsigned i = 0; i < Factor; i++) {
	unsigned IdxI = StoreCount * LaneLen * Factor + i;
	if (Mask[IdxI] >= 0) {
	Ops.push_back(Builder.CreateShuffleVector(
	Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
	} else {
	unsigned StartMask = 0;
	for (unsigned j = 1; j < LaneLen; j++) {
	unsigned IdxJ = StoreCount * LaneLen * Factor + j;
	if (Mask[IdxJ * Factor + IdxI] >= 0) {
	StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
	break;
	}
	}
	// Note: Filling undef gaps with random elements is ok, since
	// those elements were being written anyway (with undefs).
	// In the case of all undefs we're defaulting to using elems from 0
	// Note: StartMask cannot be negative, it's checked in
	// isReInterleaveMask
	Ops.push_back(Builder.CreateShuffleVector(
	Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
	}
	}

	// If we generating more than one store, we compute the base address of
	// subsequent stores as an offset from the previous.
	if (StoreCount > 0)
	BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
	BaseAddr, LaneLen * Factor);

	Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
	Builder.CreateCall(StNFunc, Ops);
	}
	return true;
	}

	// Lower an SVE structured load intrinsic returning a tuple type to target
	// specific intrinsic taking the same input but returning a multi-result value
	// of the split tuple type.
	//
	// E.g. Lowering an LD3:
	//
	// call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32(
	// <vscale x 4 x i1> %pred,
	// <vscale x 4 x i32>* %addr)
	//
	// Output DAG:
	//
	// t0: ch = EntryToken
	// t2: nxv4i1,ch = CopyFromReg t0, Register:nxv4i1 %0
	// t4: i64,ch = CopyFromReg t0, Register:i64 %1
	// t5: nxv4i32,nxv4i32,nxv4i32,ch = AArch64ISD::SVE_LD3 t0, t2, t4
	// t6: nxv12i32 = concat_vectors t5, t5:1, t5:2
	//
	// This is called pre-legalization to avoid widening/splitting issues with
	// non-power-of-2 tuple types used for LD3, such as nxv12i32.
	SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic,
	ArrayRef<SDValue> LoadOps,
	EVT VT, SelectionDAG &DAG,
	const SDLoc &DL) const {
	assert(VT.isScalableVector() && "Can only lower scalable vectors");

	unsigned N, Opcode;
	static std::map<unsigned, std::pair<unsigned, unsigned>> IntrinsicMap = {
	{Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2_MERGE_ZERO}},
	{Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3_MERGE_ZERO}},
	{Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}};

	std::tie(N, Opcode) = IntrinsicMap[Intrinsic];
	assert(VT.getVectorElementCount().Min % N == 0 &&
	"invalid tuple vector type!");

	EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
	VT.getVectorElementCount() / N);
	assert(isTypeLegal(SplitVT));

	SmallVector<EVT, 5> VTs(N, SplitVT);
	VTs.push_back(MVT::Other); // Chain
	SDVTList NodeTys = DAG.getVTList(VTs);

	SDValue PseudoLoad = DAG.getNode(Opcode, DL, NodeTys, LoadOps);
	SmallVector<SDValue, 4> PseudoLoadOps;
	for (unsigned I = 0; I < N; ++I)
	PseudoLoadOps.push_back(SDValue(PseudoLoad.getNode(), I));
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, PseudoLoadOps);
	}

	EVT AArch64TargetLowering::getOptimalMemOpType(
	const MemOp &Op, const AttributeList &FuncAttributes) const {
	bool CanImplicitFloat =
	!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
	bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
	bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
	// Only use AdvSIMD to implement memset of 32-byte and above. It would have
	// taken one instruction to materialize the v2i64 zero and one store (with
	// restrictive addressing mode). Just do i64 stores.
	bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
	auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
	if (Op.isAligned(AlignCheck))
	return true;
	bool Fast;
	return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone,
	&Fast) &&
	Fast;
	};

	if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
	AlignmentIsAcceptable(MVT::v2i64, Align(16)))
	return MVT::v2i64;
	if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
	return MVT::f128;
	if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
	return MVT::i64;
	if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
	return MVT::i32;
	return MVT::Other;
	}

	LLT AArch64TargetLowering::getOptimalMemOpLLT(
	const MemOp &Op, const AttributeList &FuncAttributes) const {
	bool CanImplicitFloat =
	!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
	bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
	bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
	// Only use AdvSIMD to implement memset of 32-byte and above. It would have
	// taken one instruction to materialize the v2i64 zero and one store (with
	// restrictive addressing mode). Just do i64 stores.
	bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
	auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
	if (Op.isAligned(AlignCheck))
	return true;
	bool Fast;
	return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone,
	&Fast) &&
	Fast;
	};

	if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
	AlignmentIsAcceptable(MVT::v2i64, Align(16)))
	return LLT::vector(2, 64);
	if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
	return LLT::scalar(128);
	if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
	return LLT::scalar(64);
	if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
	return LLT::scalar(32);
	return LLT();
	}

	// 12-bit optionally shifted immediates are legal for adds.
	bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
	if (Immed == std::numeric_limits<int64_t>::min()) {
	LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
	<< ": avoid UB for INT64_MIN\n");
	return false;
	}
	// Same encoding for add/sub, just flip the sign.
	Immed = std::abs(Immed);
	bool IsLegal = ((Immed >> 12) == 0 \|\|
	((Immed & 0xfff) == 0 && Immed >> 24 == 0));
	LLVM_DEBUG(dbgs() << "Is " << Immed
	<< " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
	return IsLegal;
	}

	// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
	// immediates is the same as for an add or a sub.
	bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
	return isLegalAddImmediate(Immed);
	}

	/// isLegalAddressingMode - Return true if the addressing mode represented
	/// by AM is legal for this target, for a load/store of the specified type.
	bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS, Instruction *I) const {
	// AArch64 has five basic addressing modes:
	// reg
	// reg + 9-bit signed offset
	// reg + SIZE_IN_BYTES * 12-bit unsigned offset
	// reg1 + reg2
	// reg + SIZE_IN_BYTES * reg

	// No global is ever allowed as a base.
	if (AM.BaseGV)
	return false;

	// No reg+reg+imm addressing.
	if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
	return false;

	// FIXME: Update this method to support scalable addressing modes.
	if (isa<ScalableVectorType>(Ty))
	return AM.HasBaseReg && !AM.BaseOffs && !AM.Scale;

	// check reg + imm case:
	// i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
	uint64_t NumBytes = 0;
	if (Ty->isSized()) {
	uint64_t NumBits = DL.getTypeSizeInBits(Ty);
	NumBytes = NumBits / 8;
	if (!isPowerOf2_64(NumBits))
	NumBytes = 0;
	}

	if (!AM.Scale) {
	int64_t Offset = AM.BaseOffs;

	// 9-bit signed offset
	if (isInt<9>(Offset))
	return true;

	// 12-bit unsigned offset
	unsigned shift = Log2_64(NumBytes);
	if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
	// Must be a multiple of NumBytes (NumBytes is a power of 2)
	(Offset >> shift) << shift == Offset)
	return true;
	return false;
	}

	// Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2

	return AM.Scale == 1 \|\| (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
	}

	bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {
	// Consider splitting large offset of struct or array.
	return true;
	}

	int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS) const {
	// Scaling factors are not free at all.
	// Operands \| Rt Latency
	// -------------------------------------------
	// Rt, [Xn, Xm] \| 4
	// -------------------------------------------
	// Rt, [Xn, Xm, lsl #imm] \| Rn: 4 Rm: 5
	// Rt, [Xn, Wm, <extend> #imm] \|
	if (isLegalAddressingMode(DL, AM, Ty, AS))
	// Scale represents reg2 * scale, thus account for 1 if
	// it is not equal to 0 or 1.
	return AM.Scale != 0 && AM.Scale != 1;
	return -1;
	}

	bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(
	const MachineFunction &MF, EVT VT) const {
	VT = VT.getScalarType();

	if (!VT.isSimple())
	return false;

	switch (VT.getSimpleVT().SimpleTy) {
	case MVT::f32:
	case MVT::f64:
	return true;
	default:
	break;
	}

	return false;
	}

	bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
	Type *Ty) const {
	switch (Ty->getScalarType()->getTypeID()) {
	case Type::FloatTyID:
	case Type::DoubleTyID:
	return true;
	default:
	return false;
	}
	}

	const MCPhysReg *
	AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
	// LR is a callee-save register, but we must treat it as clobbered by any call
	// site. Hence we include LR in the scratch registers, which are in turn added
	// as implicit-defs for stackmaps and patchpoints.
	static const MCPhysReg ScratchRegs[] = {
	AArch64::X16, AArch64::X17, AArch64::LR, 0
	};
	return ScratchRegs;
	}

	bool
	AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
	CombineLevel Level) const {
	N = N->getOperand(0).getNode();
	EVT VT = N->getValueType(0);
	// If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
	// it with shift to let it be lowered to UBFX.
	if (N->getOpcode() == ISD::AND && (VT == MVT::i32 \|\| VT == MVT::i64) &&
	isa<ConstantSDNode>(N->getOperand(1))) {
	uint64_t TruncMask = N->getConstantOperandVal(1);
	if (isMask_64(TruncMask) &&
	N->getOperand(0).getOpcode() == ISD::SRL &&
	isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
	return false;
	}
	return true;
	}

	bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const {
	assert(Ty->isIntegerTy());

	unsigned BitSize = Ty->getPrimitiveSizeInBits();
	if (BitSize == 0)
	return false;

	int64_t Val = Imm.getSExtValue();
	if (Val == 0 \|\| AArch64_AM::isLogicalImmediate(Val, BitSize))
	return true;

	if ((int64_t)Val < 0)
	Val = ~Val;
	if (BitSize == 32)
	Val &= (1LL << 32) - 1;

	unsigned LZ = countLeadingZeros((uint64_t)Val);
	unsigned Shift = (63 - LZ) / 16;
	// MOVZ is free so return true for one or fewer MOVK.
	return Shift < 3;
	}

	bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
	unsigned Index) const {
	if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
	return false;

	return (Index == 0 \|\| Index == ResVT.getVectorNumElements());
	}

	/// Turn vector tests of the signbit in the form of:
	/// xor (sra X, elt_size(X)-1), -1
	/// into:
	/// cmge X, X, #0
	static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {
	EVT VT = N->getValueType(0);
	if (!Subtarget->hasNEON() \|\| !VT.isVector())
	return SDValue();

	// There must be a shift right algebraic before the xor, and the xor must be a
	// 'not' operation.
	SDValue Shift = N->getOperand(0);
	SDValue Ones = N->getOperand(1);
	if (Shift.getOpcode() != AArch64ISD::VASHR \|\| !Shift.hasOneUse() \|\|
	!ISD::isBuildVectorAllOnes(Ones.getNode()))
	return SDValue();

	// The shift should be smearing the sign bit across each vector element.
	auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
	EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
	if (!ShiftAmt \|\| ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
	return SDValue();

	return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
	}

	// Generate SUBS and CSEL for integer abs.
	static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
	EVT VT = N->getValueType(0);

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDLoc DL(N);

	// Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
	// and change it to SUB and CSEL.
	if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
	N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
	N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
	if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
	if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
	SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	N0.getOperand(0));
	// Generate SUBS & CSEL.
	SDValue Cmp =
	DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
	N0.getOperand(0), DAG.getConstant(0, DL, VT));
	return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
	DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
	SDValue(Cmp.getNode(), 1));
	}
	return SDValue();
	}

	static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
	return Cmp;

	return performIntegerAbsCombine(N, DAG);
	}

	SDValue
	AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
	SelectionDAG &DAG,
	SmallVectorImpl<SDNode *> &Created) const {
	AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
	if (isIntDivCheap(N->getValueType(0), Attr))
	return SDValue(N,0); // Lower SDIV as SDIV

	// fold (sdiv X, pow2)
	EVT VT = N->getValueType(0);
	if ((VT != MVT::i32 && VT != MVT::i64) \|\|
	!(Divisor.isPowerOf2() \|\| (-Divisor).isPowerOf2()))
	return SDValue();

	SDLoc DL(N);
	SDValue N0 = N->getOperand(0);
	unsigned Lg2 = Divisor.countTrailingZeros();
	SDValue Zero = DAG.getConstant(0, DL, VT);
	SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);

	// Add (N0 < 0) ? Pow2 - 1 : 0;
	SDValue CCVal;
	SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
	SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);

	Created.push_back(Cmp.getNode());
	Created.push_back(Add.getNode());
	Created.push_back(CSel.getNode());

	// Divide by pow2.
	SDValue SRA =
	DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64));

	// If we're dividing by a positive value, we're done. Otherwise, we must
	// negate the result.
	if (Divisor.isNonNegative())
	return SRA;

	Created.push_back(SRA.getNode());
	return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
	}

	static bool IsSVECntIntrinsic(SDValue S) {
	switch(getIntrinsicID(S.getNode())) {
	default:
	break;
	case Intrinsic::aarch64_sve_cntb:
	case Intrinsic::aarch64_sve_cnth:
	case Intrinsic::aarch64_sve_cntw:
	case Intrinsic::aarch64_sve_cntd:
	return true;
	}
	return false;
	}

	static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	// The below optimizations require a constant RHS.
	if (!isa<ConstantSDNode>(N->getOperand(1)))
	return SDValue();

	SDValue N0 = N->getOperand(0);
	ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1));
	const APInt &ConstValue = C->getAPIntValue();

	// Allow the scaling to be folded into the `cnt` instruction by preventing
	// the scaling to be obscured here. This makes it easier to pattern match.
	if (IsSVECntIntrinsic(N0) \|\|
	(N0->getOpcode() == ISD::TRUNCATE &&
	(IsSVECntIntrinsic(N0->getOperand(0)))))
	if (ConstValue.sge(1) && ConstValue.sle(16))
	return SDValue();

	// Multiplication of a power of two plus/minus one can be done more
	// cheaply as as shift+add/sub. For now, this is true unilaterally. If
	// future CPUs have a cheaper MADD instruction, this may need to be
	// gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
	// 64-bit is 5 cycles, so this is always a win.
	// More aggressively, some multiplications N0 * C can be lowered to
	// shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
	// e.g. 6=32=(2+1)2.
	// TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45
	// which equals to (1+2)*16-(1+2).
	// TrailingZeroes is used to test if the mul can be lowered to
	// shift+add+shift.
	unsigned TrailingZeroes = ConstValue.countTrailingZeros();
	if (TrailingZeroes) {
	// Conservatively do not lower to shift+add+shift if the mul might be
	// folded into smul or umul.
	if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) \|\|
	isZeroExtended(N0.getNode(), DAG)))
	return SDValue();
	// Conservatively do not lower to shift+add+shift if the mul might be
	// folded into madd or msub.
	if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD \|\|
	N->use_begin()->getOpcode() == ISD::SUB))
	return SDValue();
	}
	// Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
	// and shift+add+shift.
	APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);

	unsigned ShiftAmt, AddSubOpc;
	// Is the shifted value the LHS operand of the add/sub?
	bool ShiftValUseIsN0 = true;
	// Do we need to negate the result?
	bool NegateResult = false;

	if (ConstValue.isNonNegative()) {
	// (mul x, 2^N + 1) => (add (shl x, N), x)
	// (mul x, 2^N - 1) => (sub (shl x, N), x)
	// (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
	APInt SCVMinus1 = ShiftedConstValue - 1;
	APInt CVPlus1 = ConstValue + 1;
	if (SCVMinus1.isPowerOf2()) {
	ShiftAmt = SCVMinus1.logBase2();
	AddSubOpc = ISD::ADD;
	} else if (CVPlus1.isPowerOf2()) {
	ShiftAmt = CVPlus1.logBase2();
	AddSubOpc = ISD::SUB;
	} else
	return SDValue();
	} else {
	// (mul x, -(2^N - 1)) => (sub x, (shl x, N))
	// (mul x, -(2^N + 1)) => - (add (shl x, N), x)
	APInt CVNegPlus1 = -ConstValue + 1;
	APInt CVNegMinus1 = -ConstValue - 1;
	if (CVNegPlus1.isPowerOf2()) {
	ShiftAmt = CVNegPlus1.logBase2();
	AddSubOpc = ISD::SUB;
	ShiftValUseIsN0 = false;
	} else if (CVNegMinus1.isPowerOf2()) {
	ShiftAmt = CVNegMinus1.logBase2();
	AddSubOpc = ISD::ADD;
	NegateResult = true;
	} else
	return SDValue();
	}

	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0,
	DAG.getConstant(ShiftAmt, DL, MVT::i64));

	SDValue AddSubN0 = ShiftValUseIsN0 ? ShiftedVal : N0;
	SDValue AddSubN1 = ShiftValUseIsN0 ? N0 : ShiftedVal;
	SDValue Res = DAG.getNode(AddSubOpc, DL, VT, AddSubN0, AddSubN1);
	assert(!(NegateResult && TrailingZeroes) &&
	"NegateResult and TrailingZeroes cannot both be true for now.");
	// Negate the result.
	if (NegateResult)
	return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
	// Shift the result.
	if (TrailingZeroes)
	return DAG.getNode(ISD::SHL, DL, VT, Res,
	DAG.getConstant(TrailingZeroes, DL, MVT::i64));
	return Res;
	}

	static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
	SelectionDAG &DAG) {
	// Take advantage of vector comparisons producing 0 or -1 in each lane to
	// optimize away operation when it's from a constant.
	//
	// The general transformation is:
	// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
	// AND(VECTOR_CMP(x,y), constant2)
	// constant2 = UNARYOP(constant)

	// Early exit if this isn't a vector operation, the operand of the
	// unary operation isn't a bitwise AND, or if the sizes of the operations
	// aren't the same.
	EVT VT = N->getValueType(0);
	if (!VT.isVector() \|\| N->getOperand(0)->getOpcode() != ISD::AND \|\|
	N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC \|\|
	VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
	return SDValue();

	// Now check that the other operand of the AND is a constant. We could
	// make the transformation for non-constant splats as well, but it's unclear
	// that would be a benefit as it would not eliminate any operations, just
	// perform one more step in scalar code before moving to the vector unit.
	if (BuildVectorSDNode *BV =
	dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
	// Bail out if the vector isn't a constant.
	if (!BV->isConstant())
	return SDValue();

	// Everything checks out. Build up the new and improved node.
	SDLoc DL(N);
	EVT IntVT = BV->getValueType(0);
	// Create a new constant of the appropriate type for the transformed
	// DAG.
	SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
	// The AND node needs bitcasts to/from an integer vector type around it.
	SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
	SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
	N->getOperand(0)->getOperand(0), MaskConst);
	SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
	return Res;
	}

	return SDValue();
	}

	static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {
	// First try to optimize away the conversion when it's conditionally from
	// a constant. Vectors only.
	if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
	return Res;

	EVT VT = N->getValueType(0);
	if (VT != MVT::f32 && VT != MVT::f64)
	return SDValue();

	// Only optimize when the source and destination types have the same width.
	if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
	return SDValue();

	// If the result of an integer load is only used by an integer-to-float
	// conversion, use a fp load instead and a AdvSIMD scalar {S\|U}CVTF instead.
	// This eliminates an "integer-to-vector-move" UOP and improves throughput.
	SDValue N0 = N->getOperand(0);
	if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
	// Do not change the width of a volatile load.
	!cast<LoadSDNode>(N0)->isVolatile()) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
	LN0->getPointerInfo(), LN0->getAlignment(),
	LN0->getMemOperand()->getFlags());

	// Make sure successors of the original load stay after it by updating them
	// to use the new Chain.
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));

	unsigned Opcode =
	(N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
	return DAG.getNode(Opcode, SDLoc(N), VT, Load);
	}

	return SDValue();
	}

	/// Fold a floating-point multiply by power of two into floating-point to
	/// fixed-point conversion.
	static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	if (!Subtarget->hasNEON())
	return SDValue();

	if (!N->getValueType(0).isSimple())
	return SDValue();

	SDValue Op = N->getOperand(0);
	if (!Op.getValueType().isVector() \|\| !Op.getValueType().isSimple() \|\|
	Op.getOpcode() != ISD::FMUL)
	return SDValue();

	SDValue ConstVec = Op->getOperand(1);
	if (!isa<BuildVectorSDNode>(ConstVec))
	return SDValue();

	MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
	uint32_t FloatBits = FloatTy.getSizeInBits();
	if (FloatBits != 32 && FloatBits != 64)
	return SDValue();

	MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
	uint32_t IntBits = IntTy.getSizeInBits();
	if (IntBits != 16 && IntBits != 32 && IntBits != 64)
	return SDValue();

	// Avoid conversions where iN is larger than the float (e.g., float -> i64).
	if (IntBits > FloatBits)
	return SDValue();

	BitVector UndefElements;
	BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
	int32_t Bits = IntBits == 64 ? 64 : 32;
	int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
	if (C == -1 \|\| C == 0 \|\| C > Bits)
	return SDValue();

	MVT ResTy;
	unsigned NumLanes = Op.getValueType().getVectorNumElements();
	switch (NumLanes) {
	default:
	return SDValue();
	case 2:
	ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
	break;
	case 4:
	ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
	break;
	}

	if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
	return SDValue();

	assert((ResTy != MVT::v4i64 \|\| DCI.isBeforeLegalizeOps()) &&
	"Illegal vector type after legalization");

	SDLoc DL(N);
	bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
	unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
	: Intrinsic::aarch64_neon_vcvtfp2fxu;
	SDValue FixConv =
	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
	DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
	Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
	// We can handle smaller integers by generating an extra trunc.
	if (IntBits < FloatBits)
	FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);

	return FixConv;
	}

	/// Fold a floating-point divide by power of two into fixed-point to
	/// floating-point conversion.
	static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	if (!Subtarget->hasNEON())
	return SDValue();

	SDValue Op = N->getOperand(0);
	unsigned Opc = Op->getOpcode();
	if (!Op.getValueType().isVector() \|\| !Op.getValueType().isSimple() \|\|
	!Op.getOperand(0).getValueType().isSimple() \|\|
	(Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
	return SDValue();

	SDValue ConstVec = N->getOperand(1);
	if (!isa<BuildVectorSDNode>(ConstVec))
	return SDValue();

	MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
	int32_t IntBits = IntTy.getSizeInBits();
	if (IntBits != 16 && IntBits != 32 && IntBits != 64)
	return SDValue();

	MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
	int32_t FloatBits = FloatTy.getSizeInBits();
	if (FloatBits != 32 && FloatBits != 64)
	return SDValue();

	// Avoid conversions where iN is larger than the float (e.g., i64 -> float).
	if (IntBits > FloatBits)
	return SDValue();

	BitVector UndefElements;
	BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
	int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
	if (C == -1 \|\| C == 0 \|\| C > FloatBits)
	return SDValue();

	MVT ResTy;
	unsigned NumLanes = Op.getValueType().getVectorNumElements();
	switch (NumLanes) {
	default:
	return SDValue();
	case 2:
	ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
	break;
	case 4:
	ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
	break;
	}

	if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
	return SDValue();

	SDLoc DL(N);
	SDValue ConvInput = Op.getOperand(0);
	bool IsSigned = Opc == ISD::SINT_TO_FP;
	if (IntBits < FloatBits)
	ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
	ResTy, ConvInput);

	unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
	: Intrinsic::aarch64_neon_vcvtfxu2fp;
	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
	DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
	DAG.getConstant(C, DL, MVT::i32));
	}

	/// An EXTR instruction is made up of two shifts, ORed together. This helper
	/// searches for and classifies those shifts.
	static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
	bool &FromHi) {
	if (N.getOpcode() == ISD::SHL)
	FromHi = false;
	else if (N.getOpcode() == ISD::SRL)
	FromHi = true;
	else
	return false;

	if (!isa<ConstantSDNode>(N.getOperand(1)))
	return false;

	ShiftAmount = N->getConstantOperandVal(1);
	Src = N->getOperand(0);
	return true;
	}

	/// EXTR instruction extracts a contiguous chunk of bits from two existing
	/// registers viewed as a high/low pair. This function looks for the pattern:
	/// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it
	/// with an EXTR. Can't quite be done in TableGen because the two immediates
	/// aren't independent.
	static SDValue tryCombineToEXTR(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	SelectionDAG &DAG = DCI.DAG;
	SDLoc DL(N);
	EVT VT = N->getValueType(0);

	assert(N->getOpcode() == ISD::OR && "Unexpected root");

	if (VT != MVT::i32 && VT != MVT::i64)
	return SDValue();

	SDValue LHS;
	uint32_t ShiftLHS = 0;
	bool LHSFromHi = false;
	if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
	return SDValue();

	SDValue RHS;
	uint32_t ShiftRHS = 0;
	bool RHSFromHi = false;
	if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
	return SDValue();

	// If they're both trying to come from the high part of the register, they're
	// not really an EXTR.
	if (LHSFromHi == RHSFromHi)
	return SDValue();

	if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
	return SDValue();

	if (LHSFromHi) {
	std::swap(LHS, RHS);
	std::swap(ShiftLHS, ShiftRHS);
	}

	return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
	DAG.getConstant(ShiftRHS, DL, MVT::i64));
	}

	static SDValue tryCombineToBSL(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	EVT VT = N->getValueType(0);
	SelectionDAG &DAG = DCI.DAG;
	SDLoc DL(N);

	if (!VT.isVector())
	return SDValue();

	SDValue N0 = N->getOperand(0);
	if (N0.getOpcode() != ISD::AND)
	return SDValue();

	SDValue N1 = N->getOperand(1);
	if (N1.getOpcode() != ISD::AND)
	return SDValue();

	// We only have to look for constant vectors here since the general, variable
	// case can be handled in TableGen.
	unsigned Bits = VT.getScalarSizeInBits();
	uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
	for (int i = 1; i >= 0; --i)
	for (int j = 1; j >= 0; --j) {
	BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
	BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
	if (!BVN0 \|\| !BVN1)
	continue;

	bool FoundMatch = true;
	for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
	ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
	ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
	if (!CN0 \|\| !CN1 \|\|
	CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
	FoundMatch = false;
	break;
	}
	}

	if (FoundMatch)
	return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
	N0->getOperand(1 - i), N1->getOperand(1 - j));
	}

	return SDValue();
	}

	static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	// Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);

	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	if (SDValue Res = tryCombineToEXTR(N, DCI))
	return Res;

	if (SDValue Res = tryCombineToBSL(N, DCI))
	return Res;

	return SDValue();
	}

	static bool isConstantSplatVectorMaskForType(SDNode *N, EVT MemVT) {
	if (!MemVT.getVectorElementType().isSimple())
	return false;

	uint64_t MaskForTy = 0ull;
	switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
	case MVT::i8:
	MaskForTy = 0xffull;
	break;
	case MVT::i16:
	MaskForTy = 0xffffull;
	break;
	case MVT::i32:
	MaskForTy = 0xffffffffull;
	break;
	default:
	return false;
	break;
	}

	if (N->getOpcode() == AArch64ISD::DUP \|\| N->getOpcode() == ISD::SPLAT_VECTOR)
	if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
	return Op0->getAPIntValue().getLimitedValue() == MaskForTy;

	return false;
	}

	static SDValue performSVEAndCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	SDValue Src = N->getOperand(0);
	unsigned Opc = Src->getOpcode();

	// Zero/any extend of an unsigned unpack
	if (Opc == AArch64ISD::UUNPKHI \|\| Opc == AArch64ISD::UUNPKLO) {
	SDValue UnpkOp = Src->getOperand(0);
	SDValue Dup = N->getOperand(1);

	if (Dup.getOpcode() != AArch64ISD::DUP)
	return SDValue();

	SDLoc DL(N);
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
	uint64_t ExtVal = C->getZExtValue();

	// If the mask is fully covered by the unpack, we don't need to push
	// a new AND onto the operand
	EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
	if ((ExtVal == 0xFF && EltTy == MVT::i8) \|\|
	(ExtVal == 0xFFFF && EltTy == MVT::i16) \|\|
	(ExtVal == 0xFFFFFFFF && EltTy == MVT::i32))
	return Src;

	// Truncate to prevent a DUP with an over wide constant
	APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());

	// Otherwise, make sure we propagate the AND to the operand
	// of the unpack
	Dup = DAG.getNode(AArch64ISD::DUP, DL,
	UnpkOp->getValueType(0),
	DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));

	SDValue And = DAG.getNode(ISD::AND, DL,
	UnpkOp->getValueType(0), UnpkOp, Dup);

	return DAG.getNode(Opc, DL, N->getValueType(0), And);
	}

	SDValue Mask = N->getOperand(1);

	if (!Src.hasOneUse())
	return SDValue();

	EVT MemVT;

	// SVE load instructions perform an implicit zero-extend, which makes them
	// perfect candidates for combining.
	switch (Opc) {
	case AArch64ISD::LD1_MERGE_ZERO:
	case AArch64ISD::LDNF1_MERGE_ZERO:
	case AArch64ISD::LDFF1_MERGE_ZERO:
	MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
	break;
	case AArch64ISD::GLD1_MERGE_ZERO:
	case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
	case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
	case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
	case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
	case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
	case AArch64ISD::GLD1_IMM_MERGE_ZERO:
	case AArch64ISD::GLDFF1_MERGE_ZERO:
	case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
	case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
	case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
	case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
	case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
	case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
	case AArch64ISD::GLDNT1_MERGE_ZERO:
	MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
	break;
	default:
	return SDValue();
	}

	if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
	return Src;

	return SDValue();
	}

	static SDValue performANDCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	SelectionDAG &DAG = DCI.DAG;
	SDValue LHS = N->getOperand(0);
	EVT VT = N->getValueType(0);
	if (!VT.isVector() \|\| !DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	if (VT.isScalableVector())
	return performSVEAndCombine(N, DCI);

	BuildVectorSDNode *BVN =
	dyn_cast<BuildVectorSDNode>(N->getOperand(1).getNode());
	if (!BVN)
	return SDValue();

	// AND does not accept an immediate, so check if we can use a BIC immediate
	// instruction instead. We do this here instead of using a (and x, (mvni imm))
	// pattern in isel, because some immediates may be lowered to the preferred
	// (and x, (movi imm)) form, even though an mvni representation also exists.
	APInt DefBits(VT.getSizeInBits(), 0);
	APInt UndefBits(VT.getSizeInBits(), 0);
	if (resolveBuildVector(BVN, DefBits, UndefBits)) {
	SDValue NewOp;

	DefBits = ~DefBits;
	if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
	DefBits, &LHS)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
	DefBits, &LHS)))
	return NewOp;

	UndefBits = ~UndefBits;
	if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
	UndefBits, &LHS)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
	UndefBits, &LHS)))
	return NewOp;
	}

	return SDValue();
	}

	static SDValue performSRLCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);
	if (VT != MVT::i32 && VT != MVT::i64)
	return SDValue();

	// Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the
	// high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32)
	// to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero.
	SDValue N0 = N->getOperand(0);
	if (N0.getOpcode() == ISD::BSWAP) {
	SDLoc DL(N);
	SDValue N1 = N->getOperand(1);
	SDValue N00 = N0.getOperand(0);
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
	uint64_t ShiftAmt = C->getZExtValue();
	if (VT == MVT::i32 && ShiftAmt == 16 &&
	DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16)))
	return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
	if (VT == MVT::i64 && ShiftAmt == 32 &&
	DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32)))
	return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
	}
	}
	return SDValue();
	}

	static SDValue performConcatVectorsCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
	unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();

	// Optimize concat_vectors of truncated vectors, where the intermediate
	// type is illegal, to avoid said illegality, e.g.,
	// (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
	// (v2i16 (truncate (v2i64)))))
	// ->
	// (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
	// (v4i32 (bitcast (v2i64))),
	// <0, 2, 4, 6>)))
	// This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
	// on both input and result type, so we might generate worse code.
	// On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
	if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
	N1Opc == ISD::TRUNCATE) {
	SDValue N00 = N0->getOperand(0);
	SDValue N10 = N1->getOperand(0);
	EVT N00VT = N00.getValueType();

	if (N00VT == N10.getValueType() &&
	(N00VT == MVT::v2i64 \|\| N00VT == MVT::v4i32) &&
	N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
	MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
	SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
	for (size_t i = 0; i < Mask.size(); ++i)
	Mask[i] = i * 2;
	return DAG.getNode(ISD::TRUNCATE, dl, VT,
	DAG.getVectorShuffle(
	MidVT, dl,
	DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
	DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
	}
	}

	// Wait 'til after everything is legalized to try this. That way we have
	// legal vector types and such.
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	// Optimise concat_vectors of two [us]rhadds that use extracted subvectors
	// from the same original vectors. Combine these into a single [us]rhadd that
	// operates on the two original vectors. Example:
	// (v16i8 (concat_vectors (v8i8 (urhadd (extract_subvector (v16i8 OpA, <0>),
	// extract_subvector (v16i8 OpB,
	// <0>))),
	// (v8i8 (urhadd (extract_subvector (v16i8 OpA, <8>),
	// extract_subvector (v16i8 OpB,
	// <8>)))))
	// ->
	// (v16i8(urhadd(v16i8 OpA, v16i8 OpB)))
	if (N->getNumOperands() == 2 && N0Opc == N1Opc &&
	(N0Opc == AArch64ISD::URHADD \|\| N0Opc == AArch64ISD::SRHADD)) {
	SDValue N00 = N0->getOperand(0);
	SDValue N01 = N0->getOperand(1);
	SDValue N10 = N1->getOperand(0);
	SDValue N11 = N1->getOperand(1);

	EVT N00VT = N00.getValueType();
	EVT N10VT = N10.getValueType();

	if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N01->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N10->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) {
	SDValue N00Source = N00->getOperand(0);
	SDValue N01Source = N01->getOperand(0);
	SDValue N10Source = N10->getOperand(0);
	SDValue N11Source = N11->getOperand(0);

	if (N00Source == N10Source && N01Source == N11Source &&
	N00Source.getValueType() == VT && N01Source.getValueType() == VT) {
	assert(N0.getValueType() == N1.getValueType());

	uint64_t N00Index = N00.getConstantOperandVal(1);
	uint64_t N01Index = N01.getConstantOperandVal(1);
	uint64_t N10Index = N10.getConstantOperandVal(1);
	uint64_t N11Index = N11.getConstantOperandVal(1);

	if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 &&
	N10Index == N00VT.getVectorNumElements())
	return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source);
	}
	}
	}

	// If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
	// splat. The indexed instructions are going to be expecting a DUPLANE64, so
	// canonicalise to that.
	if (N0 == N1 && VT.getVectorNumElements() == 2) {
	assert(VT.getScalarSizeInBits() == 64);
	return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
	DAG.getConstant(0, dl, MVT::i64));
	}

	// Canonicalise concat_vectors so that the right-hand vector has as few
	// bit-casts as possible before its real operation. The primary matching
	// destination for these operations will be the narrowing "2" instructions,
	// which depend on the operation being performed on this right-hand vector.
	// For example,
	// (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
	// becomes
	// (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))

	if (N1Opc != ISD::BITCAST)
	return SDValue();
	SDValue RHS = N1->getOperand(0);
	MVT RHSTy = RHS.getValueType().getSimpleVT();
	// If the RHS is not a vector, this is not the pattern we're looking for.
	if (!RHSTy.isVector())
	return SDValue();

	LLVM_DEBUG(
	dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");

	MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
	RHSTy.getVectorNumElements() * 2);
	return DAG.getNode(ISD::BITCAST, dl, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
	DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
	RHS));
	}

	static SDValue tryCombineFixedPointConvert(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	// Wait until after everything is legalized to try this. That way we have
	// legal vector types and such.
	if (DCI.isBeforeLegalizeOps())
	return SDValue();
	// Transform a scalar conversion of a value from a lane extract into a
	// lane extract of a vector conversion. E.g., from foo1 to foo2:
	// double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
	// double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
	//
	// The second form interacts better with instruction selection and the
	// register allocator to avoid cross-class register copies that aren't
	// coalescable due to a lane reference.

	// Check the operand and see if it originates from a lane extract.
	SDValue Op1 = N->getOperand(1);
	if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	// Yep, no additional predication needed. Perform the transform.
	SDValue IID = N->getOperand(0);
	SDValue Shift = N->getOperand(2);
	SDValue Vec = Op1.getOperand(0);
	SDValue Lane = Op1.getOperand(1);
	EVT ResTy = N->getValueType(0);
	EVT VecResTy;
	SDLoc DL(N);

	// The vector width should be 128 bits by the time we get here, even
	// if it started as 64 bits (the extract_vector handling will have
	// done so).
	assert(Vec.getValueSizeInBits() == 128 &&
	"unexpected vector size on extract_vector_elt!");
	if (Vec.getValueType() == MVT::v4i32)
	VecResTy = MVT::v4f32;
	else if (Vec.getValueType() == MVT::v2i64)
	VecResTy = MVT::v2f64;
	else
	llvm_unreachable("unexpected vector type!");

	SDValue Convert =
	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
	}
	return SDValue();
	}

	// AArch64 high-vector "long" operations are formed by performing the non-high
	// version on an extract_subvector of each operand which gets the high half:
	//
	// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
	//
	// However, there are cases which don't have an extract_high explicitly, but
	// have another operation that can be made compatible with one for free. For
	// example:
	//
	// (dupv64 scalar) --> (extract_high (dup128 scalar))
	//
	// This routine does the actual conversion of such DUPs, once outer routines
	// have determined that everything else is in order.
	// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
	// similarly here.
	static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
	switch (N.getOpcode()) {
	case AArch64ISD::DUP:
	case AArch64ISD::DUPLANE8:
	case AArch64ISD::DUPLANE16:
	case AArch64ISD::DUPLANE32:
	case AArch64ISD::DUPLANE64:
	case AArch64ISD::MOVI:
	case AArch64ISD::MOVIshift:
	case AArch64ISD::MOVIedit:
	case AArch64ISD::MOVImsl:
	case AArch64ISD::MVNIshift:
	case AArch64ISD::MVNImsl:
	break;
	default:
	// FMOV could be supported, but isn't very useful, as it would only occur
	// if you passed a bitcast' floating point immediate to an eligible long
	// integer op (addl, smull, ...).
	return SDValue();
	}

	MVT NarrowTy = N.getSimpleValueType();
	if (!NarrowTy.is64BitVector())
	return SDValue();

	MVT ElementTy = NarrowTy.getVectorElementType();
	unsigned NumElems = NarrowTy.getVectorNumElements();
	MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);

	SDLoc dl(N);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy,
	DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()),
	DAG.getConstant(NumElems, dl, MVT::i64));
	}

	static bool isEssentiallyExtractHighSubvector(SDValue N) {
	if (N.getOpcode() == ISD::BITCAST)
	N = N.getOperand(0);
	if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
	return false;
	return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() ==
	N.getOperand(0).getValueType().getVectorNumElements() / 2;
	}

	/// Helper structure to keep track of ISD::SET_CC operands.
	struct GenericSetCCInfo {
	const SDValue *Opnd0;
	const SDValue *Opnd1;
	ISD::CondCode CC;
	};

	/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
	struct AArch64SetCCInfo {
	const SDValue *Cmp;
	AArch64CC::CondCode CC;
	};

	/// Helper structure to keep track of SetCC information.
	union SetCCInfo {
	GenericSetCCInfo Generic;
	AArch64SetCCInfo AArch64;
	};

	/// Helper structure to be able to read SetCC information. If set to
	/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
	/// GenericSetCCInfo.
	struct SetCCInfoAndKind {
	SetCCInfo Info;
	bool IsAArch64;
	};

	/// Check whether or not \p Op is a SET_CC operation, either a generic or
	/// an
	/// AArch64 lowered one.
	/// \p SetCCInfo is filled accordingly.
	/// \post SetCCInfo is meanginfull only when this function returns true.
	/// \return True when Op is a kind of SET_CC operation.
	static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
	// If this is a setcc, this is straight forward.
	if (Op.getOpcode() == ISD::SETCC) {
	SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
	SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
	SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
	SetCCInfo.IsAArch64 = false;
	return true;
	}
	// Otherwise, check if this is a matching csel instruction.
	// In other words:
	// - csel 1, 0, cc
	// - csel 0, 1, !cc
	if (Op.getOpcode() != AArch64ISD::CSEL)
	return false;
	// Set the information about the operands.
	// TODO: we want the operands of the Cmp not the csel
	SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
	SetCCInfo.IsAArch64 = true;
	SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
	cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());

	// Check that the operands matches the constraints:
	// (1) Both operands must be constants.
	// (2) One must be 1 and the other must be 0.
	ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
	ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));

	// Check (1).
	if (!TValue \|\| !FValue)
	return false;

	// Check (2).
	if (!TValue->isOne()) {
	// Update the comparison when we are interested in !cc.
	std::swap(TValue, FValue);
	SetCCInfo.Info.AArch64.CC =
	AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
	}
	return TValue->isOne() && FValue->isNullValue();
	}

	// Returns true if Op is setcc or zext of setcc.
	static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
	if (isSetCC(Op, Info))
	return true;
	return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
	isSetCC(Op->getOperand(0), Info));
	}

	// The folding we want to perform is:
	// (add x, [zext] (setcc cc ...) )
	// -->
	// (csel x, (add x, 1), !cc ...)
	//
	// The latter will get matched to a CSINC instruction.
	static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
	assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
	SDValue LHS = Op->getOperand(0);
	SDValue RHS = Op->getOperand(1);
	SetCCInfoAndKind InfoAndKind;

	// If neither operand is a SET_CC, give up.
	if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
	std::swap(LHS, RHS);
	if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
	return SDValue();
	}

	// FIXME: This could be generatized to work for FP comparisons.
	EVT CmpVT = InfoAndKind.IsAArch64
	? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
	: InfoAndKind.Info.Generic.Opnd0->getValueType();
	if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
	return SDValue();

	SDValue CCVal;
	SDValue Cmp;
	SDLoc dl(Op);
	if (InfoAndKind.IsAArch64) {
	CCVal = DAG.getConstant(
	AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl,
	MVT::i32);
	Cmp = *InfoAndKind.Info.AArch64.Cmp;
	} else
	Cmp = getAArch64Cmp(
	InfoAndKind.Info.Generic.Opnd0, InfoAndKind.Info.Generic.Opnd1,
	ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
	dl);

	EVT VT = Op->getValueType(0);
	LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
	return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
	}

	// The basic add/sub long vector instructions have variants with "2" on the end
	// which act on the high-half of their inputs. They are normally matched by
	// patterns like:
	//
	// (add (zeroext (extract_high LHS)),
	// (zeroext (extract_high RHS)))
	// -> uaddl2 vD, vN, vM
	//
	// However, if one of the extracts is something like a duplicate, this
	// instruction can still be used profitably. This function puts the DAG into a
	// more appropriate form for those patterns to trigger.
	static SDValue performAddSubLongCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	MVT VT = N->getSimpleValueType(0);
	if (!VT.is128BitVector()) {
	if (N->getOpcode() == ISD::ADD)
	return performSetccAddFolding(N, DAG);
	return SDValue();
	}

	// Make sure both branches are extended in the same way.
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
	LHS.getOpcode() != ISD::SIGN_EXTEND) \|\|
	LHS.getOpcode() != RHS.getOpcode())
	return SDValue();

	unsigned ExtType = LHS.getOpcode();

	// It's not worth doing if at least one of the inputs isn't already an
	// extract, but we don't know which it'll be so we have to try both.
	if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
	RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
	if (!RHS.getNode())
	return SDValue();

	RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
	} else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
	LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
	if (!LHS.getNode())
	return SDValue();

	LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
	}

	return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
	}

	// Massage DAGs which we can use the high-half "long" operations on into
	// something isel will recognize better. E.g.
	//
	// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
	// (aarch64_neon_umull (extract_high (v2i64 vec)))
	// (extract_high (v2i64 (dup128 scalar)))))
	//
	static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	assert(LHS.getValueType().is64BitVector() &&
	RHS.getValueType().is64BitVector() &&
	"unexpected shape for long operation");

	// Either node could be a DUP, but it's not worth doing both of them (you'd
	// just as well use the non-high version) so look for a corresponding extract
	// operation on the other "wing".
	if (isEssentiallyExtractHighSubvector(LHS)) {
	RHS = tryExtendDUPToExtractHigh(RHS, DAG);
	if (!RHS.getNode())
	return SDValue();
	} else if (isEssentiallyExtractHighSubvector(RHS)) {
	LHS = tryExtendDUPToExtractHigh(LHS, DAG);
	if (!LHS.getNode())
	return SDValue();
	}

	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
	N->getOperand(0), LHS, RHS);
	}

	static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
	MVT ElemTy = N->getSimpleValueType(0).getScalarType();
	unsigned ElemBits = ElemTy.getSizeInBits();

	int64_t ShiftAmount;
	if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
	APInt SplatValue, SplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
	HasAnyUndefs, ElemBits) \|\|
	SplatBitSize != ElemBits)
	return SDValue();

	ShiftAmount = SplatValue.getSExtValue();
	} else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
	ShiftAmount = CVN->getSExtValue();
	} else
	return SDValue();

	unsigned Opcode;
	bool IsRightShift;
	switch (IID) {
	default:
	llvm_unreachable("Unknown shift intrinsic");
	case Intrinsic::aarch64_neon_sqshl:
	Opcode = AArch64ISD::SQSHL_I;
	IsRightShift = false;
	break;
	case Intrinsic::aarch64_neon_uqshl:
	Opcode = AArch64ISD::UQSHL_I;
	IsRightShift = false;
	break;
	case Intrinsic::aarch64_neon_srshl:
	Opcode = AArch64ISD::SRSHR_I;
	IsRightShift = true;
	break;
	case Intrinsic::aarch64_neon_urshl:
	Opcode = AArch64ISD::URSHR_I;
	IsRightShift = true;
	break;
	case Intrinsic::aarch64_neon_sqshlu:
	Opcode = AArch64ISD::SQSHLU_I;
	IsRightShift = false;
	break;
	case Intrinsic::aarch64_neon_sshl:
	case Intrinsic::aarch64_neon_ushl:
	// For positive shift amounts we can use SHL, as ushl/sshl perform a regular
	// left shift for positive shift amounts. Below, we only replace the current
	// node with VSHL, if this condition is met.
	Opcode = AArch64ISD::VSHL;
	IsRightShift = false;
	break;
	}

	if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
	SDLoc dl(N);
	return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
	DAG.getConstant(-ShiftAmount, dl, MVT::i32));
	} else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
	SDLoc dl(N);
	return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
	DAG.getConstant(ShiftAmount, dl, MVT::i32));
	}

	return SDValue();
	}

	// The CRC32[BH] instructions ignore the high bits of their data operand. Since
	// the intrinsics must be legal and take an i32, this means there's almost
	// certainly going to be a zext in the DAG which we can eliminate.
	static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
	SDValue AndN = N->getOperand(2);
	if (AndN.getOpcode() != ISD::AND)
	return SDValue();

	ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
	if (!CMask \|\| CMask->getZExtValue() != Mask)
	return SDValue();

	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
	N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
	}

	static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
	SelectionDAG &DAG) {
	SDLoc dl(N);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
	DAG.getNode(Opc, dl,
	N->getOperand(1).getSimpleValueType(),
	N->getOperand(1)),
	DAG.getConstant(0, dl, MVT::i64));
	}

	static SDValue LowerSVEIntReduction(SDNode *N, unsigned Opc,
	SelectionDAG &DAG) {
	SDLoc dl(N);
	LLVMContext &Ctx = *DAG.getContext();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	EVT VT = N->getValueType(0);
	SDValue Pred = N->getOperand(1);
	SDValue Data = N->getOperand(2);
	EVT DataVT = Data.getValueType();

	if (DataVT.getVectorElementType().isScalarInteger() &&
	(VT == MVT::i8 \|\| VT == MVT::i16 \|\| VT == MVT::i32 \|\| VT == MVT::i64)) {
	if (!TLI.isTypeLegal(DataVT))
	return SDValue();

	EVT OutputVT = EVT::getVectorVT(Ctx, VT,
	AArch64::NeonBitsPerVector / VT.getSizeInBits());
	SDValue Reduce = DAG.getNode(Opc, dl, OutputVT, Pred, Data);
	SDValue Zero = DAG.getConstant(0, dl, MVT::i64);
	SDValue Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Reduce, Zero);

	return Result;
	}

	return SDValue();
	}

	static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) {
	SDLoc DL(N);
	SDValue Op1 = N->getOperand(1);
	SDValue Op2 = N->getOperand(2);
	EVT ScalarTy = Op1.getValueType();

	if ((ScalarTy == MVT::i8) \|\| (ScalarTy == MVT::i16)) {
	Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
	Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
	}

	return DAG.getNode(AArch64ISD::INDEX_VECTOR, DL, N->getValueType(0),
	Op1, Op2);
	}

	static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) {
	SDLoc dl(N);
	SDValue Scalar = N->getOperand(3);
	EVT ScalarTy = Scalar.getValueType();

	if ((ScalarTy == MVT::i8) \|\| (ScalarTy == MVT::i16))
	Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);

	SDValue Passthru = N->getOperand(1);
	SDValue Pred = N->getOperand(2);
	return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
	Pred, Scalar, Passthru);
	}

	static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) {
	SDLoc dl(N);
	LLVMContext &Ctx = *DAG.getContext();
	EVT VT = N->getValueType(0);

	assert(VT.isScalableVector() && "Expected a scalable vector.");

	// Current lowering only supports the SVE-ACLE types.
	if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock)
	return SDValue();

	unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
	unsigned ByteSize = VT.getSizeInBits().getKnownMinSize() / 8;
	EVT ByteVT = EVT::getVectorVT(Ctx, MVT::i8, { ByteSize, true });

	// Convert everything to the domain of EXT (i.e bytes).
	SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
	SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
	SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
	DAG.getConstant(ElemSize, dl, MVT::i32));

	SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
	return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
	}

	static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	if (DCI.isBeforeLegalize())
	return SDValue();

	SDValue Comparator = N->getOperand(3);
	if (Comparator.getOpcode() == AArch64ISD::DUP \|\|
	Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
	unsigned IID = getIntrinsicID(N);
	EVT VT = N->getValueType(0);
	EVT CmpVT = N->getOperand(2).getValueType();
	SDValue Pred = N->getOperand(1);
	SDValue Imm;
	SDLoc DL(N);

	switch (IID) {
	default:
	llvm_unreachable("Called with wrong intrinsic!");
	break;

	// Signed comparisons
	case Intrinsic::aarch64_sve_cmpeq_wide:
	case Intrinsic::aarch64_sve_cmpne_wide:
	case Intrinsic::aarch64_sve_cmpge_wide:
	case Intrinsic::aarch64_sve_cmpgt_wide:
	case Intrinsic::aarch64_sve_cmplt_wide:
	case Intrinsic::aarch64_sve_cmple_wide: {
	if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
	int64_t ImmVal = CN->getSExtValue();
	if (ImmVal >= -16 && ImmVal <= 15)
	Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
	else
	return SDValue();
	}
	break;
	}
	// Unsigned comparisons
	case Intrinsic::aarch64_sve_cmphs_wide:
	case Intrinsic::aarch64_sve_cmphi_wide:
	case Intrinsic::aarch64_sve_cmplo_wide:
	case Intrinsic::aarch64_sve_cmpls_wide: {
	if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
	uint64_t ImmVal = CN->getZExtValue();
	if (ImmVal <= 127)
	Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
	else
	return SDValue();
	}
	break;
	}
	}

	if (!Imm)
	return SDValue();

	SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
	return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
	N->getOperand(2), Splat, DAG.getCondCode(CC));
	}

	return SDValue();
	}

	static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
	AArch64CC::CondCode Cond) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	SDLoc DL(Op);
	assert(Op.getValueType().isScalableVector() &&
	TLI.isTypeLegal(Op.getValueType()) &&
	"Expected legal scalable vector type!");

	// Ensure target specific opcodes are using legal type.
	EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
	SDValue TVal = DAG.getConstant(1, DL, OutVT);
	SDValue FVal = DAG.getConstant(0, DL, OutVT);

	// Set condition code (CC) flags.
	SDValue Test = DAG.getNode(AArch64ISD::PTEST, DL, MVT::Other, Pg, Op);

	// Convert CC to integer based on requested condition.
	// NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
	SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
	SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
	return DAG.getZExtOrTrunc(Res, DL, VT);
	}

	static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc,
	SelectionDAG &DAG) {
	SDLoc DL(N);

	SDValue Pred = N->getOperand(1);
	SDValue VecToReduce = N->getOperand(2);

	EVT ReduceVT = VecToReduce.getValueType();
	SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);

	// SVE reductions set the whole vector register with the first element
	// containing the reduction result, which we'll now extract.
	SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
	Zero);
	}

	static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,
	SelectionDAG &DAG) {
	SDLoc DL(N);

	SDValue Pred = N->getOperand(1);
	SDValue InitVal = N->getOperand(2);
	SDValue VecToReduce = N->getOperand(3);
	EVT ReduceVT = VecToReduce.getValueType();

	// Ordered reductions use the first lane of the result vector as the
	// reduction's initial value.
	SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
	InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
	DAG.getUNDEF(ReduceVT), InitVal, Zero);

	SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);

	// SVE reductions set the whole vector register with the first element
	// containing the reduction result, which we'll now extract.
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
	Zero);
	}

	static SDValue performIntrinsicCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	SelectionDAG &DAG = DCI.DAG;
	unsigned IID = getIntrinsicID(N);
	switch (IID) {
	default:
	break;
	case Intrinsic::aarch64_neon_vcvtfxs2fp:
	case Intrinsic::aarch64_neon_vcvtfxu2fp:
	return tryCombineFixedPointConvert(N, DCI, DAG);
	case Intrinsic::aarch64_neon_saddv:
	return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
	case Intrinsic::aarch64_neon_uaddv:
	return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
	case Intrinsic::aarch64_neon_sminv:
	return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
	case Intrinsic::aarch64_neon_uminv:
	return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
	case Intrinsic::aarch64_neon_smaxv:
	return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
	case Intrinsic::aarch64_neon_umaxv:
	return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
	case Intrinsic::aarch64_neon_fmax:
	return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2));
	case Intrinsic::aarch64_neon_fmin:
	return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2));
	case Intrinsic::aarch64_neon_fmaxnm:
	return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2));
	case Intrinsic::aarch64_neon_fminnm:
	return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2));
	case Intrinsic::aarch64_neon_smull:
	case Intrinsic::aarch64_neon_umull:
	case Intrinsic::aarch64_neon_pmull:
	case Intrinsic::aarch64_neon_sqdmull:
	return tryCombineLongOpWithDup(IID, N, DCI, DAG);
	case Intrinsic::aarch64_neon_sqshl:
	case Intrinsic::aarch64_neon_uqshl:
	case Intrinsic::aarch64_neon_sqshlu:
	case Intrinsic::aarch64_neon_srshl:
	case Intrinsic::aarch64_neon_urshl:
	case Intrinsic::aarch64_neon_sshl:
	case Intrinsic::aarch64_neon_ushl:
	return tryCombineShiftImm(IID, N, DAG);
	case Intrinsic::aarch64_crc32b:
	case Intrinsic::aarch64_crc32cb:
	return tryCombineCRC32(0xff, N, DAG);
	case Intrinsic::aarch64_crc32h:
	case Intrinsic::aarch64_crc32ch:
	return tryCombineCRC32(0xffff, N, DAG);
	case Intrinsic::aarch64_sve_smaxv:
	return LowerSVEIntReduction(N, AArch64ISD::SMAXV_PRED, DAG);
	case Intrinsic::aarch64_sve_umaxv:
	return LowerSVEIntReduction(N, AArch64ISD::UMAXV_PRED, DAG);
	case Intrinsic::aarch64_sve_sminv:
	return LowerSVEIntReduction(N, AArch64ISD::SMINV_PRED, DAG);
	case Intrinsic::aarch64_sve_uminv:
	return LowerSVEIntReduction(N, AArch64ISD::UMINV_PRED, DAG);
	case Intrinsic::aarch64_sve_orv:
	return LowerSVEIntReduction(N, AArch64ISD::ORV_PRED, DAG);
	case Intrinsic::aarch64_sve_eorv:
	return LowerSVEIntReduction(N, AArch64ISD::EORV_PRED, DAG);
	case Intrinsic::aarch64_sve_andv:
	return LowerSVEIntReduction(N, AArch64ISD::ANDV_PRED, DAG);
	case Intrinsic::aarch64_sve_index:
	return LowerSVEIntrinsicIndex(N, DAG);
	case Intrinsic::aarch64_sve_dup:
	return LowerSVEIntrinsicDUP(N, DAG);
	case Intrinsic::aarch64_sve_dup_x:
	return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
	N->getOperand(1));
	case Intrinsic::aarch64_sve_ext:
	return LowerSVEIntrinsicEXT(N, DAG);
	case Intrinsic::aarch64_sve_smin:
	return DAG.getNode(AArch64ISD::SMIN_MERGE_OP1, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2), N->getOperand(3));
	case Intrinsic::aarch64_sve_umin:
	return DAG.getNode(AArch64ISD::UMIN_MERGE_OP1, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2), N->getOperand(3));
	case Intrinsic::aarch64_sve_smax:
	return DAG.getNode(AArch64ISD::SMAX_MERGE_OP1, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2), N->getOperand(3));
	case Intrinsic::aarch64_sve_umax:
	return DAG.getNode(AArch64ISD::UMAX_MERGE_OP1, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2), N->getOperand(3));
	case Intrinsic::aarch64_sve_lsl:
	return DAG.getNode(AArch64ISD::SHL_MERGE_OP1, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2), N->getOperand(3));
	case Intrinsic::aarch64_sve_lsr:
	return DAG.getNode(AArch64ISD::SRL_MERGE_OP1, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2), N->getOperand(3));
	case Intrinsic::aarch64_sve_asr:
	return DAG.getNode(AArch64ISD::SRA_MERGE_OP1, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2), N->getOperand(3));
	case Intrinsic::aarch64_sve_cmphs:
	if (!N->getOperand(2).getValueType().isFloatingPoint())
	return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
	N->getValueType(0), N->getOperand(1), N->getOperand(2),
	N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
	break;
	case Intrinsic::aarch64_sve_cmphi:
	if (!N->getOperand(2).getValueType().isFloatingPoint())
	return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
	N->getValueType(0), N->getOperand(1), N->getOperand(2),
	N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
	break;
	case Intrinsic::aarch64_sve_cmpge:
	if (!N->getOperand(2).getValueType().isFloatingPoint())
	return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
	N->getValueType(0), N->getOperand(1), N->getOperand(2),
	N->getOperand(3), DAG.getCondCode(ISD::SETGE));
	break;
	case Intrinsic::aarch64_sve_cmpgt:
	if (!N->getOperand(2).getValueType().isFloatingPoint())
	return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
	N->getValueType(0), N->getOperand(1), N->getOperand(2),
	N->getOperand(3), DAG.getCondCode(ISD::SETGT));
	break;
	case Intrinsic::aarch64_sve_cmpeq:
	if (!N->getOperand(2).getValueType().isFloatingPoint())
	return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
	N->getValueType(0), N->getOperand(1), N->getOperand(2),
	N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
	break;
	case Intrinsic::aarch64_sve_cmpne:
	if (!N->getOperand(2).getValueType().isFloatingPoint())
	return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
	N->getValueType(0), N->getOperand(1), N->getOperand(2),
	N->getOperand(3), DAG.getCondCode(ISD::SETNE));
	break;
	case Intrinsic::aarch64_sve_fadda:
	return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG);
	case Intrinsic::aarch64_sve_faddv:
	return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG);
	case Intrinsic::aarch64_sve_fmaxnmv:
	return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG);
	case Intrinsic::aarch64_sve_fmaxv:
	return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG);
	case Intrinsic::aarch64_sve_fminnmv:
	return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG);
	case Intrinsic::aarch64_sve_fminv:
	return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG);
	case Intrinsic::aarch64_sve_sel:
	return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2), N->getOperand(3));
	case Intrinsic::aarch64_sve_cmpeq_wide:
	return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
	case Intrinsic::aarch64_sve_cmpne_wide:
	return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
	case Intrinsic::aarch64_sve_cmpge_wide:
	return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
	case Intrinsic::aarch64_sve_cmpgt_wide:
	return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
	case Intrinsic::aarch64_sve_cmplt_wide:
	return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
	case Intrinsic::aarch64_sve_cmple_wide:
	return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
	case Intrinsic::aarch64_sve_cmphs_wide:
	return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
	case Intrinsic::aarch64_sve_cmphi_wide:
	return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
	case Intrinsic::aarch64_sve_cmplo_wide:
	return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
	case Intrinsic::aarch64_sve_cmpls_wide:
	return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
	case Intrinsic::aarch64_sve_ptest_any:
	return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
	AArch64CC::ANY_ACTIVE);
	case Intrinsic::aarch64_sve_ptest_first:
	return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
	AArch64CC::FIRST_ACTIVE);
	case Intrinsic::aarch64_sve_ptest_last:
	return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
	AArch64CC::LAST_ACTIVE);
	}
	return SDValue();
	}

	static SDValue performExtendCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	// If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
	// we can convert that DUP into another extract_high (of a bigger DUP), which
	// helps the backend to decide that an sabdl2 would be useful, saving a real
	// extract_high operation.
	if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
	N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
	SDNode *ABDNode = N->getOperand(0).getNode();
	unsigned IID = getIntrinsicID(ABDNode);
	if (IID == Intrinsic::aarch64_neon_sabd \|\|
	IID == Intrinsic::aarch64_neon_uabd) {
	SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
	if (!NewABD.getNode())
	return SDValue();

	return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
	NewABD);
	}
	}

	// This is effectively a custom type legalization for AArch64.
	//
	// Type legalization will split an extend of a small, legal, type to a larger
	// illegal type by first splitting the destination type, often creating
	// illegal source types, which then get legalized in isel-confusing ways,
	// leading to really terrible codegen. E.g.,
	// %result = v8i32 sext v8i8 %value
	// becomes
	// %losrc = extract_subreg %value, ...
	// %hisrc = extract_subreg %value, ...
	// %lo = v4i32 sext v4i8 %losrc
	// %hi = v4i32 sext v4i8 %hisrc
	// Things go rapidly downhill from there.
	//
	// For AArch64, the [sz]ext vector instructions can only go up one element
	// size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
	// take two instructions.
	//
	// This implies that the most efficient way to do the extend from v8i8
	// to two v4i32 values is to first extend the v8i8 to v8i16, then do
	// the normal splitting to happen for the v8i16->v8i32.

	// This is pre-legalization to catch some cases where the default
	// type legalization will create ill-tempered code.
	if (!DCI.isBeforeLegalizeOps())
	return SDValue();

	// We're only interested in cleaning things up for non-legal vector types
	// here. If both the source and destination are legal, things will just
	// work naturally without any fiddling.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT ResVT = N->getValueType(0);
	if (!ResVT.isVector() \|\| TLI.isTypeLegal(ResVT))
	return SDValue();
	// If the vector type isn't a simple VT, it's beyond the scope of what
	// we're worried about here. Let legalization do its thing and hope for
	// the best.
	SDValue Src = N->getOperand(0);
	EVT SrcVT = Src->getValueType(0);
	if (!ResVT.isSimple() \|\| !SrcVT.isSimple())
	return SDValue();

	// If the source VT is a 64-bit fixed or scalable vector, we can play games
	// and get the better results we want.
	if (SrcVT.getSizeInBits().getKnownMinSize() != 64)
	return SDValue();

	unsigned SrcEltSize = SrcVT.getScalarSizeInBits();
	ElementCount SrcEC = SrcVT.getVectorElementCount();
	SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), SrcEC);
	SDLoc DL(N);
	Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);

	// Now split the rest of the operation into two halves, each with a 64
	// bit source.
	EVT LoVT, HiVT;
	SDValue Lo, Hi;
	LoVT = HiVT = ResVT.getHalfNumVectorElementsVT(*DAG.getContext());

	EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
	LoVT.getVectorElementCount());
	Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
	DAG.getConstant(0, DL, MVT::i64));
	Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
	DAG.getConstant(InNVT.getVectorMinNumElements(), DL, MVT::i64));
	Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
	Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);

	// Now combine the parts back together so we still have a single result
	// like the combiner expects.
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
	}

	static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
	SDValue SplatVal, unsigned NumVecElts) {
	assert(!St.isTruncatingStore() && "cannot split truncating vector store");
	unsigned OrigAlignment = St.getAlignment();
	unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;

	// Create scalar stores. This is at least as good as the code sequence for a
	// split unaligned store which is a dup.s, ext.b, and two stores.
	// Most of the time the three stores should be replaced by store pair
	// instructions (stp).
	SDLoc DL(&St);
	SDValue BasePtr = St.getBasePtr();
	uint64_t BaseOffset = 0;

	const MachinePointerInfo &PtrInfo = St.getPointerInfo();
	SDValue NewST1 =
	DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
	OrigAlignment, St.getMemOperand()->getFlags());

	// As this in ISel, we will not merge this add which may degrade results.
	if (BasePtr->getOpcode() == ISD::ADD &&
	isa<ConstantSDNode>(BasePtr->getOperand(1))) {
	BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
	BasePtr = BasePtr->getOperand(0);
	}

	unsigned Offset = EltOffset;
	while (--NumVecElts) {
	unsigned Alignment = MinAlign(OrigAlignment, Offset);
	SDValue OffsetPtr =
	DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
	DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
	NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
	PtrInfo.getWithOffset(Offset), Alignment,
	St.getMemOperand()->getFlags());
	Offset += EltOffset;
	}
	return NewST1;
	}

	// Returns an SVE type that ContentTy can be trivially sign or zero extended
	// into.
	static MVT getSVEContainerType(EVT ContentTy) {
	assert(ContentTy.isSimple() && "No SVE containers for extended types");

	switch (ContentTy.getSimpleVT().SimpleTy) {
	default:
	llvm_unreachable("No known SVE container for this MVT type");
	case MVT::nxv2i8:
	case MVT::nxv2i16:
	case MVT::nxv2i32:
	case MVT::nxv2i64:
	case MVT::nxv2f32:
	case MVT::nxv2f64:
	return MVT::nxv2i64;
	case MVT::nxv4i8:
	case MVT::nxv4i16:
	case MVT::nxv4i32:
	case MVT::nxv4f32:
	return MVT::nxv4i32;
	case MVT::nxv8i8:
	case MVT::nxv8i16:
	case MVT::nxv8f16:
	case MVT::nxv8bf16:
	return MVT::nxv8i16;
	case MVT::nxv16i8:
	return MVT::nxv16i8;
	}
	}

	static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
	SDLoc DL(N);
	EVT VT = N->getValueType(0);

	if (VT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
	return SDValue();

	EVT ContainerVT = VT;
	if (ContainerVT.isInteger())
	ContainerVT = getSVEContainerType(ContainerVT);

	SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
	SDValue Ops[] = { N->getOperand(0), // Chain
	N->getOperand(2), // Pg
	N->getOperand(3), // Base
	DAG.getValueType(VT) };

	SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
	SDValue LoadChain = SDValue(Load.getNode(), 1);

	if (ContainerVT.isInteger() && (VT != ContainerVT))
	Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));

	return DAG.getMergeValues({ Load, LoadChain }, DL);
	}

	static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	EVT PtrTy = N->getOperand(3).getValueType();

	if (VT == MVT::nxv8bf16 &&
	!static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
	return SDValue();

	EVT LoadVT = VT;
	if (VT.isFloatingPoint())
	LoadVT = VT.changeTypeToInteger();

	auto *MINode = cast<MemIntrinsicSDNode>(N);
	SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
	SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
	MINode->getOperand(3), DAG.getUNDEF(PtrTy),
	MINode->getOperand(2), PassThru,
	MINode->getMemoryVT(), MINode->getMemOperand(),
	ISD::UNINDEXED, ISD::NON_EXTLOAD, false);

	if (VT.isFloatingPoint()) {
	SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
	return DAG.getMergeValues(Ops, DL);
	}

	return L;
	}

	template <unsigned Opcode>
	static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) {
	static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO \|\|
	Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
	"Unsupported opcode.");
	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	+ if (VT == MVT::nxv8bf16 &&
	+ !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
	+ return SDValue();

	EVT LoadVT = VT;
	if (VT.isFloatingPoint())
	LoadVT = VT.changeTypeToInteger();

	SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
	SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
	SDValue LoadChain = SDValue(Load.getNode(), 1);

	if (VT.isFloatingPoint())
	Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));

	return DAG.getMergeValues({Load, LoadChain}, DL);
	}

	static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) {
	SDLoc DL(N);
	SDValue Data = N->getOperand(2);
	EVT DataVT = Data.getValueType();
	EVT HwSrcVt = getSVEContainerType(DataVT);
	SDValue InputVT = DAG.getValueType(DataVT);

	if (DataVT == MVT::nxv8bf16 &&
	!static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
	return SDValue();

	if (DataVT.isFloatingPoint())
	InputVT = DAG.getValueType(HwSrcVt);

	SDValue SrcNew;
	if (Data.getValueType().isFloatingPoint())
	SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
	else
	SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);

	SDValue Ops[] = { N->getOperand(0), // Chain
	SrcNew,
	N->getOperand(4), // Base
	N->getOperand(3), // Pg
	InputVT
	};

	return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
	}

	static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
	SDLoc DL(N);

	SDValue Data = N->getOperand(2);
	EVT DataVT = Data.getValueType();
	EVT PtrTy = N->getOperand(4).getValueType();

	if (DataVT == MVT::nxv8bf16 &&
	!static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
	return SDValue();

	if (DataVT.isFloatingPoint())
	Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);

	auto *MINode = cast<MemIntrinsicSDNode>(N);
	return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
	DAG.getUNDEF(PtrTy), MINode->getOperand(3),
	MINode->getMemoryVT(), MINode->getMemOperand(),
	ISD::UNINDEXED, false, false);
	}

	/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
	/// load store optimizer pass will merge them to store pair stores. This should
	/// be better than a movi to create the vector zero followed by a vector store
	/// if the zero constant is not re-used, since one instructions and one register
	/// live range will be removed.
	///
	/// For example, the final generated code should be:
	///
	/// stp xzr, xzr, [x0]
	///
	/// instead of:
	///
	/// movi v0.2d, #0
	/// str q0, [x0]
	///
	static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
	SDValue StVal = St.getValue();
	EVT VT = StVal.getValueType();

	// Avoid scalarizing zero splat stores for scalable vectors.
	if (VT.isScalableVector())
	return SDValue();

	// It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
	// 2, 3 or 4 i32 elements.
	int NumVecElts = VT.getVectorNumElements();
	if (!(((NumVecElts == 2 \|\| NumVecElts == 3) &&
	VT.getVectorElementType().getSizeInBits() == 64) \|\|
	((NumVecElts == 2 \|\| NumVecElts == 3 \|\| NumVecElts == 4) &&
	VT.getVectorElementType().getSizeInBits() == 32)))
	return SDValue();

	if (StVal.getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	// If the zero constant has more than one use then the vector store could be
	// better since the constant mov will be amortized and stp q instructions
	// should be able to be formed.
	if (!StVal.hasOneUse())
	return SDValue();

	// If the store is truncating then it's going down to i16 or smaller, which
	// means it can be implemented in a single store anyway.
	if (St.isTruncatingStore())
	return SDValue();

	// If the immediate offset of the address operand is too large for the stp
	// instruction, then bail out.
	if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
	int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
	if (Offset < -512 \|\| Offset > 504)
	return SDValue();
	}

	for (int I = 0; I < NumVecElts; ++I) {
	SDValue EltVal = StVal.getOperand(I);
	if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
	return SDValue();
	}

	// Use a CopyFromReg WZR/XZR here to prevent
	// DAGCombiner::MergeConsecutiveStores from undoing this transformation.
	SDLoc DL(&St);
	unsigned ZeroReg;
	EVT ZeroVT;
	if (VT.getVectorElementType().getSizeInBits() == 32) {
	ZeroReg = AArch64::WZR;
	ZeroVT = MVT::i32;
	} else {
	ZeroReg = AArch64::XZR;
	ZeroVT = MVT::i64;
	}
	SDValue SplatVal =
	DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
	return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
	}

	/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
	/// value. The load store optimizer pass will merge them to store pair stores.
	/// This has better performance than a splat of the scalar followed by a split
	/// vector store. Even if the stores are not merged it is four stores vs a dup,
	/// followed by an ext.b and two stores.
	static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
	SDValue StVal = St.getValue();
	EVT VT = StVal.getValueType();

	// Don't replace floating point stores, they possibly won't be transformed to
	// stp because of the store pair suppress pass.
	if (VT.isFloatingPoint())
	return SDValue();

	// We can express a splat as store pair(s) for 2 or 4 elements.
	unsigned NumVecElts = VT.getVectorNumElements();
	if (NumVecElts != 4 && NumVecElts != 2)
	return SDValue();

	// If the store is truncating then it's going down to i16 or smaller, which
	// means it can be implemented in a single store anyway.
	if (St.isTruncatingStore())
	return SDValue();

	// Check that this is a splat.
	// Make sure that each of the relevant vector element locations are inserted
	// to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
	std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
	SDValue SplatVal;
	for (unsigned I = 0; I < NumVecElts; ++I) {
	// Check for insert vector elements.
	if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
	return SDValue();

	// Check that same value is inserted at each vector element.
	if (I == 0)
	SplatVal = StVal.getOperand(1);
	else if (StVal.getOperand(1) != SplatVal)
	return SDValue();

	// Check insert element index.
	ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
	if (!CIndex)
	return SDValue();
	uint64_t IndexVal = CIndex->getZExtValue();
	if (IndexVal >= NumVecElts)
	return SDValue();
	IndexNotInserted.reset(IndexVal);

	StVal = StVal.getOperand(0);
	}
	// Check that all vector element locations were inserted to.
	if (IndexNotInserted.any())
	return SDValue();

	return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
	}

	static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {

	StoreSDNode *S = cast<StoreSDNode>(N);
	if (S->isVolatile() \|\| S->isIndexed())
	return SDValue();

	SDValue StVal = S->getValue();
	EVT VT = StVal.getValueType();

	if (!VT.isFixedLengthVector())
	return SDValue();

	// If we get a splat of zeros, convert this vector store to a store of
	// scalars. They will be merged into store pairs of xzr thereby removing one
	// instruction and one register.
	if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
	return ReplacedZeroSplat;

	// FIXME: The logic for deciding if an unaligned store should be split should
	// be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
	// a call to that function here.

	if (!Subtarget->isMisaligned128StoreSlow())
	return SDValue();

	// Don't split at -Oz.
	if (DAG.getMachineFunction().getFunction().hasMinSize())
	return SDValue();

	// Don't split v2i64 vectors. Memcpy lowering produces those and splitting
	// those up regresses performance on micro-benchmarks and olden/bh.
	if (VT.getVectorNumElements() < 2 \|\| VT == MVT::v2i64)
	return SDValue();

	// Split unaligned 16B stores. They are terrible for performance.
	// Don't split stores with alignment of 1 or 2. Code that uses clang vector
	// extensions can use this to mark that it does not want splitting to happen
	// (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
	// eliminating alignment hazards is only 1 in 8 for alignment of 2.
	if (VT.getSizeInBits() != 128 \|\| S->getAlignment() >= 16 \|\|
	S->getAlignment() <= 2)
	return SDValue();

	// If we get a splat of a scalar convert this vector store to a store of
	// scalars. They will be merged into store pairs thereby removing two
	// instructions.
	if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
	return ReplacedSplat;

	SDLoc DL(S);

	// Split VT into two.
	EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
	unsigned NumElts = HalfVT.getVectorNumElements();
	SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
	DAG.getConstant(0, DL, MVT::i64));
	SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
	DAG.getConstant(NumElts, DL, MVT::i64));
	SDValue BasePtr = S->getBasePtr();
	SDValue NewST1 =
	DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
	S->getAlignment(), S->getMemOperand()->getFlags());
	SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
	DAG.getConstant(8, DL, MVT::i64));
	return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
	S->getPointerInfo(), S->getAlignment(),
	S->getMemOperand()->getFlags());
	}

	/// Target-specific DAG combine function for post-increment LD1 (lane) and
	/// post-increment LD1R.
	static SDValue performPostLD1Combine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	bool IsLaneOp) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);

	if (VT.isScalableVector())
	return SDValue();

	unsigned LoadIdx = IsLaneOp ? 1 : 0;
	SDNode *LD = N->getOperand(LoadIdx).getNode();
	// If it is not LOAD, can not do such combine.
	if (LD->getOpcode() != ISD::LOAD)
	return SDValue();

	// The vector lane must be a constant in the LD1LANE opcode.
	SDValue Lane;
	if (IsLaneOp) {
	Lane = N->getOperand(2);
	auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
	if (!LaneC \|\| LaneC->getZExtValue() >= VT.getVectorNumElements())
	return SDValue();
	}

	LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
	EVT MemVT = LoadSDN->getMemoryVT();
	// Check if memory operand is the same type as the vector element.
	if (MemVT != VT.getVectorElementType())
	return SDValue();

	// Check if there are other uses. If so, do not combine as it will introduce
	// an extra load.
	for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
	++UI) {
	if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
	continue;
	if (*UI != N)
	return SDValue();
	}

	SDValue Addr = LD->getOperand(1);
	SDValue Vector = N->getOperand(0);
	// Search for a use of the address operand that is an increment.
	for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
	Addr.getNode()->use_end(); UI != UE; ++UI) {
	SDNode User = UI;
	if (User->getOpcode() != ISD::ADD
	\|\| UI.getUse().getResNo() != Addr.getResNo())
	continue;

	// If the increment is a constant, it must match the memory ref size.
	SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
	if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
	uint32_t IncVal = CInc->getZExtValue();
	unsigned NumBytes = VT.getScalarSizeInBits() / 8;
	if (IncVal != NumBytes)
	continue;
	Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
	}

	// To avoid cycle construction make sure that neither the load nor the add
	// are predecessors to each other or the Vector.
	SmallPtrSet<const SDNode *, 32> Visited;
	SmallVector<const SDNode *, 16> Worklist;
	Visited.insert(Addr.getNode());
	Worklist.push_back(User);
	Worklist.push_back(LD);
	Worklist.push_back(Vector.getNode());
	if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) \|\|
	SDNode::hasPredecessorHelper(User, Visited, Worklist))
	continue;

	SmallVector<SDValue, 8> Ops;
	Ops.push_back(LD->getOperand(0)); // Chain
	if (IsLaneOp) {
	Ops.push_back(Vector); // The vector to be inserted
	Ops.push_back(Lane); // The lane to be inserted in the vector
	}
	Ops.push_back(Addr);
	Ops.push_back(Inc);

	EVT Tys[3] = { VT, MVT::i64, MVT::Other };
	SDVTList SDTys = DAG.getVTList(Tys);
	unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
	SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
	MemVT,
	LoadSDN->getMemOperand());

	// Update the uses.
	SDValue NewResults[] = {
	SDValue(LD, 0), // The result of load
	SDValue(UpdN.getNode(), 2) // Chain
	};
	DCI.CombineTo(LD, NewResults);
	DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
	DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register

	break;
	}
	return SDValue();
	}

	/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
	/// address translation.
	static bool performTBISimplification(SDValue Addr,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	APInt DemandedMask = APInt::getLowBitsSet(64, 56);
	KnownBits Known;
	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
	!DCI.isBeforeLegalizeOps());
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
	DCI.CommitTargetLoweringOpt(TLO);
	return true;
	}
	return false;
	}

	static SDValue performSTORECombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {
	if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
	return Split;

	if (Subtarget->supportsAddressTopByteIgnored() &&
	performTBISimplification(N->getOperand(2), DCI, DAG))
	return SDValue(N, 0);

	return SDValue();
	}


	/// Target-specific DAG combine function for NEON load/store intrinsics
	/// to merge base address updates.
	static SDValue performNEONPostLDSTCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	if (DCI.isBeforeLegalize() \|\| DCI.isCalledByLegalizer())
	return SDValue();

	unsigned AddrOpIdx = N->getNumOperands() - 1;
	SDValue Addr = N->getOperand(AddrOpIdx);

	// Search for a use of the address operand that is an increment.
	for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
	UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
	SDNode User = UI;
	if (User->getOpcode() != ISD::ADD \|\|
	UI.getUse().getResNo() != Addr.getResNo())
	continue;

	// Check that the add is independent of the load/store. Otherwise, folding
	// it would create a cycle.
	SmallPtrSet<const SDNode *, 32> Visited;
	SmallVector<const SDNode *, 16> Worklist;
	Visited.insert(Addr.getNode());
	Worklist.push_back(N);
	Worklist.push_back(User);
	if (SDNode::hasPredecessorHelper(N, Visited, Worklist) \|\|
	SDNode::hasPredecessorHelper(User, Visited, Worklist))
	continue;

	// Find the new opcode for the updating load/store.
	bool IsStore = false;
	bool IsLaneOp = false;
	bool IsDupOp = false;
	unsigned NewOpc = 0;
	unsigned NumVecs = 0;
	unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
	switch (IntNo) {
	default: llvm_unreachable("unexpected intrinsic for Neon base update");
	case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
	NumVecs = 2; break;
	case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
	NumVecs = 3; break;
	case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
	NumVecs = 4; break;
	case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
	NumVecs = 2; IsStore = true; break;
	case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
	NumVecs = 3; IsStore = true; break;
	case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
	NumVecs = 4; IsStore = true; break;
	case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
	NumVecs = 2; break;
	case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
	NumVecs = 3; break;
	case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
	NumVecs = 4; break;
	case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
	NumVecs = 2; IsStore = true; break;
	case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
	NumVecs = 3; IsStore = true; break;
	case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
	NumVecs = 4; IsStore = true; break;
	case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
	NumVecs = 2; IsDupOp = true; break;
	case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
	NumVecs = 3; IsDupOp = true; break;
	case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
	NumVecs = 4; IsDupOp = true; break;
	case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
	NumVecs = 2; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
	NumVecs = 3; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
	NumVecs = 4; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
	NumVecs = 2; IsStore = true; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
	NumVecs = 3; IsStore = true; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
	NumVecs = 4; IsStore = true; IsLaneOp = true; break;
	}

	EVT VecTy;
	if (IsStore)
	VecTy = N->getOperand(2).getValueType();
	else
	VecTy = N->getValueType(0);

	// If the increment is a constant, it must match the memory ref size.
	SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
	if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
	uint32_t IncVal = CInc->getZExtValue();
	unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
	if (IsLaneOp \|\| IsDupOp)
	NumBytes /= VecTy.getVectorNumElements();
	if (IncVal != NumBytes)
	continue;
	Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
	}
	SmallVector<SDValue, 8> Ops;
	Ops.push_back(N->getOperand(0)); // Incoming chain
	// Load lane and store have vector list as input.
	if (IsLaneOp \|\| IsStore)
	for (unsigned i = 2; i < AddrOpIdx; ++i)
	Ops.push_back(N->getOperand(i));
	Ops.push_back(Addr); // Base register
	Ops.push_back(Inc);

	// Return Types.
	EVT Tys[6];
	unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
	unsigned n;
	for (n = 0; n < NumResultVecs; ++n)
	Tys[n] = VecTy;
	Tys[n++] = MVT::i64; // Type of write back register
	Tys[n] = MVT::Other; // Type of the chain
	SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));

	MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
	SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
	MemInt->getMemoryVT(),
	MemInt->getMemOperand());

	// Update the uses.
	std::vector<SDValue> NewResults;
	for (unsigned i = 0; i < NumResultVecs; ++i) {
	NewResults.push_back(SDValue(UpdN.getNode(), i));
	}
	NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
	DCI.CombineTo(N, NewResults);
	DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));

	break;
	}
	return SDValue();
	}

	// Checks to see if the value is the prescribed width and returns information
	// about its extension mode.
	static
	bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
	ExtType = ISD::NON_EXTLOAD;
	switch(V.getNode()->getOpcode()) {
	default:
	return false;
	case ISD::LOAD: {
	LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
	if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
	\|\| (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
	ExtType = LoadNode->getExtensionType();
	return true;
	}
	return false;
	}
	case ISD::AssertSext: {
	VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
	if ((TypeNode->getVT() == MVT::i8 && width == 8)
	\|\| (TypeNode->getVT() == MVT::i16 && width == 16)) {
	ExtType = ISD::SEXTLOAD;
	return true;
	}
	return false;
	}
	case ISD::AssertZext: {
	VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
	if ((TypeNode->getVT() == MVT::i8 && width == 8)
	\|\| (TypeNode->getVT() == MVT::i16 && width == 16)) {
	ExtType = ISD::ZEXTLOAD;
	return true;
	}
	return false;
	}
	case ISD::Constant:
	case ISD::TargetConstant: {
	return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
	1LL << (width - 1);
	}
	}

	return true;
	}

	// This function does a whole lot of voodoo to determine if the tests are
	// equivalent without and with a mask. Essentially what happens is that given a
	// DAG resembling:
	//
	// +-------------+ +-------------+ +-------------+ +-------------+
	// \| Input \| \| AddConstant \| \| CompConstant\| \| CC \|
	// +-------------+ +-------------+ +-------------+ +-------------+
	// \| \| \| \|
	// V V \| +----------+
	// +-------------+ +----+ \| \|
	// \| ADD \| \|0xff\| \| \|
	// +-------------+ +----+ \| \|
	// \| \| \| \|
	// V V \| \|
	// +-------------+ \| \|
	// \| AND \| \| \|
	// +-------------+ \| \|
	// \| \| \|
	// +-----+ \| \|
	// \| \| \|
	// V V V
	// +-------------+
	// \| CMP \|
	// +-------------+
	//
	// The AND node may be safely removed for some combinations of inputs. In
	// particular we need to take into account the extension type of the Input,
	// the exact values of AddConstant, CompConstant, and CC, along with the nominal
	// width of the input (this can work for any width inputs, the above graph is
	// specific to 8 bits.
	//
	// The specific equations were worked out by generating output tables for each
	// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
	// problem was simplified by working with 4 bit inputs, which means we only
	// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
	// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
	// patterns present in both extensions (0,7). For every distinct set of
	// AddConstant and CompConstants bit patterns we can consider the masked and
	// unmasked versions to be equivalent if the result of this function is true for
	// all 16 distinct bit patterns of for the current extension type of Input (w0).
	//
	// sub w8, w0, w1
	// and w10, w8, #0x0f
	// cmp w8, w2
	// cset w9, AArch64CC
	// cmp w10, w2
	// cset w11, AArch64CC
	// cmp w9, w11
	// cset w0, eq
	// ret
	//
	// Since the above function shows when the outputs are equivalent it defines
	// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
	// would be expensive to run during compiles. The equations below were written
	// in a test harness that confirmed they gave equivalent outputs to the above
	// for all inputs function, so they can be used determine if the removal is
	// legal instead.
	//
	// isEquivalentMaskless() is the code for testing if the AND can be removed
	// factored out of the DAG recognition as the DAG can take several forms.

	static bool isEquivalentMaskless(unsigned CC, unsigned width,
	ISD::LoadExtType ExtType, int AddConstant,
	int CompConstant) {
	// By being careful about our equations and only writing the in term
	// symbolic values and well known constants (0, 1, -1, MaxUInt) we can
	// make them generally applicable to all bit widths.
	int MaxUInt = (1 << width);

	// For the purposes of these comparisons sign extending the type is
	// equivalent to zero extending the add and displacing it by half the integer
	// width. Provided we are careful and make sure our equations are valid over
	// the whole range we can just adjust the input and avoid writing equations
	// for sign extended inputs.
	if (ExtType == ISD::SEXTLOAD)
	AddConstant -= (1 << (width-1));

	switch(CC) {
	case AArch64CC::LE:
	case AArch64CC::GT:
	if ((AddConstant == 0) \|\|
	(CompConstant == MaxUInt - 1 && AddConstant < 0) \|\|
	(AddConstant >= 0 && CompConstant < 0) \|\|
	(AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
	return true;
	break;
	case AArch64CC::LT:
	case AArch64CC::GE:
	if ((AddConstant == 0) \|\|
	(AddConstant >= 0 && CompConstant <= 0) \|\|
	(AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
	return true;
	break;
	case AArch64CC::HI:
	case AArch64CC::LS:
	if ((AddConstant >= 0 && CompConstant < 0) \|\|
	(AddConstant <= 0 && CompConstant >= -1 &&
	CompConstant < AddConstant + MaxUInt))
	return true;
	break;
	case AArch64CC::PL:
	case AArch64CC::MI:
	if ((AddConstant == 0) \|\|
	(AddConstant > 0 && CompConstant <= 0) \|\|
	(AddConstant < 0 && CompConstant <= AddConstant))
	return true;
	break;
	case AArch64CC::LO:
	case AArch64CC::HS:
	if ((AddConstant >= 0 && CompConstant <= 0) \|\|
	(AddConstant <= 0 && CompConstant >= 0 &&
	CompConstant <= AddConstant + MaxUInt))
	return true;
	break;
	case AArch64CC::EQ:
	case AArch64CC::NE:
	if ((AddConstant > 0 && CompConstant < 0) \|\|
	(AddConstant < 0 && CompConstant >= 0 &&
	CompConstant < AddConstant + MaxUInt) \|\|
	(AddConstant >= 0 && CompConstant >= 0 &&
	CompConstant >= AddConstant) \|\|
	(AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
	return true;
	break;
	case AArch64CC::VS:
	case AArch64CC::VC:
	case AArch64CC::AL:
	case AArch64CC::NV:
	return true;
	case AArch64CC::Invalid:
	break;
	}

	return false;
	}

	static
	SDValue performCONDCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG, unsigned CCIndex,
	unsigned CmpIndex) {
	unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
	SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
	unsigned CondOpcode = SubsNode->getOpcode();

	if (CondOpcode != AArch64ISD::SUBS)
	return SDValue();

	// There is a SUBS feeding this condition. Is it fed by a mask we can
	// use?

	SDNode *AndNode = SubsNode->getOperand(0).getNode();
	unsigned MaskBits = 0;

	if (AndNode->getOpcode() != ISD::AND)
	return SDValue();

	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
	uint32_t CNV = CN->getZExtValue();
	if (CNV == 255)
	MaskBits = 8;
	else if (CNV == 65535)
	MaskBits = 16;
	}

	if (!MaskBits)
	return SDValue();

	SDValue AddValue = AndNode->getOperand(0);

	if (AddValue.getOpcode() != ISD::ADD)
	return SDValue();

	// The basic dag structure is correct, grab the inputs and validate them.

	SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
	SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
	SDValue SubsInputValue = SubsNode->getOperand(1);

	// The mask is present and the provenance of all the values is a smaller type,
	// lets see if the mask is superfluous.

	if (!isa<ConstantSDNode>(AddInputValue2.getNode()) \|\|
	!isa<ConstantSDNode>(SubsInputValue.getNode()))
	return SDValue();

	ISD::LoadExtType ExtType;

	if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) \|\|
	!checkValueWidth(AddInputValue2, MaskBits, ExtType) \|\|
	!checkValueWidth(AddInputValue1, MaskBits, ExtType) )
	return SDValue();

	if(!isEquivalentMaskless(CC, MaskBits, ExtType,
	cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
	cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
	return SDValue();

	// The AND is not necessary, remove it.

	SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
	SubsNode->getValueType(1));
	SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };

	SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
	DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());

	return SDValue(N, 0);
	}

	// Optimize compare with zero and branch.
	static SDValue performBRCONDCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	MachineFunction &MF = DAG.getMachineFunction();
	// Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
	// will not be produced, as they are conditional branch instructions that do
	// not set flags.
	if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
	return SDValue();

	if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
	N = NV.getNode();
	SDValue Chain = N->getOperand(0);
	SDValue Dest = N->getOperand(1);
	SDValue CCVal = N->getOperand(2);
	SDValue Cmp = N->getOperand(3);

	assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
	unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
	if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
	return SDValue();

	unsigned CmpOpc = Cmp.getOpcode();
	if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
	return SDValue();

	// Only attempt folding if there is only one use of the flag and no use of the
	// value.
	if (!Cmp->hasNUsesOfValue(0, 0) \|\| !Cmp->hasNUsesOfValue(1, 1))
	return SDValue();

	SDValue LHS = Cmp.getOperand(0);
	SDValue RHS = Cmp.getOperand(1);

	assert(LHS.getValueType() == RHS.getValueType() &&
	"Expected the value type to be the same for both operands!");
	if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
	return SDValue();

	if (isNullConstant(LHS))
	std::swap(LHS, RHS);

	if (!isNullConstant(RHS))
	return SDValue();

	if (LHS.getOpcode() == ISD::SHL \|\| LHS.getOpcode() == ISD::SRA \|\|
	LHS.getOpcode() == ISD::SRL)
	return SDValue();

	// Fold the compare into the branch instruction.
	SDValue BR;
	if (CC == AArch64CC::EQ)
	BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
	else
	BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);

	// Do not add new nodes to DAG combiner worklist.
	DCI.CombineTo(N, BR, false);

	return SDValue();
	}

	// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
	// as well as whether the test should be inverted. This code is required to
	// catch these cases (as opposed to standard dag combines) because
	// AArch64ISD::TBZ is matched during legalization.
	static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
	SelectionDAG &DAG) {

	if (!Op->hasOneUse())
	return Op;

	// We don't handle undef/constant-fold cases below, as they should have
	// already been taken care of (e.g. and of 0, test of undefined shifted bits,
	// etc.)

	// (tbz (trunc x), b) -> (tbz x, b)
	// This case is just here to enable more of the below cases to be caught.
	if (Op->getOpcode() == ISD::TRUNCATE &&
	Bit < Op->getValueType(0).getSizeInBits()) {
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	}

	// (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
	if (Op->getOpcode() == ISD::ANY_EXTEND &&
	Bit < Op->getOperand(0).getValueSizeInBits()) {
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	}

	if (Op->getNumOperands() != 2)
	return Op;

	auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
	if (!C)
	return Op;

	switch (Op->getOpcode()) {
	default:
	return Op;

	// (tbz (and x, m), b) -> (tbz x, b)
	case ISD::AND:
	if ((C->getZExtValue() >> Bit) & 1)
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	return Op;

	// (tbz (shl x, c), b) -> (tbz x, b-c)
	case ISD::SHL:
	if (C->getZExtValue() <= Bit &&
	(Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
	Bit = Bit - C->getZExtValue();
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	}
	return Op;

	// (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
	case ISD::SRA:
	Bit = Bit + C->getZExtValue();
	if (Bit >= Op->getValueType(0).getSizeInBits())
	Bit = Op->getValueType(0).getSizeInBits() - 1;
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);

	// (tbz (srl x, c), b) -> (tbz x, b+c)
	case ISD::SRL:
	if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
	Bit = Bit + C->getZExtValue();
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	}
	return Op;

	// (tbz (xor x, -1), b) -> (tbnz x, b)
	case ISD::XOR:
	if ((C->getZExtValue() >> Bit) & 1)
	Invert = !Invert;
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	}
	}

	// Optimize test single bit zero/non-zero and branch.
	static SDValue performTBZCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
	bool Invert = false;
	SDValue TestSrc = N->getOperand(1);
	SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);

	if (TestSrc == NewTestSrc)
	return SDValue();

	unsigned NewOpc = N->getOpcode();
	if (Invert) {
	if (NewOpc == AArch64ISD::TBZ)
	NewOpc = AArch64ISD::TBNZ;
	else {
	assert(NewOpc == AArch64ISD::TBNZ);
	NewOpc = AArch64ISD::TBZ;
	}
	}

	SDLoc DL(N);
	return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
	DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
	}

	// vselect (v1i1 setcc) ->
	// vselect (v1iXX setcc) (XX is the size of the compared operand type)
	// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
	// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
	// such VSELECT.
	static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	EVT CCVT = N0.getValueType();

	if (N0.getOpcode() != ISD::SETCC \|\| CCVT.getVectorNumElements() != 1 \|\|
	CCVT.getVectorElementType() != MVT::i1)
	return SDValue();

	EVT ResVT = N->getValueType(0);
	EVT CmpVT = N0.getOperand(0).getValueType();
	// Only combine when the result type is of the same size as the compared
	// operands.
	if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
	return SDValue();

	SDValue IfTrue = N->getOperand(1);
	SDValue IfFalse = N->getOperand(2);
	SDValue SetCC =
	DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
	N0.getOperand(0), N0.getOperand(1),
	cast<CondCodeSDNode>(N0.getOperand(2))->get());
	return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
	IfTrue, IfFalse);
	}

	/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
	/// the compare-mask instructions rather than going via NZCV, even if LHS and
	/// RHS are really scalar. This replaces any scalar setcc in the above pattern
	/// with a vector one followed by a DUP shuffle on the result.
	static SDValue performSelectCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	SelectionDAG &DAG = DCI.DAG;
	SDValue N0 = N->getOperand(0);
	EVT ResVT = N->getValueType(0);

	if (N0.getOpcode() != ISD::SETCC)
	return SDValue();

	// Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
	// scalar SetCCResultType. We also don't expect vectors, because we assume
	// that selects fed by vector SETCCs are canonicalized to VSELECT.
	assert((N0.getValueType() == MVT::i1 \|\| N0.getValueType() == MVT::i32) &&
	"Scalar-SETCC feeding SELECT has unexpected result type!");

	// If NumMaskElts == 0, the comparison is larger than select result. The
	// largest real NEON comparison is 64-bits per lane, which means the result is
	// at most 32-bits and an illegal vector. Just bail out for now.
	EVT SrcVT = N0.getOperand(0).getValueType();

	// Don't try to do this optimization when the setcc itself has i1 operands.
	// There are no legal vectors of i1, so this would be pointless.
	if (SrcVT == MVT::i1)
	return SDValue();

	int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
	if (!ResVT.isVector() \|\| NumMaskElts == 0)
	return SDValue();

	SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
	EVT CCVT = SrcVT.changeVectorElementTypeToInteger();

	// Also bail out if the vector CCVT isn't the same size as ResVT.
	// This can happen if the SETCC operand size doesn't divide the ResVT size
	// (e.g., f64 vs v3f32).
	if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
	return SDValue();

	// Make sure we didn't create illegal types, if we're not supposed to.
	assert(DCI.isBeforeLegalize() \|\|
	DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));

	// First perform a vector comparison, where lane 0 is the one we're interested
	// in.
	SDLoc DL(N0);
	SDValue LHS =
	DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
	SDValue RHS =
	DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
	SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));

	// Now duplicate the comparison mask we want across all other lanes.
	SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
	SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
	Mask = DAG.getNode(ISD::BITCAST, DL,
	ResVT.changeVectorElementTypeToInteger(), Mask);

	return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
	}

	/// Get rid of unnecessary NVCASTs (that don't change the type).
	static SDValue performNVCASTCombine(SDNode *N) {
	if (N->getValueType(0) == N->getOperand(0).getValueType())
	return N->getOperand(0);

	return SDValue();
	}

	// If all users of the globaladdr are of the form (globaladdr + constant), find
	// the smallest constant, fold it into the globaladdr's offset and rewrite the
	// globaladdr as (globaladdr + constant) - constant.
	static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget,
	const TargetMachine &TM) {
	auto *GN = cast<GlobalAddressSDNode>(N);
	if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
	AArch64II::MO_NO_FLAG)
	return SDValue();

	uint64_t MinOffset = -1ull;
	for (SDNode *N : GN->uses()) {
	if (N->getOpcode() != ISD::ADD)
	return SDValue();
	auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
	if (!C)
	C = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!C)
	return SDValue();
	MinOffset = std::min(MinOffset, C->getZExtValue());
	}
	uint64_t Offset = MinOffset + GN->getOffset();

	// Require that the new offset is larger than the existing one. Otherwise, we
	// can end up oscillating between two possible DAGs, for example,
	// (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
	if (Offset <= uint64_t(GN->getOffset()))
	return SDValue();

	// Check whether folding this offset is legal. It must not go out of bounds of
	// the referenced object to avoid violating the code model, and must be
	// smaller than 2^21 because this is the largest offset expressible in all
	// object formats.
	//
	// This check also prevents us from folding negative offsets, which will end
	// up being treated in the same way as large positive ones. They could also
	// cause code model violations, and aren't really common enough to matter.
	if (Offset >= (1 << 21))
	return SDValue();

	const GlobalValue *GV = GN->getGlobal();
	Type *T = GV->getValueType();
	if (!T->isSized() \|\|
	Offset > GV->getParent()->getDataLayout().getTypeAllocSize(T))
	return SDValue();

	SDLoc DL(GN);
	SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
	return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
	DAG.getConstant(MinOffset, DL, MVT::i64));
	}

	// Turns the vector of indices into a vector of byte offstes by scaling Offset
	// by (BitWidth / 8).
	static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset,
	SDLoc DL, unsigned BitWidth) {
	assert(Offset.getValueType().isScalableVector() &&
	"This method is only for scalable vectors of offsets");

	SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
	SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);

	return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
	}

	/// Check if the value of \p OffsetInBytes can be used as an immediate for
	/// the gather load/prefetch and scatter store instructions with vector base and
	/// immediate offset addressing mode:
	///
	/// [<Zn>.[S\|D]{, #<imm>}]
	///
	/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.

	inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
	unsigned ScalarSizeInBytes) {
	// The immediate is not a multiple of the scalar size.
	if (OffsetInBytes % ScalarSizeInBytes)
	return false;

	// The immediate is out of range.
	if (OffsetInBytes / ScalarSizeInBytes > 31)
	return false;

	return true;
	}

	/// Check if the value of \p Offset represents a valid immediate for the SVE
	/// gather load/prefetch and scatter store instructiona with vector base and
	/// immediate offset addressing mode:
	///
	/// [<Zn>.[S\|D]{, #<imm>}]
	///
	/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
	static bool isValidImmForSVEVecImmAddrMode(SDValue Offset,
	unsigned ScalarSizeInBytes) {
	ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
	return OffsetConst && isValidImmForSVEVecImmAddrMode(
	OffsetConst->getZExtValue(), ScalarSizeInBytes);
	}

	static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
	unsigned Opcode,
	bool OnlyPackedOffsets = true) {
	const SDValue Src = N->getOperand(2);
	const EVT SrcVT = Src->getValueType(0);
	assert(SrcVT.isScalableVector() &&
	"Scatter stores are only possible for SVE vectors");

	SDLoc DL(N);
	MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();

	// Make sure that source data will fit into an SVE register
	if (SrcVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
	return SDValue();

	// For FPs, ACLE only supports _packed_ single and double precision types.
	if (SrcElVT.isFloatingPoint())
	if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64))
	return SDValue();

	// Depending on the addressing mode, this is either a pointer or a vector of
	// pointers (that fits into one register)
	SDValue Base = N->getOperand(4);
	// Depending on the addressing mode, this is either a single offset or a
	// vector of offsets (that fits into one register)
	SDValue Offset = N->getOperand(5);

	// For "scalar + vector of indices", just scale the indices. This only
	// applies to non-temporal scatters because there's no instruction that takes
	// indicies.
	if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
	Offset =
	getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
	Opcode = AArch64ISD::SSTNT1_PRED;
	}

	// In the case of non-temporal gather loads there's only one SVE instruction
	// per data-size: "scalar + vector", i.e.
	// * stnt1{b\|h\|w\|d} { z0.s }, p0/z, [z0.s, x0]
	// Since we do have intrinsics that allow the arguments to be in a different
	// order, we may need to swap them to match the spec.
	if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector())
	std::swap(Base, Offset);

	// SST1_IMM requires that the offset is an immediate that is:
	// * a multiple of #SizeInBytes,
	// * in the range [0, 31 x #SizeInBytes],
	// where #SizeInBytes is the size in bytes of the stored items. For
	// immediates outside that range and non-immediate scalar offsets use SST1 or
	// SST1_UXTW instead.
	if (Opcode == AArch64ISD::SST1_IMM_PRED) {
	if (!isValidImmForSVEVecImmAddrMode(Offset,
	SrcVT.getScalarSizeInBits() / 8)) {
	if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
	Opcode = AArch64ISD::SST1_UXTW_PRED;
	else
	Opcode = AArch64ISD::SST1_PRED;

	std::swap(Base, Offset);
	}
	}

	auto &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isTypeLegal(Base.getValueType()))
	return SDValue();

	// Some scatter store variants allow unpacked offsets, but only as nxv2i32
	// vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
	// nxv2i64. Legalize accordingly.
	if (!OnlyPackedOffsets &&
	Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
	Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);

	if (!TLI.isTypeLegal(Offset.getValueType()))
	return SDValue();

	// Source value type that is representable in hardware
	EVT HwSrcVt = getSVEContainerType(SrcVT);

	// Keep the original type of the input data to store - this is needed to be
	// able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
	// FP values we want the integer equivalent, so just use HwSrcVt.
	SDValue InputVT = DAG.getValueType(SrcVT);
	if (SrcVT.isFloatingPoint())
	InputVT = DAG.getValueType(HwSrcVt);

	SDVTList VTs = DAG.getVTList(MVT::Other);
	SDValue SrcNew;

	if (Src.getValueType().isFloatingPoint())
	SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
	else
	SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);

	SDValue Ops[] = {N->getOperand(0), // Chain
	SrcNew,
	N->getOperand(3), // Pg
	Base,
	Offset,
	InputVT};

	return DAG.getNode(Opcode, DL, VTs, Ops);
	}

	static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
	unsigned Opcode,
	bool OnlyPackedOffsets = true) {
	const EVT RetVT = N->getValueType(0);
	assert(RetVT.isScalableVector() &&
	"Gather loads are only possible for SVE vectors");

	SDLoc DL(N);

	// Make sure that the loaded data will fit into an SVE register
	if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
	return SDValue();

	// Depending on the addressing mode, this is either a pointer or a vector of
	// pointers (that fits into one register)
	SDValue Base = N->getOperand(3);
	// Depending on the addressing mode, this is either a single offset or a
	// vector of offsets (that fits into one register)
	SDValue Offset = N->getOperand(4);

	// For "scalar + vector of indices", just scale the indices. This only
	// applies to non-temporal gathers because there's no instruction that takes
	// indicies.
	if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
	Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
	RetVT.getScalarSizeInBits());
	Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
	}

	// In the case of non-temporal gather loads there's only one SVE instruction
	// per data-size: "scalar + vector", i.e.
	// * ldnt1{b\|h\|w\|d} { z0.s }, p0/z, [z0.s, x0]
	// Since we do have intrinsics that allow the arguments to be in a different
	// order, we may need to swap them to match the spec.
	if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO &&
	Offset.getValueType().isVector())
	std::swap(Base, Offset);

	// GLD{FF}1_IMM requires that the offset is an immediate that is:
	// * a multiple of #SizeInBytes,
	// * in the range [0, 31 x #SizeInBytes],
	// where #SizeInBytes is the size in bytes of the loaded items. For
	// immediates outside that range and non-immediate scalar offsets use
	// GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
	if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO \|\|
	Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
	if (!isValidImmForSVEVecImmAddrMode(Offset,
	RetVT.getScalarSizeInBits() / 8)) {
	if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
	Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
	? AArch64ISD::GLD1_UXTW_MERGE_ZERO
	: AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
	else
	Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
	? AArch64ISD::GLD1_MERGE_ZERO
	: AArch64ISD::GLDFF1_MERGE_ZERO;

	std::swap(Base, Offset);
	}
	}

	auto &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isTypeLegal(Base.getValueType()))
	return SDValue();

	// Some gather load variants allow unpacked offsets, but only as nxv2i32
	// vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
	// nxv2i64. Legalize accordingly.
	if (!OnlyPackedOffsets &&
	Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
	Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);

	// Return value type that is representable in hardware
	EVT HwRetVt = getSVEContainerType(RetVT);

	// Keep the original output value type around - this is needed to be able to
	// select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
	// values we want the integer equivalent, so just use HwRetVT.
	SDValue OutVT = DAG.getValueType(RetVT);
	if (RetVT.isFloatingPoint())
	OutVT = DAG.getValueType(HwRetVt);

	SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
	SDValue Ops[] = {N->getOperand(0), // Chain
	N->getOperand(2), // Pg
	Base, Offset, OutVT};

	SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
	SDValue LoadChain = SDValue(Load.getNode(), 1);

	if (RetVT.isInteger() && (RetVT != HwRetVt))
	Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));

	// If the original return value was FP, bitcast accordingly. Doing it here
	// means that we can avoid adding TableGen patterns for FPs.
	if (RetVT.isFloatingPoint())
	Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));

	return DAG.getMergeValues({Load, LoadChain}, DL);
	}

	static SDValue
	performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SDLoc DL(N);
	SDValue Src = N->getOperand(0);
	unsigned Opc = Src->getOpcode();

	// Sign extend of an unsigned unpack -> signed unpack
	if (Opc == AArch64ISD::UUNPKHI \|\| Opc == AArch64ISD::UUNPKLO) {

	unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
	: AArch64ISD::SUNPKLO;

	// Push the sign extend to the operand of the unpack
	// This is necessary where, for example, the operand of the unpack
	// is another unpack:
	// 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
	// ->
	// 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
	// ->
	// 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
	SDValue ExtOp = Src->getOperand(0);
	auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
	EVT EltTy = VT.getVectorElementType();
	(void)EltTy;

	assert((EltTy == MVT::i8 \|\| EltTy == MVT::i16 \|\| EltTy == MVT::i32) &&
	"Sign extending from an invalid type");

	EVT ExtVT = EVT::getVectorVT(*DAG.getContext(),
	VT.getVectorElementType(),
	VT.getVectorElementCount() * 2);

	SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(),
	ExtOp, DAG.getValueType(ExtVT));

	return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
	}

	// SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
	// for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
	unsigned NewOpc;
	unsigned MemVTOpNum = 4;
	switch (Opc) {
	case AArch64ISD::LD1_MERGE_ZERO:
	NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
	MemVTOpNum = 3;
	break;
	case AArch64ISD::LDNF1_MERGE_ZERO:
	NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
	MemVTOpNum = 3;
	break;
	case AArch64ISD::LDFF1_MERGE_ZERO:
	NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
	MemVTOpNum = 3;
	break;
	case AArch64ISD::GLD1_MERGE_ZERO:
	NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
	break;
	case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
	NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
	break;
	case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
	NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
	break;
	case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
	NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
	break;
	case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
	NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
	break;
	case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
	NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
	break;
	case AArch64ISD::GLD1_IMM_MERGE_ZERO:
	NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
	break;
	case AArch64ISD::GLDFF1_MERGE_ZERO:
	NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
	break;
	case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
	NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
	break;
	case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
	NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
	break;
	case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
	NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
	break;
	case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
	NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
	break;
	case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
	NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
	break;
	case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
	NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
	break;
	case AArch64ISD::GLDNT1_MERGE_ZERO:
	NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
	break;
	default:
	return SDValue();
	}

	EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
	EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();

	if ((SignExtSrcVT != SrcMemVT) \|\| !Src.hasOneUse())
	return SDValue();

	EVT DstVT = N->getValueType(0);
	SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);

	SmallVector<SDValue, 5> Ops;
	for (unsigned I = 0; I < Src->getNumOperands(); ++I)
	Ops.push_back(Src->getOperand(I));

	SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
	DCI.CombineTo(N, ExtLoad);
	DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));

	// Return N so it doesn't get rechecked
	return SDValue(N, 0);
	}

	/// Legalize the gather prefetch (scalar + vector addressing mode) when the
	/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
	/// != nxv2i32) do not need legalization.
	static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) {
	const unsigned OffsetPos = 4;
	SDValue Offset = N->getOperand(OffsetPos);

	// Not an unpacked vector, bail out.
	if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
	return SDValue();

	// Extend the unpacked offset vector to 64-bit lanes.
	SDLoc DL(N);
	Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
	SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
	// Replace the offset operand with the 64-bit one.
	Ops[OffsetPos] = Offset;

	return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
	}

	/// Combines a node carrying the intrinsic
	/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
	/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
	/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
	/// sve gather prefetch instruction with vector plus immediate addressing mode.
	static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG,
	unsigned ScalarSizeInBytes) {
	const unsigned ImmPos = 4, OffsetPos = 3;
	// No need to combine the node if the immediate is valid...
	if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
	return SDValue();

	// ...otherwise swap the offset base with the offset...
	SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
	std::swap(Ops[ImmPos], Ops[OffsetPos]);
	// ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
	// `aarch64_sve_prfb_gather_uxtw_index`.
	SDLoc DL(N);
	Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
	MVT::i64);

	return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
	}

	SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	switch (N->getOpcode()) {
	default:
	LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
	break;
	case ISD::ADD:
	case ISD::SUB:
	return performAddSubLongCombine(N, DCI, DAG);
	case ISD::XOR:
	return performXorCombine(N, DAG, DCI, Subtarget);
	case ISD::MUL:
	return performMulCombine(N, DAG, DCI, Subtarget);
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	return performIntToFpCombine(N, DAG, Subtarget);
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	return performFpToIntCombine(N, DAG, DCI, Subtarget);
	case ISD::FDIV:
	return performFDivCombine(N, DAG, DCI, Subtarget);
	case ISD::OR:
	return performORCombine(N, DCI, Subtarget);
	case ISD::AND:
	return performANDCombine(N, DCI);
	case ISD::SRL:
	return performSRLCombine(N, DCI);
	case ISD::INTRINSIC_WO_CHAIN:
	return performIntrinsicCombine(N, DCI, Subtarget);
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::SIGN_EXTEND:
	return performExtendCombine(N, DCI, DAG);
	case ISD::SIGN_EXTEND_INREG:
	return performSignExtendInRegCombine(N, DCI, DAG);
	case ISD::CONCAT_VECTORS:
	return performConcatVectorsCombine(N, DCI, DAG);
	case ISD::SELECT:
	return performSelectCombine(N, DCI);
	case ISD::VSELECT:
	return performVSelectCombine(N, DCI.DAG);
	case ISD::LOAD:
	if (performTBISimplification(N->getOperand(1), DCI, DAG))
	return SDValue(N, 0);
	break;
	case ISD::STORE:
	return performSTORECombine(N, DCI, DAG, Subtarget);
	case AArch64ISD::BRCOND:
	return performBRCONDCombine(N, DCI, DAG);
	case AArch64ISD::TBNZ:
	case AArch64ISD::TBZ:
	return performTBZCombine(N, DCI, DAG);
	case AArch64ISD::CSEL:
	return performCONDCombine(N, DCI, DAG, 2, 3);
	case AArch64ISD::DUP:
	return performPostLD1Combine(N, DCI, false);
	case AArch64ISD::NVCAST:
	return performNVCASTCombine(N);
	case ISD::INSERT_VECTOR_ELT:
	return performPostLD1Combine(N, DCI, true);
	case ISD::INTRINSIC_VOID:
	case ISD::INTRINSIC_W_CHAIN:
	switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
	case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
	return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /=ScalarSizeInBytes/);
	case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
	return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /=ScalarSizeInBytes/);
	case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
	return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /=ScalarSizeInBytes/);
	case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
	return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /=ScalarSizeInBytes/);
	case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
	case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
	case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
	case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
	case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
	case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
	case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
	case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
	return legalizeSVEGatherPrefetchOffsVec(N, DAG);
	case Intrinsic::aarch64_neon_ld2:
	case Intrinsic::aarch64_neon_ld3:
	case Intrinsic::aarch64_neon_ld4:
	case Intrinsic::aarch64_neon_ld1x2:
	case Intrinsic::aarch64_neon_ld1x3:
	case Intrinsic::aarch64_neon_ld1x4:
	case Intrinsic::aarch64_neon_ld2lane:
	case Intrinsic::aarch64_neon_ld3lane:
	case Intrinsic::aarch64_neon_ld4lane:
	case Intrinsic::aarch64_neon_ld2r:
	case Intrinsic::aarch64_neon_ld3r:
	case Intrinsic::aarch64_neon_ld4r:
	case Intrinsic::aarch64_neon_st2:
	case Intrinsic::aarch64_neon_st3:
	case Intrinsic::aarch64_neon_st4:
	case Intrinsic::aarch64_neon_st1x2:
	case Intrinsic::aarch64_neon_st1x3:
	case Intrinsic::aarch64_neon_st1x4:
	case Intrinsic::aarch64_neon_st2lane:
	case Intrinsic::aarch64_neon_st3lane:
	case Intrinsic::aarch64_neon_st4lane:
	return performNEONPostLDSTCombine(N, DCI, DAG);
	case Intrinsic::aarch64_sve_ldnt1:
	return performLDNT1Combine(N, DAG);
	case Intrinsic::aarch64_sve_ld1rq:
	return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
	case Intrinsic::aarch64_sve_ld1ro:
	return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
	case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
	return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
	case Intrinsic::aarch64_sve_ldnt1_gather:
	return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
	case Intrinsic::aarch64_sve_ldnt1_gather_index:
	return performGatherLoadCombine(N, DAG,
	AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
	case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
	return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
	case Intrinsic::aarch64_sve_ld1:
	return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO);
	case Intrinsic::aarch64_sve_ldnf1:
	return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO);
	case Intrinsic::aarch64_sve_ldff1:
	return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO);
	case Intrinsic::aarch64_sve_st1:
	return performST1Combine(N, DAG);
	case Intrinsic::aarch64_sve_stnt1:
	return performSTNT1Combine(N, DAG);
	case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
	return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
	case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
	return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
	case Intrinsic::aarch64_sve_stnt1_scatter:
	return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
	case Intrinsic::aarch64_sve_stnt1_scatter_index:
	return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED);
	case Intrinsic::aarch64_sve_ld1_gather:
	return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
	case Intrinsic::aarch64_sve_ld1_gather_index:
	return performGatherLoadCombine(N, DAG,
	AArch64ISD::GLD1_SCALED_MERGE_ZERO);
	case Intrinsic::aarch64_sve_ld1_gather_sxtw:
	return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO,
	/OnlyPackedOffsets=/false);
	case Intrinsic::aarch64_sve_ld1_gather_uxtw:
	return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO,
	/OnlyPackedOffsets=/false);
	case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
	return performGatherLoadCombine(N, DAG,
	AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
	/OnlyPackedOffsets=/false);
	case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
	return performGatherLoadCombine(N, DAG,
	AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
	/OnlyPackedOffsets=/false);
	case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
	return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO);
	case Intrinsic::aarch64_sve_ldff1_gather:
	return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO);
	case Intrinsic::aarch64_sve_ldff1_gather_index:
	return performGatherLoadCombine(N, DAG,
	AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
	case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
	return performGatherLoadCombine(N, DAG,
	AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
	/OnlyPackedOffsets=/false);
	case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
	return performGatherLoadCombine(N, DAG,
	AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
	/OnlyPackedOffsets=/false);
	case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
	return performGatherLoadCombine(N, DAG,
	AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
	/OnlyPackedOffsets=/false);
	case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
	return performGatherLoadCombine(N, DAG,
	AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
	/OnlyPackedOffsets=/false);
	case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
	return performGatherLoadCombine(N, DAG,
	AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
	case Intrinsic::aarch64_sve_st1_scatter:
	return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
	case Intrinsic::aarch64_sve_st1_scatter_index:
	return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED);
	case Intrinsic::aarch64_sve_st1_scatter_sxtw:
	return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED,
	/OnlyPackedOffsets=/false);
	case Intrinsic::aarch64_sve_st1_scatter_uxtw:
	return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED,
	/OnlyPackedOffsets=/false);
	case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
	return performScatterStoreCombine(N, DAG,
	AArch64ISD::SST1_SXTW_SCALED_PRED,
	/OnlyPackedOffsets=/false);
	case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
	return performScatterStoreCombine(N, DAG,
	AArch64ISD::SST1_UXTW_SCALED_PRED,
	/OnlyPackedOffsets=/false);
	case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
	return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
	case Intrinsic::aarch64_sve_tuple_get: {
	SDLoc DL(N);
	SDValue Chain = N->getOperand(0);
	SDValue Src1 = N->getOperand(2);
	SDValue Idx = N->getOperand(3);

	uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
	EVT ResVT = N->getValueType(0);
	uint64_t NumLanes = ResVT.getVectorElementCount().Min;
	SDValue Val =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Src1,
	DAG.getConstant(IdxConst * NumLanes, DL, MVT::i32));
	return DAG.getMergeValues({Val, Chain}, DL);
	}
	case Intrinsic::aarch64_sve_tuple_set: {
	SDLoc DL(N);
	SDValue Chain = N->getOperand(0);
	SDValue Tuple = N->getOperand(2);
	SDValue Idx = N->getOperand(3);
	SDValue Vec = N->getOperand(4);

	EVT TupleVT = Tuple.getValueType();
	uint64_t TupleLanes = TupleVT.getVectorElementCount().Min;

	uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
	uint64_t NumLanes = Vec.getValueType().getVectorElementCount().Min;

	if ((TupleLanes % NumLanes) != 0)
	report_fatal_error("invalid tuple vector!");

	uint64_t NumVecs = TupleLanes / NumLanes;

	SmallVector<SDValue, 4> Opnds;
	for (unsigned I = 0; I < NumVecs; ++I) {
	if (I == IdxConst)
	Opnds.push_back(Vec);
	else {
	Opnds.push_back(
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Vec.getValueType(), Tuple,
	DAG.getConstant(I * NumLanes, DL, MVT::i32)));
	}
	}
	SDValue Concat =
	DAG.getNode(ISD::CONCAT_VECTORS, DL, Tuple.getValueType(), Opnds);
	return DAG.getMergeValues({Concat, Chain}, DL);
	}
	case Intrinsic::aarch64_sve_tuple_create2:
	case Intrinsic::aarch64_sve_tuple_create3:
	case Intrinsic::aarch64_sve_tuple_create4: {
	SDLoc DL(N);
	SDValue Chain = N->getOperand(0);

	SmallVector<SDValue, 4> Opnds;
	for (unsigned I = 2; I < N->getNumOperands(); ++I)
	Opnds.push_back(N->getOperand(I));

	EVT VT = Opnds[0].getValueType();
	EVT EltVT = VT.getVectorElementType();
	EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
	VT.getVectorElementCount() *
	(N->getNumOperands() - 2));
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Opnds);
	return DAG.getMergeValues({Concat, Chain}, DL);
	}
	case Intrinsic::aarch64_sve_ld2:
	case Intrinsic::aarch64_sve_ld3:
	case Intrinsic::aarch64_sve_ld4: {
	SDLoc DL(N);
	SDValue Chain = N->getOperand(0);
	SDValue Mask = N->getOperand(2);
	SDValue BasePtr = N->getOperand(3);
	SDValue LoadOps[] = {Chain, Mask, BasePtr};
	unsigned IntrinsicID =
	cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
	SDValue Result =
	LowerSVEStructLoad(IntrinsicID, LoadOps, N->getValueType(0), DAG, DL);
	return DAG.getMergeValues({Result, Chain}, DL);
	}
	default:
	break;
	}
	break;
	case ISD::GlobalAddress:
	return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
	}
	return SDValue();
	}

	// Check if the return value is used as only a return value, as otherwise
	// we can't perform a tail-call. In particular, we need to check for
	// target ISD nodes that are returns and any other "odd" constructs
	// that the generic analysis code won't necessarily catch.
	bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
	SDValue &Chain) const {
	if (N->getNumValues() != 1)
	return false;
	if (!N->hasNUsesOfValue(1, 0))
	return false;

	SDValue TCChain = Chain;
	SDNode Copy = N->use_begin();
	if (Copy->getOpcode() == ISD::CopyToReg) {
	// If the copy has a glue operand, we conservatively assume it isn't safe to
	// perform a tail call.
	if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
	MVT::Glue)
	return false;
	TCChain = Copy->getOperand(0);
	} else if (Copy->getOpcode() != ISD::FP_EXTEND)
	return false;

	bool HasRet = false;
	for (SDNode *Node : Copy->uses()) {
	if (Node->getOpcode() != AArch64ISD::RET_FLAG)
	return false;
	HasRet = true;
	}

	if (!HasRet)
	return false;

	Chain = TCChain;
	return true;
	}

	// Return whether the an instruction can potentially be optimized to a tail
	// call. This will cause the optimizers to attempt to move, or duplicate,
	// return instructions to help enable tail call optimizations for this
	// instruction.
	bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
	return CI->isTailCall();
	}

	bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
	SDValue &Offset,
	ISD::MemIndexedMode &AM,
	bool &IsInc,
	SelectionDAG &DAG) const {
	if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
	return false;

	Base = Op->getOperand(0);
	// All of the indexed addressing mode instructions take a signed
	// 9 bit immediate offset.
	if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
	int64_t RHSC = RHS->getSExtValue();
	if (Op->getOpcode() == ISD::SUB)
	RHSC = -(uint64_t)RHSC;
	if (!isInt<9>(RHSC))
	return false;
	IsInc = (Op->getOpcode() == ISD::ADD);
	Offset = Op->getOperand(1);
	return true;
	}
	return false;
	}

	bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
	SDValue &Offset,
	ISD::MemIndexedMode &AM,
	SelectionDAG &DAG) const {
	EVT VT;
	SDValue Ptr;
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
	VT = LD->getMemoryVT();
	Ptr = LD->getBasePtr();
	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
	VT = ST->getMemoryVT();
	Ptr = ST->getBasePtr();
	} else
	return false;

	bool IsInc;
	if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
	return false;
	AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
	return true;
	}

	bool AArch64TargetLowering::getPostIndexedAddressParts(
	SDNode N, SDNode Op, SDValue &Base, SDValue &Offset,
	ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
	EVT VT;
	SDValue Ptr;
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
	VT = LD->getMemoryVT();
	Ptr = LD->getBasePtr();
	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
	VT = ST->getMemoryVT();
	Ptr = ST->getBasePtr();
	} else
	return false;

	bool IsInc;
	if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
	return false;
	// Post-indexing updates the base, so it's not a valid transform
	// if that's not the same as the load's pointer.
	if (Ptr != Base)
	return false;
	AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
	return true;
	}

	static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) {
	SDLoc DL(N);
	SDValue Op = N->getOperand(0);

	if (N->getValueType(0) != MVT::i16 \|\|
	(Op.getValueType() != MVT::f16 && Op.getValueType() != MVT::bf16))
	return;

	Op = SDValue(
	DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
	DAG.getUNDEF(MVT::i32), Op,
	DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
	0);
	Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
	Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
	}

	static void ReplaceReductionResults(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG, unsigned InterOp,
	unsigned AcrossOp) {
	EVT LoVT, HiVT;
	SDValue Lo, Hi;
	SDLoc dl(N);
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
	std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
	SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
	SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
	Results.push_back(SplitVal);
	}

	static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
	SDLoc DL(N);
	SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N);
	SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64,
	DAG.getNode(ISD::SRL, DL, MVT::i128, N,
	DAG.getConstant(64, DL, MVT::i64)));
	return std::make_pair(Lo, Hi);
	}

	void AArch64TargetLowering::ReplaceExtractSubVectorResults(
	SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
	SDValue In = N->getOperand(0);
	EVT InVT = In.getValueType();

	// Common code will handle these just fine.
	if (!InVT.isScalableVector() \|\| !InVT.isInteger())
	return;

	SDLoc DL(N);
	EVT VT = N->getValueType(0);

	// The following checks bail if this is not a halving operation.

	ElementCount ResEC = VT.getVectorElementCount();

	if (InVT.getVectorElementCount().Min != (ResEC.Min * 2))
	return;

	auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!CIndex)
	return;

	unsigned Index = CIndex->getZExtValue();
	if ((Index != 0) && (Index != ResEC.Min))
	return;

	unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
	EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());

	SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
	Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
	}

	// Create an even/odd pair of X registers holding integer value V.
	static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
	SDLoc dl(V.getNode());
	SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i64);
	SDValue VHi = DAG.getAnyExtOrTrunc(
	DAG.getNode(ISD::SRL, dl, MVT::i128, V, DAG.getConstant(64, dl, MVT::i64)),
	dl, MVT::i64);
	if (DAG.getDataLayout().isBigEndian())
	std::swap (VLo, VHi);
	SDValue RegClass =
	DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
	SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
	SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
	const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
	return SDValue(
	DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
	}

	static void ReplaceCMP_SWAP_128Results(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {
	assert(N->getValueType(0) == MVT::i128 &&
	"AtomicCmpSwap on types less than 128 should be legal");

	if (Subtarget->hasLSE()) {
	// LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
	// so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
	SDValue Ops[] = {
	createGPRPairNode(DAG, N->getOperand(2)), // Compare value
	createGPRPairNode(DAG, N->getOperand(3)), // Store value
	N->getOperand(1), // Ptr
	N->getOperand(0), // Chain in
	};

	MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();

	unsigned Opcode;
	switch (MemOp->getOrdering()) {
	case AtomicOrdering::Monotonic:
	Opcode = AArch64::CASPX;
	break;
	case AtomicOrdering::Acquire:
	Opcode = AArch64::CASPAX;
	break;
	case AtomicOrdering::Release:
	Opcode = AArch64::CASPLX;
	break;
	case AtomicOrdering::AcquireRelease:
	case AtomicOrdering::SequentiallyConsistent:
	Opcode = AArch64::CASPALX;
	break;
	default:
	llvm_unreachable("Unexpected ordering!");
	}

	MachineSDNode *CmpSwap = DAG.getMachineNode(
	Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
	DAG.setNodeMemRefs(CmpSwap, {MemOp});

	unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
	if (DAG.getDataLayout().isBigEndian())
	std::swap(SubReg1, SubReg2);
	SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
	SDValue(CmpSwap, 0));
	SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
	SDValue(CmpSwap, 0));
	Results.push_back(
	DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
	Results.push_back(SDValue(CmpSwap, 1)); // Chain out
	return;
	}

	auto Desired = splitInt128(N->getOperand(2), DAG);
	auto New = splitInt128(N->getOperand(3), DAG);
	SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
	New.first, New.second, N->getOperand(0)};
	SDNode *CmpSwap = DAG.getMachineNode(
	AArch64::CMP_SWAP_128, SDLoc(N),
	DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops);

	MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
	DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});

	Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
	SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
	Results.push_back(SDValue(CmpSwap, 3));
	}

	void AArch64TargetLowering::ReplaceNodeResults(
	SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
	switch (N->getOpcode()) {
	default:
	llvm_unreachable("Don't know how to custom expand this");
	case ISD::BITCAST:
	ReplaceBITCASTResults(N, Results, DAG);
	return;
	case ISD::VECREDUCE_ADD:
	case ISD::VECREDUCE_SMAX:
	case ISD::VECREDUCE_SMIN:
	case ISD::VECREDUCE_UMAX:
	case ISD::VECREDUCE_UMIN:
	Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
	return;

	case ISD::CTPOP:
	Results.push_back(LowerCTPOP(SDValue(N, 0), DAG));
	return;
	case AArch64ISD::SADDV:
	ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
	return;
	case AArch64ISD::UADDV:
	ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
	return;
	case AArch64ISD::SMINV:
	ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
	return;
	case AArch64ISD::UMINV:
	ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
	return;
	case AArch64ISD::SMAXV:
	ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
	return;
	case AArch64ISD::UMAXV:
	ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
	return;
	case ISD::FP_TO_UINT:
	case ISD::FP_TO_SINT:
	assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
	// Let normal code take care of it by not adding anything to Results.
	return;
	case ISD::ATOMIC_CMP_SWAP:
	ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
	return;
	case ISD::LOAD: {
	assert(SDValue(N, 0).getValueType() == MVT::i128 &&
	"unexpected load's value type");
	LoadSDNode *LoadNode = cast<LoadSDNode>(N);
	if (!LoadNode->isVolatile() \|\| LoadNode->getMemoryVT() != MVT::i128) {
	// Non-volatile loads are optimized later in AArch64's load/store
	// optimizer.
	return;
	}

	SDValue Result = DAG.getMemIntrinsicNode(
	AArch64ISD::LDP, SDLoc(N),
	DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
	{LoadNode->getChain(), LoadNode->getBasePtr()}, LoadNode->getMemoryVT(),
	LoadNode->getMemOperand());

	SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
	Result.getValue(0), Result.getValue(1));
	Results.append({Pair, Result.getValue(2) /* Chain */});
	return;
	}
	case ISD::EXTRACT_SUBVECTOR:
	ReplaceExtractSubVectorResults(N, Results, DAG);
	return;
	case ISD::INTRINSIC_WO_CHAIN: {
	EVT VT = N->getValueType(0);
	assert((VT == MVT::i8 \|\| VT == MVT::i16) &&
	"custom lowering for unexpected type");

	ConstantSDNode *CN = cast<ConstantSDNode>(N->getOperand(0));
	Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
	switch (IntID) {
	default:
	return;
	case Intrinsic::aarch64_sve_clasta_n: {
	SDLoc DL(N);
	auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
	auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
	N->getOperand(1), Op2, N->getOperand(3));
	Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
	return;
	}
	case Intrinsic::aarch64_sve_clastb_n: {
	SDLoc DL(N);
	auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
	auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
	N->getOperand(1), Op2, N->getOperand(3));
	Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
	return;
	}
	case Intrinsic::aarch64_sve_lasta: {
	SDLoc DL(N);
	auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
	N->getOperand(1), N->getOperand(2));
	Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
	return;
	}
	case Intrinsic::aarch64_sve_lastb: {
	SDLoc DL(N);
	auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
	N->getOperand(1), N->getOperand(2));
	Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
	return;
	}
	}
	}
	}
	}

	bool AArch64TargetLowering::useLoadStackGuardNode() const {
	if (Subtarget->isTargetAndroid() \|\| Subtarget->isTargetFuchsia())
	return TargetLowering::useLoadStackGuardNode();
	return true;
	}

	unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
	// Combine multiple FDIVs with the same divisor into multiple FMULs by the
	// reciprocal if there are three or more FDIVs.
	return 3;
	}

	TargetLoweringBase::LegalizeTypeAction
	AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {
	// During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
	// v4i16, v2i32 instead of to promote.
	if (VT == MVT::v1i8 \|\| VT == MVT::v1i16 \|\| VT == MVT::v1i32 \|\|
	VT == MVT::v1f32)
	return TypeWidenVector;

	return TargetLoweringBase::getPreferredVectorAction(VT);
	}

	// Loads and stores less than 128-bits are already atomic; ones above that
	// are doomed anyway, so defer to the default libcall and blame the OS when
	// things go wrong.
	bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
	unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
	return Size == 128;
	}

	// Loads and stores less than 128-bits are already atomic; ones above that
	// are doomed anyway, so defer to the default libcall and blame the OS when
	// things go wrong.
	TargetLowering::AtomicExpansionKind
	AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
	unsigned Size = LI->getType()->getPrimitiveSizeInBits();
	return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
	}

	// For the real atomic operations, we have ldxr/stxr up to 128 bits,
	TargetLowering::AtomicExpansionKind
	AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
	if (AI->isFloatingPointOperation())
	return AtomicExpansionKind::CmpXChg;

	unsigned Size = AI->getType()->getPrimitiveSizeInBits();
	if (Size > 128) return AtomicExpansionKind::None;
	// Nand not supported in LSE.
	if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC;
	// Leave 128 bits to LLSC.
	return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC;
	}

	TargetLowering::AtomicExpansionKind
	AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
	AtomicCmpXchgInst *AI) const {
	// If subtarget has LSE, leave cmpxchg intact for codegen.
	if (Subtarget->hasLSE())
	return AtomicExpansionKind::None;
	// At -O0, fast-regalloc cannot cope with the live vregs necessary to
	// implement cmpxchg without spilling. If the address being exchanged is also
	// on the stack and close enough to the spill slot, this can lead to a
	// situation where the monitor always gets cleared and the atomic operation
	// can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
	if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
	return AtomicExpansionKind::None;
	return AtomicExpansionKind::LLSC;
	}

	Value AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value Addr,
	AtomicOrdering Ord) const {
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
	bool IsAcquire = isAcquireOrStronger(Ord);

	// Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
	// intrinsic must return {i64, i64} and we have to recombine them into a
	// single i128 here.
	if (ValTy->getPrimitiveSizeInBits() == 128) {
	Intrinsic::ID Int =
	IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
	Function *Ldxr = Intrinsic::getDeclaration(M, Int);

	Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
	Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");

	Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
	Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
	Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
	Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
	return Builder.CreateOr(
	Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
	}

	Type *Tys[] = { Addr->getType() };
	Intrinsic::ID Int =
	IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
	Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);

	Type *EltTy = cast<PointerType>(Addr->getType())->getElementType();

	const DataLayout &DL = M->getDataLayout();
	IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(EltTy));
	Value *Trunc = Builder.CreateTrunc(Builder.CreateCall(Ldxr, Addr), IntEltTy);

	return Builder.CreateBitCast(Trunc, EltTy);
	}

	void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
	IRBuilder<> &Builder) const {
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
	}

	Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
	Value Val, Value Addr,
	AtomicOrdering Ord) const {
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	bool IsRelease = isReleaseOrStronger(Ord);

	// Since the intrinsics must have legal type, the i128 intrinsics take two
	// parameters: "i64, i64". We must marshal Val into the appropriate form
	// before the call.
	if (Val->getType()->getPrimitiveSizeInBits() == 128) {
	Intrinsic::ID Int =
	IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
	Function *Stxr = Intrinsic::getDeclaration(M, Int);
	Type *Int64Ty = Type::getInt64Ty(M->getContext());

	Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
	Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
	Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
	return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
	}

	Intrinsic::ID Int =
	IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
	Type *Tys[] = { Addr->getType() };
	Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);

	const DataLayout &DL = M->getDataLayout();
	IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
	Val = Builder.CreateBitCast(Val, IntValTy);

	return Builder.CreateCall(Stxr,
	{Builder.CreateZExtOrBitCast(
	Val, Stxr->getFunctionType()->getParamType(0)),
	Addr});
	}

	bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
	Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
	return Ty->isArrayTy();
	}

	bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
	EVT) const {
	return false;
	}

	static Value *UseTlsOffset(IRBuilder<> &IRB, unsigned Offset) {
	Module *M = IRB.GetInsertBlock()->getParent()->getParent();
	Function *ThreadPointerFunc =
	Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
	return IRB.CreatePointerCast(
	IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
	Offset),
	IRB.getInt8PtrTy()->getPointerTo(0));
	}

	Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
	// Android provides a fixed TLS slot for the stack cookie. See the definition
	// of TLS_SLOT_STACK_GUARD in
	// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
	if (Subtarget->isTargetAndroid())
	return UseTlsOffset(IRB, 0x28);

	// Fuchsia is similar.
	// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
	if (Subtarget->isTargetFuchsia())
	return UseTlsOffset(IRB, -0x10);

	return TargetLowering::getIRStackGuard(IRB);
	}

	void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
	// MSVC CRT provides functionalities for stack protection.
	if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
	// MSVC CRT has a global variable holding security cookie.
	M.getOrInsertGlobal("__security_cookie",
	Type::getInt8PtrTy(M.getContext()));

	// MSVC CRT has a function to validate security cookie.
	FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
	"__security_check_cookie", Type::getVoidTy(M.getContext()),
	Type::getInt8PtrTy(M.getContext()));
	if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
	F->setCallingConv(CallingConv::Win64);
	F->addAttribute(1, Attribute::AttrKind::InReg);
	}
	return;
	}
	TargetLowering::insertSSPDeclarations(M);
	}

	Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const {
	// MSVC CRT has a global variable holding security cookie.
	if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
	return M.getGlobalVariable("__security_cookie");
	return TargetLowering::getSDagStackGuard(M);
	}

	Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
	// MSVC CRT has a function to validate security cookie.
	if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
	return M.getFunction("__security_check_cookie");
	return TargetLowering::getSSPStackGuardCheck(M);
	}

	Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
	// Android provides a fixed TLS slot for the SafeStack pointer. See the
	// definition of TLS_SLOT_SAFESTACK in
	// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
	if (Subtarget->isTargetAndroid())
	return UseTlsOffset(IRB, 0x48);

	// Fuchsia is similar.
	// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
	if (Subtarget->isTargetFuchsia())
	return UseTlsOffset(IRB, -0x8);

	return TargetLowering::getSafeStackPointerLocation(IRB);
	}

	bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
	const Instruction &AndI) const {
	// Only sink 'and' mask to cmp use block if it is masking a single bit, since
	// this is likely to be fold the and/cmp/br into a single tbz instruction. It
	// may be beneficial to sink in other cases, but we would have to check that
	// the cmp would not get folded into the br to form a cbz for these to be
	// beneficial.
	ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
	if (!Mask)
	return false;
	return Mask->getValue().isPowerOf2();
	}

	bool AArch64TargetLowering::
	shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
	SDValue X, ConstantSDNode XC, ConstantSDNode CC, SDValue Y,
	unsigned OldShiftOpcode, unsigned NewShiftOpcode,
	SelectionDAG &DAG) const {
	// Does baseline recommend not to perform the fold by default?
	if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
	X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
	return false;
	// Else, if this is a vector shift, prefer 'shl'.
	return X.getValueType().isScalarInteger() \|\| NewShiftOpcode == ISD::SHL;
	}

	bool AArch64TargetLowering::shouldExpandShift(SelectionDAG &DAG,
	SDNode *N) const {
	if (DAG.getMachineFunction().getFunction().hasMinSize() &&
	!Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
	return false;
	return true;
	}

	void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
	// Update IsSplitCSR in AArch64unctionInfo.
	AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
	AFI->setIsSplitCSR(true);
	}

	void AArch64TargetLowering::insertCopiesSplitCSR(
	MachineBasicBlock *Entry,
	const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
	if (!IStart)
	return;

	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
	MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
	MachineBasicBlock::iterator MBBI = Entry->begin();
	for (const MCPhysReg I = IStart; I; ++I) {
	const TargetRegisterClass *RC = nullptr;
	if (AArch64::GPR64RegClass.contains(*I))
	RC = &AArch64::GPR64RegClass;
	else if (AArch64::FPR64RegClass.contains(*I))
	RC = &AArch64::FPR64RegClass;
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");

	Register NewVR = MRI->createVirtualRegister(RC);
	// Create copy from CSR to a virtual register.
	// FIXME: this currently does not emit CFI pseudo-instructions, it works
	// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
	// nounwind. If we want to generalize this later, we may need to emit
	// CFI pseudo-instructions.
	assert(Entry->getParent()->getFunction().hasFnAttribute(
	Attribute::NoUnwind) &&
	"Function should be nounwind in insertCopiesSplitCSR!");
	Entry->addLiveIn(*I);
	BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
	.addReg(*I);

	// Insert the copy-back instructions right before the terminator.
	for (auto *Exit : Exits)
	BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
	TII->get(TargetOpcode::COPY), *I)
	.addReg(NewVR);
	}
	}

	bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
	// Integer division on AArch64 is expensive. However, when aggressively
	// optimizing for code size, we prefer to use a div instruction, as it is
	// usually smaller than the alternative sequence.
	// The exception to this is vector division. Since AArch64 doesn't have vector
	// integer division, leaving the division as-is is a loss even in terms of
	// size, because it will have to be scalarized, while the alternative code
	// sequence can be performed in vector form.
	bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
	return OptSize && !VT.isVector();
	}

	bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
	// We want inc-of-add for scalars and sub-of-not for vectors.
	return VT.isScalarInteger();
	}

	bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
	return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
	}

	unsigned
	AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
	if (Subtarget->isTargetDarwin() \|\| Subtarget->isTargetWindows())
	return getPointerTy(DL).getSizeInBits();

	return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
	}

	void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
	MF.getFrameInfo().computeMaxCallFrameSize(MF);
	TargetLoweringBase::finalizeLowering(MF);
	}

	// Unlike X86, we let frame lowering assign offsets to all catch objects.
	bool AArch64TargetLowering::needsFixedCatchObjects() const {
	return false;
	}

	bool AArch64TargetLowering::shouldLocalize(
	const MachineInstr &MI, const TargetTransformInfo *TTI) const {
	switch (MI.getOpcode()) {
	case TargetOpcode::G_GLOBAL_VALUE: {
	// On Darwin, TLS global vars get selected into function calls, which
	// we don't want localized, as they can get moved into the middle of a
	// another call sequence.
	const GlobalValue &GV = *MI.getOperand(1).getGlobal();
	if (GV.isThreadLocal() && Subtarget->isTargetMachO())
	return false;
	break;
	}
	// If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
	// localizable.
	case AArch64::ADRP:
	case AArch64::G_ADD_LOW:
	return true;
	default:
	break;
	}
	return TargetLoweringBase::shouldLocalize(MI, TTI);
	}

	bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
	if (isa<ScalableVectorType>(Inst.getType()))
	return true;

	for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
	if (isa<ScalableVectorType>(Inst.getOperand(i)->getType()))
	return true;

	+ if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
	+ if (isa<ScalableVectorType>(AI->getAllocatedType()))
	+ return true;
	+ }
	+
	return false;
	}

	// Return the largest legal scalable vector type that matches VT's element type.
	static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {
	assert(VT.isFixedLengthVector() &&
	DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
	"Expected legal fixed length vector!");
	switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
	default:
	llvm_unreachable("unexpected element type for SVE container");
	case MVT::i8:
	return EVT(MVT::nxv16i8);
	case MVT::i16:
	return EVT(MVT::nxv8i16);
	case MVT::i32:
	return EVT(MVT::nxv4i32);
	case MVT::i64:
	return EVT(MVT::nxv2i64);
	case MVT::f16:
	return EVT(MVT::nxv8f16);
	case MVT::f32:
	return EVT(MVT::nxv4f32);
	case MVT::f64:
	return EVT(MVT::nxv2f64);
	}
	}

	// Return a PTRUE with active lanes corresponding to the extent of VT.
	static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
	EVT VT) {
	assert(VT.isFixedLengthVector() &&
	DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
	"Expected legal fixed length vector!");

	int PgPattern;
	switch (VT.getVectorNumElements()) {
	default:
	llvm_unreachable("unexpected element count for SVE predicate");
	case 1:
	PgPattern = AArch64SVEPredPattern::vl1;
	break;
	case 2:
	PgPattern = AArch64SVEPredPattern::vl2;
	break;
	case 4:
	PgPattern = AArch64SVEPredPattern::vl4;
	break;
	case 8:
	PgPattern = AArch64SVEPredPattern::vl8;
	break;
	case 16:
	PgPattern = AArch64SVEPredPattern::vl16;
	break;
	case 32:
	PgPattern = AArch64SVEPredPattern::vl32;
	break;
	case 64:
	PgPattern = AArch64SVEPredPattern::vl64;
	break;
	case 128:
	PgPattern = AArch64SVEPredPattern::vl128;
	break;
	case 256:
	PgPattern = AArch64SVEPredPattern::vl256;
	break;
	}

	// TODO: For vectors that are exactly getMaxSVEVectorSizeInBits big, we can
	// use AArch64SVEPredPattern::all, which can enable the use of unpredicated
	// variants of instructions when available.

	MVT MaskVT;
	switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
	default:
	llvm_unreachable("unexpected element type for SVE predicate");
	case MVT::i8:
	MaskVT = MVT::nxv16i1;
	break;
	case MVT::i16:
	case MVT::f16:
	MaskVT = MVT::nxv8i1;
	break;
	case MVT::i32:
	case MVT::f32:
	MaskVT = MVT::nxv4i1;
	break;
	case MVT::i64:
	case MVT::f64:
	MaskVT = MVT::nxv2i1;
	break;
	}

	return DAG.getNode(AArch64ISD::PTRUE, DL, MaskVT,
	DAG.getTargetConstant(PgPattern, DL, MVT::i64));
	}

	static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
	EVT VT) {
	assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
	"Expected legal scalable vector!");
	auto PredTy = VT.changeVectorElementType(MVT::i1);
	return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
	}

	static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) {
	if (VT.isFixedLengthVector())
	return getPredicateForFixedLengthVector(DAG, DL, VT);

	return getPredicateForScalableVector(DAG, DL, VT);
	}

	// Grow V to consume an entire SVE register.
	static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
	assert(VT.isScalableVector() &&
	"Expected to convert into a scalable vector!");
	assert(V.getValueType().isFixedLengthVector() &&
	"Expected a fixed length vector operand!");
	SDLoc DL(V);
	SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
	}

	// Shrink V so it's just big enough to maintain a VT's worth of data.
	static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
	assert(VT.isFixedLengthVector() &&
	"Expected to convert into a fixed length vector!");
	assert(V.getValueType().isScalableVector() &&
	"Expected a scalable vector operand!");
	SDLoc DL(V);
	SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
	}

	// Convert all fixed length vector loads larger than NEON to masked_loads.
	SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
	SDValue Op, SelectionDAG &DAG) const {
	auto Load = cast<LoadSDNode>(Op);

	SDLoc DL(Op);
	EVT VT = Op.getValueType();
	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);

	auto NewLoad = DAG.getMaskedLoad(
	ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
	getPredicateForFixedLengthVector(DAG, DL, VT), DAG.getUNDEF(ContainerVT),
	Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(),
	Load->getExtensionType());

	auto Result = convertFromScalableVector(DAG, VT, NewLoad);
	SDValue MergedValues[2] = {Result, Load->getChain()};
	return DAG.getMergeValues(MergedValues, DL);
	}

	// Convert all fixed length vector stores larger than NEON to masked_stores.
	SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
	SDValue Op, SelectionDAG &DAG) const {
	auto Store = cast<StoreSDNode>(Op);

	SDLoc DL(Op);
	EVT VT = Store->getValue().getValueType();
	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);

	auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
	return DAG.getMaskedStore(
	Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
	getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(),
	Store->getMemOperand(), Store->getAddressingMode(),
	Store->isTruncatingStore());
	}

	SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
	SDValue Op, SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();
	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");

	SDLoc DL(Op);
	SDValue Val = Op.getOperand(0);
	EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
	Val = convertToScalableVector(DAG, ContainerVT, Val);

	// Repeatedly truncate Val until the result is of the desired element type.
	switch (ContainerVT.getSimpleVT().SimpleTy) {
	default:
	llvm_unreachable("unimplemented container type");
	case MVT::nxv2i64:
	Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
	Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
	if (VT.getVectorElementType() == MVT::i32)
	break;
	LLVM_FALLTHROUGH;
	case MVT::nxv4i32:
	Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
	Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
	if (VT.getVectorElementType() == MVT::i16)
	break;
	LLVM_FALLTHROUGH;
	case MVT::nxv8i16:
	Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
	Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
	assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
	break;
	}

	return convertFromScalableVector(DAG, VT, Val);
	}

	SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
	SelectionDAG &DAG,
	unsigned NewOp) const {
	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	auto Pg = getPredicateForVector(DAG, DL, VT);

	if (useSVEForFixedLengthVectorVT(VT)) {
	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);

	// Create list of operands by convereting existing ones to scalable types.
	SmallVector<SDValue, 4> Operands = {Pg};
	for (const SDValue &V : Op->op_values()) {
	if (isa<CondCodeSDNode>(V)) {
	Operands.push_back(V);
	continue;
	}

	assert(useSVEForFixedLengthVectorVT(V.getValueType()) &&
	"Only fixed length vectors are supported!");
	Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
	}

	auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
	return convertFromScalableVector(DAG, VT, ScalableRes);
	}

	assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");

	SmallVector<SDValue, 4> Operands = {Pg};
	for (const SDValue &V : Op->op_values()) {
	assert((isa<CondCodeSDNode>(V) \|\| V.getValueType().isScalableVector()) &&
	"Only scalable vectors are supported!");
	Operands.push_back(V);
	}

	return DAG.getNode(NewOp, DL, VT, Operands);
	}
	diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
	index 6df7970f4d82..4f4ba692c2db 100644
	--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
	+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
	@@ -1,11233 +1,11236 @@
	//===- AArch64InstrFormats.td - AArch64 Instruction Formats --- tblgen --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// Describe AArch64 instructions format here
	//

	// Format specifies the encoding used by the instruction. This is part of the
	// ad-hoc solution used to emit machine instruction encodings by our machine
	// code emitter.
	class Format<bits<2> val> {
	bits<2> Value = val;
	}

	def PseudoFrm : Format<0>;
	def NormalFrm : Format<1>; // Do we need any others?

	// Enum describing whether an instruction is
	// destructive in its first source operand.
	class DestructiveInstTypeEnum<bits<4> val> {
	bits<4> Value = val;
	}
	def NotDestructive : DestructiveInstTypeEnum<0>;
	// Destructive in its first operand and can be MOVPRFX'd, but has no other
	// special properties.
	def DestructiveOther : DestructiveInstTypeEnum<1>;
	def DestructiveUnary : DestructiveInstTypeEnum<2>;
	def DestructiveBinaryImm : DestructiveInstTypeEnum<3>;
	def DestructiveBinaryShImmUnpred : DestructiveInstTypeEnum<4>;
	def DestructiveBinary : DestructiveInstTypeEnum<5>;
	def DestructiveBinaryComm : DestructiveInstTypeEnum<6>;
	def DestructiveBinaryCommWithRev : DestructiveInstTypeEnum<7>;
	def DestructiveTernaryCommWithRev : DestructiveInstTypeEnum<8>;

	class FalseLanesEnum<bits<2> val> {
	bits<2> Value = val;
	}
	def FalseLanesNone : FalseLanesEnum<0>;
	def FalseLanesZero : FalseLanesEnum<1>;
	def FalseLanesUndef : FalseLanesEnum<2>;

	// AArch64 Instruction Format
	class AArch64Inst<Format f, string cstr> : Instruction {
	field bits<32> Inst; // Instruction encoding.
	// Mask of bits that cause an encoding to be UNPREDICTABLE.
	// If a bit is set, then if the corresponding bit in the
	// target encoding differs from its value in the "Inst" field,
	// the instruction is UNPREDICTABLE (SoftFail in abstract parlance).
	field bits<32> Unpredictable = 0;
	// SoftFail is the generic name for this field, but we alias it so
	// as to make it more obvious what it means in ARM-land.
	field bits<32> SoftFail = Unpredictable;
	let Namespace = "AArch64";
	Format F = f;
	bits<2> Form = F.Value;

	// Defaults
	FalseLanesEnum FalseLanes = FalseLanesNone;
	DestructiveInstTypeEnum DestructiveInstType = NotDestructive;
	ElementSizeEnum ElementSize = ElementSizeNone;

	let TSFlags{8-7} = FalseLanes.Value;
	let TSFlags{6-3} = DestructiveInstType.Value;
	let TSFlags{2-0} = ElementSize.Value;

	let Pattern = [];
	let Constraints = cstr;
	}

	class InstSubst<string Asm, dag Result, bit EmitPriority = 0>
	: InstAlias<Asm, Result, EmitPriority>, Requires<[UseNegativeImmediates]>;

	// Pseudo instructions (don't have encoding information)
	class Pseudo<dag oops, dag iops, list<dag> pattern, string cstr = "">
	: AArch64Inst<PseudoFrm, cstr> {
	dag OutOperandList = oops;
	dag InOperandList = iops;
	let Pattern = pattern;
	let isCodeGenOnly = 1;
	let isPseudo = 1;
	}

	// Real instructions (have encoding information)
	class EncodedI<string cstr, list<dag> pattern> : AArch64Inst<NormalFrm, cstr> {
	let Pattern = pattern;
	let Size = 4;
	}

	// Normal instructions
	class I<dag oops, dag iops, string asm, string operands, string cstr,
	list<dag> pattern>
	: EncodedI<cstr, pattern> {
	dag OutOperandList = oops;
	dag InOperandList = iops;
	let AsmString = !strconcat(asm, operands);
	}

	class TriOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$MHS, node:$RHS), res>;
	class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
	class UnOpFrag<dag res> : PatFrag<(ops node:$LHS), res>;

	// Helper fragment for an extract of the high portion of a 128-bit vector.
	def extract_high_v16i8 :
	UnOpFrag<(extract_subvector (v16i8 node:$LHS), (i64 8))>;
	def extract_high_v8i16 :
	UnOpFrag<(extract_subvector (v8i16 node:$LHS), (i64 4))>;
	def extract_high_v4i32 :
	UnOpFrag<(extract_subvector (v4i32 node:$LHS), (i64 2))>;
	def extract_high_v2i64 :
	UnOpFrag<(extract_subvector (v2i64 node:$LHS), (i64 1))>;

	//===----------------------------------------------------------------------===//
	// Asm Operand Classes.
	//

	// Shifter operand for arithmetic shifted encodings.
	def ShifterOperand : AsmOperandClass {
	let Name = "Shifter";
	}

	// Shifter operand for mov immediate encodings.
	def MovImm32ShifterOperand : AsmOperandClass {
	let SuperClasses = [ShifterOperand];
	let Name = "MovImm32Shifter";
	let RenderMethod = "addShifterOperands";
	let DiagnosticType = "InvalidMovImm32Shift";
	}
	def MovImm64ShifterOperand : AsmOperandClass {
	let SuperClasses = [ShifterOperand];
	let Name = "MovImm64Shifter";
	let RenderMethod = "addShifterOperands";
	let DiagnosticType = "InvalidMovImm64Shift";
	}

	// Shifter operand for arithmetic register shifted encodings.
	class ArithmeticShifterOperand<int width> : AsmOperandClass {
	let SuperClasses = [ShifterOperand];
	let Name = "ArithmeticShifter" # width;
	let PredicateMethod = "isArithmeticShifter<" # width # ">";
	let RenderMethod = "addShifterOperands";
	let DiagnosticType = "AddSubRegShift" # width;
	}

	def ArithmeticShifterOperand32 : ArithmeticShifterOperand<32>;
	def ArithmeticShifterOperand64 : ArithmeticShifterOperand<64>;

	// Shifter operand for logical register shifted encodings.
	class LogicalShifterOperand<int width> : AsmOperandClass {
	let SuperClasses = [ShifterOperand];
	let Name = "LogicalShifter" # width;
	let PredicateMethod = "isLogicalShifter<" # width # ">";
	let RenderMethod = "addShifterOperands";
	let DiagnosticType = "AddSubRegShift" # width;
	}

	def LogicalShifterOperand32 : LogicalShifterOperand<32>;
	def LogicalShifterOperand64 : LogicalShifterOperand<64>;

	// Shifter operand for logical vector 128/64-bit shifted encodings.
	def LogicalVecShifterOperand : AsmOperandClass {
	let SuperClasses = [ShifterOperand];
	let Name = "LogicalVecShifter";
	let RenderMethod = "addShifterOperands";
	}
	def LogicalVecHalfWordShifterOperand : AsmOperandClass {
	let SuperClasses = [LogicalVecShifterOperand];
	let Name = "LogicalVecHalfWordShifter";
	let RenderMethod = "addShifterOperands";
	}

	// The "MSL" shifter on the vector MOVI instruction.
	def MoveVecShifterOperand : AsmOperandClass {
	let SuperClasses = [ShifterOperand];
	let Name = "MoveVecShifter";
	let RenderMethod = "addShifterOperands";
	}

	// Extend operand for arithmetic encodings.
	def ExtendOperand : AsmOperandClass {
	let Name = "Extend";
	let DiagnosticType = "AddSubRegExtendLarge";
	}
	def ExtendOperand64 : AsmOperandClass {
	let SuperClasses = [ExtendOperand];
	let Name = "Extend64";
	let DiagnosticType = "AddSubRegExtendSmall";
	}
	// 'extend' that's a lsl of a 64-bit register.
	def ExtendOperandLSL64 : AsmOperandClass {
	let SuperClasses = [ExtendOperand];
	let Name = "ExtendLSL64";
	let RenderMethod = "addExtend64Operands";
	let DiagnosticType = "AddSubRegExtendLarge";
	}

	// 8-bit floating-point immediate encodings.
	def FPImmOperand : AsmOperandClass {
	let Name = "FPImm";
	let ParserMethod = "tryParseFPImm<true>";
	let DiagnosticType = "InvalidFPImm";
	}

	def CondCode : AsmOperandClass {
	let Name = "CondCode";
	let DiagnosticType = "InvalidCondCode";
	}

	// A 32-bit register pasrsed as 64-bit
	def GPR32as64Operand : AsmOperandClass {
	let Name = "GPR32as64";
	let ParserMethod =
	"tryParseGPROperand<false, RegConstraintEqualityTy::EqualsSubReg>";
	}
	def GPR32as64 : RegisterOperand<GPR32> {
	let ParserMatchClass = GPR32as64Operand;
	}

	// A 64-bit register pasrsed as 32-bit
	def GPR64as32Operand : AsmOperandClass {
	let Name = "GPR64as32";
	let ParserMethod =
	"tryParseGPROperand<false, RegConstraintEqualityTy::EqualsSuperReg>";
	}
	def GPR64as32 : RegisterOperand<GPR64, "printGPR64as32"> {
	let ParserMatchClass = GPR64as32Operand;
	}

	// 8-bit immediate for AdvSIMD where 64-bit values of the form:
	// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
	// are encoded as the eight bit value 'abcdefgh'.
	def SIMDImmType10Operand : AsmOperandClass { let Name = "SIMDImmType10"; }

	class UImmScaledMemoryIndexed<int Width, int Scale> : AsmOperandClass {
	let Name = "UImm" # Width # "s" # Scale;
	let DiagnosticType = "InvalidMemoryIndexed" # Scale # "UImm" # Width;
	let RenderMethod = "addImmScaledOperands<" # Scale # ">";
	let PredicateMethod = "isUImmScaled<" # Width # ", " # Scale # ">";
	}

	class SImmScaledMemoryIndexed<int Width, int Scale> : AsmOperandClass {
	let Name = "SImm" # Width # "s" # Scale;
	let DiagnosticType = "InvalidMemoryIndexed" # Scale # "SImm" # Width;
	let RenderMethod = "addImmScaledOperands<" # Scale # ">";
	let PredicateMethod = "isSImmScaled<" # Width # ", " # Scale # ">";
	}

	//===----------------------------------------------------------------------===//
	// Operand Definitions.
	//

	// ADR[P] instruction labels.
	def AdrpOperand : AsmOperandClass {
	let Name = "AdrpLabel";
	let ParserMethod = "tryParseAdrpLabel";
	let DiagnosticType = "InvalidLabel";
	}
	def adrplabel : Operand<i64> {
	let EncoderMethod = "getAdrLabelOpValue";
	let PrintMethod = "printAdrpLabel";
	let ParserMatchClass = AdrpOperand;
	}

	def AdrOperand : AsmOperandClass {
	let Name = "AdrLabel";
	let ParserMethod = "tryParseAdrLabel";
	let DiagnosticType = "InvalidLabel";
	}
	def adrlabel : Operand<i64> {
	let EncoderMethod = "getAdrLabelOpValue";
	let ParserMatchClass = AdrOperand;
	}

	class SImmOperand<int width> : AsmOperandClass {
	let Name = "SImm" # width;
	let DiagnosticType = "InvalidMemoryIndexedSImm" # width;
	let RenderMethod = "addImmOperands";
	let PredicateMethod = "isSImm<" # width # ">";
	}


	class AsmImmRange<int Low, int High> : AsmOperandClass {
	let Name = "Imm" # Low # "_" # High;
	let DiagnosticType = "InvalidImm" # Low # "_" # High;
	let RenderMethod = "addImmOperands";
	let PredicateMethod = "isImmInRange<" # Low # "," # High # ">";
	}

	// Authenticated loads for v8.3 can have scaled 10-bit immediate offsets.
	def SImm10s8Operand : SImmScaledMemoryIndexed<10, 8>;
	def simm10Scaled : Operand<i64> {
	let ParserMatchClass = SImm10s8Operand;
	let DecoderMethod = "DecodeSImm<10>";
	let PrintMethod = "printImmScale<8>";
	}

	def simm9s16 : Operand<i64> {
	let ParserMatchClass = SImmScaledMemoryIndexed<9, 16>;
	let DecoderMethod = "DecodeSImm<9>";
	let PrintMethod = "printImmScale<16>";
	}

	// uimm6 predicate - True if the immediate is in the range [0, 63].
	def UImm6Operand : AsmOperandClass {
	let Name = "UImm6";
	let DiagnosticType = "InvalidImm0_63";
	}

	def uimm6 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= 0 && Imm < 64; }]> {
	let ParserMatchClass = UImm6Operand;
	}

	def uimm16 : Operand<i16>, ImmLeaf<i16, [{return Imm >= 0 && Imm < 65536;}]>{
	let ParserMatchClass = AsmImmRange<0, 65535>;
	}

	def SImm9Operand : SImmOperand<9>;
	def simm9 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -256 && Imm < 256; }]> {
	let ParserMatchClass = SImm9Operand;
	let DecoderMethod = "DecodeSImm<9>";
	}

	def SImm8Operand : SImmOperand<8>;
	def simm8 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= -128 && Imm < 127; }]> {
	let ParserMatchClass = SImm8Operand;
	let DecoderMethod = "DecodeSImm<8>";
	}

	def SImm6Operand : SImmOperand<6>;
	def simm6_32b : Operand<i32>, ImmLeaf<i32, [{ return Imm >= -32 && Imm < 32; }]> {
	let ParserMatchClass = SImm6Operand;
	let DecoderMethod = "DecodeSImm<6>";
	}

	def SImm5Operand : SImmOperand<5>;
	def simm5_64b : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -16 && Imm < 16; }]> {
	let ParserMatchClass = SImm5Operand;
	let DecoderMethod = "DecodeSImm<5>";
	}

	def simm5_32b : Operand<i32>, ImmLeaf<i32, [{ return Imm >= -16 && Imm < 16; }]> {
	let ParserMatchClass = SImm5Operand;
	let DecoderMethod = "DecodeSImm<5>";
	}

	def simm5_8b : Operand<i32>, ImmLeaf<i32, [{ return (int8_t)Imm >= -16 && (int8_t)Imm < 16; }]> {
	let ParserMatchClass = SImm5Operand;
	let DecoderMethod = "DecodeSImm<5>";
	let PrintMethod = "printSImm<8>";
	}

	def simm5_16b : Operand<i32>, ImmLeaf<i32, [{ return (int16_t)Imm >= -16 && (int16_t)Imm < 16; }]> {
	let ParserMatchClass = SImm5Operand;
	let DecoderMethod = "DecodeSImm<5>";
	let PrintMethod = "printSImm<16>";
	}

	// simm7sN predicate - True if the immediate is a multiple of N in the range
	// [-64 * N, 63 * N].

	def SImm7s4Operand : SImmScaledMemoryIndexed<7, 4>;
	def SImm7s8Operand : SImmScaledMemoryIndexed<7, 8>;
	def SImm7s16Operand : SImmScaledMemoryIndexed<7, 16>;

	def simm7s4 : Operand<i32> {
	let ParserMatchClass = SImm7s4Operand;
	let PrintMethod = "printImmScale<4>";
	}

	def simm7s8 : Operand<i32> {
	let ParserMatchClass = SImm7s8Operand;
	let PrintMethod = "printImmScale<8>";
	}

	def simm7s16 : Operand<i32> {
	let ParserMatchClass = SImm7s16Operand;
	let PrintMethod = "printImmScale<16>";
	}

	def am_sve_fi : ComplexPattern<i64, 2, "SelectAddrModeFrameIndexSVE", []>;

	def am_indexed7s8 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S8", []>;
	def am_indexed7s16 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S16", []>;
	def am_indexed7s32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S32", []>;
	def am_indexed7s64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S64", []>;
	def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>;

	def am_indexedu6s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedU6S128", []>;
	def am_indexeds9s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedS9S128", []>;

	def UImmS1XForm : SDNodeXForm<imm, [{
	return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i64);
	}]>;
	def UImmS2XForm : SDNodeXForm<imm, [{
	return CurDAG->getTargetConstant(N->getZExtValue() / 2, SDLoc(N), MVT::i64);
	}]>;
	def UImmS4XForm : SDNodeXForm<imm, [{
	return CurDAG->getTargetConstant(N->getZExtValue() / 4, SDLoc(N), MVT::i64);
	}]>;
	def UImmS8XForm : SDNodeXForm<imm, [{
	return CurDAG->getTargetConstant(N->getZExtValue() / 8, SDLoc(N), MVT::i64);
	}]>;

	// uimm5sN predicate - True if the immediate is a multiple of N in the range
	// [0 * N, 32 * N].
	def UImm5s2Operand : UImmScaledMemoryIndexed<5, 2>;
	def UImm5s4Operand : UImmScaledMemoryIndexed<5, 4>;
	def UImm5s8Operand : UImmScaledMemoryIndexed<5, 8>;

	def uimm5s2 : Operand<i64>, ImmLeaf<i64,
	[{ return Imm >= 0 && Imm < (32*2) && ((Imm % 2) == 0); }],
	UImmS2XForm> {
	let ParserMatchClass = UImm5s2Operand;
	let PrintMethod = "printImmScale<2>";
	}
	def uimm5s4 : Operand<i64>, ImmLeaf<i64,
	[{ return Imm >= 0 && Imm < (32*4) && ((Imm % 4) == 0); }],
	UImmS4XForm> {
	let ParserMatchClass = UImm5s4Operand;
	let PrintMethod = "printImmScale<4>";
	}
	def uimm5s8 : Operand<i64>, ImmLeaf<i64,
	[{ return Imm >= 0 && Imm < (32*8) && ((Imm % 8) == 0); }],
	UImmS8XForm> {
	let ParserMatchClass = UImm5s8Operand;
	let PrintMethod = "printImmScale<8>";
	}

	// tuimm5sN predicate - similiar to uimm5sN, but use TImmLeaf (TargetConstant)
	// instead of ImmLeaf (Constant)
	def tuimm5s2 : Operand<i64>, TImmLeaf<i64,
	[{ return Imm >= 0 && Imm < (32*2) && ((Imm % 2) == 0); }],
	UImmS2XForm> {
	let ParserMatchClass = UImm5s2Operand;
	let PrintMethod = "printImmScale<2>";
	}
	def tuimm5s4 : Operand<i64>, TImmLeaf<i64,
	[{ return Imm >= 0 && Imm < (32*4) && ((Imm % 4) == 0); }],
	UImmS4XForm> {
	let ParserMatchClass = UImm5s4Operand;
	let PrintMethod = "printImmScale<4>";
	}
	def tuimm5s8 : Operand<i64>, TImmLeaf<i64,
	[{ return Imm >= 0 && Imm < (32*8) && ((Imm % 8) == 0); }],
	UImmS8XForm> {
	let ParserMatchClass = UImm5s8Operand;
	let PrintMethod = "printImmScale<8>";
	}

	// uimm6sN predicate - True if the immediate is a multiple of N in the range
	// [0 * N, 64 * N].
	def UImm6s1Operand : UImmScaledMemoryIndexed<6, 1>;
	def UImm6s2Operand : UImmScaledMemoryIndexed<6, 2>;
	def UImm6s4Operand : UImmScaledMemoryIndexed<6, 4>;
	def UImm6s8Operand : UImmScaledMemoryIndexed<6, 8>;
	def UImm6s16Operand : UImmScaledMemoryIndexed<6, 16>;

	def uimm6s1 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= 0 && Imm < 64; }]> {
	let ParserMatchClass = UImm6s1Operand;
	}
	def uimm6s2 : Operand<i64>, ImmLeaf<i64,
	[{ return Imm >= 0 && Imm < (64*2) && ((Imm % 2) == 0); }]> {
	let PrintMethod = "printImmScale<2>";
	let ParserMatchClass = UImm6s2Operand;
	}
	def uimm6s4 : Operand<i64>, ImmLeaf<i64,
	[{ return Imm >= 0 && Imm < (64*4) && ((Imm % 4) == 0); }]> {
	let PrintMethod = "printImmScale<4>";
	let ParserMatchClass = UImm6s4Operand;
	}
	def uimm6s8 : Operand<i64>, ImmLeaf<i64,
	[{ return Imm >= 0 && Imm < (64*8) && ((Imm % 8) == 0); }]> {
	let PrintMethod = "printImmScale<8>";
	let ParserMatchClass = UImm6s8Operand;
	}
	def uimm6s16 : Operand<i64>, ImmLeaf<i64,
	[{ return Imm >= 0 && Imm < (64*16) && ((Imm % 16) == 0); }]> {
	let PrintMethod = "printImmScale<16>";
	let ParserMatchClass = UImm6s16Operand;
	}

	def SImmS2XForm : SDNodeXForm<imm, [{
	return CurDAG->getTargetConstant(N->getSExtValue() / 2, SDLoc(N), MVT::i64);
	}]>;
	def SImmS3XForm : SDNodeXForm<imm, [{
	return CurDAG->getTargetConstant(N->getSExtValue() / 3, SDLoc(N), MVT::i64);
	}]>;
	def SImmS4XForm : SDNodeXForm<imm, [{
	return CurDAG->getTargetConstant(N->getSExtValue() / 4, SDLoc(N), MVT::i64);
	}]>;
	def SImmS16XForm : SDNodeXForm<imm, [{
	return CurDAG->getTargetConstant(N->getSExtValue() / 16, SDLoc(N), MVT::i64);
	}]>;
	+def SImmS32XForm : SDNodeXForm<imm, [{
	+ return CurDAG->getTargetConstant(N->getSExtValue() / 32, SDLoc(N), MVT::i64);
	+}]>;

	// simm6sN predicate - True if the immediate is a multiple of N in the range
	// [-32 * N, 31 * N].
	def SImm6s1Operand : SImmScaledMemoryIndexed<6, 1>;
	def simm6s1 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -32 && Imm < 32; }]> {
	let ParserMatchClass = SImm6s1Operand;
	let DecoderMethod = "DecodeSImm<6>";
	}

	// simm4sN predicate - True if the immediate is a multiple of N in the range
	// [ -8* N, 7 * N].
	def SImm4s1Operand : SImmScaledMemoryIndexed<4, 1>;
	def SImm4s2Operand : SImmScaledMemoryIndexed<4, 2>;
	def SImm4s3Operand : SImmScaledMemoryIndexed<4, 3>;
	def SImm4s4Operand : SImmScaledMemoryIndexed<4, 4>;
	def SImm4s16Operand : SImmScaledMemoryIndexed<4, 16>;
	def SImm4s32Operand : SImmScaledMemoryIndexed<4, 32>;

	def simm4s1 : Operand<i64>, ImmLeaf<i64,
	[{ return Imm >=-8 && Imm <= 7; }]> {
	let ParserMatchClass = SImm4s1Operand;
	let DecoderMethod = "DecodeSImm<4>";
	}

	def simm4s2 : Operand<i64>, ImmLeaf<i64,
	[{ return Imm >=-16 && Imm <= 14 && (Imm % 2) == 0x0; }], SImmS2XForm> {
	let PrintMethod = "printImmScale<2>";
	let ParserMatchClass = SImm4s2Operand;
	let DecoderMethod = "DecodeSImm<4>";
	}

	def simm4s3 : Operand<i64>, ImmLeaf<i64,
	[{ return Imm >=-24 && Imm <= 21 && (Imm % 3) == 0x0; }], SImmS3XForm> {
	let PrintMethod = "printImmScale<3>";
	let ParserMatchClass = SImm4s3Operand;
	let DecoderMethod = "DecodeSImm<4>";
	}

	def simm4s4 : Operand<i64>, ImmLeaf<i64,
	[{ return Imm >=-32 && Imm <= 28 && (Imm % 4) == 0x0; }], SImmS4XForm> {
	let PrintMethod = "printImmScale<4>";
	let ParserMatchClass = SImm4s4Operand;
	let DecoderMethod = "DecodeSImm<4>";
	}
	def simm4s16 : Operand<i64>, ImmLeaf<i64,
	[{ return Imm >=-128 && Imm <= 112 && (Imm % 16) == 0x0; }], SImmS16XForm> {
	let PrintMethod = "printImmScale<16>";
	let ParserMatchClass = SImm4s16Operand;
	let DecoderMethod = "DecodeSImm<4>";
	}
	def simm4s32 : Operand<i64>, ImmLeaf<i64,
	-[{ return Imm >=-256 && Imm <= 224 && (Imm % 32) == 0x0; }]> {
	+[{ return Imm >=-256 && Imm <= 224 && (Imm % 32) == 0x0; }], SImmS32XForm> {
	let PrintMethod = "printImmScale<32>";
	let ParserMatchClass = SImm4s32Operand;
	let DecoderMethod = "DecodeSImm<4>";
	}

	def Imm1_8Operand : AsmImmRange<1, 8>;
	def Imm1_16Operand : AsmImmRange<1, 16>;
	def Imm1_32Operand : AsmImmRange<1, 32>;
	def Imm1_64Operand : AsmImmRange<1, 64>;

	class BranchTarget<int N> : AsmOperandClass {
	let Name = "BranchTarget" # N;
	let DiagnosticType = "InvalidLabel";
	let PredicateMethod = "isBranchTarget<" # N # ">";
	}

	class PCRelLabel<int N> : BranchTarget<N> {
	let Name = "PCRelLabel" # N;
	}

	def BranchTarget14Operand : BranchTarget<14>;
	def BranchTarget26Operand : BranchTarget<26>;
	def PCRelLabel19Operand : PCRelLabel<19>;

	def MovWSymbolG3AsmOperand : AsmOperandClass {
	let Name = "MovWSymbolG3";
	let RenderMethod = "addImmOperands";
	}

	def movw_symbol_g3 : Operand<i32> {
	let ParserMatchClass = MovWSymbolG3AsmOperand;
	}

	def MovWSymbolG2AsmOperand : AsmOperandClass {
	let Name = "MovWSymbolG2";
	let RenderMethod = "addImmOperands";
	}

	def movw_symbol_g2 : Operand<i32> {
	let ParserMatchClass = MovWSymbolG2AsmOperand;
	}

	def MovWSymbolG1AsmOperand : AsmOperandClass {
	let Name = "MovWSymbolG1";
	let RenderMethod = "addImmOperands";
	}

	def movw_symbol_g1 : Operand<i32> {
	let ParserMatchClass = MovWSymbolG1AsmOperand;
	}

	def MovWSymbolG0AsmOperand : AsmOperandClass {
	let Name = "MovWSymbolG0";
	let RenderMethod = "addImmOperands";
	}

	def movw_symbol_g0 : Operand<i32> {
	let ParserMatchClass = MovWSymbolG0AsmOperand;
	}

	class fixedpoint_i32<ValueType FloatVT>
	: Operand<FloatVT>,
	ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<32>", [fpimm, ld]> {
	let EncoderMethod = "getFixedPointScaleOpValue";
	let DecoderMethod = "DecodeFixedPointScaleImm32";
	let ParserMatchClass = Imm1_32Operand;
	}

	class fixedpoint_i64<ValueType FloatVT>
	: Operand<FloatVT>,
	ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<64>", [fpimm, ld]> {
	let EncoderMethod = "getFixedPointScaleOpValue";
	let DecoderMethod = "DecodeFixedPointScaleImm64";
	let ParserMatchClass = Imm1_64Operand;
	}

	def fixedpoint_f16_i32 : fixedpoint_i32<f16>;
	def fixedpoint_f32_i32 : fixedpoint_i32<f32>;
	def fixedpoint_f64_i32 : fixedpoint_i32<f64>;

	def fixedpoint_f16_i64 : fixedpoint_i64<f16>;
	def fixedpoint_f32_i64 : fixedpoint_i64<f32>;
	def fixedpoint_f64_i64 : fixedpoint_i64<f64>;

	def vecshiftR8 : Operand<i32>, ImmLeaf<i32, [{
	return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
	}]> {
	let EncoderMethod = "getVecShiftR8OpValue";
	let DecoderMethod = "DecodeVecShiftR8Imm";
	let ParserMatchClass = Imm1_8Operand;
	}
	def vecshiftR16 : Operand<i32>, ImmLeaf<i32, [{
	return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
	}]> {
	let EncoderMethod = "getVecShiftR16OpValue";
	let DecoderMethod = "DecodeVecShiftR16Imm";
	let ParserMatchClass = Imm1_16Operand;
	}
	def vecshiftR16Narrow : Operand<i32>, ImmLeaf<i32, [{
	return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
	}]> {
	let EncoderMethod = "getVecShiftR16OpValue";
	let DecoderMethod = "DecodeVecShiftR16ImmNarrow";
	let ParserMatchClass = Imm1_8Operand;
	}
	def vecshiftR32 : Operand<i32>, ImmLeaf<i32, [{
	return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
	}]> {
	let EncoderMethod = "getVecShiftR32OpValue";
	let DecoderMethod = "DecodeVecShiftR32Imm";
	let ParserMatchClass = Imm1_32Operand;
	}
	def vecshiftR32Narrow : Operand<i32>, ImmLeaf<i32, [{
	return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
	}]> {
	let EncoderMethod = "getVecShiftR32OpValue";
	let DecoderMethod = "DecodeVecShiftR32ImmNarrow";
	let ParserMatchClass = Imm1_16Operand;
	}
	def vecshiftR64 : Operand<i32>, ImmLeaf<i32, [{
	return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 65);
	}]> {
	let EncoderMethod = "getVecShiftR64OpValue";
	let DecoderMethod = "DecodeVecShiftR64Imm";
	let ParserMatchClass = Imm1_64Operand;
	}
	def vecshiftR64Narrow : Operand<i32>, ImmLeaf<i32, [{
	return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
	}]> {
	let EncoderMethod = "getVecShiftR64OpValue";
	let DecoderMethod = "DecodeVecShiftR64ImmNarrow";
	let ParserMatchClass = Imm1_32Operand;
	}

	// Same as vecshiftR#N, but use TargetConstant (TimmLeaf) instead of Constant
	// (ImmLeaf)
	def tvecshiftR8 : Operand<i32>, TImmLeaf<i32, [{
	return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
	}]> {
	let EncoderMethod = "getVecShiftR8OpValue";
	let DecoderMethod = "DecodeVecShiftR8Imm";
	let ParserMatchClass = Imm1_8Operand;
	}
	def tvecshiftR16 : Operand<i32>, TImmLeaf<i32, [{
	return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
	}]> {
	let EncoderMethod = "getVecShiftR16OpValue";
	let DecoderMethod = "DecodeVecShiftR16Imm";
	let ParserMatchClass = Imm1_16Operand;
	}
	def tvecshiftR32 : Operand<i32>, TImmLeaf<i32, [{
	return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
	}]> {
	let EncoderMethod = "getVecShiftR32OpValue";
	let DecoderMethod = "DecodeVecShiftR32Imm";
	let ParserMatchClass = Imm1_32Operand;
	}
	def tvecshiftR64 : Operand<i32>, TImmLeaf<i32, [{
	return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 65);
	}]> {
	let EncoderMethod = "getVecShiftR64OpValue";
	let DecoderMethod = "DecodeVecShiftR64Imm";
	let ParserMatchClass = Imm1_64Operand;
	}

	def Imm0_1Operand : AsmImmRange<0, 1>;
	def Imm0_7Operand : AsmImmRange<0, 7>;
	def Imm0_15Operand : AsmImmRange<0, 15>;
	def Imm0_31Operand : AsmImmRange<0, 31>;
	def Imm0_63Operand : AsmImmRange<0, 63>;

	def vecshiftL8 : Operand<i32>, ImmLeaf<i32, [{
	return (((uint32_t)Imm) < 8);
	}]> {
	let EncoderMethod = "getVecShiftL8OpValue";
	let DecoderMethod = "DecodeVecShiftL8Imm";
	let ParserMatchClass = Imm0_7Operand;
	}
	def vecshiftL16 : Operand<i32>, ImmLeaf<i32, [{
	return (((uint32_t)Imm) < 16);
	}]> {
	let EncoderMethod = "getVecShiftL16OpValue";
	let DecoderMethod = "DecodeVecShiftL16Imm";
	let ParserMatchClass = Imm0_15Operand;
	}
	def vecshiftL32 : Operand<i32>, ImmLeaf<i32, [{
	return (((uint32_t)Imm) < 32);
	}]> {
	let EncoderMethod = "getVecShiftL32OpValue";
	let DecoderMethod = "DecodeVecShiftL32Imm";
	let ParserMatchClass = Imm0_31Operand;
	}
	def vecshiftL64 : Operand<i32>, ImmLeaf<i32, [{
	return (((uint32_t)Imm) < 64);
	}]> {
	let EncoderMethod = "getVecShiftL64OpValue";
	let DecoderMethod = "DecodeVecShiftL64Imm";
	let ParserMatchClass = Imm0_63Operand;
	}

	// Same as vecshiftL#N, but use TargetConstant (TimmLeaf) instead of Constant
	// (ImmLeaf)
	def tvecshiftL8 : Operand<i32>, TImmLeaf<i32, [{
	return (((uint32_t)Imm) < 8);
	}]> {
	let EncoderMethod = "getVecShiftL8OpValue";
	let DecoderMethod = "DecodeVecShiftL8Imm";
	let ParserMatchClass = Imm0_7Operand;
	}
	def tvecshiftL16 : Operand<i32>, TImmLeaf<i32, [{
	return (((uint32_t)Imm) < 16);
	}]> {
	let EncoderMethod = "getVecShiftL16OpValue";
	let DecoderMethod = "DecodeVecShiftL16Imm";
	let ParserMatchClass = Imm0_15Operand;
	}
	def tvecshiftL32 : Operand<i32>, TImmLeaf<i32, [{
	return (((uint32_t)Imm) < 32);
	}]> {
	let EncoderMethod = "getVecShiftL32OpValue";
	let DecoderMethod = "DecodeVecShiftL32Imm";
	let ParserMatchClass = Imm0_31Operand;
	}
	def tvecshiftL64 : Operand<i32>, TImmLeaf<i32, [{
	return (((uint32_t)Imm) < 64);
	}]> {
	let EncoderMethod = "getVecShiftL64OpValue";
	let DecoderMethod = "DecodeVecShiftL64Imm";
	let ParserMatchClass = Imm0_63Operand;
	}

	// Crazy immediate formats used by 32-bit and 64-bit logical immediate
	// instructions for splatting repeating bit patterns across the immediate.
	def logical_imm32_XFORM : SDNodeXForm<imm, [{
	uint64_t enc = AArch64_AM::encodeLogicalImmediate(N->getZExtValue(), 32);
	return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
	}]>;
	def logical_imm64_XFORM : SDNodeXForm<imm, [{
	uint64_t enc = AArch64_AM::encodeLogicalImmediate(N->getZExtValue(), 64);
	return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
	}]>;

	def gi_logical_imm32_XFORM : GICustomOperandRenderer<"renderLogicalImm32">,
	GISDNodeXFormEquiv<logical_imm32_XFORM>;
	def gi_logical_imm64_XFORM : GICustomOperandRenderer<"renderLogicalImm64">,
	GISDNodeXFormEquiv<logical_imm64_XFORM>;

	let DiagnosticType = "LogicalSecondSource" in {
	def LogicalImm32Operand : AsmOperandClass {
	let Name = "LogicalImm32";
	let PredicateMethod = "isLogicalImm<int32_t>";
	let RenderMethod = "addLogicalImmOperands<int32_t>";
	}
	def LogicalImm64Operand : AsmOperandClass {
	let Name = "LogicalImm64";
	let PredicateMethod = "isLogicalImm<int64_t>";
	let RenderMethod = "addLogicalImmOperands<int64_t>";
	}
	def LogicalImm32NotOperand : AsmOperandClass {
	let Name = "LogicalImm32Not";
	let PredicateMethod = "isLogicalImm<int32_t>";
	let RenderMethod = "addLogicalImmNotOperands<int32_t>";
	}
	def LogicalImm64NotOperand : AsmOperandClass {
	let Name = "LogicalImm64Not";
	let PredicateMethod = "isLogicalImm<int64_t>";
	let RenderMethod = "addLogicalImmNotOperands<int64_t>";
	}
	}
	def logical_imm32 : Operand<i32>, IntImmLeaf<i32, [{
	return AArch64_AM::isLogicalImmediate(Imm.getZExtValue(), 32);
	}], logical_imm32_XFORM> {
	let PrintMethod = "printLogicalImm<int32_t>";
	let ParserMatchClass = LogicalImm32Operand;
	}
	def logical_imm64 : Operand<i64>, IntImmLeaf<i64, [{
	return AArch64_AM::isLogicalImmediate(Imm.getZExtValue(), 64);
	}], logical_imm64_XFORM> {
	let PrintMethod = "printLogicalImm<int64_t>";
	let ParserMatchClass = LogicalImm64Operand;
	}
	def logical_imm32_not : Operand<i32> {
	let ParserMatchClass = LogicalImm32NotOperand;
	}
	def logical_imm64_not : Operand<i64> {
	let ParserMatchClass = LogicalImm64NotOperand;
	}

	// iXX_imm0_65535 predicates - True if the immediate is in the range [0,65535].
	let ParserMatchClass = AsmImmRange<0, 65535>, PrintMethod = "printImmHex" in {
	def i32_imm0_65535 : Operand<i32>, TImmLeaf<i32, [{
	return ((uint32_t)Imm) < 65536;
	}]>;

	def i64_imm0_65535 : Operand<i64>, TImmLeaf<i64, [{
	return ((uint64_t)Imm) < 65536;
	}]>;
	}

	// imm0_255 predicate - True if the immediate is in the range [0,255].
	def Imm0_255Operand : AsmImmRange<0,255>;

	def imm0_255 : Operand<i32>, ImmLeaf<i32, [{
	return ((uint32_t)Imm) < 256;
	}]> {
	let ParserMatchClass = Imm0_255Operand;
	let PrintMethod = "printImm";
	}

	// imm0_127 predicate - True if the immediate is in the range [0,127]
	def Imm0_127Operand : AsmImmRange<0, 127>;
	def imm0_127 : Operand<i32>, ImmLeaf<i32, [{
	return ((uint32_t)Imm) < 128;
	}]> {
	let ParserMatchClass = Imm0_127Operand;
	let PrintMethod = "printImm";
	}

	def imm0_127_64b : Operand<i64>, ImmLeaf<i64, [{
	return ((uint64_t)Imm) < 128;
	}]> {
	let ParserMatchClass = Imm0_127Operand;
	let PrintMethod = "printImm";
	}

	// NOTE: These imm0_N operands have to be of type i64 because i64 is the size
	// for all shift-amounts.

	// imm0_63 predicate - True if the immediate is in the range [0,63]
	def imm0_63 : Operand<i64>, ImmLeaf<i64, [{
	return ((uint64_t)Imm) < 64;
	}]> {
	let ParserMatchClass = Imm0_63Operand;
	}

	// imm0_31 predicate - True if the immediate is in the range [0,31]
	def imm0_31 : Operand<i64>, ImmLeaf<i64, [{
	return ((uint64_t)Imm) < 32;
	}]> {
	let ParserMatchClass = Imm0_31Operand;
	}

	// timm0_31 predicate - same ass imm0_31, but use TargetConstant (TimmLeaf)
	// instead of Constant (ImmLeaf)
	def timm0_31 : Operand<i64>, TImmLeaf<i64, [{
	return ((uint64_t)Imm) < 32;
	}]> {
	let ParserMatchClass = Imm0_31Operand;
	}

	// True if the 32-bit immediate is in the range [0,31]
	def imm32_0_31 : Operand<i32>, ImmLeaf<i32, [{
	return ((uint64_t)Imm) < 32;
	}]> {
	let ParserMatchClass = Imm0_31Operand;
	}

	// imm0_1 predicate - True if the immediate is in the range [0,1]
	def imm0_1 : Operand<i64>, ImmLeaf<i64, [{
	return ((uint64_t)Imm) < 2;
	}]> {
	let ParserMatchClass = Imm0_1Operand;
	}

	// imm0_15 predicate - True if the immediate is in the range [0,15]
	def imm0_15 : Operand<i64>, ImmLeaf<i64, [{
	return ((uint64_t)Imm) < 16;
	}]> {
	let ParserMatchClass = Imm0_15Operand;
	}

	// imm0_7 predicate - True if the immediate is in the range [0,7]
	def imm0_7 : Operand<i64>, ImmLeaf<i64, [{
	return ((uint64_t)Imm) < 8;
	}]> {
	let ParserMatchClass = Imm0_7Operand;
	}

	// imm32_0_7 predicate - True if the 32-bit immediate is in the range [0,7]
	def imm32_0_7 : Operand<i32>, TImmLeaf<i32, [{
	return ((uint32_t)Imm) < 8;
	}]> {
	let ParserMatchClass = Imm0_7Operand;
	}

	// imm32_0_15 predicate - True if the 32-bit immediate is in the range [0,15]
	def imm32_0_15 : Operand<i32>, ImmLeaf<i32, [{
	return ((uint32_t)Imm) < 16;
	}]> {
	let ParserMatchClass = Imm0_15Operand;
	}

	// An arithmetic shifter operand:
	// {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr
	// {5-0} - imm6
	class arith_shift<ValueType Ty, int width> : Operand<Ty> {
	let PrintMethod = "printShifter";
	let ParserMatchClass = !cast<AsmOperandClass>(
	"ArithmeticShifterOperand" # width);
	}

	def arith_shift32 : arith_shift<i32, 32>;
	def arith_shift64 : arith_shift<i64, 64>;

	class arith_shifted_reg<ValueType Ty, RegisterClass regclass, int width>
	: Operand<Ty>,
	ComplexPattern<Ty, 2, "SelectArithShiftedRegister", []> {
	let PrintMethod = "printShiftedRegister";
	let MIOperandInfo = (ops regclass, !cast<Operand>("arith_shift" # width));
	}

	def arith_shifted_reg32 : arith_shifted_reg<i32, GPR32, 32>;
	def arith_shifted_reg64 : arith_shifted_reg<i64, GPR64, 64>;

	def gi_arith_shifted_reg32 :
	GIComplexOperandMatcher<s32, "selectArithShiftedRegister">,
	GIComplexPatternEquiv<arith_shifted_reg32>;

	def gi_arith_shifted_reg64 :
	GIComplexOperandMatcher<s64, "selectArithShiftedRegister">,
	GIComplexPatternEquiv<arith_shifted_reg64>;

	// An arithmetic shifter operand:
	// {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr, 11 = ror
	// {5-0} - imm6
	class logical_shift<int width> : Operand<i32> {
	let PrintMethod = "printShifter";
	let ParserMatchClass = !cast<AsmOperandClass>(
	"LogicalShifterOperand" # width);
	}

	def logical_shift32 : logical_shift<32>;
	def logical_shift64 : logical_shift<64>;

	class logical_shifted_reg<ValueType Ty, RegisterClass regclass, Operand shiftop>
	: Operand<Ty>,
	ComplexPattern<Ty, 2, "SelectLogicalShiftedRegister", []> {
	let PrintMethod = "printShiftedRegister";
	let MIOperandInfo = (ops regclass, shiftop);
	}

	def logical_shifted_reg32 : logical_shifted_reg<i32, GPR32, logical_shift32>;
	def logical_shifted_reg64 : logical_shifted_reg<i64, GPR64, logical_shift64>;

	def gi_logical_shifted_reg32 :
	GIComplexOperandMatcher<s32, "selectLogicalShiftedRegister">,
	GIComplexPatternEquiv<logical_shifted_reg32>;

	def gi_logical_shifted_reg64 :
	GIComplexOperandMatcher<s64, "selectLogicalShiftedRegister">,
	GIComplexPatternEquiv<logical_shifted_reg64>;

	// A logical vector shifter operand:
	// {7-6} - shift type: 00 = lsl
	// {5-0} - imm6: #0, #8, #16, or #24
	def logical_vec_shift : Operand<i32> {
	let PrintMethod = "printShifter";
	let EncoderMethod = "getVecShifterOpValue";
	let ParserMatchClass = LogicalVecShifterOperand;
	}

	// A logical vector half-word shifter operand:
	// {7-6} - shift type: 00 = lsl
	// {5-0} - imm6: #0 or #8
	def logical_vec_hw_shift : Operand<i32> {
	let PrintMethod = "printShifter";
	let EncoderMethod = "getVecShifterOpValue";
	let ParserMatchClass = LogicalVecHalfWordShifterOperand;
	}

	// A vector move shifter operand:
	// {0} - imm1: #8 or #16
	def move_vec_shift : Operand<i32> {
	let PrintMethod = "printShifter";
	let EncoderMethod = "getMoveVecShifterOpValue";
	let ParserMatchClass = MoveVecShifterOperand;
	}

	let DiagnosticType = "AddSubSecondSource" in {
	def AddSubImmOperand : AsmOperandClass {
	let Name = "AddSubImm";
	let ParserMethod = "tryParseImmWithOptionalShift";
	let RenderMethod = "addImmWithOptionalShiftOperands<12>";
	}
	def AddSubImmNegOperand : AsmOperandClass {
	let Name = "AddSubImmNeg";
	let ParserMethod = "tryParseImmWithOptionalShift";
	let RenderMethod = "addImmNegWithOptionalShiftOperands<12>";
	}
	}
	// An ADD/SUB immediate shifter operand:
	// second operand:
	// {7-6} - shift type: 00 = lsl
	// {5-0} - imm6: #0 or #12
	class addsub_shifted_imm<ValueType Ty>
	: Operand<Ty>, ComplexPattern<Ty, 2, "SelectArithImmed", [imm]> {
	let PrintMethod = "printAddSubImm";
	let EncoderMethod = "getAddSubImmOpValue";
	let ParserMatchClass = AddSubImmOperand;
	let MIOperandInfo = (ops i32imm, i32imm);
	}

	class addsub_shifted_imm_neg<ValueType Ty>
	: Operand<Ty> {
	let EncoderMethod = "getAddSubImmOpValue";
	let ParserMatchClass = AddSubImmNegOperand;
	let MIOperandInfo = (ops i32imm, i32imm);
	}

	def addsub_shifted_imm32 : addsub_shifted_imm<i32>;
	def addsub_shifted_imm64 : addsub_shifted_imm<i64>;
	def addsub_shifted_imm32_neg : addsub_shifted_imm_neg<i32>;
	def addsub_shifted_imm64_neg : addsub_shifted_imm_neg<i64>;

	def gi_addsub_shifted_imm32 :
	GIComplexOperandMatcher<s32, "selectArithImmed">,
	GIComplexPatternEquiv<addsub_shifted_imm32>;

	def gi_addsub_shifted_imm64 :
	GIComplexOperandMatcher<s64, "selectArithImmed">,
	GIComplexPatternEquiv<addsub_shifted_imm64>;

	class neg_addsub_shifted_imm<ValueType Ty>
	: Operand<Ty>, ComplexPattern<Ty, 2, "SelectNegArithImmed", [imm]> {
	let PrintMethod = "printAddSubImm";
	let EncoderMethod = "getAddSubImmOpValue";
	let ParserMatchClass = AddSubImmOperand;
	let MIOperandInfo = (ops i32imm, i32imm);
	}

	def neg_addsub_shifted_imm32 : neg_addsub_shifted_imm<i32>;
	def neg_addsub_shifted_imm64 : neg_addsub_shifted_imm<i64>;

	def gi_neg_addsub_shifted_imm32 :
	GIComplexOperandMatcher<s32, "selectNegArithImmed">,
	GIComplexPatternEquiv<neg_addsub_shifted_imm32>;

	def gi_neg_addsub_shifted_imm64 :
	GIComplexOperandMatcher<s64, "selectNegArithImmed">,
	GIComplexPatternEquiv<neg_addsub_shifted_imm64>;

	// An extend operand:
	// {5-3} - extend type
	// {2-0} - imm3
	def arith_extend : Operand<i32> {
	let PrintMethod = "printArithExtend";
	let ParserMatchClass = ExtendOperand;
	}
	def arith_extend64 : Operand<i32> {
	let PrintMethod = "printArithExtend";
	let ParserMatchClass = ExtendOperand64;
	}

	// 'extend' that's a lsl of a 64-bit register.
	def arith_extendlsl64 : Operand<i32> {
	let PrintMethod = "printArithExtend";
	let ParserMatchClass = ExtendOperandLSL64;
	}

	class arith_extended_reg32<ValueType Ty> : Operand<Ty>,
	ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
	let PrintMethod = "printExtendedRegister";
	let MIOperandInfo = (ops GPR32, arith_extend);
	}

	class arith_extended_reg32to64<ValueType Ty> : Operand<Ty>,
	ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
	let PrintMethod = "printExtendedRegister";
	let MIOperandInfo = (ops GPR32, arith_extend64);
	}

	def arith_extended_reg32_i32 : arith_extended_reg32<i32>;
	def gi_arith_extended_reg32_i32 :
	GIComplexOperandMatcher<s32, "selectArithExtendedRegister">,
	GIComplexPatternEquiv<arith_extended_reg32_i32>;

	def arith_extended_reg32_i64 : arith_extended_reg32<i64>;
	def gi_arith_extended_reg32_i64 :
	GIComplexOperandMatcher<s64, "selectArithExtendedRegister">,
	GIComplexPatternEquiv<arith_extended_reg32_i64>;

	def arith_extended_reg32to64_i64 : arith_extended_reg32to64<i64>;
	def gi_arith_extended_reg32to64_i64 :
	GIComplexOperandMatcher<s64, "selectArithExtendedRegister">,
	GIComplexPatternEquiv<arith_extended_reg32to64_i64>;

	// Floating-point immediate.
	def fpimm16 : Operand<f16>,
	FPImmLeaf<f16, [{
	return AArch64_AM::getFP16Imm(Imm) != -1;
	}], SDNodeXForm<fpimm, [{
	APFloat InVal = N->getValueAPF();
	uint32_t enc = AArch64_AM::getFP16Imm(InVal);
	return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
	}]>> {
	let ParserMatchClass = FPImmOperand;
	let PrintMethod = "printFPImmOperand";
	}
	def fpimm32 : Operand<f32>,
	FPImmLeaf<f32, [{
	return AArch64_AM::getFP32Imm(Imm) != -1;
	}], SDNodeXForm<fpimm, [{
	APFloat InVal = N->getValueAPF();
	uint32_t enc = AArch64_AM::getFP32Imm(InVal);
	return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
	}]>> {
	let ParserMatchClass = FPImmOperand;
	let PrintMethod = "printFPImmOperand";
	}
	def fpimm64 : Operand<f64>,
	FPImmLeaf<f64, [{
	return AArch64_AM::getFP64Imm(Imm) != -1;
	}], SDNodeXForm<fpimm, [{
	APFloat InVal = N->getValueAPF();
	uint32_t enc = AArch64_AM::getFP64Imm(InVal);
	return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
	}]>> {
	let ParserMatchClass = FPImmOperand;
	let PrintMethod = "printFPImmOperand";
	}

	def fpimm8 : Operand<i32> {
	let ParserMatchClass = FPImmOperand;
	let PrintMethod = "printFPImmOperand";
	}

	def fpimm0 : FPImmLeaf<fAny, [{
	return Imm.isExactlyValue(+0.0);
	}]>;

	// Vector lane operands
	class AsmVectorIndex<int Min, int Max, string NamePrefix=""> : AsmOperandClass {
	let Name = NamePrefix # "IndexRange" # Min # "_" # Max;
	let DiagnosticType = "Invalid" # Name;
	let PredicateMethod = "isVectorIndex<" # Min # ", " # Max # ">";
	let RenderMethod = "addVectorIndexOperands";
	}

	class AsmVectorIndexOpnd<ValueType ty, AsmOperandClass mc>
	: Operand<ty> {
	let ParserMatchClass = mc;
	let PrintMethod = "printVectorIndex";
	}

	multiclass VectorIndex<ValueType ty, AsmOperandClass mc, code pred> {
	def "" : AsmVectorIndexOpnd<ty, mc>, ImmLeaf<ty, pred>;
	def _timm : AsmVectorIndexOpnd<ty, mc>, TImmLeaf<ty, pred>;
	}

	def VectorIndex1Operand : AsmVectorIndex<1, 1>;
	def VectorIndexBOperand : AsmVectorIndex<0, 15>;
	def VectorIndexHOperand : AsmVectorIndex<0, 7>;
	def VectorIndexSOperand : AsmVectorIndex<0, 3>;
	def VectorIndexDOperand : AsmVectorIndex<0, 1>;

	defm VectorIndex1 : VectorIndex<i64, VectorIndex1Operand,
	[{ return ((uint64_t)Imm) == 1; }]>;
	defm VectorIndexB : VectorIndex<i64, VectorIndexBOperand,
	[{ return ((uint64_t)Imm) < 16; }]>;
	defm VectorIndexH : VectorIndex<i64, VectorIndexHOperand,
	[{ return ((uint64_t)Imm) < 8; }]>;
	defm VectorIndexS : VectorIndex<i64, VectorIndexSOperand,
	[{ return ((uint64_t)Imm) < 4; }]>;
	defm VectorIndexD : VectorIndex<i64, VectorIndexDOperand,
	[{ return ((uint64_t)Imm) < 2; }]>;

	defm VectorIndex132b : VectorIndex<i32, VectorIndex1Operand,
	[{ return ((uint64_t)Imm) == 1; }]>;
	defm VectorIndexB32b : VectorIndex<i32, VectorIndexBOperand,
	[{ return ((uint64_t)Imm) < 16; }]>;
	defm VectorIndexH32b : VectorIndex<i32, VectorIndexHOperand,
	[{ return ((uint64_t)Imm) < 8; }]>;
	defm VectorIndexS32b : VectorIndex<i32, VectorIndexSOperand,
	[{ return ((uint64_t)Imm) < 4; }]>;
	defm VectorIndexD32b : VectorIndex<i32, VectorIndexDOperand,
	[{ return ((uint64_t)Imm) < 2; }]>;

	def SVEVectorIndexExtDupBOperand : AsmVectorIndex<0, 63, "SVE">;
	def SVEVectorIndexExtDupHOperand : AsmVectorIndex<0, 31, "SVE">;
	def SVEVectorIndexExtDupSOperand : AsmVectorIndex<0, 15, "SVE">;
	def SVEVectorIndexExtDupDOperand : AsmVectorIndex<0, 7, "SVE">;
	def SVEVectorIndexExtDupQOperand : AsmVectorIndex<0, 3, "SVE">;

	defm sve_elm_idx_extdup_b
	: VectorIndex<i64, SVEVectorIndexExtDupBOperand,
	[{ return ((uint64_t)Imm) < 64; }]>;
	defm sve_elm_idx_extdup_h
	: VectorIndex<i64, SVEVectorIndexExtDupHOperand,
	[{ return ((uint64_t)Imm) < 32; }]>;
	defm sve_elm_idx_extdup_s
	: VectorIndex<i64, SVEVectorIndexExtDupSOperand,
	[{ return ((uint64_t)Imm) < 16; }]>;
	defm sve_elm_idx_extdup_d
	: VectorIndex<i64, SVEVectorIndexExtDupDOperand,
	[{ return ((uint64_t)Imm) < 8; }]>;
	defm sve_elm_idx_extdup_q
	: VectorIndex<i64, SVEVectorIndexExtDupQOperand,
	[{ return ((uint64_t)Imm) < 4; }]>;

	// 8-bit immediate for AdvSIMD where 64-bit values of the form:
	// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
	// are encoded as the eight bit value 'abcdefgh'.
	def simdimmtype10 : Operand<i32>,
	FPImmLeaf<f64, [{
	return AArch64_AM::isAdvSIMDModImmType10(
	Imm.bitcastToAPInt().getZExtValue());
	}], SDNodeXForm<fpimm, [{
	APFloat InVal = N->getValueAPF();
	uint32_t enc = AArch64_AM::encodeAdvSIMDModImmType10(N->getValueAPF()
	.bitcastToAPInt()
	.getZExtValue());
	return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
	}]>> {
	let ParserMatchClass = SIMDImmType10Operand;
	let PrintMethod = "printSIMDType10Operand";
	}


	//---
	// System management
	//---

	// Base encoding for system instruction operands.
	let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
	class BaseSystemI<bit L, dag oops, dag iops, string asm, string operands,
	list<dag> pattern = []>
	: I<oops, iops, asm, operands, "", pattern> {
	let Inst{31-22} = 0b1101010100;
	let Inst{21} = L;
	}

	// System instructions which do not have an Rt register.
	class SimpleSystemI<bit L, dag iops, string asm, string operands,
	list<dag> pattern = []>
	: BaseSystemI<L, (outs), iops, asm, operands, pattern> {
	let Inst{4-0} = 0b11111;
	}

	// System instructions which have an Rt register.
	class RtSystemI<bit L, dag oops, dag iops, string asm, string operands>
	: BaseSystemI<L, oops, iops, asm, operands>,
	Sched<[WriteSys]> {
	bits<5> Rt;
	let Inst{4-0} = Rt;
	}

	// System instructions for transactional memory extension
	class TMBaseSystemI<bit L, bits<4> CRm, bits<3> op2, dag oops, dag iops,
	string asm, string operands, list<dag> pattern>
	: BaseSystemI<L, oops, iops, asm, operands, pattern>,
	Sched<[WriteSys]> {
	let Inst{20-12} = 0b000110011;
	let Inst{11-8} = CRm;
	let Inst{7-5} = op2;
	let DecoderMethod = "";

	let mayLoad = 1;
	let mayStore = 1;
	}

	// System instructions for transactional memory - single input operand
	class TMSystemI<bits<4> CRm, string asm, list<dag> pattern>
	: TMBaseSystemI<0b1, CRm, 0b011,
	(outs GPR64:$Rt), (ins), asm, "\t$Rt", pattern> {
	bits<5> Rt;
	let Inst{4-0} = Rt;
	}

	// System instructions for transactional memory - no operand
	class TMSystemINoOperand<bits<4> CRm, string asm, list<dag> pattern>
	: TMBaseSystemI<0b0, CRm, 0b011, (outs), (ins), asm, "", pattern> {
	let Inst{4-0} = 0b11111;
	}

	// System instructions for exit from transactions
	class TMSystemException<bits<3> op1, string asm, list<dag> pattern>
	: I<(outs), (ins i64_imm0_65535:$imm), asm, "\t$imm", "", pattern>,
	Sched<[WriteSys]> {
	bits<16> imm;
	let Inst{31-24} = 0b11010100;
	let Inst{23-21} = op1;
	let Inst{20-5} = imm;
	let Inst{4-0} = 0b00000;
	}

	// Hint instructions that take both a CRm and a 3-bit immediate.
	// NOTE: ideally, this would have mayStore = 0, mayLoad = 0, but we cannot
	// model patterns with sufficiently fine granularity
	let mayStore = 1, mayLoad = 1, hasSideEffects = 1 in
	class HintI<string mnemonic>
	: SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#"\t$imm", "",
	[(int_aarch64_hint imm0_127:$imm)]>,
	Sched<[WriteHint]> {
	bits <7> imm;
	let Inst{20-12} = 0b000110010;
	let Inst{11-5} = imm;
	}

	// System instructions taking a single literal operand which encodes into
	// CRm. op2 differentiates the opcodes.
	def BarrierAsmOperand : AsmOperandClass {
	let Name = "Barrier";
	let ParserMethod = "tryParseBarrierOperand";
	}
	def barrier_op : Operand<i32> {
	let PrintMethod = "printBarrierOption";
	let ParserMatchClass = BarrierAsmOperand;
	}
	class CRmSystemI<Operand crmtype, bits<3> opc, string asm,
	list<dag> pattern = []>
	: SimpleSystemI<0, (ins crmtype:$CRm), asm, "\t$CRm", pattern>,
	Sched<[WriteBarrier]> {
	bits<4> CRm;
	let Inst{20-12} = 0b000110011;
	let Inst{11-8} = CRm;
	let Inst{7-5} = opc;
	}

	class SystemNoOperands<bits<3> op2, string asm, list<dag> pattern = []>
	: SimpleSystemI<0, (ins), asm, "", pattern>,
	Sched<[]> {
	bits<4> CRm;
	let CRm = 0b0011;
	let Inst{31-12} = 0b11010101000000110010;
	let Inst{11-8} = CRm;
	let Inst{7-5} = op2;
	let Inst{4-0} = 0b11111;
	}

	// MRS/MSR system instructions. These have different operand classes because
	// a different subset of registers can be accessed through each instruction.
	def MRSSystemRegisterOperand : AsmOperandClass {
	let Name = "MRSSystemRegister";
	let ParserMethod = "tryParseSysReg";
	let DiagnosticType = "MRS";
	}
	// concatenation of op0, op1, CRn, CRm, op2. 16-bit immediate.
	def mrs_sysreg_op : Operand<i32> {
	let ParserMatchClass = MRSSystemRegisterOperand;
	let DecoderMethod = "DecodeMRSSystemRegister";
	let PrintMethod = "printMRSSystemRegister";
	}

	def MSRSystemRegisterOperand : AsmOperandClass {
	let Name = "MSRSystemRegister";
	let ParserMethod = "tryParseSysReg";
	let DiagnosticType = "MSR";
	}
	def msr_sysreg_op : Operand<i32> {
	let ParserMatchClass = MSRSystemRegisterOperand;
	let DecoderMethod = "DecodeMSRSystemRegister";
	let PrintMethod = "printMSRSystemRegister";
	}

	def PSBHintOperand : AsmOperandClass {
	let Name = "PSBHint";
	let ParserMethod = "tryParsePSBHint";
	}
	def psbhint_op : Operand<i32> {
	let ParserMatchClass = PSBHintOperand;
	let PrintMethod = "printPSBHintOp";
	let MCOperandPredicate = [{
	// Check, if operand is valid, to fix exhaustive aliasing in disassembly.
	// "psb" is an alias to "hint" only for certain values of CRm:Op2 fields.
	if (!MCOp.isImm())
	return false;
	return AArch64PSBHint::lookupPSBByEncoding(MCOp.getImm()) != nullptr;
	}];
	}

	def BTIHintOperand : AsmOperandClass {
	let Name = "BTIHint";
	let ParserMethod = "tryParseBTIHint";
	}
	def btihint_op : Operand<i32> {
	let ParserMatchClass = BTIHintOperand;
	let PrintMethod = "printBTIHintOp";
	let MCOperandPredicate = [{
	// "bti" is an alias to "hint" only for certain values of CRm:Op2 fields.
	if (!MCOp.isImm())
	return false;
	return AArch64BTIHint::lookupBTIByEncoding((MCOp.getImm() ^ 32) >> 1) != nullptr;
	}];
	}

	class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg),
	"mrs", "\t$Rt, $systemreg"> {
	bits<16> systemreg;
	let Inst{20-5} = systemreg;
	}

	// FIXME: Some of these def NZCV, others don't. Best way to model that?
	// Explicitly modeling each of the system register as a register class
	// would do it, but feels like overkill at this point.
	class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt),
	"msr", "\t$systemreg, $Rt"> {
	bits<16> systemreg;
	let Inst{20-5} = systemreg;
	}

	def SystemPStateFieldWithImm0_15Operand : AsmOperandClass {
	let Name = "SystemPStateFieldWithImm0_15";
	let ParserMethod = "tryParseSysReg";
	}
	def pstatefield4_op : Operand<i32> {
	let ParserMatchClass = SystemPStateFieldWithImm0_15Operand;
	let PrintMethod = "printSystemPStateField";
	}

	// Instructions to modify PSTATE, no input reg
	let Defs = [NZCV] in
	class PstateWriteSimple<dag iops, string asm, string operands>
	: SimpleSystemI<0, iops, asm, operands> {

	let Inst{20-19} = 0b00;
	let Inst{15-12} = 0b0100;
	}

	class MSRpstateImm0_15
	: PstateWriteSimple<(ins pstatefield4_op:$pstatefield, imm0_15:$imm), "msr",
	"\t$pstatefield, $imm">,
	Sched<[WriteSys]> {

	bits<6> pstatefield;
	bits<4> imm;
	let Inst{18-16} = pstatefield{5-3};
	let Inst{11-8} = imm;
	let Inst{7-5} = pstatefield{2-0};

	let DecoderMethod = "DecodeSystemPStateInstruction";
	// MSRpstateI aliases with MSRI. When the MSRpstateI decoder method returns
	// Fail the decoder should attempt to decode the instruction as MSRI.
	let hasCompleteDecoder = 0;
	}

	def SystemPStateFieldWithImm0_1Operand : AsmOperandClass {
	let Name = "SystemPStateFieldWithImm0_1";
	let ParserMethod = "tryParseSysReg";
	}
	def pstatefield1_op : Operand<i32> {
	let ParserMatchClass = SystemPStateFieldWithImm0_1Operand;
	let PrintMethod = "printSystemPStateField";
	}

	class MSRpstateImm0_1
	: PstateWriteSimple<(ins pstatefield1_op:$pstatefield, imm0_1:$imm), "msr",
	"\t$pstatefield, $imm">,
	Sched<[WriteSys]> {

	bits<6> pstatefield;
	bit imm;
	let Inst{18-16} = pstatefield{5-3};
	let Inst{11-9} = 0b000;
	let Inst{8} = imm;
	let Inst{7-5} = pstatefield{2-0};

	let DecoderMethod = "DecodeSystemPStateInstruction";
	// MSRpstateI aliases with MSRI. When the MSRpstateI decoder method returns
	// Fail the decoder should attempt to decode the instruction as MSRI.
	let hasCompleteDecoder = 0;
	}

	// SYS and SYSL generic system instructions.
	def SysCRAsmOperand : AsmOperandClass {
	let Name = "SysCR";
	let ParserMethod = "tryParseSysCROperand";
	}

	def sys_cr_op : Operand<i32> {
	let PrintMethod = "printSysCROperand";
	let ParserMatchClass = SysCRAsmOperand;
	}

	class SystemXtI<bit L, string asm>
	: RtSystemI<L, (outs),
	(ins imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2, GPR64:$Rt),
	asm, "\t$op1, $Cn, $Cm, $op2, $Rt"> {
	bits<3> op1;
	bits<4> Cn;
	bits<4> Cm;
	bits<3> op2;
	let Inst{20-19} = 0b01;
	let Inst{18-16} = op1;
	let Inst{15-12} = Cn;
	let Inst{11-8} = Cm;
	let Inst{7-5} = op2;
	}

	class SystemLXtI<bit L, string asm>
	: RtSystemI<L, (outs),
	(ins GPR64:$Rt, imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2),
	asm, "\t$Rt, $op1, $Cn, $Cm, $op2"> {
	bits<3> op1;
	bits<4> Cn;
	bits<4> Cm;
	bits<3> op2;
	let Inst{20-19} = 0b01;
	let Inst{18-16} = op1;
	let Inst{15-12} = Cn;
	let Inst{11-8} = Cm;
	let Inst{7-5} = op2;
	}


	// Branch (register) instructions:
	//
	// case opc of
	// 0001 blr
	// 0000 br
	// 0101 dret
	// 0100 eret
	// 0010 ret
	// otherwise UNDEFINED
	class BaseBranchReg<bits<4> opc, dag oops, dag iops, string asm,
	string operands, list<dag> pattern>
	: I<oops, iops, asm, operands, "", pattern>, Sched<[WriteBrReg]> {
	let Inst{31-25} = 0b1101011;
	let Inst{24-21} = opc;
	let Inst{20-16} = 0b11111;
	let Inst{15-10} = 0b000000;
	let Inst{4-0} = 0b00000;
	}

	class BranchReg<bits<4> opc, string asm, list<dag> pattern>
	: BaseBranchReg<opc, (outs), (ins GPR64:$Rn), asm, "\t$Rn", pattern> {
	bits<5> Rn;
	let Inst{9-5} = Rn;
	}

	let mayLoad = 0, mayStore = 0, hasSideEffects = 1, isReturn = 1 in
	class SpecialReturn<bits<4> opc, string asm>
	: BaseBranchReg<opc, (outs), (ins), asm, "", []> {
	let Inst{9-5} = 0b11111;
	}

	let mayLoad = 1 in
	class RCPCLoad<bits<2> sz, string asm, RegisterClass RC>
	: I<(outs RC:$Rt), (ins GPR64sp0:$Rn), asm, "\t$Rt, [$Rn]", "", []>,
	Sched<[]> {
	bits<5> Rn;
	bits<5> Rt;
	let Inst{31-30} = sz;
	let Inst{29-10} = 0b11100010111111110000;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rt;
	}

	class AuthBase<bits<1> M, dag oops, dag iops, string asm, string operands,
	list<dag> pattern>
	: I<oops, iops, asm, operands, "", pattern>, Sched<[]> {
	let isAuthenticated = 1;
	let Inst{31-25} = 0b1101011;
	let Inst{20-11} = 0b1111100001;
	let Inst{10} = M;
	let Inst{4-0} = 0b11111;
	}

	class AuthBranchTwoOperands<bits<1> op, bits<1> M, string asm>
	: AuthBase<M, (outs), (ins GPR64:$Rn, GPR64sp:$Rm), asm, "\t$Rn, $Rm", []> {
	bits<5> Rn;
	bits<5> Rm;
	let Inst{24-22} = 0b100;
	let Inst{21} = op;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rm;
	}

	class AuthOneOperand<bits<3> opc, bits<1> M, string asm>
	: AuthBase<M, (outs), (ins GPR64:$Rn), asm, "\t$Rn", []> {
	bits<5> Rn;
	let Inst{24} = 0;
	let Inst{23-21} = opc;
	let Inst{9-5} = Rn;
	}

	let Uses = [LR,SP] in
	class AuthReturn<bits<3> op, bits<1> M, string asm>
	: AuthBase<M, (outs), (ins), asm, "", []> {
	let Inst{24} = 0;
	let Inst{23-21} = op;
	let Inst{9-0} = 0b1111111111;
	}

	let mayLoad = 1 in
	class BaseAuthLoad<bit M, bit W, dag oops, dag iops, string asm,
	string operands, string cstr, Operand opr>
	: I<oops, iops, asm, operands, cstr, []>, Sched<[]> {
	bits<10> offset;
	bits<5> Rn;
	bits<5> Rt;
	let isAuthenticated = 1;
	let Inst{31-24} = 0b11111000;
	let Inst{23} = M;
	let Inst{22} = offset{9};
	let Inst{21} = 1;
	let Inst{20-12} = offset{8-0};
	let Inst{11} = W;
	let Inst{10} = 1;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rt;

	let DecoderMethod = "DecodeAuthLoadInstruction";
	}

	multiclass AuthLoad<bit M, string asm, Operand opr> {
	def indexed : BaseAuthLoad<M, 0, (outs GPR64:$Rt),
	(ins GPR64sp:$Rn, opr:$offset),
	asm, "\t$Rt, [$Rn, $offset]", "", opr>;
	def writeback : BaseAuthLoad<M, 1, (outs GPR64sp:$wback, GPR64:$Rt),
	(ins GPR64sp:$Rn, opr:$offset),
	asm, "\t$Rt, [$Rn, $offset]!",
	"$Rn = $wback,@earlyclobber $wback", opr>;

	def : InstAlias<asm # "\t$Rt, [$Rn]",
	(!cast<Instruction>(NAME # "indexed") GPR64:$Rt, GPR64sp:$Rn, 0)>;

	def : InstAlias<asm # "\t$Rt, [$wback]!",
	(!cast<Instruction>(NAME # "writeback") GPR64sp:$wback, GPR64:$Rt, 0), 0>;
	}

	//---
	// Conditional branch instruction.
	//---

	// Condition code.
	// 4-bit immediate. Pretty-printed as <cc>
	def ccode : Operand<i32> {
	let PrintMethod = "printCondCode";
	let ParserMatchClass = CondCode;
	}
	def inv_ccode : Operand<i32> {
	// AL and NV are invalid in the aliases which use inv_ccode
	let PrintMethod = "printInverseCondCode";
	let ParserMatchClass = CondCode;
	let MCOperandPredicate = [{
	return MCOp.isImm() &&
	MCOp.getImm() != AArch64CC::AL &&
	MCOp.getImm() != AArch64CC::NV;
	}];
	}

	// Conditional branch target. 19-bit immediate. The low two bits of the target
	// offset are implied zero and so are not part of the immediate.
	def am_brcond : Operand<OtherVT> {
	let EncoderMethod = "getCondBranchTargetOpValue";
	let DecoderMethod = "DecodePCRelLabel19";
	let PrintMethod = "printAlignedLabel";
	let ParserMatchClass = PCRelLabel19Operand;
	let OperandType = "OPERAND_PCREL";
	}

	class BranchCond : I<(outs), (ins ccode:$cond, am_brcond:$target),
	"b", ".$cond\t$target", "",
	[(AArch64brcond bb:$target, imm:$cond, NZCV)]>,
	Sched<[WriteBr]> {
	let isBranch = 1;
	let isTerminator = 1;
	let Uses = [NZCV];

	bits<4> cond;
	bits<19> target;
	let Inst{31-24} = 0b01010100;
	let Inst{23-5} = target;
	let Inst{4} = 0;
	let Inst{3-0} = cond;
	}

	//---
	// Compare-and-branch instructions.
	//---
	class BaseCmpBranch<RegisterClass regtype, bit op, string asm, SDNode node>
	: I<(outs), (ins regtype:$Rt, am_brcond:$target),
	asm, "\t$Rt, $target", "",
	[(node regtype:$Rt, bb:$target)]>,
	Sched<[WriteBr]> {
	let isBranch = 1;
	let isTerminator = 1;

	bits<5> Rt;
	bits<19> target;
	let Inst{30-25} = 0b011010;
	let Inst{24} = op;
	let Inst{23-5} = target;
	let Inst{4-0} = Rt;
	}

	multiclass CmpBranch<bit op, string asm, SDNode node> {
	def W : BaseCmpBranch<GPR32, op, asm, node> {
	let Inst{31} = 0;
	}
	def X : BaseCmpBranch<GPR64, op, asm, node> {
	let Inst{31} = 1;
	}
	}

	//---
	// Test-bit-and-branch instructions.
	//---
	// Test-and-branch target. 14-bit sign-extended immediate. The low two bits of
	// the target offset are implied zero and so are not part of the immediate.
	def am_tbrcond : Operand<OtherVT> {
	let EncoderMethod = "getTestBranchTargetOpValue";
	let PrintMethod = "printAlignedLabel";
	let ParserMatchClass = BranchTarget14Operand;
	let OperandType = "OPERAND_PCREL";
	}

	// AsmOperand classes to emit (or not) special diagnostics
	def TBZImm0_31Operand : AsmOperandClass {
	let Name = "TBZImm0_31";
	let PredicateMethod = "isImmInRange<0,31>";
	let RenderMethod = "addImmOperands";
	}
	def TBZImm32_63Operand : AsmOperandClass {
	let Name = "Imm32_63";
	let PredicateMethod = "isImmInRange<32,63>";
	let DiagnosticType = "InvalidImm0_63";
	let RenderMethod = "addImmOperands";
	}

	class tbz_imm0_31<AsmOperandClass matcher> : Operand<i64>, ImmLeaf<i64, [{
	return (((uint32_t)Imm) < 32);
	}]> {
	let ParserMatchClass = matcher;
	}

	def tbz_imm0_31_diag : tbz_imm0_31<Imm0_31Operand>;
	def tbz_imm0_31_nodiag : tbz_imm0_31<TBZImm0_31Operand>;

	def tbz_imm32_63 : Operand<i64>, ImmLeaf<i64, [{
	return (((uint32_t)Imm) > 31) && (((uint32_t)Imm) < 64);
	}]> {
	let ParserMatchClass = TBZImm32_63Operand;
	}

	class BaseTestBranch<RegisterClass regtype, Operand immtype,
	bit op, string asm, SDNode node>
	: I<(outs), (ins regtype:$Rt, immtype:$bit_off, am_tbrcond:$target),
	asm, "\t$Rt, $bit_off, $target", "",
	[(node regtype:$Rt, immtype:$bit_off, bb:$target)]>,
	Sched<[WriteBr]> {
	let isBranch = 1;
	let isTerminator = 1;

	bits<5> Rt;
	bits<6> bit_off;
	bits<14> target;

	let Inst{30-25} = 0b011011;
	let Inst{24} = op;
	let Inst{23-19} = bit_off{4-0};
	let Inst{18-5} = target;
	let Inst{4-0} = Rt;

	let DecoderMethod = "DecodeTestAndBranch";
	}

	multiclass TestBranch<bit op, string asm, SDNode node> {
	def W : BaseTestBranch<GPR32, tbz_imm0_31_diag, op, asm, node> {
	let Inst{31} = 0;
	}

	def X : BaseTestBranch<GPR64, tbz_imm32_63, op, asm, node> {
	let Inst{31} = 1;
	}

	// Alias X-reg with 0-31 imm to W-Reg.
	def : InstAlias<asm # "\t$Rd, $imm, $target",
	(!cast<Instruction>(NAME#"W") GPR32as64:$Rd,
	tbz_imm0_31_nodiag:$imm, am_tbrcond:$target), 0>;
	def : Pat<(node GPR64:$Rn, tbz_imm0_31_diag:$imm, bb:$target),
	(!cast<Instruction>(NAME#"W") (EXTRACT_SUBREG GPR64:$Rn, sub_32),
	tbz_imm0_31_diag:$imm, bb:$target)>;
	}

	//---
	// Unconditional branch (immediate) instructions.
	//---
	def am_b_target : Operand<OtherVT> {
	let EncoderMethod = "getBranchTargetOpValue";
	let PrintMethod = "printAlignedLabel";
	let ParserMatchClass = BranchTarget26Operand;
	let OperandType = "OPERAND_PCREL";
	}
	def am_bl_target : Operand<i64> {
	let EncoderMethod = "getBranchTargetOpValue";
	let PrintMethod = "printAlignedLabel";
	let ParserMatchClass = BranchTarget26Operand;
	let OperandType = "OPERAND_PCREL";
	}

	class BImm<bit op, dag iops, string asm, list<dag> pattern>
	: I<(outs), iops, asm, "\t$addr", "", pattern>, Sched<[WriteBr]> {
	bits<26> addr;
	let Inst{31} = op;
	let Inst{30-26} = 0b00101;
	let Inst{25-0} = addr;

	let DecoderMethod = "DecodeUnconditionalBranch";
	}

	class BranchImm<bit op, string asm, list<dag> pattern>
	: BImm<op, (ins am_b_target:$addr), asm, pattern>;
	class CallImm<bit op, string asm, list<dag> pattern>
	: BImm<op, (ins am_bl_target:$addr), asm, pattern>;

	//---
	// Basic one-operand data processing instructions.
	//---

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseOneOperandData<bits<3> opc, RegisterClass regtype, string asm,
	SDPatternOperator node>
	: I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
	[(set regtype:$Rd, (node regtype:$Rn))]>,
	Sched<[WriteI, ReadI]> {
	bits<5> Rd;
	bits<5> Rn;

	let Inst{30-13} = 0b101101011000000000;
	let Inst{12-10} = opc;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	multiclass OneOperandData<bits<3> opc, string asm,
	SDPatternOperator node = null_frag> {
	def Wr : BaseOneOperandData<opc, GPR32, asm, node> {
	let Inst{31} = 0;
	}

	def Xr : BaseOneOperandData<opc, GPR64, asm, node> {
	let Inst{31} = 1;
	}
	}

	class OneWRegData<bits<3> opc, string asm, SDPatternOperator node>
	: BaseOneOperandData<opc, GPR32, asm, node> {
	let Inst{31} = 0;
	}

	class OneXRegData<bits<3> opc, string asm, SDPatternOperator node>
	: BaseOneOperandData<opc, GPR64, asm, node> {
	let Inst{31} = 1;
	}

	class SignAuthOneData<bits<3> opcode_prefix, bits<2> opcode, string asm>
	: I<(outs GPR64:$Rd), (ins GPR64sp:$Rn), asm, "\t$Rd, $Rn", "",
	[]>,
	Sched<[WriteI, ReadI]> {
	bits<5> Rd;
	bits<5> Rn;
	let Inst{31-15} = 0b11011010110000010;
	let Inst{14-12} = opcode_prefix;
	let Inst{11-10} = opcode;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	class SignAuthZero<bits<3> opcode_prefix, bits<2> opcode, string asm>
	: I<(outs GPR64:$Rd), (ins), asm, "\t$Rd", "", []>, Sched<[]> {
	bits<5> Rd;
	let Inst{31-15} = 0b11011010110000010;
	let Inst{14-12} = opcode_prefix;
	let Inst{11-10} = opcode;
	let Inst{9-5} = 0b11111;
	let Inst{4-0} = Rd;
	}

	class SignAuthTwoOperand<bits<4> opc, string asm,
	SDPatternOperator OpNode>
	: I<(outs GPR64:$Rd), (ins GPR64:$Rn, GPR64sp:$Rm),
	asm, "\t$Rd, $Rn, $Rm", "",
	[(set GPR64:$Rd, (OpNode GPR64:$Rn, GPR64sp:$Rm))]>,
	Sched<[WriteI, ReadI, ReadI]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<5> Rm;
	let Inst{31-21} = 0b10011010110;
	let Inst{20-16} = Rm;
	let Inst{15-14} = 0b00;
	let Inst{13-10} = opc;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	// Base class for the Armv8.4-A 8 and 16-bit flag manipulation instructions
	class BaseFlagManipulation<bit sf, bit sz, dag iops, string asm, string ops>
	: I<(outs), iops, asm, ops, "", []>,
	Sched<[WriteI, ReadI, ReadI]> {
	let Uses = [NZCV];
	bits<5> Rn;
	let Inst{31} = sf;
	let Inst{30-15} = 0b0111010000000000;
	let Inst{14} = sz;
	let Inst{13-10} = 0b0010;
	let Inst{9-5} = Rn;
	let Inst{4-0} = 0b01101;
	}

	class FlagRotate<dag iops, string asm, string ops>
	: BaseFlagManipulation<0b1, 0b0, iops, asm, ops> {
	bits<6> imm;
	bits<4> mask;
	let Inst{20-15} = imm;
	let Inst{13-10} = 0b0001;
	let Inst{4} = 0b0;
	let Inst{3-0} = mask;
	}

	//---
	// Basic two-operand data processing instructions.
	//---
	class BaseBaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
	list<dag> pattern>
	: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
	asm, "\t$Rd, $Rn, $Rm", "", pattern>,
	Sched<[WriteI, ReadI, ReadI]> {
	let Uses = [NZCV];
	bits<5> Rd;
	bits<5> Rn;
	bits<5> Rm;
	let Inst{30} = isSub;
	let Inst{28-21} = 0b11010000;
	let Inst{20-16} = Rm;
	let Inst{15-10} = 0;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	class BaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
	SDNode OpNode>
	: BaseBaseAddSubCarry<isSub, regtype, asm,
	[(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, NZCV))]>;

	class BaseAddSubCarrySetFlags<bit isSub, RegisterClass regtype, string asm,
	SDNode OpNode>
	: BaseBaseAddSubCarry<isSub, regtype, asm,
	[(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, NZCV)),
	(implicit NZCV)]> {
	let Defs = [NZCV];
	}

	multiclass AddSubCarry<bit isSub, string asm, string asm_setflags,
	SDNode OpNode, SDNode OpNode_setflags> {
	def Wr : BaseAddSubCarry<isSub, GPR32, asm, OpNode> {
	let Inst{31} = 0;
	let Inst{29} = 0;
	}
	def Xr : BaseAddSubCarry<isSub, GPR64, asm, OpNode> {
	let Inst{31} = 1;
	let Inst{29} = 0;
	}

	// Sets flags.
	def SWr : BaseAddSubCarrySetFlags<isSub, GPR32, asm_setflags,
	OpNode_setflags> {
	let Inst{31} = 0;
	let Inst{29} = 1;
	}
	def SXr : BaseAddSubCarrySetFlags<isSub, GPR64, asm_setflags,
	OpNode_setflags> {
	let Inst{31} = 1;
	let Inst{29} = 1;
	}
	}

	class BaseTwoOperand<bits<4> opc, RegisterClass regtype, string asm,
	SDPatternOperator OpNode,
	RegisterClass in1regtype = regtype,
	RegisterClass in2regtype = regtype>
	: I<(outs regtype:$Rd), (ins in1regtype:$Rn, in2regtype:$Rm),
	asm, "\t$Rd, $Rn, $Rm", "",
	[(set regtype:$Rd, (OpNode in1regtype:$Rn, in2regtype:$Rm))]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<5> Rm;
	let Inst{30-21} = 0b0011010110;
	let Inst{20-16} = Rm;
	let Inst{15-14} = 0b00;
	let Inst{13-10} = opc;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	class BaseDiv<bit isSigned, RegisterClass regtype, string asm,
	SDPatternOperator OpNode>
	: BaseTwoOperand<{0,0,1,?}, regtype, asm, OpNode> {
	let Inst{10} = isSigned;
	}

	multiclass Div<bit isSigned, string asm, SDPatternOperator OpNode> {
	def Wr : BaseDiv<isSigned, GPR32, asm, OpNode>,
	Sched<[WriteID32, ReadID, ReadID]> {
	let Inst{31} = 0;
	}
	def Xr : BaseDiv<isSigned, GPR64, asm, OpNode>,
	Sched<[WriteID64, ReadID, ReadID]> {
	let Inst{31} = 1;
	}
	}

	class BaseShift<bits<2> shift_type, RegisterClass regtype, string asm,
	SDPatternOperator OpNode = null_frag>
	: BaseTwoOperand<{1,0,?,?}, regtype, asm, OpNode>,
	Sched<[WriteIS, ReadI]> {
	let Inst{11-10} = shift_type;
	}

	multiclass Shift<bits<2> shift_type, string asm, SDNode OpNode> {
	def Wr : BaseShift<shift_type, GPR32, asm> {
	let Inst{31} = 0;
	}

	def Xr : BaseShift<shift_type, GPR64, asm, OpNode> {
	let Inst{31} = 1;
	}

	def : Pat<(i32 (OpNode GPR32:$Rn, i64:$Rm)),
	(!cast<Instruction>(NAME # "Wr") GPR32:$Rn,
	(EXTRACT_SUBREG i64:$Rm, sub_32))>;

	def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (zext GPR32:$Rm)))),
	(!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;

	def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (anyext GPR32:$Rm)))),
	(!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;

	def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (sext GPR32:$Rm)))),
	(!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;

	def : Pat<(i64 (OpNode GPR64:$Rn, (i64 (sext GPR32:$Rm)))),
	(!cast<Instruction>(NAME # "Xr") GPR64:$Rn,
	(SUBREG_TO_REG (i32 0), GPR32:$Rm, sub_32))>;

	def : Pat<(i64 (OpNode GPR64:$Rn, (i64 (zext GPR32:$Rm)))),
	(!cast<Instruction>(NAME # "Xr") GPR64:$Rn,
	(SUBREG_TO_REG (i32 0), GPR32:$Rm, sub_32))>;
	}

	class ShiftAlias<string asm, Instruction inst, RegisterClass regtype>
	: InstAlias<asm#"\t$dst, $src1, $src2",
	(inst regtype:$dst, regtype:$src1, regtype:$src2), 0>;

	class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype,
	RegisterClass addtype, string asm,
	list<dag> pattern>
	: I<(outs addtype:$Rd), (ins multype:$Rn, multype:$Rm, addtype:$Ra),
	asm, "\t$Rd, $Rn, $Rm, $Ra", "", pattern> {
	bits<5> Rd;
	bits<5> Rn;
	bits<5> Rm;
	bits<5> Ra;
	let Inst{30-24} = 0b0011011;
	let Inst{23-21} = opc;
	let Inst{20-16} = Rm;
	let Inst{15} = isSub;
	let Inst{14-10} = Ra;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	multiclass MulAccum<bit isSub, string asm, SDNode AccNode> {
	// MADD/MSUB generation is decided by MachineCombiner.cpp
	def Wrrr : BaseMulAccum<isSub, 0b000, GPR32, GPR32, asm,
	[/(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))/]>,
	Sched<[WriteIM32, ReadIM, ReadIM, ReadIMA]> {
	let Inst{31} = 0;
	}

	def Xrrr : BaseMulAccum<isSub, 0b000, GPR64, GPR64, asm,
	[/(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))/]>,
	Sched<[WriteIM64, ReadIM, ReadIM, ReadIMA]> {
	let Inst{31} = 1;
	}
	}

	class WideMulAccum<bit isSub, bits<3> opc, string asm,
	SDNode AccNode, SDNode ExtNode>
	: BaseMulAccum<isSub, opc, GPR32, GPR64, asm,
	[(set GPR64:$Rd, (AccNode GPR64:$Ra,
	(mul (ExtNode GPR32:$Rn), (ExtNode GPR32:$Rm))))]>,
	Sched<[WriteIM32, ReadIM, ReadIM, ReadIMA]> {
	let Inst{31} = 1;
	}

	class MulHi<bits<3> opc, string asm, SDNode OpNode>
	: I<(outs GPR64:$Rd), (ins GPR64:$Rn, GPR64:$Rm),
	asm, "\t$Rd, $Rn, $Rm", "",
	[(set GPR64:$Rd, (OpNode GPR64:$Rn, GPR64:$Rm))]>,
	Sched<[WriteIM64, ReadIM, ReadIM]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<5> Rm;
	let Inst{31-24} = 0b10011011;
	let Inst{23-21} = opc;
	let Inst{20-16} = Rm;
	let Inst{15} = 0;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;

	// The Ra field of SMULH and UMULH is unused: it should be assembled as 31
	// (i.e. all bits 1) but is ignored by the processor.
	let PostEncoderMethod = "fixMulHigh";
	}

	class MulAccumWAlias<string asm, Instruction inst>
	: InstAlias<asm#"\t$dst, $src1, $src2",
	(inst GPR32:$dst, GPR32:$src1, GPR32:$src2, WZR)>;
	class MulAccumXAlias<string asm, Instruction inst>
	: InstAlias<asm#"\t$dst, $src1, $src2",
	(inst GPR64:$dst, GPR64:$src1, GPR64:$src2, XZR)>;
	class WideMulAccumAlias<string asm, Instruction inst>
	: InstAlias<asm#"\t$dst, $src1, $src2",
	(inst GPR64:$dst, GPR32:$src1, GPR32:$src2, XZR)>;

	class BaseCRC32<bit sf, bits<2> sz, bit C, RegisterClass StreamReg,
	SDPatternOperator OpNode, string asm>
	: I<(outs GPR32:$Rd), (ins GPR32:$Rn, StreamReg:$Rm),
	asm, "\t$Rd, $Rn, $Rm", "",
	[(set GPR32:$Rd, (OpNode GPR32:$Rn, StreamReg:$Rm))]>,
	Sched<[WriteISReg, ReadI, ReadISReg]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<5> Rm;

	let Inst{31} = sf;
	let Inst{30-21} = 0b0011010110;
	let Inst{20-16} = Rm;
	let Inst{15-13} = 0b010;
	let Inst{12} = C;
	let Inst{11-10} = sz;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	let Predicates = [HasCRC];
	}

	//---
	// Address generation.
	//---

	class ADRI<bit page, string asm, Operand adr, list<dag> pattern>
	: I<(outs GPR64:$Xd), (ins adr:$label), asm, "\t$Xd, $label", "",
	pattern>,
	Sched<[WriteI]> {
	bits<5> Xd;
	bits<21> label;
	let Inst{31} = page;
	let Inst{30-29} = label{1-0};
	let Inst{28-24} = 0b10000;
	let Inst{23-5} = label{20-2};
	let Inst{4-0} = Xd;

	let DecoderMethod = "DecodeAdrInstruction";
	}

	//---
	// Move immediate.
	//---

	def movimm32_imm : Operand<i32> {
	let ParserMatchClass = AsmImmRange<0, 65535>;
	let EncoderMethod = "getMoveWideImmOpValue";
	let PrintMethod = "printImm";
	}
	def movimm32_shift : Operand<i32> {
	let PrintMethod = "printShifter";
	let ParserMatchClass = MovImm32ShifterOperand;
	}
	def movimm64_shift : Operand<i32> {
	let PrintMethod = "printShifter";
	let ParserMatchClass = MovImm64ShifterOperand;
	}

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseMoveImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
	string asm>
	: I<(outs regtype:$Rd), (ins movimm32_imm:$imm, shifter:$shift),
	asm, "\t$Rd, $imm$shift", "", []>,
	Sched<[WriteImm]> {
	bits<5> Rd;
	bits<16> imm;
	bits<6> shift;
	let Inst{30-29} = opc;
	let Inst{28-23} = 0b100101;
	let Inst{22-21} = shift{5-4};
	let Inst{20-5} = imm;
	let Inst{4-0} = Rd;

	let DecoderMethod = "DecodeMoveImmInstruction";
	}

	multiclass MoveImmediate<bits<2> opc, string asm> {
	def Wi : BaseMoveImmediate<opc, GPR32, movimm32_shift, asm> {
	let Inst{31} = 0;
	}

	def Xi : BaseMoveImmediate<opc, GPR64, movimm64_shift, asm> {
	let Inst{31} = 1;
	}
	}

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseInsertImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
	string asm>
	: I<(outs regtype:$Rd),
	(ins regtype:$src, movimm32_imm:$imm, shifter:$shift),
	asm, "\t$Rd, $imm$shift", "$src = $Rd", []>,
	Sched<[WriteI, ReadI]> {
	bits<5> Rd;
	bits<16> imm;
	bits<6> shift;
	let Inst{30-29} = opc;
	let Inst{28-23} = 0b100101;
	let Inst{22-21} = shift{5-4};
	let Inst{20-5} = imm;
	let Inst{4-0} = Rd;

	let DecoderMethod = "DecodeMoveImmInstruction";
	}

	multiclass InsertImmediate<bits<2> opc, string asm> {
	def Wi : BaseInsertImmediate<opc, GPR32, movimm32_shift, asm> {
	let Inst{31} = 0;
	}

	def Xi : BaseInsertImmediate<opc, GPR64, movimm64_shift, asm> {
	let Inst{31} = 1;
	}
	}

	//---
	// Add/Subtract
	//---

	class BaseAddSubImm<bit isSub, bit setFlags, RegisterClass dstRegtype,
	string asm_inst, string asm_ops,
	dag inputs, dag pattern>
	: I<(outs dstRegtype:$Rd), inputs, asm_inst, asm_ops, "", [pattern]>,
	Sched<[WriteI, ReadI]> {
	bits<5> Rd;
	bits<5> Rn;
	let Inst{30} = isSub;
	let Inst{29} = setFlags;
	let Inst{28-24} = 0b10001;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	class AddSubImmShift<bit isSub, bit setFlags, RegisterClass dstRegtype,
	RegisterClass srcRegtype, addsub_shifted_imm immtype,
	string asm_inst, SDPatternOperator OpNode>
	: BaseAddSubImm<isSub, setFlags, dstRegtype, asm_inst, "\t$Rd, $Rn, $imm",
	(ins srcRegtype:$Rn, immtype:$imm),
	(set dstRegtype:$Rd, (OpNode srcRegtype:$Rn, immtype:$imm))> {
	bits<14> imm;
	let Inst{23-22} = imm{13-12}; // '00' => lsl #0, '01' => lsl #12
	let Inst{21-10} = imm{11-0};
	let DecoderMethod = "DecodeAddSubImmShift";
	}

	class BaseAddSubRegPseudo<RegisterClass regtype,
	SDPatternOperator OpNode>
	: Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
	[(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
	Sched<[WriteI, ReadI, ReadI]>;

	class BaseAddSubSReg<bit isSub, bit setFlags, RegisterClass regtype,
	arith_shifted_reg shifted_regtype, string asm,
	SDPatternOperator OpNode>
	: I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
	asm, "\t$Rd, $Rn, $Rm", "",
	[(set regtype:$Rd, (OpNode regtype:$Rn, shifted_regtype:$Rm))]>,
	Sched<[WriteISReg, ReadI, ReadISReg]> {
	// The operands are in order to match the 'addr' MI operands, so we
	// don't need an encoder method and by-name matching. Just use the default
	// in-order handling. Since we're using by-order, make sure the names
	// do not match.
	bits<5> dst;
	bits<5> src1;
	bits<5> src2;
	bits<8> shift;
	let Inst{30} = isSub;
	let Inst{29} = setFlags;
	let Inst{28-24} = 0b01011;
	let Inst{23-22} = shift{7-6};
	let Inst{21} = 0;
	let Inst{20-16} = src2;
	let Inst{15-10} = shift{5-0};
	let Inst{9-5} = src1;
	let Inst{4-0} = dst;

	let DecoderMethod = "DecodeThreeAddrSRegInstruction";
	}

	class BaseAddSubEReg<bit isSub, bit setFlags, RegisterClass dstRegtype,
	RegisterClass src1Regtype, Operand src2Regtype,
	string asm, SDPatternOperator OpNode>
	: I<(outs dstRegtype:$R1),
	(ins src1Regtype:$R2, src2Regtype:$R3),
	asm, "\t$R1, $R2, $R3", "",
	[(set dstRegtype:$R1, (OpNode src1Regtype:$R2, src2Regtype:$R3))]>,
	Sched<[WriteIEReg, ReadI, ReadIEReg]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<5> Rm;
	bits<6> ext;
	let Inst{30} = isSub;
	let Inst{29} = setFlags;
	let Inst{28-24} = 0b01011;
	let Inst{23-21} = 0b001;
	let Inst{20-16} = Rm;
	let Inst{15-13} = ext{5-3};
	let Inst{12-10} = ext{2-0};
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;

	let DecoderMethod = "DecodeAddSubERegInstruction";
	}

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseAddSubEReg64<bit isSub, bit setFlags, RegisterClass dstRegtype,
	RegisterClass src1Regtype, RegisterClass src2Regtype,
	Operand ext_op, string asm>
	: I<(outs dstRegtype:$Rd),
	(ins src1Regtype:$Rn, src2Regtype:$Rm, ext_op:$ext),
	asm, "\t$Rd, $Rn, $Rm$ext", "", []>,
	Sched<[WriteIEReg, ReadI, ReadIEReg]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<5> Rm;
	bits<6> ext;
	let Inst{30} = isSub;
	let Inst{29} = setFlags;
	let Inst{28-24} = 0b01011;
	let Inst{23-21} = 0b001;
	let Inst{20-16} = Rm;
	let Inst{15} = ext{5};
	let Inst{12-10} = ext{2-0};
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;

	let DecoderMethod = "DecodeAddSubERegInstruction";
	}

	// Aliases for register+register add/subtract.
	class AddSubRegAlias<string asm, Instruction inst, RegisterClass dstRegtype,
	RegisterClass src1Regtype, RegisterClass src2Regtype,
	int shiftExt>
	: InstAlias<asm#"\t$dst, $src1, $src2",
	(inst dstRegtype:$dst, src1Regtype:$src1, src2Regtype:$src2,
	shiftExt)>;

	multiclass AddSub<bit isSub, string mnemonic, string alias,
	SDPatternOperator OpNode = null_frag> {
	let hasSideEffects = 0, isReMaterializable = 1, isAsCheapAsAMove = 1 in {
	// Add/Subtract immediate
	// Increase the weight of the immediate variant to try to match it before
	// the extended register variant.
	// We used to match the register variant before the immediate when the
	// register argument could be implicitly zero-extended.
	let AddedComplexity = 6 in
	def Wri : AddSubImmShift<isSub, 0, GPR32sp, GPR32sp, addsub_shifted_imm32,
	mnemonic, OpNode> {
	let Inst{31} = 0;
	}
	let AddedComplexity = 6 in
	def Xri : AddSubImmShift<isSub, 0, GPR64sp, GPR64sp, addsub_shifted_imm64,
	mnemonic, OpNode> {
	let Inst{31} = 1;
	}

	// Add/Subtract register - Only used for CodeGen
	def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
	def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;

	// Add/Subtract shifted register
	def Wrs : BaseAddSubSReg<isSub, 0, GPR32, arith_shifted_reg32, mnemonic,
	OpNode> {
	let Inst{31} = 0;
	}
	def Xrs : BaseAddSubSReg<isSub, 0, GPR64, arith_shifted_reg64, mnemonic,
	OpNode> {
	let Inst{31} = 1;
	}
	}

	// Add/Subtract extended register
	let AddedComplexity = 1, hasSideEffects = 0 in {
	def Wrx : BaseAddSubEReg<isSub, 0, GPR32sp, GPR32sp,
	arith_extended_reg32_i32, mnemonic, OpNode> {
	let Inst{31} = 0;
	}
	def Xrx : BaseAddSubEReg<isSub, 0, GPR64sp, GPR64sp,
	arith_extended_reg32to64_i64, mnemonic, OpNode> {
	let Inst{31} = 1;
	}
	}

	def Xrx64 : BaseAddSubEReg64<isSub, 0, GPR64sp, GPR64sp, GPR64,
	arith_extendlsl64, mnemonic> {
	// UXTX and SXTX only.
	let Inst{14-13} = 0b11;
	let Inst{31} = 1;
	}

	// add Rd, Rb, -imm -> sub Rd, Rn, imm
	def : InstSubst<alias#"\t$Rd, $Rn, $imm",
	(!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32sp:$Rn,
	addsub_shifted_imm32_neg:$imm), 0>;
	def : InstSubst<alias#"\t$Rd, $Rn, $imm",
	(!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64sp:$Rn,
	addsub_shifted_imm64_neg:$imm), 0>;

	// Register/register aliases with no shift when SP is not used.
	def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
	GPR32, GPR32, GPR32, 0>;
	def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
	GPR64, GPR64, GPR64, 0>;

	// Register/register aliases with no shift when either the destination or
	// first source register is SP.
	def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
	GPR32sponly, GPR32sp, GPR32, 16>; // UXTW #0
	def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
	GPR32sp, GPR32sponly, GPR32, 16>; // UXTW #0
	def : AddSubRegAlias<mnemonic,
	!cast<Instruction>(NAME#"Xrx64"),
	GPR64sponly, GPR64sp, GPR64, 24>; // UXTX #0
	def : AddSubRegAlias<mnemonic,
	!cast<Instruction>(NAME#"Xrx64"),
	GPR64sp, GPR64sponly, GPR64, 24>; // UXTX #0
	}

	multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp,
	string alias, string cmpAlias> {
	let isCompare = 1, Defs = [NZCV] in {
	// Add/Subtract immediate
	def Wri : AddSubImmShift<isSub, 1, GPR32, GPR32sp, addsub_shifted_imm32,
	mnemonic, OpNode> {
	let Inst{31} = 0;
	}
	def Xri : AddSubImmShift<isSub, 1, GPR64, GPR64sp, addsub_shifted_imm64,
	mnemonic, OpNode> {
	let Inst{31} = 1;
	}

	// Add/Subtract register
	def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
	def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;

	// Add/Subtract shifted register
	def Wrs : BaseAddSubSReg<isSub, 1, GPR32, arith_shifted_reg32, mnemonic,
	OpNode> {
	let Inst{31} = 0;
	}
	def Xrs : BaseAddSubSReg<isSub, 1, GPR64, arith_shifted_reg64, mnemonic,
	OpNode> {
	let Inst{31} = 1;
	}

	// Add/Subtract extended register
	let AddedComplexity = 1 in {
	def Wrx : BaseAddSubEReg<isSub, 1, GPR32, GPR32sp,
	arith_extended_reg32_i32, mnemonic, OpNode> {
	let Inst{31} = 0;
	}
	def Xrx : BaseAddSubEReg<isSub, 1, GPR64, GPR64sp,
	arith_extended_reg32_i64, mnemonic, OpNode> {
	let Inst{31} = 1;
	}
	}

	def Xrx64 : BaseAddSubEReg64<isSub, 1, GPR64, GPR64sp, GPR64,
	arith_extendlsl64, mnemonic> {
	// UXTX and SXTX only.
	let Inst{14-13} = 0b11;
	let Inst{31} = 1;
	}
	} // Defs = [NZCV]

	// Support negative immediates, e.g. adds Rd, Rn, -imm -> subs Rd, Rn, imm
	def : InstSubst<alias#"\t$Rd, $Rn, $imm",
	(!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32sp:$Rn,
	addsub_shifted_imm32_neg:$imm), 0>;
	def : InstSubst<alias#"\t$Rd, $Rn, $imm",
	(!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64sp:$Rn,
	addsub_shifted_imm64_neg:$imm), 0>;

	// Compare aliases
	def : InstAlias<cmp#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri")
	WZR, GPR32sp:$src, addsub_shifted_imm32:$imm), 5>;
	def : InstAlias<cmp#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri")
	XZR, GPR64sp:$src, addsub_shifted_imm64:$imm), 5>;
	def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Wrx")
	WZR, GPR32sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
	def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx")
	XZR, GPR64sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
	def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx64")
	XZR, GPR64sp:$src1, GPR64:$src2, arith_extendlsl64:$sh), 4>;
	def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Wrs")
	WZR, GPR32:$src1, GPR32:$src2, arith_shift32:$sh), 4>;
	def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrs")
	XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>;

	// Support negative immediates, e.g. cmp Rn, -imm -> cmn Rn, imm
	def : InstSubst<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri")
	WZR, GPR32sp:$src, addsub_shifted_imm32_neg:$imm), 0>;
	def : InstSubst<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri")
	XZR, GPR64sp:$src, addsub_shifted_imm64_neg:$imm), 0>;

	// Compare shorthands
	def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Wrs")
	WZR, GPR32:$src1, GPR32:$src2, 0), 5>;
	def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Xrs")
	XZR, GPR64:$src1, GPR64:$src2, 0), 5>;
	def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Wrx")
	WZR, GPR32sponly:$src1, GPR32:$src2, 16), 5>;
	def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Xrx64")
	XZR, GPR64sponly:$src1, GPR64:$src2, 24), 5>;

	// Register/register aliases with no shift when SP is not used.
	def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
	GPR32, GPR32, GPR32, 0>;
	def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
	GPR64, GPR64, GPR64, 0>;

	// Register/register aliases with no shift when the first source register
	// is SP.
	def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
	GPR32, GPR32sponly, GPR32, 16>; // UXTW #0
	def : AddSubRegAlias<mnemonic,
	!cast<Instruction>(NAME#"Xrx64"),
	GPR64, GPR64sponly, GPR64, 24>; // UXTX #0
	}

	class AddSubG<bit isSub, string asm_inst, SDPatternOperator OpNode>
	: BaseAddSubImm<
	isSub, 0, GPR64sp, asm_inst, "\t$Rd, $Rn, $imm6, $imm4",
	(ins GPR64sp:$Rn, uimm6s16:$imm6, imm0_15:$imm4),
	(set GPR64sp:$Rd, (OpNode GPR64sp:$Rn, imm0_63:$imm6, imm0_15:$imm4))> {
	bits<6> imm6;
	bits<4> imm4;
	let Inst{31} = 1;
	let Inst{23-22} = 0b10;
	let Inst{21-16} = imm6;
	let Inst{15-14} = 0b00;
	let Inst{13-10} = imm4;
	let Unpredictable{15-14} = 0b11;
	}

	class SUBP<bit setsFlags, string asm_instr, SDPatternOperator OpNode>
	: BaseTwoOperand<0b0000, GPR64, asm_instr, OpNode, GPR64sp, GPR64sp> {
	let Inst{31} = 1;
	let Inst{29} = setsFlags;
	}

	//---
	// Extract
	//---
	def SDTA64EXTR : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
	SDTCisPtrTy<3>]>;
	def AArch64Extr : SDNode<"AArch64ISD::EXTR", SDTA64EXTR>;

	class BaseExtractImm<RegisterClass regtype, Operand imm_type, string asm,
	list<dag> patterns>
	: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, imm_type:$imm),
	asm, "\t$Rd, $Rn, $Rm, $imm", "", patterns>,
	Sched<[WriteExtr, ReadExtrHi]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<5> Rm;
	bits<6> imm;

	let Inst{30-23} = 0b00100111;
	let Inst{21} = 0;
	let Inst{20-16} = Rm;
	let Inst{15-10} = imm;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	multiclass ExtractImm<string asm> {
	def Wrri : BaseExtractImm<GPR32, imm0_31, asm,
	[(set GPR32:$Rd,
	(AArch64Extr GPR32:$Rn, GPR32:$Rm, imm0_31:$imm))]> {
	let Inst{31} = 0;
	let Inst{22} = 0;
	// imm<5> must be zero.
	let imm{5} = 0;
	}
	def Xrri : BaseExtractImm<GPR64, imm0_63, asm,
	[(set GPR64:$Rd,
	(AArch64Extr GPR64:$Rn, GPR64:$Rm, imm0_63:$imm))]> {

	let Inst{31} = 1;
	let Inst{22} = 1;
	}
	}

	//---
	// Bitfield
	//---

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseBitfieldImm<bits<2> opc,
	RegisterClass regtype, Operand imm_type, string asm>
	: I<(outs regtype:$Rd), (ins regtype:$Rn, imm_type:$immr, imm_type:$imms),
	asm, "\t$Rd, $Rn, $immr, $imms", "", []>,
	Sched<[WriteIS, ReadI]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<6> immr;
	bits<6> imms;

	let Inst{30-29} = opc;
	let Inst{28-23} = 0b100110;
	let Inst{21-16} = immr;
	let Inst{15-10} = imms;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	multiclass BitfieldImm<bits<2> opc, string asm> {
	def Wri : BaseBitfieldImm<opc, GPR32, imm0_31, asm> {
	let Inst{31} = 0;
	let Inst{22} = 0;
	// imms<5> and immr<5> must be zero, else ReservedValue().
	let Inst{21} = 0;
	let Inst{15} = 0;
	}
	def Xri : BaseBitfieldImm<opc, GPR64, imm0_63, asm> {
	let Inst{31} = 1;
	let Inst{22} = 1;
	}
	}

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseBitfieldImmWith2RegArgs<bits<2> opc,
	RegisterClass regtype, Operand imm_type, string asm>
	: I<(outs regtype:$Rd), (ins regtype:$src, regtype:$Rn, imm_type:$immr,
	imm_type:$imms),
	asm, "\t$Rd, $Rn, $immr, $imms", "$src = $Rd", []>,
	Sched<[WriteIS, ReadI]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<6> immr;
	bits<6> imms;

	let Inst{30-29} = opc;
	let Inst{28-23} = 0b100110;
	let Inst{21-16} = immr;
	let Inst{15-10} = imms;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	multiclass BitfieldImmWith2RegArgs<bits<2> opc, string asm> {
	def Wri : BaseBitfieldImmWith2RegArgs<opc, GPR32, imm0_31, asm> {
	let Inst{31} = 0;
	let Inst{22} = 0;
	// imms<5> and immr<5> must be zero, else ReservedValue().
	let Inst{21} = 0;
	let Inst{15} = 0;
	}
	def Xri : BaseBitfieldImmWith2RegArgs<opc, GPR64, imm0_63, asm> {
	let Inst{31} = 1;
	let Inst{22} = 1;
	}
	}

	//---
	// Logical
	//---

	// Logical (immediate)
	class BaseLogicalImm<bits<2> opc, RegisterClass dregtype,
	RegisterClass sregtype, Operand imm_type, string asm,
	list<dag> pattern>
	: I<(outs dregtype:$Rd), (ins sregtype:$Rn, imm_type:$imm),
	asm, "\t$Rd, $Rn, $imm", "", pattern>,
	Sched<[WriteI, ReadI]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<13> imm;
	let Inst{30-29} = opc;
	let Inst{28-23} = 0b100100;
	let Inst{22} = imm{12};
	let Inst{21-16} = imm{11-6};
	let Inst{15-10} = imm{5-0};
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;

	let DecoderMethod = "DecodeLogicalImmInstruction";
	}

	// Logical (shifted register)
	class BaseLogicalSReg<bits<2> opc, bit N, RegisterClass regtype,
	logical_shifted_reg shifted_regtype, string asm,
	list<dag> pattern>
	: I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
	asm, "\t$Rd, $Rn, $Rm", "", pattern>,
	Sched<[WriteISReg, ReadI, ReadISReg]> {
	// The operands are in order to match the 'addr' MI operands, so we
	// don't need an encoder method and by-name matching. Just use the default
	// in-order handling. Since we're using by-order, make sure the names
	// do not match.
	bits<5> dst;
	bits<5> src1;
	bits<5> src2;
	bits<8> shift;
	let Inst{30-29} = opc;
	let Inst{28-24} = 0b01010;
	let Inst{23-22} = shift{7-6};
	let Inst{21} = N;
	let Inst{20-16} = src2;
	let Inst{15-10} = shift{5-0};
	let Inst{9-5} = src1;
	let Inst{4-0} = dst;

	let DecoderMethod = "DecodeThreeAddrSRegInstruction";
	}

	// Aliases for register+register logical instructions.
	class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype>
	: InstAlias<asm#"\t$dst, $src1, $src2",
	(inst regtype:$dst, regtype:$src1, regtype:$src2, 0)>;

	multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode,
	string Alias> {
	let AddedComplexity = 6, isReMaterializable = 1, isAsCheapAsAMove = 1 in
	def Wri : BaseLogicalImm<opc, GPR32sp, GPR32, logical_imm32, mnemonic,
	[(set GPR32sp:$Rd, (OpNode GPR32:$Rn,
	logical_imm32:$imm))]> {
	let Inst{31} = 0;
	let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
	}
	let AddedComplexity = 6, isReMaterializable = 1, isAsCheapAsAMove = 1 in
	def Xri : BaseLogicalImm<opc, GPR64sp, GPR64, logical_imm64, mnemonic,
	[(set GPR64sp:$Rd, (OpNode GPR64:$Rn,
	logical_imm64:$imm))]> {
	let Inst{31} = 1;
	}

	def : InstSubst<Alias # "\t$Rd, $Rn, $imm",
	(!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32:$Rn,
	logical_imm32_not:$imm), 0>;
	def : InstSubst<Alias # "\t$Rd, $Rn, $imm",
	(!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64:$Rn,
	logical_imm64_not:$imm), 0>;
	}

	multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode,
	string Alias> {
	let isCompare = 1, Defs = [NZCV] in {
	def Wri : BaseLogicalImm<opc, GPR32, GPR32, logical_imm32, mnemonic,
	[(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_imm32:$imm))]> {
	let Inst{31} = 0;
	let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
	}
	def Xri : BaseLogicalImm<opc, GPR64, GPR64, logical_imm64, mnemonic,
	[(set GPR64:$Rd, (OpNode GPR64:$Rn, logical_imm64:$imm))]> {
	let Inst{31} = 1;
	}
	} // end Defs = [NZCV]

	def : InstSubst<Alias # "\t$Rd, $Rn, $imm",
	(!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32:$Rn,
	logical_imm32_not:$imm), 0>;
	def : InstSubst<Alias # "\t$Rd, $Rn, $imm",
	(!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64:$Rn,
	logical_imm64_not:$imm), 0>;
	}

	class BaseLogicalRegPseudo<RegisterClass regtype, SDPatternOperator OpNode>
	: Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
	[(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
	Sched<[WriteI, ReadI, ReadI]>;

	// Split from LogicalImm as not all instructions have both.
	multiclass LogicalReg<bits<2> opc, bit N, string mnemonic,
	SDPatternOperator OpNode> {
	let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
	def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
	def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;
	}

	def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
	[(set GPR32:$Rd, (OpNode GPR32:$Rn,
	logical_shifted_reg32:$Rm))]> {
	let Inst{31} = 0;
	}
	def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic,
	[(set GPR64:$Rd, (OpNode GPR64:$Rn,
	logical_shifted_reg64:$Rm))]> {
	let Inst{31} = 1;
	}

	def : LogicalRegAlias<mnemonic,
	!cast<Instruction>(NAME#"Wrs"), GPR32>;
	def : LogicalRegAlias<mnemonic,
	!cast<Instruction>(NAME#"Xrs"), GPR64>;
	}

	// Split from LogicalReg to allow setting NZCV Defs
	multiclass LogicalRegS<bits<2> opc, bit N, string mnemonic,
	SDPatternOperator OpNode = null_frag> {
	let Defs = [NZCV], mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
	def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
	def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;

	def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
	[(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_shifted_reg32:$Rm))]> {
	let Inst{31} = 0;
	}
	def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic,
	[(set GPR64:$Rd, (OpNode GPR64:$Rn, logical_shifted_reg64:$Rm))]> {
	let Inst{31} = 1;
	}
	} // Defs = [NZCV]

	def : LogicalRegAlias<mnemonic,
	!cast<Instruction>(NAME#"Wrs"), GPR32>;
	def : LogicalRegAlias<mnemonic,
	!cast<Instruction>(NAME#"Xrs"), GPR64>;
	}

	//---
	// Conditionally set flags
	//---

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseCondComparisonImm<bit op, RegisterClass regtype, ImmLeaf immtype,
	string mnemonic, SDNode OpNode>
	: I<(outs), (ins regtype:$Rn, immtype:$imm, imm32_0_15:$nzcv, ccode:$cond),
	mnemonic, "\t$Rn, $imm, $nzcv, $cond", "",
	[(set NZCV, (OpNode regtype:$Rn, immtype:$imm, (i32 imm:$nzcv),
	(i32 imm:$cond), NZCV))]>,
	Sched<[WriteI, ReadI]> {
	let Uses = [NZCV];
	let Defs = [NZCV];

	bits<5> Rn;
	bits<5> imm;
	bits<4> nzcv;
	bits<4> cond;

	let Inst{30} = op;
	let Inst{29-21} = 0b111010010;
	let Inst{20-16} = imm;
	let Inst{15-12} = cond;
	let Inst{11-10} = 0b10;
	let Inst{9-5} = Rn;
	let Inst{4} = 0b0;
	let Inst{3-0} = nzcv;
	}

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseCondComparisonReg<bit op, RegisterClass regtype, string mnemonic,
	SDNode OpNode>
	: I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond),
	mnemonic, "\t$Rn, $Rm, $nzcv, $cond", "",
	[(set NZCV, (OpNode regtype:$Rn, regtype:$Rm, (i32 imm:$nzcv),
	(i32 imm:$cond), NZCV))]>,
	Sched<[WriteI, ReadI, ReadI]> {
	let Uses = [NZCV];
	let Defs = [NZCV];

	bits<5> Rn;
	bits<5> Rm;
	bits<4> nzcv;
	bits<4> cond;

	let Inst{30} = op;
	let Inst{29-21} = 0b111010010;
	let Inst{20-16} = Rm;
	let Inst{15-12} = cond;
	let Inst{11-10} = 0b00;
	let Inst{9-5} = Rn;
	let Inst{4} = 0b0;
	let Inst{3-0} = nzcv;
	}

	multiclass CondComparison<bit op, string mnemonic, SDNode OpNode> {
	// immediate operand variants
	def Wi : BaseCondComparisonImm<op, GPR32, imm32_0_31, mnemonic, OpNode> {
	let Inst{31} = 0;
	}
	def Xi : BaseCondComparisonImm<op, GPR64, imm0_31, mnemonic, OpNode> {
	let Inst{31} = 1;
	}
	// register operand variants
	def Wr : BaseCondComparisonReg<op, GPR32, mnemonic, OpNode> {
	let Inst{31} = 0;
	}
	def Xr : BaseCondComparisonReg<op, GPR64, mnemonic, OpNode> {
	let Inst{31} = 1;
	}
	}

	//---
	// Conditional select
	//---

	class BaseCondSelect<bit op, bits<2> op2, RegisterClass regtype, string asm>
	: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
	asm, "\t$Rd, $Rn, $Rm, $cond", "",
	[(set regtype:$Rd,
	(AArch64csel regtype:$Rn, regtype:$Rm, (i32 imm:$cond), NZCV))]>,
	Sched<[WriteI, ReadI, ReadI]> {
	let Uses = [NZCV];

	bits<5> Rd;
	bits<5> Rn;
	bits<5> Rm;
	bits<4> cond;

	let Inst{30} = op;
	let Inst{29-21} = 0b011010100;
	let Inst{20-16} = Rm;
	let Inst{15-12} = cond;
	let Inst{11-10} = op2;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	multiclass CondSelect<bit op, bits<2> op2, string asm> {
	def Wr : BaseCondSelect<op, op2, GPR32, asm> {
	let Inst{31} = 0;
	}
	def Xr : BaseCondSelect<op, op2, GPR64, asm> {
	let Inst{31} = 1;
	}
	}

	class BaseCondSelectOp<bit op, bits<2> op2, RegisterClass regtype, string asm,
	PatFrag frag>
	: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
	asm, "\t$Rd, $Rn, $Rm, $cond", "",
	[(set regtype:$Rd,
	(AArch64csel regtype:$Rn, (frag regtype:$Rm),
	(i32 imm:$cond), NZCV))]>,
	Sched<[WriteI, ReadI, ReadI]> {
	let Uses = [NZCV];

	bits<5> Rd;
	bits<5> Rn;
	bits<5> Rm;
	bits<4> cond;

	let Inst{30} = op;
	let Inst{29-21} = 0b011010100;
	let Inst{20-16} = Rm;
	let Inst{15-12} = cond;
	let Inst{11-10} = op2;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	def inv_cond_XFORM : SDNodeXForm<imm, [{
	AArch64CC::CondCode CC = static_cast<AArch64CC::CondCode>(N->getZExtValue());
	return CurDAG->getTargetConstant(AArch64CC::getInvertedCondCode(CC), SDLoc(N),
	MVT::i32);
	}]>;

	multiclass CondSelectOp<bit op, bits<2> op2, string asm, PatFrag frag> {
	def Wr : BaseCondSelectOp<op, op2, GPR32, asm, frag> {
	let Inst{31} = 0;
	}
	def Xr : BaseCondSelectOp<op, op2, GPR64, asm, frag> {
	let Inst{31} = 1;
	}

	def : Pat<(AArch64csel (frag GPR32:$Rm), GPR32:$Rn, (i32 imm:$cond), NZCV),
	(!cast<Instruction>(NAME # Wr) GPR32:$Rn, GPR32:$Rm,
	(inv_cond_XFORM imm:$cond))>;

	def : Pat<(AArch64csel (frag GPR64:$Rm), GPR64:$Rn, (i32 imm:$cond), NZCV),
	(!cast<Instruction>(NAME # Xr) GPR64:$Rn, GPR64:$Rm,
	(inv_cond_XFORM imm:$cond))>;
	}

	//---
	// Special Mask Value
	//---
	def maski8_or_more : Operand<i32>,
	ImmLeaf<i32, [{ return (Imm & 0xff) == 0xff; }]> {
	}
	def maski16_or_more : Operand<i32>,
	ImmLeaf<i32, [{ return (Imm & 0xffff) == 0xffff; }]> {
	}


	//---
	// Load/store
	//---

	// (unsigned immediate)
	// Indexed for 8-bit registers. offset is in range [0,4095].
	def am_indexed8 : ComplexPattern<i64, 2, "SelectAddrModeIndexed8", []>;
	def am_indexed16 : ComplexPattern<i64, 2, "SelectAddrModeIndexed16", []>;
	def am_indexed32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed32", []>;
	def am_indexed64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed64", []>;
	def am_indexed128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed128", []>;

	def gi_am_indexed8 :
	GIComplexOperandMatcher<s64, "selectAddrModeIndexed<8>">,
	GIComplexPatternEquiv<am_indexed8>;
	def gi_am_indexed16 :
	GIComplexOperandMatcher<s64, "selectAddrModeIndexed<16>">,
	GIComplexPatternEquiv<am_indexed16>;
	def gi_am_indexed32 :
	GIComplexOperandMatcher<s64, "selectAddrModeIndexed<32>">,
	GIComplexPatternEquiv<am_indexed32>;
	def gi_am_indexed64 :
	GIComplexOperandMatcher<s64, "selectAddrModeIndexed<64>">,
	GIComplexPatternEquiv<am_indexed64>;
	def gi_am_indexed128 :
	GIComplexOperandMatcher<s64, "selectAddrModeIndexed<128>">,
	GIComplexPatternEquiv<am_indexed128>;

	class UImm12OffsetOperand<int Scale> : AsmOperandClass {
	let Name = "UImm12Offset" # Scale;
	let RenderMethod = "addUImm12OffsetOperands<" # Scale # ">";
	let PredicateMethod = "isUImm12Offset<" # Scale # ">";
	let DiagnosticType = "InvalidMemoryIndexed" # Scale;
	}

	def UImm12OffsetScale1Operand : UImm12OffsetOperand<1>;
	def UImm12OffsetScale2Operand : UImm12OffsetOperand<2>;
	def UImm12OffsetScale4Operand : UImm12OffsetOperand<4>;
	def UImm12OffsetScale8Operand : UImm12OffsetOperand<8>;
	def UImm12OffsetScale16Operand : UImm12OffsetOperand<16>;

	class uimm12_scaled<int Scale> : Operand<i64> {
	let ParserMatchClass
	= !cast<AsmOperandClass>("UImm12OffsetScale" # Scale # "Operand");
	let EncoderMethod
	= "getLdStUImm12OpValue<AArch64::fixup_aarch64_ldst_imm12_scale" # Scale # ">";
	let PrintMethod = "printUImm12Offset<" # Scale # ">";
	}

	def uimm12s1 : uimm12_scaled<1>;
	def uimm12s2 : uimm12_scaled<2>;
	def uimm12s4 : uimm12_scaled<4>;
	def uimm12s8 : uimm12_scaled<8>;
	def uimm12s16 : uimm12_scaled<16>;

	class BaseLoadStoreUI<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
	string asm, list<dag> pattern>
	: I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", pattern> {
	bits<5> Rt;

	bits<5> Rn;
	bits<12> offset;

	let Inst{31-30} = sz;
	let Inst{29-27} = 0b111;
	let Inst{26} = V;
	let Inst{25-24} = 0b01;
	let Inst{23-22} = opc;
	let Inst{21-10} = offset;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rt;

	let DecoderMethod = "DecodeUnsignedLdStInstruction";
	}

	multiclass LoadUI<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
	Operand indextype, string asm, list<dag> pattern> {
	let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
	def ui : BaseLoadStoreUI<sz, V, opc, (outs regtype:$Rt),
	(ins GPR64sp:$Rn, indextype:$offset),
	asm, pattern>,
	Sched<[WriteLD]>;

	def : InstAlias<asm # "\t$Rt, [$Rn]",
	(!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
	}

	multiclass StoreUI<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
	Operand indextype, string asm, list<dag> pattern> {
	let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
	def ui : BaseLoadStoreUI<sz, V, opc, (outs),
	(ins regtype:$Rt, GPR64sp:$Rn, indextype:$offset),
	asm, pattern>,
	Sched<[WriteST]>;

	def : InstAlias<asm # "\t$Rt, [$Rn]",
	(!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
	}

	// Same as StoreUI, but take a RegisterOperand. This is used by GlobalISel to
	// substitute zero-registers automatically.
	//
	// TODO: Roll out zero-register subtitution to GPR32/GPR64 and fold this back
	// into StoreUI.
	multiclass StoreUIz<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
	Operand indextype, string asm, list<dag> pattern> {
	let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
	def ui : BaseLoadStoreUI<sz, V, opc, (outs),
	(ins regtype:$Rt, GPR64sp:$Rn, indextype:$offset),
	asm, pattern>,
	Sched<[WriteST]>;

	def : InstAlias<asm # "\t$Rt, [$Rn]",
	(!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
	}

	def PrefetchOperand : AsmOperandClass {
	let Name = "Prefetch";
	let ParserMethod = "tryParsePrefetch";
	}
	def prfop : Operand<i32> {
	let PrintMethod = "printPrefetchOp";
	let ParserMatchClass = PrefetchOperand;
	}

	let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
	class PrefetchUI<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat>
	: BaseLoadStoreUI<sz, V, opc,
	(outs), (ins prfop:$Rt, GPR64sp:$Rn, uimm12s8:$offset),
	asm, pat>,
	Sched<[WriteLD]>;

	//---
	// Load literal
	//---

	// Load literal address: 19-bit immediate. The low two bits of the target
	// offset are implied zero and so are not part of the immediate.
	def am_ldrlit : Operand<iPTR> {
	let EncoderMethod = "getLoadLiteralOpValue";
	let DecoderMethod = "DecodePCRelLabel19";
	let PrintMethod = "printAlignedLabel";
	let ParserMatchClass = PCRelLabel19Operand;
	let OperandType = "OPERAND_PCREL";
	}

	let mayLoad = 1, mayStore = 0, hasSideEffects = 0, AddedComplexity = 20 in
	class LoadLiteral<bits<2> opc, bit V, RegisterOperand regtype, string asm, list<dag> pat>
	: I<(outs regtype:$Rt), (ins am_ldrlit:$label),
	asm, "\t$Rt, $label", "", pat>,
	Sched<[WriteLD]> {
	bits<5> Rt;
	bits<19> label;
	let Inst{31-30} = opc;
	let Inst{29-27} = 0b011;
	let Inst{26} = V;
	let Inst{25-24} = 0b00;
	let Inst{23-5} = label;
	let Inst{4-0} = Rt;
	}

	let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
	class PrefetchLiteral<bits<2> opc, bit V, string asm, list<dag> pat>
	: I<(outs), (ins prfop:$Rt, am_ldrlit:$label),
	asm, "\t$Rt, $label", "", pat>,
	Sched<[WriteLD]> {
	bits<5> Rt;
	bits<19> label;
	let Inst{31-30} = opc;
	let Inst{29-27} = 0b011;
	let Inst{26} = V;
	let Inst{25-24} = 0b00;
	let Inst{23-5} = label;
	let Inst{4-0} = Rt;
	}

	//---
	// Load/store register offset
	//---

	def ro_Xindexed8 : ComplexPattern<i64, 4, "SelectAddrModeXRO<8>", []>;
	def ro_Xindexed16 : ComplexPattern<i64, 4, "SelectAddrModeXRO<16>", []>;
	def ro_Xindexed32 : ComplexPattern<i64, 4, "SelectAddrModeXRO<32>", []>;
	def ro_Xindexed64 : ComplexPattern<i64, 4, "SelectAddrModeXRO<64>", []>;
	def ro_Xindexed128 : ComplexPattern<i64, 4, "SelectAddrModeXRO<128>", []>;

	def gi_ro_Xindexed8 :
	GIComplexOperandMatcher<s64, "selectAddrModeXRO<8>">,
	GIComplexPatternEquiv<ro_Xindexed8>;
	def gi_ro_Xindexed16 :
	GIComplexOperandMatcher<s64, "selectAddrModeXRO<16>">,
	GIComplexPatternEquiv<ro_Xindexed16>;
	def gi_ro_Xindexed32 :
	GIComplexOperandMatcher<s64, "selectAddrModeXRO<32>">,
	GIComplexPatternEquiv<ro_Xindexed32>;
	def gi_ro_Xindexed64 :
	GIComplexOperandMatcher<s64, "selectAddrModeXRO<64>">,
	GIComplexPatternEquiv<ro_Xindexed64>;
	def gi_ro_Xindexed128 :
	GIComplexOperandMatcher<s64, "selectAddrModeXRO<128>">,
	GIComplexPatternEquiv<ro_Xindexed128>;

	def ro_Windexed8 : ComplexPattern<i64, 4, "SelectAddrModeWRO<8>", []>;
	def ro_Windexed16 : ComplexPattern<i64, 4, "SelectAddrModeWRO<16>", []>;
	def ro_Windexed32 : ComplexPattern<i64, 4, "SelectAddrModeWRO<32>", []>;
	def ro_Windexed64 : ComplexPattern<i64, 4, "SelectAddrModeWRO<64>", []>;
	def ro_Windexed128 : ComplexPattern<i64, 4, "SelectAddrModeWRO<128>", []>;

	def gi_ro_Windexed8 :
	GIComplexOperandMatcher<s64, "selectAddrModeWRO<8>">,
	GIComplexPatternEquiv<ro_Windexed8>;
	def gi_ro_Windexed16 :
	GIComplexOperandMatcher<s64, "selectAddrModeWRO<16>">,
	GIComplexPatternEquiv<ro_Windexed16>;
	def gi_ro_Windexed32 :
	GIComplexOperandMatcher<s64, "selectAddrModeWRO<32>">,
	GIComplexPatternEquiv<ro_Windexed32>;
	def gi_ro_Windexed64 :
	GIComplexOperandMatcher<s64, "selectAddrModeWRO<64>">,
	GIComplexPatternEquiv<ro_Windexed64>;
	def gi_ro_Windexed128 :
	GIComplexOperandMatcher<s64, "selectAddrModeWRO<128>">,
	GIComplexPatternEquiv<ro_Windexed128>;

	class MemExtendOperand<string Reg, int Width> : AsmOperandClass {
	let Name = "Mem" # Reg # "Extend" # Width;
	let PredicateMethod = "isMem" # Reg # "Extend<" # Width # ">";
	let RenderMethod = "addMemExtendOperands";
	let DiagnosticType = "InvalidMemory" # Reg # "Extend" # Width;
	}

	def MemWExtend8Operand : MemExtendOperand<"W", 8> {
	// The address "[x0, x1, lsl #0]" actually maps to the variant which performs
	// the trivial shift.
	let RenderMethod = "addMemExtend8Operands";
	}
	def MemWExtend16Operand : MemExtendOperand<"W", 16>;
	def MemWExtend32Operand : MemExtendOperand<"W", 32>;
	def MemWExtend64Operand : MemExtendOperand<"W", 64>;
	def MemWExtend128Operand : MemExtendOperand<"W", 128>;

	def MemXExtend8Operand : MemExtendOperand<"X", 8> {
	// The address "[x0, x1, lsl #0]" actually maps to the variant which performs
	// the trivial shift.
	let RenderMethod = "addMemExtend8Operands";
	}
	def MemXExtend16Operand : MemExtendOperand<"X", 16>;
	def MemXExtend32Operand : MemExtendOperand<"X", 32>;
	def MemXExtend64Operand : MemExtendOperand<"X", 64>;
	def MemXExtend128Operand : MemExtendOperand<"X", 128>;

	class ro_extend<AsmOperandClass ParserClass, string Reg, int Width>
	: Operand<i32> {
	let ParserMatchClass = ParserClass;
	let PrintMethod = "printMemExtend<'" # Reg # "', " # Width # ">";
	let DecoderMethod = "DecodeMemExtend";
	let EncoderMethod = "getMemExtendOpValue";
	let MIOperandInfo = (ops i32imm:$signed, i32imm:$doshift);
	}

	def ro_Wextend8 : ro_extend<MemWExtend8Operand, "w", 8>;
	def ro_Wextend16 : ro_extend<MemWExtend16Operand, "w", 16>;
	def ro_Wextend32 : ro_extend<MemWExtend32Operand, "w", 32>;
	def ro_Wextend64 : ro_extend<MemWExtend64Operand, "w", 64>;
	def ro_Wextend128 : ro_extend<MemWExtend128Operand, "w", 128>;

	def ro_Xextend8 : ro_extend<MemXExtend8Operand, "x", 8>;
	def ro_Xextend16 : ro_extend<MemXExtend16Operand, "x", 16>;
	def ro_Xextend32 : ro_extend<MemXExtend32Operand, "x", 32>;
	def ro_Xextend64 : ro_extend<MemXExtend64Operand, "x", 64>;
	def ro_Xextend128 : ro_extend<MemXExtend128Operand, "x", 128>;

	class ROAddrMode<ComplexPattern windex, ComplexPattern xindex,
	Operand wextend, Operand xextend> {
	// CodeGen-level pattern covering the entire addressing mode.
	ComplexPattern Wpat = windex;
	ComplexPattern Xpat = xindex;

	// Asm-level Operand covering the valid "uxtw #3" style syntax.
	Operand Wext = wextend;
	Operand Xext = xextend;
	}

	def ro8 : ROAddrMode<ro_Windexed8, ro_Xindexed8, ro_Wextend8, ro_Xextend8>;
	def ro16 : ROAddrMode<ro_Windexed16, ro_Xindexed16, ro_Wextend16, ro_Xextend16>;
	def ro32 : ROAddrMode<ro_Windexed32, ro_Xindexed32, ro_Wextend32, ro_Xextend32>;
	def ro64 : ROAddrMode<ro_Windexed64, ro_Xindexed64, ro_Wextend64, ro_Xextend64>;
	def ro128 : ROAddrMode<ro_Windexed128, ro_Xindexed128, ro_Wextend128,
	ro_Xextend128>;

	class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
	string asm, dag ins, dag outs, list<dag> pat>
	: I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
	bits<5> Rt;
	bits<5> Rn;
	bits<5> Rm;
	bits<2> extend;
	let Inst{31-30} = sz;
	let Inst{29-27} = 0b111;
	let Inst{26} = V;
	let Inst{25-24} = 0b00;
	let Inst{23-22} = opc;
	let Inst{21} = 1;
	let Inst{20-16} = Rm;
	let Inst{15} = extend{1}; // sign extend Rm?
	let Inst{14} = 1;
	let Inst{12} = extend{0}; // do shift?
	let Inst{11-10} = 0b10;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rt;
	}

	class ROInstAlias<string asm, RegisterOperand regtype, Instruction INST>
	: InstAlias<asm # "\t$Rt, [$Rn, $Rm]",
	(INST regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;

	multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
	string asm, ValueType Ty, SDPatternOperator loadop> {
	let AddedComplexity = 10 in
	def roW : LoadStore8RO<sz, V, opc, regtype, asm,
	(outs regtype:$Rt),
	(ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
	[(set (Ty regtype:$Rt),
	(loadop (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
	ro_Wextend8:$extend)))]>,
	Sched<[WriteLDIdx, ReadAdrBase]> {
	let Inst{13} = 0b0;
	}

	let AddedComplexity = 10 in
	def roX : LoadStore8RO<sz, V, opc, regtype, asm,
	(outs regtype:$Rt),
	(ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
	[(set (Ty regtype:$Rt),
	(loadop (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
	ro_Xextend8:$extend)))]>,
	Sched<[WriteLDIdx, ReadAdrBase]> {
	let Inst{13} = 0b1;
	}

	def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
	}

	multiclass Store8RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
	string asm, ValueType Ty, SDPatternOperator storeop> {
	let AddedComplexity = 10 in
	def roW : LoadStore8RO<sz, V, opc, regtype, asm, (outs),
	(ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
	[(storeop (Ty regtype:$Rt),
	(ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
	ro_Wextend8:$extend))]>,
	Sched<[WriteSTIdx, ReadAdrBase]> {
	let Inst{13} = 0b0;
	}

	let AddedComplexity = 10 in
	def roX : LoadStore8RO<sz, V, opc, regtype, asm, (outs),
	(ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
	[(storeop (Ty regtype:$Rt),
	(ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
	ro_Xextend8:$extend))]>,
	Sched<[WriteSTIdx, ReadAdrBase]> {
	let Inst{13} = 0b1;
	}

	def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
	}

	class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
	string asm, dag ins, dag outs, list<dag> pat>
	: I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
	bits<5> Rt;
	bits<5> Rn;
	bits<5> Rm;
	bits<2> extend;
	let Inst{31-30} = sz;
	let Inst{29-27} = 0b111;
	let Inst{26} = V;
	let Inst{25-24} = 0b00;
	let Inst{23-22} = opc;
	let Inst{21} = 1;
	let Inst{20-16} = Rm;
	let Inst{15} = extend{1}; // sign extend Rm?
	let Inst{14} = 1;
	let Inst{12} = extend{0}; // do shift?
	let Inst{11-10} = 0b10;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rt;
	}

	multiclass Load16RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
	string asm, ValueType Ty, SDPatternOperator loadop> {
	let AddedComplexity = 10 in
	def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
	(ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend),
	[(set (Ty regtype:$Rt),
	(loadop (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
	ro_Wextend16:$extend)))]>,
	Sched<[WriteLDIdx, ReadAdrBase]> {
	let Inst{13} = 0b0;
	}

	let AddedComplexity = 10 in
	def roX : LoadStore16RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
	(ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend),
	[(set (Ty regtype:$Rt),
	(loadop (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
	ro_Xextend16:$extend)))]>,
	Sched<[WriteLDIdx, ReadAdrBase]> {
	let Inst{13} = 0b1;
	}

	def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
	}

	multiclass Store16RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
	string asm, ValueType Ty, SDPatternOperator storeop> {
	let AddedComplexity = 10 in
	def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs),
	(ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend),
	[(storeop (Ty regtype:$Rt),
	(ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
	ro_Wextend16:$extend))]>,
	Sched<[WriteSTIdx, ReadAdrBase]> {
	let Inst{13} = 0b0;
	}

	let AddedComplexity = 10 in
	def roX : LoadStore16RO<sz, V, opc, regtype, asm, (outs),
	(ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend),
	[(storeop (Ty regtype:$Rt),
	(ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
	ro_Xextend16:$extend))]>,
	Sched<[WriteSTIdx, ReadAdrBase]> {
	let Inst{13} = 0b1;
	}

	def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
	}

	class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
	string asm, dag ins, dag outs, list<dag> pat>
	: I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
	bits<5> Rt;
	bits<5> Rn;
	bits<5> Rm;
	bits<2> extend;
	let Inst{31-30} = sz;
	let Inst{29-27} = 0b111;
	let Inst{26} = V;
	let Inst{25-24} = 0b00;
	let Inst{23-22} = opc;
	let Inst{21} = 1;
	let Inst{20-16} = Rm;
	let Inst{15} = extend{1}; // sign extend Rm?
	let Inst{14} = 1;
	let Inst{12} = extend{0}; // do shift?
	let Inst{11-10} = 0b10;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rt;
	}

	multiclass Load32RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
	string asm, ValueType Ty, SDPatternOperator loadop> {
	let AddedComplexity = 10 in
	def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
	(ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend),
	[(set (Ty regtype:$Rt),
	(loadop (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
	ro_Wextend32:$extend)))]>,
	Sched<[WriteLDIdx, ReadAdrBase]> {
	let Inst{13} = 0b0;
	}

	let AddedComplexity = 10 in
	def roX : LoadStore32RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
	(ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend),
	[(set (Ty regtype:$Rt),
	(loadop (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
	ro_Xextend32:$extend)))]>,
	Sched<[WriteLDIdx, ReadAdrBase]> {
	let Inst{13} = 0b1;
	}

	def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
	}

	multiclass Store32RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
	string asm, ValueType Ty, SDPatternOperator storeop> {
	let AddedComplexity = 10 in
	def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs),
	(ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend),
	[(storeop (Ty regtype:$Rt),
	(ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
	ro_Wextend32:$extend))]>,
	Sched<[WriteSTIdx, ReadAdrBase]> {
	let Inst{13} = 0b0;
	}

	let AddedComplexity = 10 in
	def roX : LoadStore32RO<sz, V, opc, regtype, asm, (outs),
	(ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend),
	[(storeop (Ty regtype:$Rt),
	(ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
	ro_Xextend32:$extend))]>,
	Sched<[WriteSTIdx, ReadAdrBase]> {
	let Inst{13} = 0b1;
	}

	def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
	}

	class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
	string asm, dag ins, dag outs, list<dag> pat>
	: I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
	bits<5> Rt;
	bits<5> Rn;
	bits<5> Rm;
	bits<2> extend;
	let Inst{31-30} = sz;
	let Inst{29-27} = 0b111;
	let Inst{26} = V;
	let Inst{25-24} = 0b00;
	let Inst{23-22} = opc;
	let Inst{21} = 1;
	let Inst{20-16} = Rm;
	let Inst{15} = extend{1}; // sign extend Rm?
	let Inst{14} = 1;
	let Inst{12} = extend{0}; // do shift?
	let Inst{11-10} = 0b10;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rt;
	}

	multiclass Load64RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
	string asm, ValueType Ty, SDPatternOperator loadop> {
	let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
	def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
	(ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
	[(set (Ty regtype:$Rt),
	(loadop (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
	ro_Wextend64:$extend)))]>,
	Sched<[WriteLDIdx, ReadAdrBase]> {
	let Inst{13} = 0b0;
	}

	let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
	def roX : LoadStore64RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
	(ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
	[(set (Ty regtype:$Rt),
	(loadop (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
	ro_Xextend64:$extend)))]>,
	Sched<[WriteLDIdx, ReadAdrBase]> {
	let Inst{13} = 0b1;
	}

	def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
	}

	multiclass Store64RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
	string asm, ValueType Ty, SDPatternOperator storeop> {
	let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
	def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs),
	(ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
	[(storeop (Ty regtype:$Rt),
	(ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
	ro_Wextend64:$extend))]>,
	Sched<[WriteSTIdx, ReadAdrBase]> {
	let Inst{13} = 0b0;
	}

	let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
	def roX : LoadStore64RO<sz, V, opc, regtype, asm, (outs),
	(ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
	[(storeop (Ty regtype:$Rt),
	(ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
	ro_Xextend64:$extend))]>,
	Sched<[WriteSTIdx, ReadAdrBase]> {
	let Inst{13} = 0b1;
	}

	def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
	}

	class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
	string asm, dag ins, dag outs, list<dag> pat>
	: I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
	bits<5> Rt;
	bits<5> Rn;
	bits<5> Rm;
	bits<2> extend;
	let Inst{31-30} = sz;
	let Inst{29-27} = 0b111;
	let Inst{26} = V;
	let Inst{25-24} = 0b00;
	let Inst{23-22} = opc;
	let Inst{21} = 1;
	let Inst{20-16} = Rm;
	let Inst{15} = extend{1}; // sign extend Rm?
	let Inst{14} = 1;
	let Inst{12} = extend{0}; // do shift?
	let Inst{11-10} = 0b10;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rt;
	}

	multiclass Load128RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
	string asm, ValueType Ty, SDPatternOperator loadop> {
	let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
	def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
	(ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend),
	[(set (Ty regtype:$Rt),
	(loadop (ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm,
	ro_Wextend128:$extend)))]>,
	Sched<[WriteLDIdx, ReadAdrBase]> {
	let Inst{13} = 0b0;
	}

	let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
	def roX : LoadStore128RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
	(ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend128:$extend),
	[(set (Ty regtype:$Rt),
	(loadop (ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm,
	ro_Xextend128:$extend)))]>,
	Sched<[WriteLDIdx, ReadAdrBase]> {
	let Inst{13} = 0b1;
	}

	def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
	}

	multiclass Store128RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
	string asm, ValueType Ty, SDPatternOperator storeop> {
	let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
	def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs),
	(ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend),
	[]>,
	Sched<[WriteSTIdx, ReadAdrBase]> {
	let Inst{13} = 0b0;
	}

	let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
	def roX : LoadStore128RO<sz, V, opc, regtype, asm, (outs),
	(ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend128:$extend),
	[]>,
	Sched<[WriteSTIdx, ReadAdrBase]> {
	let Inst{13} = 0b1;
	}

	def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
	}

	let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
	class BasePrefetchRO<bits<2> sz, bit V, bits<2> opc, dag outs, dag ins,
	string asm, list<dag> pat>
	: I<outs, ins, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat>,
	Sched<[WriteLD]> {
	bits<5> Rt;
	bits<5> Rn;
	bits<5> Rm;
	bits<2> extend;
	let Inst{31-30} = sz;
	let Inst{29-27} = 0b111;
	let Inst{26} = V;
	let Inst{25-24} = 0b00;
	let Inst{23-22} = opc;
	let Inst{21} = 1;
	let Inst{20-16} = Rm;
	let Inst{15} = extend{1}; // sign extend Rm?
	let Inst{14} = 1;
	let Inst{12} = extend{0}; // do shift?
	let Inst{11-10} = 0b10;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rt;
	}

	multiclass PrefetchRO<bits<2> sz, bit V, bits<2> opc, string asm> {
	def roW : BasePrefetchRO<sz, V, opc, (outs),
	(ins prfop:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
	asm, [(AArch64Prefetch imm:$Rt,
	(ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
	ro_Wextend64:$extend))]> {
	let Inst{13} = 0b0;
	}

	def roX : BasePrefetchRO<sz, V, opc, (outs),
	(ins prfop:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
	asm, [(AArch64Prefetch imm:$Rt,
	(ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
	ro_Xextend64:$extend))]> {
	let Inst{13} = 0b1;
	}

	def : InstAlias<"prfm $Rt, [$Rn, $Rm]",
	(!cast<Instruction>(NAME # "roX") prfop:$Rt,
	GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
	}

	//---
	// Load/store unscaled immediate
	//---

	def am_unscaled8 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled8", []>;
	def am_unscaled16 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled16", []>;
	def am_unscaled32 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled32", []>;
	def am_unscaled64 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled64", []>;
	def am_unscaled128 :ComplexPattern<i64, 2, "SelectAddrModeUnscaled128", []>;

	def gi_am_unscaled8 :
	GIComplexOperandMatcher<s64, "selectAddrModeUnscaled8">,
	GIComplexPatternEquiv<am_unscaled8>;
	def gi_am_unscaled16 :
	GIComplexOperandMatcher<s64, "selectAddrModeUnscaled16">,
	GIComplexPatternEquiv<am_unscaled16>;
	def gi_am_unscaled32 :
	GIComplexOperandMatcher<s64, "selectAddrModeUnscaled32">,
	GIComplexPatternEquiv<am_unscaled32>;
	def gi_am_unscaled64 :
	GIComplexOperandMatcher<s64, "selectAddrModeUnscaled64">,
	GIComplexPatternEquiv<am_unscaled64>;
	def gi_am_unscaled128 :
	GIComplexOperandMatcher<s64, "selectAddrModeUnscaled128">,
	GIComplexPatternEquiv<am_unscaled128>;


	class BaseLoadStoreUnscale<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
	string asm, list<dag> pattern>
	: I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", pattern> {
	bits<5> Rt;
	bits<5> Rn;
	bits<9> offset;
	let Inst{31-30} = sz;
	let Inst{29-27} = 0b111;
	let Inst{26} = V;
	let Inst{25-24} = 0b00;
	let Inst{23-22} = opc;
	let Inst{21} = 0;
	let Inst{20-12} = offset;
	let Inst{11-10} = 0b00;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rt;

	let DecoderMethod = "DecodeSignedLdStInstruction";
	}

	// Armv8.4 LDAPR & STLR with Immediate Offset instruction
	multiclass BaseLoadUnscaleV84<string asm, bits<2> sz, bits<2> opc,
	RegisterOperand regtype > {
	def i : BaseLoadStoreUnscale<sz, 0, opc, (outs regtype:$Rt),
	(ins GPR64sp:$Rn, simm9:$offset), asm, []>,
	Sched<[WriteST]> {
	let Inst{29} = 0;
	let Inst{24} = 1;
	}
	def : InstAlias<asm # "\t$Rt, [$Rn]",
	(!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
	}

	multiclass BaseStoreUnscaleV84<string asm, bits<2> sz, bits<2> opc,
	RegisterOperand regtype > {
	def i : BaseLoadStoreUnscale<sz, 0, opc, (outs),
	(ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
	asm, []>,
	Sched<[WriteST]> {
	let Inst{29} = 0;
	let Inst{24} = 1;
	}
	def : InstAlias<asm # "\t$Rt, [$Rn]",
	(!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
	}

	multiclass LoadUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
	string asm, list<dag> pattern> {
	let AddedComplexity = 1 in // try this before LoadUI
	def i : BaseLoadStoreUnscale<sz, V, opc, (outs regtype:$Rt),
	(ins GPR64sp:$Rn, simm9:$offset), asm, pattern>,
	Sched<[WriteLD]>;

	def : InstAlias<asm # "\t$Rt, [$Rn]",
	(!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
	}

	multiclass StoreUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
	string asm, list<dag> pattern> {
	let AddedComplexity = 1 in // try this before StoreUI
	def i : BaseLoadStoreUnscale<sz, V, opc, (outs),
	(ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
	asm, pattern>,
	Sched<[WriteST]>;

	def : InstAlias<asm # "\t$Rt, [$Rn]",
	(!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
	}

	multiclass PrefetchUnscaled<bits<2> sz, bit V, bits<2> opc, string asm,
	list<dag> pat> {
	let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
	def i : BaseLoadStoreUnscale<sz, V, opc, (outs),
	(ins prfop:$Rt, GPR64sp:$Rn, simm9:$offset),
	asm, pat>,
	Sched<[WriteLD]>;

	def : InstAlias<asm # "\t$Rt, [$Rn]",
	(!cast<Instruction>(NAME # "i") prfop:$Rt, GPR64sp:$Rn, 0)>;
	}

	//---
	// Load/store unscaled immediate, unprivileged
	//---

	class BaseLoadStoreUnprivileged<bits<2> sz, bit V, bits<2> opc,
	dag oops, dag iops, string asm>
	: I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", []> {
	bits<5> Rt;
	bits<5> Rn;
	bits<9> offset;
	let Inst{31-30} = sz;
	let Inst{29-27} = 0b111;
	let Inst{26} = V;
	let Inst{25-24} = 0b00;
	let Inst{23-22} = opc;
	let Inst{21} = 0;
	let Inst{20-12} = offset;
	let Inst{11-10} = 0b10;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rt;

	let DecoderMethod = "DecodeSignedLdStInstruction";
	}

	multiclass LoadUnprivileged<bits<2> sz, bit V, bits<2> opc,
	RegisterClass regtype, string asm> {
	let mayStore = 0, mayLoad = 1, hasSideEffects = 0 in
	def i : BaseLoadStoreUnprivileged<sz, V, opc, (outs regtype:$Rt),
	(ins GPR64sp:$Rn, simm9:$offset), asm>,
	Sched<[WriteLD]>;

	def : InstAlias<asm # "\t$Rt, [$Rn]",
	(!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
	}

	multiclass StoreUnprivileged<bits<2> sz, bit V, bits<2> opc,
	RegisterClass regtype, string asm> {
	let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
	def i : BaseLoadStoreUnprivileged<sz, V, opc, (outs),
	(ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
	asm>,
	Sched<[WriteST]>;

	def : InstAlias<asm # "\t$Rt, [$Rn]",
	(!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
	}

	//---
	// Load/store pre-indexed
	//---

	class BaseLoadStorePreIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
	string asm, string cstr, list<dag> pat>
	: I<oops, iops, asm, "\t$Rt, [$Rn, $offset]!", cstr, pat> {
	bits<5> Rt;
	bits<5> Rn;
	bits<9> offset;
	let Inst{31-30} = sz;
	let Inst{29-27} = 0b111;
	let Inst{26} = V;
	let Inst{25-24} = 0;
	let Inst{23-22} = opc;
	let Inst{21} = 0;
	let Inst{20-12} = offset;
	let Inst{11-10} = 0b11;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rt;

	let DecoderMethod = "DecodeSignedLdStInstruction";
	}

	let hasSideEffects = 0 in {
	let mayStore = 0, mayLoad = 1 in
	class LoadPreIdx<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
	string asm>
	: BaseLoadStorePreIdx<sz, V, opc,
	(outs GPR64sp:$wback, regtype:$Rt),
	(ins GPR64sp:$Rn, simm9:$offset), asm,
	"$Rn = $wback,@earlyclobber $wback", []>,
	Sched<[WriteLD, WriteAdr]>;

	let mayStore = 1, mayLoad = 0 in
	class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
	string asm, SDPatternOperator storeop, ValueType Ty>
	: BaseLoadStorePreIdx<sz, V, opc,
	(outs GPR64sp:$wback),
	(ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
	asm, "$Rn = $wback,@earlyclobber $wback",
	[(set GPR64sp:$wback,
	(storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
	Sched<[WriteAdr, WriteST]>;
	} // hasSideEffects = 0

	//---
	// Load/store post-indexed
	//---

	class BaseLoadStorePostIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
	string asm, string cstr, list<dag> pat>
	: I<oops, iops, asm, "\t$Rt, [$Rn], $offset", cstr, pat> {
	bits<5> Rt;
	bits<5> Rn;
	bits<9> offset;
	let Inst{31-30} = sz;
	let Inst{29-27} = 0b111;
	let Inst{26} = V;
	let Inst{25-24} = 0b00;
	let Inst{23-22} = opc;
	let Inst{21} = 0b0;
	let Inst{20-12} = offset;
	let Inst{11-10} = 0b01;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rt;

	let DecoderMethod = "DecodeSignedLdStInstruction";
	}

	let hasSideEffects = 0 in {
	let mayStore = 0, mayLoad = 1 in
	class LoadPostIdx<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
	string asm>
	: BaseLoadStorePostIdx<sz, V, opc,
	(outs GPR64sp:$wback, regtype:$Rt),
	(ins GPR64sp:$Rn, simm9:$offset),
	asm, "$Rn = $wback,@earlyclobber $wback", []>,
	Sched<[WriteLD, WriteAdr]>;

	let mayStore = 1, mayLoad = 0 in
	class StorePostIdx<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
	string asm, SDPatternOperator storeop, ValueType Ty>
	: BaseLoadStorePostIdx<sz, V, opc,
	(outs GPR64sp:$wback),
	(ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
	asm, "$Rn = $wback,@earlyclobber $wback",
	[(set GPR64sp:$wback,
	(storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
	Sched<[WriteAdr, WriteST]>;
	} // hasSideEffects = 0


	//---
	// Load/store pair
	//---

	// (indexed, offset)

	class BaseLoadStorePairOffset<bits<2> opc, bit V, bit L, dag oops, dag iops,
	string asm>
	: I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]", "", []> {
	bits<5> Rt;
	bits<5> Rt2;
	bits<5> Rn;
	bits<7> offset;
	let Inst{31-30} = opc;
	let Inst{29-27} = 0b101;
	let Inst{26} = V;
	let Inst{25-23} = 0b010;
	let Inst{22} = L;
	let Inst{21-15} = offset;
	let Inst{14-10} = Rt2;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rt;

	let DecoderMethod = "DecodePairLdStInstruction";
	}

	multiclass LoadPairOffset<bits<2> opc, bit V, RegisterOperand regtype,
	Operand indextype, string asm> {
	let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in
	def i : BaseLoadStorePairOffset<opc, V, 1,
	(outs regtype:$Rt, regtype:$Rt2),
	(ins GPR64sp:$Rn, indextype:$offset), asm>,
	Sched<[WriteLD, WriteLDHi]>;

	def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
	(!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
	GPR64sp:$Rn, 0)>;
	}


	multiclass StorePairOffset<bits<2> opc, bit V, RegisterOperand regtype,
	Operand indextype, string asm> {
	let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
	def i : BaseLoadStorePairOffset<opc, V, 0, (outs),
	(ins regtype:$Rt, regtype:$Rt2,
	GPR64sp:$Rn, indextype:$offset),
	asm>,
	Sched<[WriteSTP]>;

	def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
	(!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
	GPR64sp:$Rn, 0)>;
	}

	// (pre-indexed)
	class BaseLoadStorePairPreIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
	string asm>
	: I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]!", "$Rn = $wback,@earlyclobber $wback", []> {
	bits<5> Rt;
	bits<5> Rt2;
	bits<5> Rn;
	bits<7> offset;
	let Inst{31-30} = opc;
	let Inst{29-27} = 0b101;
	let Inst{26} = V;
	let Inst{25-23} = 0b011;
	let Inst{22} = L;
	let Inst{21-15} = offset;
	let Inst{14-10} = Rt2;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rt;

	let DecoderMethod = "DecodePairLdStInstruction";
	}

	let hasSideEffects = 0 in {
	let mayStore = 0, mayLoad = 1 in
	class LoadPairPreIdx<bits<2> opc, bit V, RegisterOperand regtype,
	Operand indextype, string asm>
	: BaseLoadStorePairPreIdx<opc, V, 1,
	(outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
	(ins GPR64sp:$Rn, indextype:$offset), asm>,
	Sched<[WriteLD, WriteLDHi, WriteAdr]>;

	let mayStore = 1, mayLoad = 0 in
	class StorePairPreIdx<bits<2> opc, bit V, RegisterOperand regtype,
	Operand indextype, string asm>
	: BaseLoadStorePairPreIdx<opc, V, 0, (outs GPR64sp:$wback),
	(ins regtype:$Rt, regtype:$Rt2,
	GPR64sp:$Rn, indextype:$offset),
	asm>,
	Sched<[WriteAdr, WriteSTP]>;
	} // hasSideEffects = 0

	// (post-indexed)

	class BaseLoadStorePairPostIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
	string asm>
	: I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn], $offset", "$Rn = $wback,@earlyclobber $wback", []> {
	bits<5> Rt;
	bits<5> Rt2;
	bits<5> Rn;
	bits<7> offset;
	let Inst{31-30} = opc;
	let Inst{29-27} = 0b101;
	let Inst{26} = V;
	let Inst{25-23} = 0b001;
	let Inst{22} = L;
	let Inst{21-15} = offset;
	let Inst{14-10} = Rt2;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rt;

	let DecoderMethod = "DecodePairLdStInstruction";
	}

	let hasSideEffects = 0 in {
	let mayStore = 0, mayLoad = 1 in
	class LoadPairPostIdx<bits<2> opc, bit V, RegisterOperand regtype,
	Operand idxtype, string asm>
	: BaseLoadStorePairPostIdx<opc, V, 1,
	(outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
	(ins GPR64sp:$Rn, idxtype:$offset), asm>,
	Sched<[WriteLD, WriteLDHi, WriteAdr]>;

	let mayStore = 1, mayLoad = 0 in
	class StorePairPostIdx<bits<2> opc, bit V, RegisterOperand regtype,
	Operand idxtype, string asm>
	: BaseLoadStorePairPostIdx<opc, V, 0, (outs GPR64sp:$wback),
	(ins regtype:$Rt, regtype:$Rt2,
	GPR64sp:$Rn, idxtype:$offset),
	asm>,
	Sched<[WriteAdr, WriteSTP]>;
	} // hasSideEffects = 0

	// (no-allocate)

	class BaseLoadStorePairNoAlloc<bits<2> opc, bit V, bit L, dag oops, dag iops,
	string asm>
	: I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]", "", []> {
	bits<5> Rt;
	bits<5> Rt2;
	bits<5> Rn;
	bits<7> offset;
	let Inst{31-30} = opc;
	let Inst{29-27} = 0b101;
	let Inst{26} = V;
	let Inst{25-23} = 0b000;
	let Inst{22} = L;
	let Inst{21-15} = offset;
	let Inst{14-10} = Rt2;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rt;

	let DecoderMethod = "DecodePairLdStInstruction";
	}

	multiclass LoadPairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
	Operand indextype, string asm> {
	let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in
	def i : BaseLoadStorePairNoAlloc<opc, V, 1,
	(outs regtype:$Rt, regtype:$Rt2),
	(ins GPR64sp:$Rn, indextype:$offset), asm>,
	Sched<[WriteLD, WriteLDHi]>;


	def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
	(!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
	GPR64sp:$Rn, 0)>;
	}

	multiclass StorePairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
	Operand indextype, string asm> {
	let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in
	def i : BaseLoadStorePairNoAlloc<opc, V, 0, (outs),
	(ins regtype:$Rt, regtype:$Rt2,
	GPR64sp:$Rn, indextype:$offset),
	asm>,
	Sched<[WriteSTP]>;

	def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
	(!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
	GPR64sp:$Rn, 0)>;
	}

	//---
	// Load/store exclusive
	//---

	// True exclusive operations write to and/or read from the system's exclusive
	// monitors, which as far as a compiler is concerned can be modelled as a
	// random shared memory address. Hence LoadExclusive mayStore.
	//
	// Since these instructions have the undefined register bits set to 1 in
	// their canonical form, we need a post encoder method to set those bits
	// to 1 when encoding these instructions. We do this using the
	// fixLoadStoreExclusive function. This function has template parameters:
	//
	// fixLoadStoreExclusive<int hasRs, int hasRt2>
	//
	// hasRs indicates that the instruction uses the Rs field, so we won't set
	// it to 1 (and the same for Rt2). We don't need template parameters for
	// the other register fields since Rt and Rn are always used.
	//
	let hasSideEffects = 1, mayLoad = 1, mayStore = 1 in
	class BaseLoadStoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
	dag oops, dag iops, string asm, string operands>
	: I<oops, iops, asm, operands, "", []> {
	let Inst{31-30} = sz;
	let Inst{29-24} = 0b001000;
	let Inst{23} = o2;
	let Inst{22} = L;
	let Inst{21} = o1;
	let Inst{15} = o0;

	let DecoderMethod = "DecodeExclusiveLdStInstruction";
	}

	// Neither Rs nor Rt2 operands.
	class LoadStoreExclusiveSimple<bits<2> sz, bit o2, bit L, bit o1, bit o0,
	dag oops, dag iops, string asm, string operands>
	: BaseLoadStoreExclusive<sz, o2, L, o1, o0, oops, iops, asm, operands> {
	bits<5> Rt;
	bits<5> Rn;
	let Inst{20-16} = 0b11111;
	let Unpredictable{20-16} = 0b11111;
	let Inst{14-10} = 0b11111;
	let Unpredictable{14-10} = 0b11111;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rt;

	let PostEncoderMethod = "fixLoadStoreExclusive<0,0>";
	}

	// Simple load acquires don't set the exclusive monitor
	let mayLoad = 1, mayStore = 0 in
	class LoadAcquire<bits<2> sz, bit o2, bit L, bit o1, bit o0,
	RegisterClass regtype, string asm>
	: LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
	(ins GPR64sp0:$Rn), asm, "\t$Rt, [$Rn]">,
	Sched<[WriteLD]>;

	class LoadExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
	RegisterClass regtype, string asm>
	: LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
	(ins GPR64sp0:$Rn), asm, "\t$Rt, [$Rn]">,
	Sched<[WriteLD]>;

	class LoadExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
	RegisterClass regtype, string asm>
	: BaseLoadStoreExclusive<sz, o2, L, o1, o0,
	(outs regtype:$Rt, regtype:$Rt2),
	(ins GPR64sp0:$Rn), asm,
	"\t$Rt, $Rt2, [$Rn]">,
	Sched<[WriteLD, WriteLDHi]> {
	bits<5> Rt;
	bits<5> Rt2;
	bits<5> Rn;
	let Inst{14-10} = Rt2;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rt;

	let PostEncoderMethod = "fixLoadStoreExclusive<0,1>";
	}

	// Simple store release operations do not check the exclusive monitor.
	let mayLoad = 0, mayStore = 1 in
	class StoreRelease<bits<2> sz, bit o2, bit L, bit o1, bit o0,
	RegisterClass regtype, string asm>
	: LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs),
	(ins regtype:$Rt, GPR64sp0:$Rn),
	asm, "\t$Rt, [$Rn]">,
	Sched<[WriteST]>;

	let mayLoad = 1, mayStore = 1 in
	class StoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
	RegisterClass regtype, string asm>
	: BaseLoadStoreExclusive<sz, o2, L, o1, o0, (outs GPR32:$Ws),
	(ins regtype:$Rt, GPR64sp0:$Rn),
	asm, "\t$Ws, $Rt, [$Rn]">,
	Sched<[WriteSTX]> {
	bits<5> Ws;
	bits<5> Rt;
	bits<5> Rn;
	let Inst{20-16} = Ws;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rt;

	let Constraints = "@earlyclobber $Ws";
	let PostEncoderMethod = "fixLoadStoreExclusive<1,0>";
	}

	class StoreExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
	RegisterClass regtype, string asm>
	: BaseLoadStoreExclusive<sz, o2, L, o1, o0,
	(outs GPR32:$Ws),
	(ins regtype:$Rt, regtype:$Rt2, GPR64sp0:$Rn),
	asm, "\t$Ws, $Rt, $Rt2, [$Rn]">,
	Sched<[WriteSTX]> {
	bits<5> Ws;
	bits<5> Rt;
	bits<5> Rt2;
	bits<5> Rn;
	let Inst{20-16} = Ws;
	let Inst{14-10} = Rt2;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rt;

	let Constraints = "@earlyclobber $Ws";
	}

	// Armv8.5-A Memory Tagging Extension
	class BaseMemTag<bits<2> opc1, bits<2> opc2, string asm_insn,
	string asm_opnds, string cstr, dag oops, dag iops>
	: I<oops, iops, asm_insn, asm_opnds, cstr, []>,
	Sched<[]> {
	bits<5> Rn;

	let Inst{31-24} = 0b11011001;
	let Inst{23-22} = opc1;
	let Inst{21} = 1;
	// Inst{20-12} defined by subclass
	let Inst{11-10} = opc2;
	let Inst{9-5} = Rn;
	// Inst{4-0} defined by subclass
	}

	class MemTagVector<bit Load, string asm_insn, string asm_opnds,
	dag oops, dag iops>
	: BaseMemTag<{0b1, Load}, 0b00, asm_insn, asm_opnds,
	"", oops, iops> {
	bits<5> Rt;

	let Inst{20-12} = 0b000000000;
	let Inst{4-0} = Rt;

	let mayLoad = Load;
	}

	class MemTagLoad<string asm_insn, string asm_opnds>
	: BaseMemTag<0b01, 0b00, asm_insn, asm_opnds, "$Rt = $wback",
	(outs GPR64:$wback),
	(ins GPR64:$Rt, GPR64sp:$Rn, simm9s16:$offset)> {
	bits<5> Rt;
	bits<9> offset;

	let Inst{20-12} = offset;
	let Inst{4-0} = Rt;

	let mayLoad = 1;
	}

	class BaseMemTagStore<bits<2> opc1, bits<2> opc2, string asm_insn,
	string asm_opnds, string cstr, dag oops, dag iops>
	: BaseMemTag<opc1, opc2, asm_insn, asm_opnds, cstr, oops, iops> {
	bits<5> Rt;
	bits<9> offset;

	let Inst{20-12} = offset;
	let Inst{4-0} = Rt;

	let mayStore = 1;
	}

	multiclass MemTagStore<bits<2> opc1, string insn> {
	def Offset :
	BaseMemTagStore<opc1, 0b10, insn, "\t$Rt, [$Rn, $offset]", "",
	(outs), (ins GPR64sp:$Rt, GPR64sp:$Rn, simm9s16:$offset)>;
	def PreIndex :
	BaseMemTagStore<opc1, 0b11, insn, "\t$Rt, [$Rn, $offset]!",
	"$Rn = $wback",
	(outs GPR64sp:$wback),
	(ins GPR64sp:$Rt, GPR64sp:$Rn, simm9s16:$offset)>;
	def PostIndex :
	BaseMemTagStore<opc1, 0b01, insn, "\t$Rt, [$Rn], $offset",
	"$Rn = $wback",
	(outs GPR64sp:$wback),
	(ins GPR64sp:$Rt, GPR64sp:$Rn, simm9s16:$offset)>;

	def : InstAlias<insn # "\t$Rt, [$Rn]",
	(!cast<Instruction>(NAME # "Offset") GPR64sp:$Rt, GPR64sp:$Rn, 0)>;
	}

	//---
	// Exception generation
	//---

	let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
	class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm>
	: I<(outs), (ins i32_imm0_65535:$imm), asm, "\t$imm", "", []>,
	Sched<[WriteSys]> {
	bits<16> imm;
	let Inst{31-24} = 0b11010100;
	let Inst{23-21} = op1;
	let Inst{20-5} = imm;
	let Inst{4-2} = 0b000;
	let Inst{1-0} = ll;
	}

	//---
	// UDF : Permanently UNDEFINED instructions. Format: Opc = 0x0000, 16 bit imm.
	//--
	let hasSideEffects = 1, isTrap = 1, mayLoad = 0, mayStore = 0 in {
	class UDFType<bits<16> opc, string asm>
	: I<(outs), (ins uimm16:$imm),
	asm, "\t$imm", "", []>,
	Sched<[]> {
	bits<16> imm;
	let Inst{31-16} = opc;
	let Inst{15-0} = imm;
	}
	}
	let Predicates = [HasFPARMv8] in {

	//---
	// Floating point to integer conversion
	//---

	class BaseFPToIntegerUnscaled<bits<2> type, bits<2> rmode, bits<3> opcode,
	RegisterClass srcType, RegisterClass dstType,
	string asm, list<dag> pattern>
	: I<(outs dstType:$Rd), (ins srcType:$Rn),
	asm, "\t$Rd, $Rn", "", pattern>,
	Sched<[WriteFCvt]> {
	bits<5> Rd;
	bits<5> Rn;
	let Inst{30-29} = 0b00;
	let Inst{28-24} = 0b11110;
	let Inst{23-22} = type;
	let Inst{21} = 1;
	let Inst{20-19} = rmode;
	let Inst{18-16} = opcode;
	let Inst{15-10} = 0;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseFPToInteger<bits<2> type, bits<2> rmode, bits<3> opcode,
	RegisterClass srcType, RegisterClass dstType,
	Operand immType, string asm, list<dag> pattern>
	: I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
	asm, "\t$Rd, $Rn, $scale", "", pattern>,
	Sched<[WriteFCvt]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<6> scale;
	let Inst{30-29} = 0b00;
	let Inst{28-24} = 0b11110;
	let Inst{23-22} = type;
	let Inst{21} = 0;
	let Inst{20-19} = rmode;
	let Inst{18-16} = opcode;
	let Inst{15-10} = scale;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm,
	SDPatternOperator OpN> {
	// Unscaled half-precision to 32-bit
	def UWHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR32, asm,
	[(set GPR32:$Rd, (OpN (f16 FPR16:$Rn)))]> {
	let Inst{31} = 0; // 32-bit GPR flag
	let Predicates = [HasFullFP16];
	}

	// Unscaled half-precision to 64-bit
	def UXHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR64, asm,
	[(set GPR64:$Rd, (OpN (f16 FPR16:$Rn)))]> {
	let Inst{31} = 1; // 64-bit GPR flag
	let Predicates = [HasFullFP16];
	}

	// Unscaled single-precision to 32-bit
	def UWSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR32, asm,
	[(set GPR32:$Rd, (OpN FPR32:$Rn))]> {
	let Inst{31} = 0; // 32-bit GPR flag
	}

	// Unscaled single-precision to 64-bit
	def UXSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR64, asm,
	[(set GPR64:$Rd, (OpN FPR32:$Rn))]> {
	let Inst{31} = 1; // 64-bit GPR flag
	}

	// Unscaled double-precision to 32-bit
	def UWDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR32, asm,
	[(set GPR32:$Rd, (OpN (f64 FPR64:$Rn)))]> {
	let Inst{31} = 0; // 32-bit GPR flag
	}

	// Unscaled double-precision to 64-bit
	def UXDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR64, asm,
	[(set GPR64:$Rd, (OpN (f64 FPR64:$Rn)))]> {
	let Inst{31} = 1; // 64-bit GPR flag
	}
	}

	multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm,
	SDPatternOperator OpN> {
	// Scaled half-precision to 32-bit
	def SWHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR32,
	fixedpoint_f16_i32, asm,
	[(set GPR32:$Rd, (OpN (fmul (f16 FPR16:$Rn),
	fixedpoint_f16_i32:$scale)))]> {
	let Inst{31} = 0; // 32-bit GPR flag
	let scale{5} = 1;
	let Predicates = [HasFullFP16];
	}

	// Scaled half-precision to 64-bit
	def SXHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR64,
	fixedpoint_f16_i64, asm,
	[(set GPR64:$Rd, (OpN (fmul (f16 FPR16:$Rn),
	fixedpoint_f16_i64:$scale)))]> {
	let Inst{31} = 1; // 64-bit GPR flag
	let Predicates = [HasFullFP16];
	}

	// Scaled single-precision to 32-bit
	def SWSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR32,
	fixedpoint_f32_i32, asm,
	[(set GPR32:$Rd, (OpN (fmul FPR32:$Rn,
	fixedpoint_f32_i32:$scale)))]> {
	let Inst{31} = 0; // 32-bit GPR flag
	let scale{5} = 1;
	}

	// Scaled single-precision to 64-bit
	def SXSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR64,
	fixedpoint_f32_i64, asm,
	[(set GPR64:$Rd, (OpN (fmul FPR32:$Rn,
	fixedpoint_f32_i64:$scale)))]> {
	let Inst{31} = 1; // 64-bit GPR flag
	}

	// Scaled double-precision to 32-bit
	def SWDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR32,
	fixedpoint_f64_i32, asm,
	[(set GPR32:$Rd, (OpN (fmul FPR64:$Rn,
	fixedpoint_f64_i32:$scale)))]> {
	let Inst{31} = 0; // 32-bit GPR flag
	let scale{5} = 1;
	}

	// Scaled double-precision to 64-bit
	def SXDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR64,
	fixedpoint_f64_i64, asm,
	[(set GPR64:$Rd, (OpN (fmul FPR64:$Rn,
	fixedpoint_f64_i64:$scale)))]> {
	let Inst{31} = 1; // 64-bit GPR flag
	}
	}

	//---
	// Integer to floating point conversion
	//---

	let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
	class BaseIntegerToFP<bit isUnsigned,
	RegisterClass srcType, RegisterClass dstType,
	Operand immType, string asm, list<dag> pattern>
	: I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
	asm, "\t$Rd, $Rn, $scale", "", pattern>,
	Sched<[WriteFCvt]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<6> scale;
	let Inst{30-24} = 0b0011110;
	let Inst{21-17} = 0b00001;
	let Inst{16} = isUnsigned;
	let Inst{15-10} = scale;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	class BaseIntegerToFPUnscaled<bit isUnsigned,
	RegisterClass srcType, RegisterClass dstType,
	ValueType dvt, string asm, SDNode node>
	: I<(outs dstType:$Rd), (ins srcType:$Rn),
	asm, "\t$Rd, $Rn", "", [(set (dvt dstType:$Rd), (node srcType:$Rn))]>,
	Sched<[WriteFCvt]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<6> scale;
	let Inst{30-24} = 0b0011110;
	let Inst{21-17} = 0b10001;
	let Inst{16} = isUnsigned;
	let Inst{15-10} = 0b000000;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
	// Unscaled
	def UWHri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR16, f16, asm, node> {
	let Inst{31} = 0; // 32-bit GPR flag
	let Inst{23-22} = 0b11; // 16-bit FPR flag
	let Predicates = [HasFullFP16];
	}

	def UWSri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR32, f32, asm, node> {
	let Inst{31} = 0; // 32-bit GPR flag
	let Inst{23-22} = 0b00; // 32-bit FPR flag
	}

	def UWDri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR64, f64, asm, node> {
	let Inst{31} = 0; // 32-bit GPR flag
	let Inst{23-22} = 0b01; // 64-bit FPR flag
	}

	def UXHri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR16, f16, asm, node> {
	let Inst{31} = 1; // 64-bit GPR flag
	let Inst{23-22} = 0b11; // 16-bit FPR flag
	let Predicates = [HasFullFP16];
	}

	def UXSri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR32, f32, asm, node> {
	let Inst{31} = 1; // 64-bit GPR flag
	let Inst{23-22} = 0b00; // 32-bit FPR flag
	}

	def UXDri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR64, f64, asm, node> {
	let Inst{31} = 1; // 64-bit GPR flag
	let Inst{23-22} = 0b01; // 64-bit FPR flag
	}

	// Scaled
	def SWHri: BaseIntegerToFP<isUnsigned, GPR32, FPR16, fixedpoint_f16_i32, asm,
	[(set (f16 FPR16:$Rd),
	(fdiv (node GPR32:$Rn),
	fixedpoint_f16_i32:$scale))]> {
	let Inst{31} = 0; // 32-bit GPR flag
	let Inst{23-22} = 0b11; // 16-bit FPR flag
	let scale{5} = 1;
	let Predicates = [HasFullFP16];
	}

	def SWSri: BaseIntegerToFP<isUnsigned, GPR32, FPR32, fixedpoint_f32_i32, asm,
	[(set FPR32:$Rd,
	(fdiv (node GPR32:$Rn),
	fixedpoint_f32_i32:$scale))]> {
	let Inst{31} = 0; // 32-bit GPR flag
	let Inst{23-22} = 0b00; // 32-bit FPR flag
	let scale{5} = 1;
	}

	def SWDri: BaseIntegerToFP<isUnsigned, GPR32, FPR64, fixedpoint_f64_i32, asm,
	[(set FPR64:$Rd,
	(fdiv (node GPR32:$Rn),
	fixedpoint_f64_i32:$scale))]> {
	let Inst{31} = 0; // 32-bit GPR flag
	let Inst{23-22} = 0b01; // 64-bit FPR flag
	let scale{5} = 1;
	}

	def SXHri: BaseIntegerToFP<isUnsigned, GPR64, FPR16, fixedpoint_f16_i64, asm,
	[(set (f16 FPR16:$Rd),
	(fdiv (node GPR64:$Rn),
	fixedpoint_f16_i64:$scale))]> {
	let Inst{31} = 1; // 64-bit GPR flag
	let Inst{23-22} = 0b11; // 16-bit FPR flag
	let Predicates = [HasFullFP16];
	}

	def SXSri: BaseIntegerToFP<isUnsigned, GPR64, FPR32, fixedpoint_f32_i64, asm,
	[(set FPR32:$Rd,
	(fdiv (node GPR64:$Rn),
	fixedpoint_f32_i64:$scale))]> {
	let Inst{31} = 1; // 64-bit GPR flag
	let Inst{23-22} = 0b00; // 32-bit FPR flag
	}

	def SXDri: BaseIntegerToFP<isUnsigned, GPR64, FPR64, fixedpoint_f64_i64, asm,
	[(set FPR64:$Rd,
	(fdiv (node GPR64:$Rn),
	fixedpoint_f64_i64:$scale))]> {
	let Inst{31} = 1; // 64-bit GPR flag
	let Inst{23-22} = 0b01; // 64-bit FPR flag
	}
	}

	//---
	// Unscaled integer <-> floating point conversion (i.e. FMOV)
	//---

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseUnscaledConversion<bits<2> rmode, bits<3> opcode,
	RegisterClass srcType, RegisterClass dstType,
	string asm>
	: I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "",
	// We use COPY_TO_REGCLASS for these bitconvert operations.
	// copyPhysReg() expands the resultant COPY instructions after
	// regalloc is done. This gives greater freedom for the allocator
	// and related passes (coalescing, copy propagation, et. al.) to
	// be more effective.
	[/(set (dvt dstType:$Rd), (bitconvert (svt srcType:$Rn)))/]>,
	Sched<[WriteFCopy]> {
	bits<5> Rd;
	bits<5> Rn;
	let Inst{30-24} = 0b0011110;
	let Inst{21} = 1;
	let Inst{20-19} = rmode;
	let Inst{18-16} = opcode;
	let Inst{15-10} = 0b000000;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseUnscaledConversionToHigh<bits<2> rmode, bits<3> opcode,
	RegisterClass srcType, RegisterOperand dstType, string asm,
	string kind>
	: I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm,
	"{\t$Rd"#kind#"$idx, $Rn\|"#kind#"\t$Rd$idx, $Rn}", "", []>,
	Sched<[WriteFCopy]> {
	bits<5> Rd;
	bits<5> Rn;
	let Inst{30-23} = 0b00111101;
	let Inst{21} = 1;
	let Inst{20-19} = rmode;
	let Inst{18-16} = opcode;
	let Inst{15-10} = 0b000000;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;

	let DecoderMethod = "DecodeFMOVLaneInstruction";
	}

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseUnscaledConversionFromHigh<bits<2> rmode, bits<3> opcode,
	RegisterOperand srcType, RegisterClass dstType, string asm,
	string kind>
	: I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm,
	"{\t$Rd, $Rn"#kind#"$idx\|"#kind#"\t$Rd, $Rn$idx}", "", []>,
	Sched<[WriteFCopy]> {
	bits<5> Rd;
	bits<5> Rn;
	let Inst{30-23} = 0b00111101;
	let Inst{21} = 1;
	let Inst{20-19} = rmode;
	let Inst{18-16} = opcode;
	let Inst{15-10} = 0b000000;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;

	let DecoderMethod = "DecodeFMOVLaneInstruction";
	}


	multiclass UnscaledConversion<string asm> {
	def WHr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR16, asm> {
	let Inst{31} = 0; // 32-bit GPR flag
	let Inst{23-22} = 0b11; // 16-bit FPR flag
	let Predicates = [HasFullFP16];
	}

	def XHr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR16, asm> {
	let Inst{31} = 1; // 64-bit GPR flag
	let Inst{23-22} = 0b11; // 16-bit FPR flag
	let Predicates = [HasFullFP16];
	}

	def WSr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR32, asm> {
	let Inst{31} = 0; // 32-bit GPR flag
	let Inst{23-22} = 0b00; // 32-bit FPR flag
	}

	def XDr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR64, asm> {
	let Inst{31} = 1; // 64-bit GPR flag
	let Inst{23-22} = 0b01; // 64-bit FPR flag
	}

	def HWr : BaseUnscaledConversion<0b00, 0b110, FPR16, GPR32, asm> {
	let Inst{31} = 0; // 32-bit GPR flag
	let Inst{23-22} = 0b11; // 16-bit FPR flag
	let Predicates = [HasFullFP16];
	}

	def HXr : BaseUnscaledConversion<0b00, 0b110, FPR16, GPR64, asm> {
	let Inst{31} = 1; // 64-bit GPR flag
	let Inst{23-22} = 0b11; // 16-bit FPR flag
	let Predicates = [HasFullFP16];
	}

	def SWr : BaseUnscaledConversion<0b00, 0b110, FPR32, GPR32, asm> {
	let Inst{31} = 0; // 32-bit GPR flag
	let Inst{23-22} = 0b00; // 32-bit FPR flag
	}

	def DXr : BaseUnscaledConversion<0b00, 0b110, FPR64, GPR64, asm> {
	let Inst{31} = 1; // 64-bit GPR flag
	let Inst{23-22} = 0b01; // 64-bit FPR flag
	}

	def XDHighr : BaseUnscaledConversionToHigh<0b01, 0b111, GPR64, V128,
	asm, ".d"> {
	let Inst{31} = 1;
	let Inst{22} = 0;
	}

	def DXHighr : BaseUnscaledConversionFromHigh<0b01, 0b110, V128, GPR64,
	asm, ".d"> {
	let Inst{31} = 1;
	let Inst{22} = 0;
	}
	}

	//---
	// Floating point conversion
	//---

	class BaseFPConversion<bits<2> type, bits<2> opcode, RegisterClass dstType,
	RegisterClass srcType, string asm, list<dag> pattern>
	: I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "", pattern>,
	Sched<[WriteFCvt]> {
	bits<5> Rd;
	bits<5> Rn;
	let Inst{31-24} = 0b00011110;
	let Inst{23-22} = type;
	let Inst{21-17} = 0b10001;
	let Inst{16-15} = opcode;
	let Inst{14-10} = 0b10000;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	multiclass FPConversion<string asm> {
	// Double-precision to Half-precision
	def HDr : BaseFPConversion<0b01, 0b11, FPR16, FPR64, asm,
	[(set (f16 FPR16:$Rd), (any_fpround FPR64:$Rn))]>;

	// Double-precision to Single-precision
	def SDr : BaseFPConversion<0b01, 0b00, FPR32, FPR64, asm,
	[(set FPR32:$Rd, (any_fpround FPR64:$Rn))]>;

	// Half-precision to Double-precision
	def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm,
	[(set FPR64:$Rd, (fpextend (f16 FPR16:$Rn)))]>;

	// Half-precision to Single-precision
	def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm,
	[(set FPR32:$Rd, (fpextend (f16 FPR16:$Rn)))]>;

	// Single-precision to Double-precision
	def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm,
	[(set FPR64:$Rd, (fpextend FPR32:$Rn))]>;

	// Single-precision to Half-precision
	def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm,
	[(set (f16 FPR16:$Rd), (any_fpround FPR32:$Rn))]>;
	}

	//---
	// Single operand floating point data processing
	//---

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseSingleOperandFPData<bits<6> opcode, RegisterClass regtype,
	ValueType vt, string asm, SDPatternOperator node>
	: I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
	[(set (vt regtype:$Rd), (node (vt regtype:$Rn)))]>,
	Sched<[WriteF]> {
	bits<5> Rd;
	bits<5> Rn;
	let Inst{31-24} = 0b00011110;
	let Inst{21} = 0b1;
	let Inst{20-15} = opcode;
	let Inst{14-10} = 0b10000;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	multiclass SingleOperandFPData<bits<4> opcode, string asm,
	SDPatternOperator node = null_frag> {

	def Hr : BaseSingleOperandFPData<{0b00,opcode}, FPR16, f16, asm, node> {
	let Inst{23-22} = 0b11; // 16-bit size flag
	let Predicates = [HasFullFP16];
	}

	def Sr : BaseSingleOperandFPData<{0b00,opcode}, FPR32, f32, asm, node> {
	let Inst{23-22} = 0b00; // 32-bit size flag
	}

	def Dr : BaseSingleOperandFPData<{0b00,opcode}, FPR64, f64, asm, node> {
	let Inst{23-22} = 0b01; // 64-bit size flag
	}
	}

	multiclass SingleOperandFPNo16<bits<6> opcode, string asm,
	SDPatternOperator node = null_frag>{

	def Sr : BaseSingleOperandFPData<opcode, FPR32, f32, asm, node> {
	let Inst{23-22} = 0b00; // 32-bit registers
	}

	def Dr : BaseSingleOperandFPData<opcode, FPR64, f64, asm, node> {
	let Inst{23-22} = 0b01; // 64-bit registers
	}
	}

	// FRInt[32\|64][Z\|N] instructions
	multiclass FRIntNNT<bits<2> opcode, string asm, SDPatternOperator node = null_frag> :
	SingleOperandFPNo16<{0b0100,opcode}, asm, node>;

	//---
	// Two operand floating point data processing
	//---

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype,
	string asm, list<dag> pat>
	: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
	asm, "\t$Rd, $Rn, $Rm", "", pat>,
	Sched<[WriteF]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<5> Rm;
	let Inst{31-24} = 0b00011110;
	let Inst{21} = 1;
	let Inst{20-16} = Rm;
	let Inst{15-12} = opcode;
	let Inst{11-10} = 0b10;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	multiclass TwoOperandFPData<bits<4> opcode, string asm,
	SDPatternOperator node = null_frag> {
	def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm,
	[(set (f16 FPR16:$Rd),
	(node (f16 FPR16:$Rn), (f16 FPR16:$Rm)))]> {
	let Inst{23-22} = 0b11; // 16-bit size flag
	let Predicates = [HasFullFP16];
	}

	def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
	[(set (f32 FPR32:$Rd),
	(node (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]> {
	let Inst{23-22} = 0b00; // 32-bit size flag
	}

	def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
	[(set (f64 FPR64:$Rd),
	(node (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]> {
	let Inst{23-22} = 0b01; // 64-bit size flag
	}
	}

	multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> {
	def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm,
	[(set (f16 FPR16:$Rd), (fneg (node (f16 FPR16:$Rn), (f16 FPR16:$Rm))))]> {
	let Inst{23-22} = 0b11; // 16-bit size flag
	let Predicates = [HasFullFP16];
	}

	def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
	[(set FPR32:$Rd, (fneg (node FPR32:$Rn, (f32 FPR32:$Rm))))]> {
	let Inst{23-22} = 0b00; // 32-bit size flag
	}

	def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
	[(set FPR64:$Rd, (fneg (node FPR64:$Rn, (f64 FPR64:$Rm))))]> {
	let Inst{23-22} = 0b01; // 64-bit size flag
	}
	}


	//---
	// Three operand floating point data processing
	//---

	class BaseThreeOperandFPData<bit isNegated, bit isSub,
	RegisterClass regtype, string asm, list<dag> pat>
	: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, regtype: $Ra),
	asm, "\t$Rd, $Rn, $Rm, $Ra", "", pat>,
	Sched<[WriteFMul]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<5> Rm;
	bits<5> Ra;
	let Inst{31-24} = 0b00011111;
	let Inst{21} = isNegated;
	let Inst{20-16} = Rm;
	let Inst{15} = isSub;
	let Inst{14-10} = Ra;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
	SDPatternOperator node> {
	def Hrrr : BaseThreeOperandFPData<isNegated, isSub, FPR16, asm,
	[(set (f16 FPR16:$Rd),
	(node (f16 FPR16:$Rn), (f16 FPR16:$Rm), (f16 FPR16:$Ra)))]> {
	let Inst{23-22} = 0b11; // 16-bit size flag
	let Predicates = [HasFullFP16];
	}

	def Srrr : BaseThreeOperandFPData<isNegated, isSub, FPR32, asm,
	[(set FPR32:$Rd,
	(node (f32 FPR32:$Rn), (f32 FPR32:$Rm), (f32 FPR32:$Ra)))]> {
	let Inst{23-22} = 0b00; // 32-bit size flag
	}

	def Drrr : BaseThreeOperandFPData<isNegated, isSub, FPR64, asm,
	[(set FPR64:$Rd,
	(node (f64 FPR64:$Rn), (f64 FPR64:$Rm), (f64 FPR64:$Ra)))]> {
	let Inst{23-22} = 0b01; // 64-bit size flag
	}
	}

	//---
	// Floating point data comparisons
	//---

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseOneOperandFPComparison<bit signalAllNans,
	RegisterClass regtype, string asm,
	list<dag> pat>
	: I<(outs), (ins regtype:$Rn), asm, "\t$Rn, #0.0", "", pat>,
	Sched<[WriteFCmp]> {
	bits<5> Rn;
	let Inst{31-24} = 0b00011110;
	let Inst{21} = 1;

	let Inst{15-10} = 0b001000;
	let Inst{9-5} = Rn;
	let Inst{4} = signalAllNans;
	let Inst{3-0} = 0b1000;

	// Rm should be 0b00000 canonically, but we need to accept any value.
	let PostEncoderMethod = "fixOneOperandFPComparison";
	}

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype,
	string asm, list<dag> pat>
	: I<(outs), (ins regtype:$Rn, regtype:$Rm), asm, "\t$Rn, $Rm", "", pat>,
	Sched<[WriteFCmp]> {
	bits<5> Rm;
	bits<5> Rn;
	let Inst{31-24} = 0b00011110;
	let Inst{21} = 1;
	let Inst{20-16} = Rm;
	let Inst{15-10} = 0b001000;
	let Inst{9-5} = Rn;
	let Inst{4} = signalAllNans;
	let Inst{3-0} = 0b0000;
	}

	multiclass FPComparison<bit signalAllNans, string asm,
	SDPatternOperator OpNode = null_frag> {
	let Defs = [NZCV] in {
	def Hrr : BaseTwoOperandFPComparison<signalAllNans, FPR16, asm,
	[(OpNode (f16 FPR16:$Rn), (f16 FPR16:$Rm)), (implicit NZCV)]> {
	let Inst{23-22} = 0b11;
	let Predicates = [HasFullFP16];
	}

	def Hri : BaseOneOperandFPComparison<signalAllNans, FPR16, asm,
	[(OpNode (f16 FPR16:$Rn), fpimm0), (implicit NZCV)]> {
	let Inst{23-22} = 0b11;
	let Predicates = [HasFullFP16];
	}

	def Srr : BaseTwoOperandFPComparison<signalAllNans, FPR32, asm,
	[(OpNode FPR32:$Rn, (f32 FPR32:$Rm)), (implicit NZCV)]> {
	let Inst{23-22} = 0b00;
	}

	def Sri : BaseOneOperandFPComparison<signalAllNans, FPR32, asm,
	[(OpNode (f32 FPR32:$Rn), fpimm0), (implicit NZCV)]> {
	let Inst{23-22} = 0b00;
	}

	def Drr : BaseTwoOperandFPComparison<signalAllNans, FPR64, asm,
	[(OpNode FPR64:$Rn, (f64 FPR64:$Rm)), (implicit NZCV)]> {
	let Inst{23-22} = 0b01;
	}

	def Dri : BaseOneOperandFPComparison<signalAllNans, FPR64, asm,
	[(OpNode (f64 FPR64:$Rn), fpimm0), (implicit NZCV)]> {
	let Inst{23-22} = 0b01;
	}
	} // Defs = [NZCV]
	}

	//---
	// Floating point conditional comparisons
	//---

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseFPCondComparison<bit signalAllNans, RegisterClass regtype,
	string mnemonic, list<dag> pat>
	: I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond),
	mnemonic, "\t$Rn, $Rm, $nzcv, $cond", "", pat>,
	Sched<[WriteFCmp]> {
	let Uses = [NZCV];
	let Defs = [NZCV];

	bits<5> Rn;
	bits<5> Rm;
	bits<4> nzcv;
	bits<4> cond;

	let Inst{31-24} = 0b00011110;
	let Inst{21} = 1;
	let Inst{20-16} = Rm;
	let Inst{15-12} = cond;
	let Inst{11-10} = 0b01;
	let Inst{9-5} = Rn;
	let Inst{4} = signalAllNans;
	let Inst{3-0} = nzcv;
	}

	multiclass FPCondComparison<bit signalAllNans, string mnemonic,
	SDPatternOperator OpNode = null_frag> {
	def Hrr : BaseFPCondComparison<signalAllNans, FPR16, mnemonic,
	[(set NZCV, (OpNode (f16 FPR16:$Rn), (f16 FPR16:$Rm), (i32 imm:$nzcv),
	(i32 imm:$cond), NZCV))]> {
	let Inst{23-22} = 0b11;
	let Predicates = [HasFullFP16];
	}

	def Srr : BaseFPCondComparison<signalAllNans, FPR32, mnemonic,
	[(set NZCV, (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm), (i32 imm:$nzcv),
	(i32 imm:$cond), NZCV))]> {
	let Inst{23-22} = 0b00;
	}

	def Drr : BaseFPCondComparison<signalAllNans, FPR64, mnemonic,
	[(set NZCV, (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm), (i32 imm:$nzcv),
	(i32 imm:$cond), NZCV))]> {
	let Inst{23-22} = 0b01;
	}
	}

	//---
	// Floating point conditional select
	//---

	class BaseFPCondSelect<RegisterClass regtype, ValueType vt, string asm>
	: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
	asm, "\t$Rd, $Rn, $Rm, $cond", "",
	[(set regtype:$Rd,
	(AArch64csel (vt regtype:$Rn), regtype:$Rm,
	(i32 imm:$cond), NZCV))]>,
	Sched<[WriteF]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<5> Rm;
	bits<4> cond;

	let Inst{31-24} = 0b00011110;
	let Inst{21} = 1;
	let Inst{20-16} = Rm;
	let Inst{15-12} = cond;
	let Inst{11-10} = 0b11;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	multiclass FPCondSelect<string asm> {
	let Uses = [NZCV] in {
	def Hrrr : BaseFPCondSelect<FPR16, f16, asm> {
	let Inst{23-22} = 0b11;
	let Predicates = [HasFullFP16];
	}

	def Srrr : BaseFPCondSelect<FPR32, f32, asm> {
	let Inst{23-22} = 0b00;
	}

	def Drrr : BaseFPCondSelect<FPR64, f64, asm> {
	let Inst{23-22} = 0b01;
	}
	} // Uses = [NZCV]
	}

	//---
	// Floating move immediate
	//---

	class BaseFPMoveImmediate<RegisterClass regtype, Operand fpimmtype, string asm>
	: I<(outs regtype:$Rd), (ins fpimmtype:$imm), asm, "\t$Rd, $imm", "",
	[(set regtype:$Rd, fpimmtype:$imm)]>,
	Sched<[WriteFImm]> {
	bits<5> Rd;
	bits<8> imm;
	let Inst{31-24} = 0b00011110;
	let Inst{21} = 1;
	let Inst{20-13} = imm;
	let Inst{12-5} = 0b10000000;
	let Inst{4-0} = Rd;
	}

	multiclass FPMoveImmediate<string asm> {
	def Hi : BaseFPMoveImmediate<FPR16, fpimm16, asm> {
	let Inst{23-22} = 0b11;
	let Predicates = [HasFullFP16];
	}

	def Si : BaseFPMoveImmediate<FPR32, fpimm32, asm> {
	let Inst{23-22} = 0b00;
	}

	def Di : BaseFPMoveImmediate<FPR64, fpimm64, asm> {
	let Inst{23-22} = 0b01;
	}
	}
	} // end of 'let Predicates = [HasFPARMv8]'

	//----------------------------------------------------------------------------
	// AdvSIMD
	//----------------------------------------------------------------------------

	let Predicates = [HasNEON] in {

	//----------------------------------------------------------------------------
	// AdvSIMD three register vector instructions
	//----------------------------------------------------------------------------

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseSIMDThreeSameVector<bit Q, bit U, bits<3> size, bits<5> opcode,
	RegisterOperand regtype, string asm, string kind,
	list<dag> pattern>
	: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
	"{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
	"\|" # kind # "\t$Rd, $Rn, $Rm\|}", "", pattern>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<5> Rm;
	let Inst{31} = 0;
	let Inst{30} = Q;
	let Inst{29} = U;
	let Inst{28-24} = 0b01110;
	let Inst{23-21} = size;
	let Inst{20-16} = Rm;
	let Inst{15-11} = opcode;
	let Inst{10} = 1;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<3> size, bits<5> opcode,
	RegisterOperand regtype, string asm, string kind,
	list<dag> pattern>
	: I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), asm,
	"{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
	"\|" # kind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<5> Rm;
	let Inst{31} = 0;
	let Inst{30} = Q;
	let Inst{29} = U;
	let Inst{28-24} = 0b01110;
	let Inst{23-21} = size;
	let Inst{20-16} = Rm;
	let Inst{15-11} = opcode;
	let Inst{10} = 1;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseSIMDThreeSameVectorPseudo<RegisterOperand regtype, list<dag> pattern>
	: Pseudo<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), pattern>,
	Sched<[WriteV]>;

	multiclass SIMDLogicalThreeVectorPseudo<SDPatternOperator OpNode> {
	def v8i8 : BaseSIMDThreeSameVectorPseudo<V64,
	[(set (v8i8 V64:$dst),
	(OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
	def v16i8 : BaseSIMDThreeSameVectorPseudo<V128,
	[(set (v16i8 V128:$dst),
	(OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
	(v16i8 V128:$Rm)))]>;

	def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS),
	(v4i16 V64:$RHS))),
	(!cast<Instruction>(NAME#"v8i8")
	V64:$LHS, V64:$MHS, V64:$RHS)>;
	def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS),
	(v2i32 V64:$RHS))),
	(!cast<Instruction>(NAME#"v8i8")
	V64:$LHS, V64:$MHS, V64:$RHS)>;
	def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS),
	(v1i64 V64:$RHS))),
	(!cast<Instruction>(NAME#"v8i8")
	V64:$LHS, V64:$MHS, V64:$RHS)>;

	def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS),
	(v8i16 V128:$RHS))),
	(!cast<Instruction>(NAME#"v16i8")
	V128:$LHS, V128:$MHS, V128:$RHS)>;
	def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS),
	(v4i32 V128:$RHS))),
	(!cast<Instruction>(NAME#"v16i8")
	V128:$LHS, V128:$MHS, V128:$RHS)>;
	def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS),
	(v2i64 V128:$RHS))),
	(!cast<Instruction>(NAME#"v16i8")
	V128:$LHS, V128:$MHS, V128:$RHS)>;
	}

	// All operand sizes distinguished in the encoding.
	multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode> {
	def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64,
	asm, ".8b",
	[(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
	def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128,
	asm, ".16b",
	[(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
	def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64,
	asm, ".4h",
	[(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
	def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128,
	asm, ".8h",
	[(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
	def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64,
	asm, ".2s",
	[(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
	def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128,
	asm, ".4s",
	[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
	def v2i64 : BaseSIMDThreeSameVector<1, U, 0b111, opc, V128,
	asm, ".2d",
	[(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
	}

	multiclass SIMDThreeSameVectorExtraPatterns<string inst, SDPatternOperator OpNode> {
	def : Pat<(v8i8 (OpNode V64:$LHS, V64:$RHS)),
	(!cast<Instruction>(inst#"v8i8") V64:$LHS, V64:$RHS)>;
	def : Pat<(v4i16 (OpNode V64:$LHS, V64:$RHS)),
	(!cast<Instruction>(inst#"v4i16") V64:$LHS, V64:$RHS)>;
	def : Pat<(v2i32 (OpNode V64:$LHS, V64:$RHS)),
	(!cast<Instruction>(inst#"v2i32") V64:$LHS, V64:$RHS)>;

	def : Pat<(v16i8 (OpNode V128:$LHS, V128:$RHS)),
	(!cast<Instruction>(inst#"v16i8") V128:$LHS, V128:$RHS)>;
	def : Pat<(v8i16 (OpNode V128:$LHS, V128:$RHS)),
	(!cast<Instruction>(inst#"v8i16") V128:$LHS, V128:$RHS)>;
	def : Pat<(v4i32 (OpNode V128:$LHS, V128:$RHS)),
	(!cast<Instruction>(inst#"v4i32") V128:$LHS, V128:$RHS)>;
	def : Pat<(v2i64 (OpNode V128:$LHS, V128:$RHS)),
	(!cast<Instruction>(inst#"v2i64") V128:$LHS, V128:$RHS)>;
	}

	// As above, but D sized elements unsupported.
	multiclass SIMDThreeSameVectorBHS<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode> {
	def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64,
	asm, ".8b",
	[(set V64:$Rd, (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))]>;
	def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128,
	asm, ".16b",
	[(set V128:$Rd, (v16i8 (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm))))]>;
	def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64,
	asm, ".4h",
	[(set V64:$Rd, (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>;
	def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128,
	asm, ".8h",
	[(set V128:$Rd, (v8i16 (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>;
	def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64,
	asm, ".2s",
	[(set V64:$Rd, (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>;
	def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128,
	asm, ".4s",
	[(set V128:$Rd, (v4i32 (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>;
	}

	multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode> {
	def v8i8 : BaseSIMDThreeSameVectorTied<0, U, 0b001, opc, V64,
	asm, ".8b",
	[(set (v8i8 V64:$dst),
	(OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
	def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b001, opc, V128,
	asm, ".16b",
	[(set (v16i8 V128:$dst),
	(OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
	def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b011, opc, V64,
	asm, ".4h",
	[(set (v4i16 V64:$dst),
	(OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
	def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b011, opc, V128,
	asm, ".8h",
	[(set (v8i16 V128:$dst),
	(OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
	def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b101, opc, V64,
	asm, ".2s",
	[(set (v2i32 V64:$dst),
	(OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
	def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b101, opc, V128,
	asm, ".4s",
	[(set (v4i32 V128:$dst),
	(OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
	}

	// As above, but only B sized elements supported.
	multiclass SIMDThreeSameVectorB<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode> {
	def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64,
	asm, ".8b",
	[(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
	def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128,
	asm, ".16b",
	[(set (v16i8 V128:$Rd),
	(OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
	}

	// As above, but only floating point elements supported.
	multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<3> opc,
	string asm, SDPatternOperator OpNode> {
	let Predicates = [HasNEON, HasFullFP16] in {
	def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64,
	asm, ".4h",
	[(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>;
	def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128,
	asm, ".8h",
	[(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
	} // Predicates = [HasNEON, HasFullFP16]
	def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64,
	asm, ".2s",
	[(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
	def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128,
	asm, ".4s",
	[(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
	def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128,
	asm, ".2d",
	[(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
	}

	multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<3> opc,
	string asm,
	SDPatternOperator OpNode> {
	let Predicates = [HasNEON, HasFullFP16] in {
	def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64,
	asm, ".4h",
	[(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>;
	def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128,
	asm, ".8h",
	[(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
	} // Predicates = [HasNEON, HasFullFP16]
	def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64,
	asm, ".2s",
	[(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
	def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128,
	asm, ".4s",
	[(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
	def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128,
	asm, ".2d",
	[(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
	}

	multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<3> opc,
	string asm, SDPatternOperator OpNode> {
	let Predicates = [HasNEON, HasFullFP16] in {
	def v4f16 : BaseSIMDThreeSameVectorTied<0, U, {S,0b10}, {0b00,opc}, V64,
	asm, ".4h",
	[(set (v4f16 V64:$dst),
	(OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>;
	def v8f16 : BaseSIMDThreeSameVectorTied<1, U, {S,0b10}, {0b00,opc}, V128,
	asm, ".8h",
	[(set (v8f16 V128:$dst),
	(OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
	} // Predicates = [HasNEON, HasFullFP16]
	def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0b01}, {0b11,opc}, V64,
	asm, ".2s",
	[(set (v2f32 V64:$dst),
	(OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
	def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0b01}, {0b11,opc}, V128,
	asm, ".4s",
	[(set (v4f32 V128:$dst),
	(OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
	def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,0b11}, {0b11,opc}, V128,
	asm, ".2d",
	[(set (v2f64 V128:$dst),
	(OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
	}

	// As above, but D and B sized elements unsupported.
	multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode> {
	def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64,
	asm, ".4h",
	[(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
	def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128,
	asm, ".8h",
	[(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
	def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64,
	asm, ".2s",
	[(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
	def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128,
	asm, ".4s",
	[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
	}

	// Logical three vector ops share opcode bits, and only use B sized elements.
	multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm,
	SDPatternOperator OpNode = null_frag> {
	def v8i8 : BaseSIMDThreeSameVector<0, U, {size,1}, 0b00011, V64,
	asm, ".8b",
	[(set (v8i8 V64:$Rd), (OpNode V64:$Rn, V64:$Rm))]>;
	def v16i8 : BaseSIMDThreeSameVector<1, U, {size,1}, 0b00011, V128,
	asm, ".16b",
	[(set (v16i8 V128:$Rd), (OpNode V128:$Rn, V128:$Rm))]>;

	def : Pat<(v4i16 (OpNode V64:$LHS, V64:$RHS)),
	(!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
	def : Pat<(v2i32 (OpNode V64:$LHS, V64:$RHS)),
	(!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
	def : Pat<(v1i64 (OpNode V64:$LHS, V64:$RHS)),
	(!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;

	def : Pat<(v8i16 (OpNode V128:$LHS, V128:$RHS)),
	(!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
	def : Pat<(v4i32 (OpNode V128:$LHS, V128:$RHS)),
	(!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
	def : Pat<(v2i64 (OpNode V128:$LHS, V128:$RHS)),
	(!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
	}

	multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
	string asm, SDPatternOperator OpNode = null_frag> {
	def v8i8 : BaseSIMDThreeSameVectorTied<0, U, {size,1}, 0b00011, V64,
	asm, ".8b",
	[(set (v8i8 V64:$dst),
	(OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
	def v16i8 : BaseSIMDThreeSameVectorTied<1, U, {size,1}, 0b00011, V128,
	asm, ".16b",
	[(set (v16i8 V128:$dst),
	(OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
	(v16i8 V128:$Rm)))]>;

	def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS),
	(v4i16 V64:$RHS))),
	(!cast<Instruction>(NAME#"v8i8")
	V64:$LHS, V64:$MHS, V64:$RHS)>;
	def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS),
	(v2i32 V64:$RHS))),
	(!cast<Instruction>(NAME#"v8i8")
	V64:$LHS, V64:$MHS, V64:$RHS)>;
	def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS),
	(v1i64 V64:$RHS))),
	(!cast<Instruction>(NAME#"v8i8")
	V64:$LHS, V64:$MHS, V64:$RHS)>;

	def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS),
	(v8i16 V128:$RHS))),
	(!cast<Instruction>(NAME#"v16i8")
	V128:$LHS, V128:$MHS, V128:$RHS)>;
	def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS),
	(v4i32 V128:$RHS))),
	(!cast<Instruction>(NAME#"v16i8")
	V128:$LHS, V128:$MHS, V128:$RHS)>;
	def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS),
	(v2i64 V128:$RHS))),
	(!cast<Instruction>(NAME#"v16i8")
	V128:$LHS, V128:$MHS, V128:$RHS)>;
	}

	// ARMv8.2-A Dot Product Instructions (Vector): These instructions extract
	// bytes from S-sized elements.
	class BaseSIMDThreeSameVectorDot<bit Q, bit U, bit Mixed, string asm, string kind1,
	string kind2, RegisterOperand RegType,
	ValueType AccumType, ValueType InputType,
	SDPatternOperator OpNode> :
	BaseSIMDThreeSameVectorTied<Q, U, 0b100, {0b1001, Mixed}, RegType, asm, kind1,
	[(set (AccumType RegType:$dst),
	(OpNode (AccumType RegType:$Rd),
	(InputType RegType:$Rn),
	(InputType RegType:$Rm)))]> {
	let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}");
	}

	multiclass SIMDThreeSameVectorDot<bit U, bit Mixed, string asm, SDPatternOperator OpNode> {
	def v8i8 : BaseSIMDThreeSameVectorDot<0, U, Mixed, asm, ".2s", ".8b", V64,
	v2i32, v8i8, OpNode>;
	def v16i8 : BaseSIMDThreeSameVectorDot<1, U, Mixed, asm, ".4s", ".16b", V128,
	v4i32, v16i8, OpNode>;
	}

	// ARMv8.2-A Fused Multiply Add-Long Instructions (Vector): These instructions
	// select inputs from 4H vectors and accumulate outputs to a 2S vector (or from
	// 8H to 4S, when Q=1).
	class BaseSIMDThreeSameVectorFML<bit Q, bit U, bit b13, bits<3> size, string asm, string kind1,
	string kind2, RegisterOperand RegType,
	ValueType AccumType, ValueType InputType,
	SDPatternOperator OpNode> :
	BaseSIMDThreeSameVectorTied<Q, U, size, 0b11101, RegType, asm, kind1,
	[(set (AccumType RegType:$dst),
	(OpNode (AccumType RegType:$Rd),
	(InputType RegType:$Rn),
	(InputType RegType:$Rm)))]> {
	let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}");
	let Inst{13} = b13;
	}

	multiclass SIMDThreeSameVectorFML<bit U, bit b13, bits<3> size, string asm,
	SDPatternOperator OpNode> {
	def v4f16 : BaseSIMDThreeSameVectorFML<0, U, b13, size, asm, ".2s", ".2h", V64,
	v2f32, v4f16, OpNode>;
	def v8f16 : BaseSIMDThreeSameVectorFML<1, U, b13, size, asm, ".4s", ".4h", V128,
	v4f32, v8f16, OpNode>;
	}


	//----------------------------------------------------------------------------
	// AdvSIMD two register vector instructions.
	//----------------------------------------------------------------------------

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
	bits<2> size2, RegisterOperand regtype, string asm,
	string dstkind, string srckind, list<dag> pattern>
	: I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
	"{\t$Rd" # dstkind # ", $Rn" # srckind #
	"\|" # dstkind # "\t$Rd, $Rn}", "", pattern>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	let Inst{31} = 0;
	let Inst{30} = Q;
	let Inst{29} = U;
	let Inst{28-24} = 0b01110;
	let Inst{23-22} = size;
	let Inst{21} = 0b1;
	let Inst{20-19} = size2;
	let Inst{18-17} = 0b00;
	let Inst{16-12} = opcode;
	let Inst{11-10} = 0b10;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
	bits<2> size2, RegisterOperand regtype,
	string asm, string dstkind, string srckind,
	list<dag> pattern>
	: I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn), asm,
	"{\t$Rd" # dstkind # ", $Rn" # srckind #
	"\|" # dstkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	let Inst{31} = 0;
	let Inst{30} = Q;
	let Inst{29} = U;
	let Inst{28-24} = 0b01110;
	let Inst{23-22} = size;
	let Inst{21} = 0b1;
	let Inst{20-19} = size2;
	let Inst{18-17} = 0b00;
	let Inst{16-12} = opcode;
	let Inst{11-10} = 0b10;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	// Supports B, H, and S element sizes.
	multiclass SIMDTwoVectorBHS<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode> {
	def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
	asm, ".8b", ".8b",
	[(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
	def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
	asm, ".16b", ".16b",
	[(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
	def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
	asm, ".4h", ".4h",
	[(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
	def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
	asm, ".8h", ".8h",
	[(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
	def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64,
	asm, ".2s", ".2s",
	[(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
	def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128,
	asm, ".4s", ".4s",
	[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
	}

	class BaseSIMDVectorLShiftLongBySize<bit Q, bits<2> size,
	RegisterOperand regtype, string asm, string dstkind,
	string srckind, string amount>
	: I<(outs V128:$Rd), (ins regtype:$Rn), asm,
	"{\t$Rd" # dstkind # ", $Rn" # srckind # ", #" # amount #
	"\|" # dstkind # "\t$Rd, $Rn, #" # amount # "}", "", []>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	let Inst{31} = 0;
	let Inst{30} = Q;
	let Inst{29-24} = 0b101110;
	let Inst{23-22} = size;
	let Inst{21-10} = 0b100001001110;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	multiclass SIMDVectorLShiftLongBySizeBHS {
	let hasSideEffects = 0 in {
	def v8i8 : BaseSIMDVectorLShiftLongBySize<0, 0b00, V64,
	"shll", ".8h", ".8b", "8">;
	def v16i8 : BaseSIMDVectorLShiftLongBySize<1, 0b00, V128,
	"shll2", ".8h", ".16b", "8">;
	def v4i16 : BaseSIMDVectorLShiftLongBySize<0, 0b01, V64,
	"shll", ".4s", ".4h", "16">;
	def v8i16 : BaseSIMDVectorLShiftLongBySize<1, 0b01, V128,
	"shll2", ".4s", ".8h", "16">;
	def v2i32 : BaseSIMDVectorLShiftLongBySize<0, 0b10, V64,
	"shll", ".2d", ".2s", "32">;
	def v4i32 : BaseSIMDVectorLShiftLongBySize<1, 0b10, V128,
	"shll2", ".2d", ".4s", "32">;
	}
	}

	// Supports all element sizes.
	multiclass SIMDLongTwoVector<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode> {
	def v8i8_v4i16 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
	asm, ".4h", ".8b",
	[(set (v4i16 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
	def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
	asm, ".8h", ".16b",
	[(set (v8i16 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
	def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
	asm, ".2s", ".4h",
	[(set (v2i32 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
	def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
	asm, ".4s", ".8h",
	[(set (v4i32 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
	def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64,
	asm, ".1d", ".2s",
	[(set (v1i64 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
	def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128,
	asm, ".2d", ".4s",
	[(set (v2i64 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
	}

	multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode> {
	def v8i8_v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, 0b00, V64,
	asm, ".4h", ".8b",
	[(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd),
	(v8i8 V64:$Rn)))]>;
	def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, 0b00, V128,
	asm, ".8h", ".16b",
	[(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd),
	(v16i8 V128:$Rn)))]>;
	def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, 0b00, V64,
	asm, ".2s", ".4h",
	[(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd),
	(v4i16 V64:$Rn)))]>;
	def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, 0b00, V128,
	asm, ".4s", ".8h",
	[(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
	(v8i16 V128:$Rn)))]>;
	def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, 0b00, V64,
	asm, ".1d", ".2s",
	[(set (v1i64 V64:$dst), (OpNode (v1i64 V64:$Rd),
	(v2i32 V64:$Rn)))]>;
	def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, 0b00, V128,
	asm, ".2d", ".4s",
	[(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd),
	(v4i32 V128:$Rn)))]>;
	}

	// Supports all element sizes, except 1xD.
	multiclass SIMDTwoVectorBHSDTied<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode> {
	def v8i8 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, 0b00, V64,
	asm, ".8b", ".8b",
	[(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn)))]>;
	def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, 0b00, V128,
	asm, ".16b", ".16b",
	[(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
	def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, 0b00, V64,
	asm, ".4h", ".4h",
	[(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn)))]>;
	def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, 0b00, V128,
	asm, ".8h", ".8h",
	[(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn)))]>;
	def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, 0b00, V64,
	asm, ".2s", ".2s",
	[(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn)))]>;
	def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, 0b00, V128,
	asm, ".4s", ".4s",
	[(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
	def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, 0b00, V128,
	asm, ".2d", ".2d",
	[(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn)))]>;
	}

	multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode = null_frag> {
	def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
	asm, ".8b", ".8b",
	[(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
	def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
	asm, ".16b", ".16b",
	[(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
	def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
	asm, ".4h", ".4h",
	[(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
	def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
	asm, ".8h", ".8h",
	[(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
	def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64,
	asm, ".2s", ".2s",
	[(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
	def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128,
	asm, ".4s", ".4s",
	[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
	def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, 0b00, V128,
	asm, ".2d", ".2d",
	[(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
	}


	// Supports only B element sizes.
	multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm,
	SDPatternOperator OpNode> {
	def v8i8 : BaseSIMDTwoSameVector<0, U, size, opc, 0b00, V64,
	asm, ".8b", ".8b",
	[(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
	def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, 0b00, V128,
	asm, ".16b", ".16b",
	[(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;

	}

	// Supports only B and H element sizes.
	multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode> {
	def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
	asm, ".8b", ".8b",
	[(set (v8i8 V64:$Rd), (OpNode V64:$Rn))]>;
	def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
	asm, ".16b", ".16b",
	[(set (v16i8 V128:$Rd), (OpNode V128:$Rn))]>;
	def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
	asm, ".4h", ".4h",
	[(set (v4i16 V64:$Rd), (OpNode V64:$Rn))]>;
	def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
	asm, ".8h", ".8h",
	[(set (v8i16 V128:$Rd), (OpNode V128:$Rn))]>;
	}

	// Supports H, S and D element sizes, uses high bit of the size field
	// as an extra opcode bit.
	multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
	SDPatternOperator OpNode> {
	let Predicates = [HasNEON, HasFullFP16] in {
	def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
	asm, ".4h", ".4h",
	[(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>;
	def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
	asm, ".8h", ".8h",
	[(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>;
	} // Predicates = [HasNEON, HasFullFP16]
	def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
	asm, ".2s", ".2s",
	[(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
	def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
	asm, ".4s", ".4s",
	[(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
	def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128,
	asm, ".2d", ".2d",
	[(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
	}

	// Supports only S and D element sizes
	multiclass SIMDTwoVectorSD<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode = null_frag> {

	def v2f32 : BaseSIMDTwoSameVector<0, U, 00, opc, 0b00, V64,
	asm, ".2s", ".2s",
	[(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
	def v4f32 : BaseSIMDTwoSameVector<1, U, 00, opc, 0b00, V128,
	asm, ".4s", ".4s",
	[(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
	def v2f64 : BaseSIMDTwoSameVector<1, U, 01, opc, 0b00, V128,
	asm, ".2d", ".2d",
	[(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
	}

	multiclass FRIntNNTVector<bit U, bit op, string asm,
	SDPatternOperator OpNode = null_frag> :
	SIMDTwoVectorSD<U, {0b1111,op}, asm, OpNode>;

	// Supports only S element size.
	multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm,
	SDPatternOperator OpNode> {
	def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
	asm, ".2s", ".2s",
	[(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
	def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
	asm, ".4s", ".4s",
	[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
	}


	multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm,
	SDPatternOperator OpNode> {
	let Predicates = [HasNEON, HasFullFP16] in {
	def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
	asm, ".4h", ".4h",
	[(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>;
	def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
	asm, ".8h", ".8h",
	[(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>;
	} // Predicates = [HasNEON, HasFullFP16]
	def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
	asm, ".2s", ".2s",
	[(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
	def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
	asm, ".4s", ".4s",
	[(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
	def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128,
	asm, ".2d", ".2d",
	[(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
	}

	multiclass SIMDTwoVectorIntToFP<bit U, bit S, bits<5> opc, string asm,
	SDPatternOperator OpNode> {
	let Predicates = [HasNEON, HasFullFP16] in {
	def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
	asm, ".4h", ".4h",
	[(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
	def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
	asm, ".8h", ".8h",
	[(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
	} // Predicates = [HasNEON, HasFullFP16]
	def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
	asm, ".2s", ".2s",
	[(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
	def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
	asm, ".4s", ".4s",
	[(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
	def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128,
	asm, ".2d", ".2d",
	[(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
	}


	class BaseSIMDMixedTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
	RegisterOperand inreg, RegisterOperand outreg,
	string asm, string outkind, string inkind,
	list<dag> pattern>
	: I<(outs outreg:$Rd), (ins inreg:$Rn), asm,
	"{\t$Rd" # outkind # ", $Rn" # inkind #
	"\|" # outkind # "\t$Rd, $Rn}", "", pattern>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	let Inst{31} = 0;
	let Inst{30} = Q;
	let Inst{29} = U;
	let Inst{28-24} = 0b01110;
	let Inst{23-22} = size;
	let Inst{21-17} = 0b10000;
	let Inst{16-12} = opcode;
	let Inst{11-10} = 0b10;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	class BaseSIMDMixedTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
	RegisterOperand inreg, RegisterOperand outreg,
	string asm, string outkind, string inkind,
	list<dag> pattern>
	: I<(outs outreg:$dst), (ins outreg:$Rd, inreg:$Rn), asm,
	"{\t$Rd" # outkind # ", $Rn" # inkind #
	"\|" # outkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	let Inst{31} = 0;
	let Inst{30} = Q;
	let Inst{29} = U;
	let Inst{28-24} = 0b01110;
	let Inst{23-22} = size;
	let Inst{21-17} = 0b10000;
	let Inst{16-12} = opcode;
	let Inst{11-10} = 0b10;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode> {
	def v8i8 : BaseSIMDMixedTwoVector<0, U, 0b00, opc, V128, V64,
	asm, ".8b", ".8h",
	[(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
	def v16i8 : BaseSIMDMixedTwoVectorTied<1, U, 0b00, opc, V128, V128,
	asm#"2", ".16b", ".8h", []>;
	def v4i16 : BaseSIMDMixedTwoVector<0, U, 0b01, opc, V128, V64,
	asm, ".4h", ".4s",
	[(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
	def v8i16 : BaseSIMDMixedTwoVectorTied<1, U, 0b01, opc, V128, V128,
	asm#"2", ".8h", ".4s", []>;
	def v2i32 : BaseSIMDMixedTwoVector<0, U, 0b10, opc, V128, V64,
	asm, ".2s", ".2d",
	[(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
	def v4i32 : BaseSIMDMixedTwoVectorTied<1, U, 0b10, opc, V128, V128,
	asm#"2", ".4s", ".2d", []>;

	def : Pat<(concat_vectors (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn))),
	(!cast<Instruction>(NAME # "v16i8")
	(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
	def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn))),
	(!cast<Instruction>(NAME # "v8i16")
	(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
	def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn))),
	(!cast<Instruction>(NAME # "v4i32")
	(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
	}

	class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<2> size2,
	bits<5> opcode, RegisterOperand regtype, string asm,
	string kind, string zero, ValueType dty,
	ValueType sty, SDNode OpNode>
	: I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
	"{\t$Rd" # kind # ", $Rn" # kind # ", #" # zero #
	"\|" # kind # "\t$Rd, $Rn, #" # zero # "}", "",
	[(set (dty regtype:$Rd), (OpNode (sty regtype:$Rn)))]>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	let Inst{31} = 0;
	let Inst{30} = Q;
	let Inst{29} = U;
	let Inst{28-24} = 0b01110;
	let Inst{23-22} = size;
	let Inst{21} = 0b1;
	let Inst{20-19} = size2;
	let Inst{18-17} = 0b00;
	let Inst{16-12} = opcode;
	let Inst{11-10} = 0b10;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	// Comparisons support all element sizes, except 1xD.
	multiclass SIMDCmpTwoVector<bit U, bits<5> opc, string asm,
	SDNode OpNode> {
	def v8i8rz : BaseSIMDCmpTwoVector<0, U, 0b00, 0b00, opc, V64,
	asm, ".8b", "0",
	v8i8, v8i8, OpNode>;
	def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, 0b00, opc, V128,
	asm, ".16b", "0",
	v16i8, v16i8, OpNode>;
	def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, 0b00, opc, V64,
	asm, ".4h", "0",
	v4i16, v4i16, OpNode>;
	def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, 0b00, opc, V128,
	asm, ".8h", "0",
	v8i16, v8i16, OpNode>;
	def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, 0b00, opc, V64,
	asm, ".2s", "0",
	v2i32, v2i32, OpNode>;
	def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, 0b00, opc, V128,
	asm, ".4s", "0",
	v4i32, v4i32, OpNode>;
	def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, 0b00, opc, V128,
	asm, ".2d", "0",
	v2i64, v2i64, OpNode>;
	}

	// FP Comparisons support only S and D element sizes (and H for v8.2a).
	multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc,
	string asm, SDNode OpNode> {

	let Predicates = [HasNEON, HasFullFP16] in {
	def v4i16rz : BaseSIMDCmpTwoVector<0, U, {S,1}, 0b11, opc, V64,
	asm, ".4h", "0.0",
	v4i16, v4f16, OpNode>;
	def v8i16rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b11, opc, V128,
	asm, ".8h", "0.0",
	v8i16, v8f16, OpNode>;
	} // Predicates = [HasNEON, HasFullFP16]
	def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, 0b00, opc, V64,
	asm, ".2s", "0.0",
	v2i32, v2f32, OpNode>;
	def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, 0b00, opc, V128,
	asm, ".4s", "0.0",
	v4i32, v4f32, OpNode>;
	def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b00, opc, V128,
	asm, ".2d", "0.0",
	v2i64, v2f64, OpNode>;

	let Predicates = [HasNEON, HasFullFP16] in {
	def : InstAlias<asm # "\t$Vd.4h, $Vn.4h, #0",
	(!cast<Instruction>(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>;
	def : InstAlias<asm # "\t$Vd.8h, $Vn.8h, #0",
	(!cast<Instruction>(NAME # v8i16rz) V128:$Vd, V128:$Vn), 0>;
	}
	def : InstAlias<asm # "\t$Vd.2s, $Vn.2s, #0",
	(!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
	def : InstAlias<asm # "\t$Vd.4s, $Vn.4s, #0",
	(!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
	def : InstAlias<asm # "\t$Vd.2d, $Vn.2d, #0",
	(!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
	let Predicates = [HasNEON, HasFullFP16] in {
	def : InstAlias<asm # ".4h\t$Vd, $Vn, #0",
	(!cast<Instruction>(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>;
	def : InstAlias<asm # ".8h\t$Vd, $Vn, #0",
	(!cast<Instruction>(NAME # v8i16rz) V128:$Vd, V128:$Vn), 0>;
	}
	def : InstAlias<asm # ".2s\t$Vd, $Vn, #0",
	(!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
	def : InstAlias<asm # ".4s\t$Vd, $Vn, #0",
	(!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
	def : InstAlias<asm # ".2d\t$Vd, $Vn, #0",
	(!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
	}

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseSIMDFPCvtTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
	RegisterOperand outtype, RegisterOperand intype,
	string asm, string VdTy, string VnTy,
	list<dag> pattern>
	: I<(outs outtype:$Rd), (ins intype:$Rn), asm,
	!strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "", pattern>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	let Inst{31} = 0;
	let Inst{30} = Q;
	let Inst{29} = U;
	let Inst{28-24} = 0b01110;
	let Inst{23-22} = size;
	let Inst{21-17} = 0b10000;
	let Inst{16-12} = opcode;
	let Inst{11-10} = 0b10;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	class BaseSIMDFPCvtTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
	RegisterOperand outtype, RegisterOperand intype,
	string asm, string VdTy, string VnTy,
	list<dag> pattern>
	: I<(outs outtype:$dst), (ins outtype:$Rd, intype:$Rn), asm,
	!strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "$Rd = $dst", pattern>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	let Inst{31} = 0;
	let Inst{30} = Q;
	let Inst{29} = U;
	let Inst{28-24} = 0b01110;
	let Inst{23-22} = size;
	let Inst{21-17} = 0b10000;
	let Inst{16-12} = opcode;
	let Inst{11-10} = 0b10;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	multiclass SIMDFPWidenTwoVector<bit U, bit S, bits<5> opc, string asm> {
	def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V128, V64,
	asm, ".4s", ".4h", []>;
	def v8i16 : BaseSIMDFPCvtTwoVector<1, U, {S,0}, opc, V128, V128,
	asm#"2", ".4s", ".8h", []>;
	def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V128, V64,
	asm, ".2d", ".2s", []>;
	def v4i32 : BaseSIMDFPCvtTwoVector<1, U, {S,1}, opc, V128, V128,
	asm#"2", ".2d", ".4s", []>;
	}

	multiclass SIMDFPNarrowTwoVector<bit U, bit S, bits<5> opc, string asm> {
	def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V64, V128,
	asm, ".4h", ".4s", []>;
	def v8i16 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,0}, opc, V128, V128,
	asm#"2", ".8h", ".4s", []>;
	def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
	asm, ".2s", ".2d", []>;
	def v4i32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
	asm#"2", ".4s", ".2d", []>;
	}

	multiclass SIMDFPInexactCvtTwoVector<bit U, bit S, bits<5> opc, string asm,
	Intrinsic OpNode> {
	def v2f32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
	asm, ".2s", ".2d",
	[(set (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
	def v4f32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
	asm#"2", ".4s", ".2d", []>;

	def : Pat<(concat_vectors (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn))),
	(!cast<Instruction>(NAME # "v4f32")
	(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
	}

	//----------------------------------------------------------------------------
	// AdvSIMD three register different-size vector instructions.
	//----------------------------------------------------------------------------

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseSIMDDifferentThreeVector<bit U, bits<3> size, bits<4> opcode,
	RegisterOperand outtype, RegisterOperand intype1,
	RegisterOperand intype2, string asm,
	string outkind, string inkind1, string inkind2,
	list<dag> pattern>
	: I<(outs outtype:$Rd), (ins intype1:$Rn, intype2:$Rm), asm,
	"{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
	"\|" # outkind # "\t$Rd, $Rn, $Rm}", "", pattern>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<5> Rm;
	let Inst{31} = 0;
	let Inst{30} = size{0};
	let Inst{29} = U;
	let Inst{28-24} = 0b01110;
	let Inst{23-22} = size{2-1};
	let Inst{21} = 1;
	let Inst{20-16} = Rm;
	let Inst{15-12} = opcode;
	let Inst{11-10} = 0b00;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseSIMDDifferentThreeVectorTied<bit U, bits<3> size, bits<4> opcode,
	RegisterOperand outtype, RegisterOperand intype1,
	RegisterOperand intype2, string asm,
	string outkind, string inkind1, string inkind2,
	list<dag> pattern>
	: I<(outs outtype:$dst), (ins outtype:$Rd, intype1:$Rn, intype2:$Rm), asm,
	"{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
	"\|" # outkind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<5> Rm;
	let Inst{31} = 0;
	let Inst{30} = size{0};
	let Inst{29} = U;
	let Inst{28-24} = 0b01110;
	let Inst{23-22} = size{2-1};
	let Inst{21} = 1;
	let Inst{20-16} = Rm;
	let Inst{15-12} = opcode;
	let Inst{11-10} = 0b00;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	// FIXME: TableGen doesn't know how to deal with expanded types that also
	// change the element count (in this case, placing the results in
	// the high elements of the result register rather than the low
	// elements). Until that's fixed, we can't code-gen those.
	multiclass SIMDNarrowThreeVectorBHS<bit U, bits<4> opc, string asm,
	Intrinsic IntOp> {
	def v8i16_v8i8 : BaseSIMDDifferentThreeVector<U, 0b000, opc,
	V64, V128, V128,
	asm, ".8b", ".8h", ".8h",
	[(set (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
	def v8i16_v16i8 : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
	V128, V128, V128,
	asm#"2", ".16b", ".8h", ".8h",
	[]>;
	def v4i32_v4i16 : BaseSIMDDifferentThreeVector<U, 0b010, opc,
	V64, V128, V128,
	asm, ".4h", ".4s", ".4s",
	[(set (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
	def v4i32_v8i16 : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
	V128, V128, V128,
	asm#"2", ".8h", ".4s", ".4s",
	[]>;
	def v2i64_v2i32 : BaseSIMDDifferentThreeVector<U, 0b100, opc,
	V64, V128, V128,
	asm, ".2s", ".2d", ".2d",
	[(set (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
	def v2i64_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
	V128, V128, V128,
	asm#"2", ".4s", ".2d", ".2d",
	[]>;


	// Patterns for the '2' variants involve INSERT_SUBREG, which you can't put in
	// a version attached to an instruction.
	def : Pat<(concat_vectors (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn),
	(v8i16 V128:$Rm))),
	(!cast<Instruction>(NAME # "v8i16_v16i8")
	(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
	V128:$Rn, V128:$Rm)>;
	def : Pat<(concat_vectors (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn),
	(v4i32 V128:$Rm))),
	(!cast<Instruction>(NAME # "v4i32_v8i16")
	(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
	V128:$Rn, V128:$Rm)>;
	def : Pat<(concat_vectors (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn),
	(v2i64 V128:$Rm))),
	(!cast<Instruction>(NAME # "v2i64_v4i32")
	(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
	V128:$Rn, V128:$Rm)>;
	}

	multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm,
	Intrinsic IntOp> {
	def v8i8 : BaseSIMDDifferentThreeVector<U, 0b000, opc,
	V128, V64, V64,
	asm, ".8h", ".8b", ".8b",
	[(set (v8i16 V128:$Rd), (IntOp (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
	def v16i8 : BaseSIMDDifferentThreeVector<U, 0b001, opc,
	V128, V128, V128,
	asm#"2", ".8h", ".16b", ".16b", []>;
	let Predicates = [HasAES] in {
	def v1i64 : BaseSIMDDifferentThreeVector<U, 0b110, opc,
	V128, V64, V64,
	asm, ".1q", ".1d", ".1d", []>;
	def v2i64 : BaseSIMDDifferentThreeVector<U, 0b111, opc,
	V128, V128, V128,
	asm#"2", ".1q", ".2d", ".2d", []>;
	}

	def : Pat<(v8i16 (IntOp (v8i8 (extract_high_v16i8 V128:$Rn)),
	(v8i8 (extract_high_v16i8 V128:$Rm)))),
	(!cast<Instruction>(NAME#"v16i8") V128:$Rn, V128:$Rm)>;
	}

	multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm,
	SDPatternOperator OpNode> {
	def v4i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b010, opc,
	V128, V64, V64,
	asm, ".4s", ".4h", ".4h",
	[(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
	def v8i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b011, opc,
	V128, V128, V128,
	asm#"2", ".4s", ".8h", ".8h",
	[(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
	(extract_high_v8i16 V128:$Rm)))]>;
	def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc,
	V128, V64, V64,
	asm, ".2d", ".2s", ".2s",
	[(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
	def v4i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b101, opc,
	V128, V128, V128,
	asm#"2", ".2d", ".4s", ".4s",
	[(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
	(extract_high_v4i32 V128:$Rm)))]>;
	}

	multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm,
	SDPatternOperator OpNode = null_frag> {
	def v8i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b000, opc,
	V128, V64, V64,
	asm, ".8h", ".8b", ".8b",
	[(set (v8i16 V128:$Rd),
	(zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))))]>;
	def v16i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b001, opc,
	V128, V128, V128,
	asm#"2", ".8h", ".16b", ".16b",
	[(set (v8i16 V128:$Rd),
	(zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
	(extract_high_v16i8 V128:$Rm)))))]>;
	def v4i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b010, opc,
	V128, V64, V64,
	asm, ".4s", ".4h", ".4h",
	[(set (v4i32 V128:$Rd),
	(zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))))]>;
	def v8i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b011, opc,
	V128, V128, V128,
	asm#"2", ".4s", ".8h", ".8h",
	[(set (v4i32 V128:$Rd),
	(zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
	(extract_high_v8i16 V128:$Rm)))))]>;
	def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc,
	V128, V64, V64,
	asm, ".2d", ".2s", ".2s",
	[(set (v2i64 V128:$Rd),
	(zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))))]>;
	def v4i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b101, opc,
	V128, V128, V128,
	asm#"2", ".2d", ".4s", ".4s",
	[(set (v2i64 V128:$Rd),
	(zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
	(extract_high_v4i32 V128:$Rm)))))]>;
	}

	multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc,
	string asm,
	SDPatternOperator OpNode> {
	def v8i8_v8i16 : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
	V128, V64, V64,
	asm, ".8h", ".8b", ".8b",
	[(set (v8i16 V128:$dst),
	(add (v8i16 V128:$Rd),
	(zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))))]>;
	def v16i8_v8i16 : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
	V128, V128, V128,
	asm#"2", ".8h", ".16b", ".16b",
	[(set (v8i16 V128:$dst),
	(add (v8i16 V128:$Rd),
	(zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
	(extract_high_v16i8 V128:$Rm))))))]>;
	def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
	V128, V64, V64,
	asm, ".4s", ".4h", ".4h",
	[(set (v4i32 V128:$dst),
	(add (v4i32 V128:$Rd),
	(zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))))]>;
	def v8i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
	V128, V128, V128,
	asm#"2", ".4s", ".8h", ".8h",
	[(set (v4i32 V128:$dst),
	(add (v4i32 V128:$Rd),
	(zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
	(extract_high_v8i16 V128:$Rm))))))]>;
	def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
	V128, V64, V64,
	asm, ".2d", ".2s", ".2s",
	[(set (v2i64 V128:$dst),
	(add (v2i64 V128:$Rd),
	(zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))))]>;
	def v4i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
	V128, V128, V128,
	asm#"2", ".2d", ".4s", ".4s",
	[(set (v2i64 V128:$dst),
	(add (v2i64 V128:$Rd),
	(zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
	(extract_high_v4i32 V128:$Rm))))))]>;
	}

	multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm,
	SDPatternOperator OpNode = null_frag> {
	def v8i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b000, opc,
	V128, V64, V64,
	asm, ".8h", ".8b", ".8b",
	[(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
	def v16i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b001, opc,
	V128, V128, V128,
	asm#"2", ".8h", ".16b", ".16b",
	[(set (v8i16 V128:$Rd), (OpNode (extract_high_v16i8 V128:$Rn),
	(extract_high_v16i8 V128:$Rm)))]>;
	def v4i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b010, opc,
	V128, V64, V64,
	asm, ".4s", ".4h", ".4h",
	[(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
	def v8i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b011, opc,
	V128, V128, V128,
	asm#"2", ".4s", ".8h", ".8h",
	[(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
	(extract_high_v8i16 V128:$Rm)))]>;
	def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc,
	V128, V64, V64,
	asm, ".2d", ".2s", ".2s",
	[(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
	def v4i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b101, opc,
	V128, V128, V128,
	asm#"2", ".2d", ".4s", ".4s",
	[(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
	(extract_high_v4i32 V128:$Rm)))]>;
	}

	multiclass SIMDLongThreeVectorTiedBHS<bit U, bits<4> opc,
	string asm,
	SDPatternOperator OpNode> {
	def v8i8_v8i16 : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
	V128, V64, V64,
	asm, ".8h", ".8b", ".8b",
	[(set (v8i16 V128:$dst),
	(OpNode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
	def v16i8_v8i16 : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
	V128, V128, V128,
	asm#"2", ".8h", ".16b", ".16b",
	[(set (v8i16 V128:$dst),
	(OpNode (v8i16 V128:$Rd),
	(extract_high_v16i8 V128:$Rn),
	(extract_high_v16i8 V128:$Rm)))]>;
	def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
	V128, V64, V64,
	asm, ".4s", ".4h", ".4h",
	[(set (v4i32 V128:$dst),
	(OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
	def v8i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
	V128, V128, V128,
	asm#"2", ".4s", ".8h", ".8h",
	[(set (v4i32 V128:$dst),
	(OpNode (v4i32 V128:$Rd),
	(extract_high_v8i16 V128:$Rn),
	(extract_high_v8i16 V128:$Rm)))]>;
	def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
	V128, V64, V64,
	asm, ".2d", ".2s", ".2s",
	[(set (v2i64 V128:$dst),
	(OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
	def v4i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
	V128, V128, V128,
	asm#"2", ".2d", ".4s", ".4s",
	[(set (v2i64 V128:$dst),
	(OpNode (v2i64 V128:$Rd),
	(extract_high_v4i32 V128:$Rn),
	(extract_high_v4i32 V128:$Rm)))]>;
	}

	multiclass SIMDLongThreeVectorSQDMLXTiedHS<bit U, bits<4> opc, string asm,
	SDPatternOperator Accum> {
	def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
	V128, V64, V64,
	asm, ".4s", ".4h", ".4h",
	[(set (v4i32 V128:$dst),
	(Accum (v4i32 V128:$Rd),
	(v4i32 (int_aarch64_neon_sqdmull (v4i16 V64:$Rn),
	(v4i16 V64:$Rm)))))]>;
	def v8i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
	V128, V128, V128,
	asm#"2", ".4s", ".8h", ".8h",
	[(set (v4i32 V128:$dst),
	(Accum (v4i32 V128:$Rd),
	(v4i32 (int_aarch64_neon_sqdmull (extract_high_v8i16 V128:$Rn),
	(extract_high_v8i16 V128:$Rm)))))]>;
	def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
	V128, V64, V64,
	asm, ".2d", ".2s", ".2s",
	[(set (v2i64 V128:$dst),
	(Accum (v2i64 V128:$Rd),
	(v2i64 (int_aarch64_neon_sqdmull (v2i32 V64:$Rn),
	(v2i32 V64:$Rm)))))]>;
	def v4i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
	V128, V128, V128,
	asm#"2", ".2d", ".4s", ".4s",
	[(set (v2i64 V128:$dst),
	(Accum (v2i64 V128:$Rd),
	(v2i64 (int_aarch64_neon_sqdmull (extract_high_v4i32 V128:$Rn),
	(extract_high_v4i32 V128:$Rm)))))]>;
	}

	multiclass SIMDWideThreeVectorBHS<bit U, bits<4> opc, string asm,
	SDPatternOperator OpNode> {
	def v8i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b000, opc,
	V128, V128, V64,
	asm, ".8h", ".8h", ".8b",
	[(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i8 V64:$Rm)))]>;
	def v16i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b001, opc,
	V128, V128, V128,
	asm#"2", ".8h", ".8h", ".16b",
	[(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
	(extract_high_v16i8 V128:$Rm)))]>;
	def v4i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b010, opc,
	V128, V128, V64,
	asm, ".4s", ".4s", ".4h",
	[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i16 V64:$Rm)))]>;
	def v8i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b011, opc,
	V128, V128, V128,
	asm#"2", ".4s", ".4s", ".8h",
	[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
	(extract_high_v8i16 V128:$Rm)))]>;
	def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc,
	V128, V128, V64,
	asm, ".2d", ".2d", ".2s",
	[(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i32 V64:$Rm)))]>;
	def v4i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b101, opc,
	V128, V128, V128,
	asm#"2", ".2d", ".2d", ".4s",
	[(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
	(extract_high_v4i32 V128:$Rm)))]>;
	}

	//----------------------------------------------------------------------------
	// AdvSIMD bitwise extract from vector
	//----------------------------------------------------------------------------

	class BaseSIMDBitwiseExtract<bit size, RegisterOperand regtype, ValueType vty,
	string asm, string kind>
	: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, i32imm:$imm), asm,
	"{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $imm" #
	"\|" # kind # "\t$Rd, $Rn, $Rm, $imm}", "",
	[(set (vty regtype:$Rd),
	(AArch64ext regtype:$Rn, regtype:$Rm, (i32 imm:$imm)))]>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<5> Rm;
	bits<4> imm;
	let Inst{31} = 0;
	let Inst{30} = size;
	let Inst{29-21} = 0b101110000;
	let Inst{20-16} = Rm;
	let Inst{15} = 0;
	let Inst{14-11} = imm;
	let Inst{10} = 0;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}


	multiclass SIMDBitwiseExtract<string asm> {
	def v8i8 : BaseSIMDBitwiseExtract<0, V64, v8i8, asm, ".8b"> {
	let imm{3} = 0;
	}
	def v16i8 : BaseSIMDBitwiseExtract<1, V128, v16i8, asm, ".16b">;
	}

	//----------------------------------------------------------------------------
	// AdvSIMD zip vector
	//----------------------------------------------------------------------------

	class BaseSIMDZipVector<bits<3> size, bits<3> opc, RegisterOperand regtype,
	string asm, string kind, SDNode OpNode, ValueType valty>
	: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
	"{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
	"\|" # kind # "\t$Rd, $Rn, $Rm}", "",
	[(set (valty regtype:$Rd), (OpNode regtype:$Rn, regtype:$Rm))]>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<5> Rm;
	let Inst{31} = 0;
	let Inst{30} = size{0};
	let Inst{29-24} = 0b001110;
	let Inst{23-22} = size{2-1};
	let Inst{21} = 0;
	let Inst{20-16} = Rm;
	let Inst{15} = 0;
	let Inst{14-12} = opc;
	let Inst{11-10} = 0b10;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	multiclass SIMDZipVector<bits<3>opc, string asm,
	SDNode OpNode> {
	def v8i8 : BaseSIMDZipVector<0b000, opc, V64,
	asm, ".8b", OpNode, v8i8>;
	def v16i8 : BaseSIMDZipVector<0b001, opc, V128,
	asm, ".16b", OpNode, v16i8>;
	def v4i16 : BaseSIMDZipVector<0b010, opc, V64,
	asm, ".4h", OpNode, v4i16>;
	def v8i16 : BaseSIMDZipVector<0b011, opc, V128,
	asm, ".8h", OpNode, v8i16>;
	def v2i32 : BaseSIMDZipVector<0b100, opc, V64,
	asm, ".2s", OpNode, v2i32>;
	def v4i32 : BaseSIMDZipVector<0b101, opc, V128,
	asm, ".4s", OpNode, v4i32>;
	def v2i64 : BaseSIMDZipVector<0b111, opc, V128,
	asm, ".2d", OpNode, v2i64>;

	def : Pat<(v4f16 (OpNode V64:$Rn, V64:$Rm)),
	(!cast<Instruction>(NAME#"v4i16") V64:$Rn, V64:$Rm)>;
	def : Pat<(v8f16 (OpNode V128:$Rn, V128:$Rm)),
	(!cast<Instruction>(NAME#"v8i16") V128:$Rn, V128:$Rm)>;
	def : Pat<(v2f32 (OpNode V64:$Rn, V64:$Rm)),
	(!cast<Instruction>(NAME#"v2i32") V64:$Rn, V64:$Rm)>;
	def : Pat<(v4f32 (OpNode V128:$Rn, V128:$Rm)),
	(!cast<Instruction>(NAME#"v4i32") V128:$Rn, V128:$Rm)>;
	def : Pat<(v2f64 (OpNode V128:$Rn, V128:$Rm)),
	(!cast<Instruction>(NAME#"v2i64") V128:$Rn, V128:$Rm)>;
	}

	//----------------------------------------------------------------------------
	// AdvSIMD three register scalar instructions
	//----------------------------------------------------------------------------

	let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
	class BaseSIMDThreeScalar<bit U, bits<3> size, bits<5> opcode,
	RegisterClass regtype, string asm,
	list<dag> pattern>
	: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
	"\t$Rd, $Rn, $Rm", "", pattern>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<5> Rm;
	let Inst{31-30} = 0b01;
	let Inst{29} = U;
	let Inst{28-24} = 0b11110;
	let Inst{23-21} = size;
	let Inst{20-16} = Rm;
	let Inst{15-11} = opcode;
	let Inst{10} = 1;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
	class BaseSIMDThreeScalarTied<bit U, bits<2> size, bit R, bits<5> opcode,
	dag oops, dag iops, string asm,
	list<dag> pattern>
	: I<oops, iops, asm, "\t$Rd, $Rn, $Rm", "$Rd = $dst", pattern>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<5> Rm;
	let Inst{31-30} = 0b01;
	let Inst{29} = U;
	let Inst{28-24} = 0b11110;
	let Inst{23-22} = size;
	let Inst{21} = R;
	let Inst{20-16} = Rm;
	let Inst{15-11} = opcode;
	let Inst{10} = 1;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode> {
	def v1i64 : BaseSIMDThreeScalar<U, 0b111, opc, FPR64, asm,
	[(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
	}

	multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode> {
	def v1i64 : BaseSIMDThreeScalar<U, 0b111, opc, FPR64, asm,
	[(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
	def v1i32 : BaseSIMDThreeScalar<U, 0b101, opc, FPR32, asm, []>;
	def v1i16 : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>;
	def v1i8 : BaseSIMDThreeScalar<U, 0b001, opc, FPR8 , asm, []>;

	def : Pat<(i64 (OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
	(!cast<Instruction>(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>;
	def : Pat<(i32 (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm))),
	(!cast<Instruction>(NAME#"v1i32") FPR32:$Rn, FPR32:$Rm)>;
	}

	multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode> {
	def v1i32 : BaseSIMDThreeScalar<U, 0b101, opc, FPR32, asm,
	[(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
	def v1i16 : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>;
	}

	multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm,
	SDPatternOperator OpNode = null_frag> {
	def v1i32: BaseSIMDThreeScalarTied<U, 0b10, R, opc, (outs FPR32:$dst),
	(ins FPR32:$Rd, FPR32:$Rn, FPR32:$Rm),
	asm, []>;
	def v1i16: BaseSIMDThreeScalarTied<U, 0b01, R, opc, (outs FPR16:$dst),
	(ins FPR16:$Rd, FPR16:$Rn, FPR16:$Rm),
	asm, []>;
	}

	multiclass SIMDFPThreeScalar<bit U, bit S, bits<3> opc, string asm,
	SDPatternOperator OpNode = null_frag> {
	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
	def NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
	[(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
	def NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
	[(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
	let Predicates = [HasNEON, HasFullFP16] in {
	def NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm,
	[(set (f16 FPR16:$Rd), (OpNode (f16 FPR16:$Rn), (f16 FPR16:$Rm)))]>;
	} // Predicates = [HasNEON, HasFullFP16]
	}

	def : Pat<(v1f64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
	(!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
	}

	multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<3> opc, string asm,
	SDPatternOperator OpNode = null_frag> {
	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
	def NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
	[(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
	def NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
	[(set (i32 FPR32:$Rd), (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]>;
	let Predicates = [HasNEON, HasFullFP16] in {
	def NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm,
	[]>;
	} // Predicates = [HasNEON, HasFullFP16]
	}

	def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
	(!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
	}

	class BaseSIMDThreeScalarMixed<bit U, bits<2> size, bits<5> opcode,
	dag oops, dag iops, string asm, string cstr, list<dag> pat>
	: I<oops, iops, asm,
	"\t$Rd, $Rn, $Rm", cstr, pat>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<5> Rm;
	let Inst{31-30} = 0b01;
	let Inst{29} = U;
	let Inst{28-24} = 0b11110;
	let Inst{23-22} = size;
	let Inst{21} = 1;
	let Inst{20-16} = Rm;
	let Inst{15-11} = opcode;
	let Inst{10} = 0;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	multiclass SIMDThreeScalarMixedHS<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode = null_frag> {
	def i16 : BaseSIMDThreeScalarMixed<U, 0b01, opc,
	(outs FPR32:$Rd),
	(ins FPR16:$Rn, FPR16:$Rm), asm, "", []>;
	def i32 : BaseSIMDThreeScalarMixed<U, 0b10, opc,
	(outs FPR64:$Rd),
	(ins FPR32:$Rn, FPR32:$Rm), asm, "",
	[(set (i64 FPR64:$Rd), (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
	}

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	multiclass SIMDThreeScalarMixedTiedHS<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode = null_frag> {
	def i16 : BaseSIMDThreeScalarMixed<U, 0b01, opc,
	(outs FPR32:$dst),
	(ins FPR32:$Rd, FPR16:$Rn, FPR16:$Rm),
	asm, "$Rd = $dst", []>;
	def i32 : BaseSIMDThreeScalarMixed<U, 0b10, opc,
	(outs FPR64:$dst),
	(ins FPR64:$Rd, FPR32:$Rn, FPR32:$Rm),
	asm, "$Rd = $dst",
	[(set (i64 FPR64:$dst),
	(OpNode (i64 FPR64:$Rd), (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
	}

	//----------------------------------------------------------------------------
	// AdvSIMD two register scalar instructions
	//----------------------------------------------------------------------------

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseSIMDTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode,
	RegisterClass regtype, RegisterClass regtype2,
	string asm, list<dag> pat>
	: I<(outs regtype:$Rd), (ins regtype2:$Rn), asm,
	"\t$Rd, $Rn", "", pat>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	let Inst{31-30} = 0b01;
	let Inst{29} = U;
	let Inst{28-24} = 0b11110;
	let Inst{23-22} = size;
	let Inst{21} = 0b1;
	let Inst{20-19} = size2;
	let Inst{18-17} = 0b00;
	let Inst{16-12} = opcode;
	let Inst{11-10} = 0b10;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseSIMDTwoScalarTied<bit U, bits<2> size, bits<5> opcode,
	RegisterClass regtype, RegisterClass regtype2,
	string asm, list<dag> pat>
	: I<(outs regtype:$dst), (ins regtype:$Rd, regtype2:$Rn), asm,
	"\t$Rd, $Rn", "$Rd = $dst", pat>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	let Inst{31-30} = 0b01;
	let Inst{29} = U;
	let Inst{28-24} = 0b11110;
	let Inst{23-22} = size;
	let Inst{21-17} = 0b10000;
	let Inst{16-12} = opcode;
	let Inst{11-10} = 0b10;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}


	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode,
	RegisterClass regtype, string asm, string zero>
	: I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
	"\t$Rd, $Rn, #" # zero, "", []>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	let Inst{31-30} = 0b01;
	let Inst{29} = U;
	let Inst{28-24} = 0b11110;
	let Inst{23-22} = size;
	let Inst{21} = 0b1;
	let Inst{20-19} = size2;
	let Inst{18-17} = 0b00;
	let Inst{16-12} = opcode;
	let Inst{11-10} = 0b10;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm>
	: I<(outs FPR32:$Rd), (ins FPR64:$Rn), asm, "\t$Rd, $Rn", "",
	[(set (f32 FPR32:$Rd), (int_aarch64_sisd_fcvtxn (f64 FPR64:$Rn)))]>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	let Inst{31-17} = 0b011111100110000;
	let Inst{16-12} = opcode;
	let Inst{11-10} = 0b10;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	multiclass SIMDCmpTwoScalarD<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode> {
	def v1i64rz : BaseSIMDCmpTwoScalar<U, 0b11, 0b00, opc, FPR64, asm, "0">;

	def : Pat<(v1i64 (OpNode FPR64:$Rn)),
	(!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
	}

	multiclass SIMDFPCmpTwoScalar<bit U, bit S, bits<5> opc, string asm,
	SDPatternOperator OpNode> {
	def v1i64rz : BaseSIMDCmpTwoScalar<U, {S,1}, 0b00, opc, FPR64, asm, "0.0">;
	def v1i32rz : BaseSIMDCmpTwoScalar<U, {S,0}, 0b00, opc, FPR32, asm, "0.0">;
	let Predicates = [HasNEON, HasFullFP16] in {
	def v1i16rz : BaseSIMDCmpTwoScalar<U, {S,1}, 0b11, opc, FPR16, asm, "0.0">;
	}

	def : InstAlias<asm # "\t$Rd, $Rn, #0",
	(!cast<Instruction>(NAME # v1i64rz) FPR64:$Rd, FPR64:$Rn), 0>;
	def : InstAlias<asm # "\t$Rd, $Rn, #0",
	(!cast<Instruction>(NAME # v1i32rz) FPR32:$Rd, FPR32:$Rn), 0>;
	let Predicates = [HasNEON, HasFullFP16] in {
	def : InstAlias<asm # "\t$Rd, $Rn, #0",
	(!cast<Instruction>(NAME # v1i16rz) FPR16:$Rd, FPR16:$Rn), 0>;
	}

	def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn))),
	(!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
	}

	multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode = null_frag> {
	def v1i64 : BaseSIMDTwoScalar<U, 0b11, 0b00, opc, FPR64, FPR64, asm,
	[(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn)))]>;

	def : Pat<(i64 (OpNode (i64 FPR64:$Rn))),
	(!cast<Instruction>(NAME # "v1i64") FPR64:$Rn)>;
	}

	multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm> {
	def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,[]>;
	def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,[]>;
	let Predicates = [HasNEON, HasFullFP16] in {
	def v1f16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,[]>;
	}
	}

	multiclass SIMDFPTwoScalarCVT<bit U, bit S, bits<5> opc, string asm,
	SDPatternOperator OpNode> {
	def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,
	[(set FPR64:$Rd, (OpNode (f64 FPR64:$Rn)))]>;
	def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,
	[(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>;
	let Predicates = [HasNEON, HasFullFP16] in {
	def v1i16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,
	[(set (f16 FPR16:$Rd), (OpNode (f16 FPR16:$Rn)))]>;
	}
	}

	multiclass SIMDTwoScalarBHSD<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode = null_frag> {
	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
	def v1i64 : BaseSIMDTwoScalar<U, 0b11, 0b00, opc, FPR64, FPR64, asm,
	[(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
	def v1i32 : BaseSIMDTwoScalar<U, 0b10, 0b00, opc, FPR32, FPR32, asm,
	[(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
	def v1i16 : BaseSIMDTwoScalar<U, 0b01, 0b00, opc, FPR16, FPR16, asm, []>;
	def v1i8 : BaseSIMDTwoScalar<U, 0b00, 0b00, opc, FPR8 , FPR8 , asm, []>;
	}

	def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn))),
	(!cast<Instruction>(NAME # v1i64) FPR64:$Rn)>;
	}

	multiclass SIMDTwoScalarBHSDTied<bit U, bits<5> opc, string asm,
	Intrinsic OpNode> {
	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
	def v1i64 : BaseSIMDTwoScalarTied<U, 0b11, opc, FPR64, FPR64, asm,
	[(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn)))]>;
	def v1i32 : BaseSIMDTwoScalarTied<U, 0b10, opc, FPR32, FPR32, asm,
	[(set (i32 FPR32:$dst), (OpNode (i32 FPR32:$Rd), (i32 FPR32:$Rn)))]>;
	def v1i16 : BaseSIMDTwoScalarTied<U, 0b01, opc, FPR16, FPR16, asm, []>;
	def v1i8 : BaseSIMDTwoScalarTied<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
	}

	def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn))),
	(!cast<Instruction>(NAME # v1i64) FPR64:$Rd, FPR64:$Rn)>;
	}



	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	multiclass SIMDTwoScalarMixedBHS<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode = null_frag> {
	def v1i32 : BaseSIMDTwoScalar<U, 0b10, 0b00, opc, FPR32, FPR64, asm,
	[(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
	def v1i16 : BaseSIMDTwoScalar<U, 0b01, 0b00, opc, FPR16, FPR32, asm, []>;
	def v1i8 : BaseSIMDTwoScalar<U, 0b00, 0b00, opc, FPR8 , FPR16, asm, []>;
	}

	//----------------------------------------------------------------------------
	// AdvSIMD scalar pairwise instructions
	//----------------------------------------------------------------------------

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseSIMDPairwiseScalar<bit U, bits<2> size, bits<5> opcode,
	RegisterOperand regtype, RegisterOperand vectype,
	string asm, string kind>
	: I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
	"{\t$Rd, $Rn" # kind # "\|" # kind # "\t$Rd, $Rn}", "", []>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	let Inst{31-30} = 0b01;
	let Inst{29} = U;
	let Inst{28-24} = 0b11110;
	let Inst{23-22} = size;
	let Inst{21-17} = 0b11000;
	let Inst{16-12} = opcode;
	let Inst{11-10} = 0b10;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	multiclass SIMDPairwiseScalarD<bit U, bits<5> opc, string asm> {
	def v2i64p : BaseSIMDPairwiseScalar<U, 0b11, opc, FPR64Op, V128,
	asm, ".2d">;
	}

	multiclass SIMDFPPairwiseScalar<bit S, bits<5> opc, string asm> {
	let Predicates = [HasNEON, HasFullFP16] in {
	def v2i16p : BaseSIMDPairwiseScalar<0, {S,0}, opc, FPR16Op, V64,
	asm, ".2h">;
	}
	def v2i32p : BaseSIMDPairwiseScalar<1, {S,0}, opc, FPR32Op, V64,
	asm, ".2s">;
	def v2i64p : BaseSIMDPairwiseScalar<1, {S,1}, opc, FPR64Op, V128,
	asm, ".2d">;
	}

	//----------------------------------------------------------------------------
	// AdvSIMD across lanes instructions
	//----------------------------------------------------------------------------

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseSIMDAcrossLanes<bit Q, bit U, bits<2> size, bits<5> opcode,
	RegisterClass regtype, RegisterOperand vectype,
	string asm, string kind, list<dag> pattern>
	: I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
	"{\t$Rd, $Rn" # kind # "\|" # kind # "\t$Rd, $Rn}", "", pattern>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	let Inst{31} = 0;
	let Inst{30} = Q;
	let Inst{29} = U;
	let Inst{28-24} = 0b01110;
	let Inst{23-22} = size;
	let Inst{21-17} = 0b11000;
	let Inst{16-12} = opcode;
	let Inst{11-10} = 0b10;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	multiclass SIMDAcrossLanesBHS<bit U, bits<5> opcode,
	string asm> {
	def v8i8v : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR8, V64,
	asm, ".8b", []>;
	def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR8, V128,
	asm, ".16b", []>;
	def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR16, V64,
	asm, ".4h", []>;
	def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR16, V128,
	asm, ".8h", []>;
	def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR32, V128,
	asm, ".4s", []>;
	}

	multiclass SIMDAcrossLanesHSD<bit U, bits<5> opcode, string asm> {
	def v8i8v : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR16, V64,
	asm, ".8b", []>;
	def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR16, V128,
	asm, ".16b", []>;
	def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR32, V64,
	asm, ".4h", []>;
	def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR32, V128,
	asm, ".8h", []>;
	def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR64, V128,
	asm, ".4s", []>;
	}

	multiclass SIMDFPAcrossLanes<bits<5> opcode, bit sz1, string asm,
	Intrinsic intOp> {
	let Predicates = [HasNEON, HasFullFP16] in {
	def v4i16v : BaseSIMDAcrossLanes<0, 0, {sz1, 0}, opcode, FPR16, V64,
	asm, ".4h",
	[(set (f16 FPR16:$Rd), (intOp (v4f16 V64:$Rn)))]>;
	def v8i16v : BaseSIMDAcrossLanes<1, 0, {sz1, 0}, opcode, FPR16, V128,
	asm, ".8h",
	[(set (f16 FPR16:$Rd), (intOp (v8f16 V128:$Rn)))]>;
	} // Predicates = [HasNEON, HasFullFP16]
	def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128,
	asm, ".4s",
	[(set FPR32:$Rd, (intOp (v4f32 V128:$Rn)))]>;
	}

	//----------------------------------------------------------------------------
	// AdvSIMD INS/DUP instructions
	//----------------------------------------------------------------------------

	// FIXME: There has got to be a better way to factor these. ugh.

	class BaseSIMDInsDup<bit Q, bit op, dag outs, dag ins, string asm,
	string operands, string constraints, list<dag> pattern>
	: I<outs, ins, asm, operands, constraints, pattern>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	let Inst{31} = 0;
	let Inst{30} = Q;
	let Inst{29} = op;
	let Inst{28-21} = 0b01110000;
	let Inst{15} = 0;
	let Inst{10} = 1;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	class SIMDDupFromMain<bit Q, bits<5> imm5, string size, ValueType vectype,
	RegisterOperand vecreg, RegisterClass regtype>
	: BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins regtype:$Rn), "dup",
	"{\t$Rd" # size # ", $Rn" #
	"\|" # size # "\t$Rd, $Rn}", "",
	[(set (vectype vecreg:$Rd), (AArch64dup regtype:$Rn))]> {
	let Inst{20-16} = imm5;
	let Inst{14-11} = 0b0001;
	}

	class SIMDDupFromElement<bit Q, string dstkind, string srckind,
	ValueType vectype, ValueType insreg,
	RegisterOperand vecreg, Operand idxtype,
	ValueType elttype, SDNode OpNode>
	: BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins V128:$Rn, idxtype:$idx), "dup",
	"{\t$Rd" # dstkind # ", $Rn" # srckind # "$idx" #
	"\|" # dstkind # "\t$Rd, $Rn$idx}", "",
	[(set (vectype vecreg:$Rd),
	(OpNode (insreg V128:$Rn), idxtype:$idx))]> {
	let Inst{14-11} = 0b0000;
	}

	class SIMDDup64FromElement
	: SIMDDupFromElement<1, ".2d", ".d", v2i64, v2i64, V128,
	VectorIndexD, i64, AArch64duplane64> {
	bits<1> idx;
	let Inst{20} = idx;
	let Inst{19-16} = 0b1000;
	}

	class SIMDDup32FromElement<bit Q, string size, ValueType vectype,
	RegisterOperand vecreg>
	: SIMDDupFromElement<Q, size, ".s", vectype, v4i32, vecreg,
	VectorIndexS, i64, AArch64duplane32> {
	bits<2> idx;
	let Inst{20-19} = idx;
	let Inst{18-16} = 0b100;
	}

	class SIMDDup16FromElement<bit Q, string size, ValueType vectype,
	RegisterOperand vecreg>
	: SIMDDupFromElement<Q, size, ".h", vectype, v8i16, vecreg,
	VectorIndexH, i64, AArch64duplane16> {
	bits<3> idx;
	let Inst{20-18} = idx;
	let Inst{17-16} = 0b10;
	}

	class SIMDDup8FromElement<bit Q, string size, ValueType vectype,
	RegisterOperand vecreg>
	: SIMDDupFromElement<Q, size, ".b", vectype, v16i8, vecreg,
	VectorIndexB, i64, AArch64duplane8> {
	bits<4> idx;
	let Inst{20-17} = idx;
	let Inst{16} = 1;
	}

	class BaseSIMDMov<bit Q, string size, bits<4> imm4, RegisterClass regtype,
	Operand idxtype, string asm, list<dag> pattern>
	: BaseSIMDInsDup<Q, 0, (outs regtype:$Rd), (ins V128:$Rn, idxtype:$idx), asm,
	"{\t$Rd, $Rn" # size # "$idx" #
	"\|" # size # "\t$Rd, $Rn$idx}", "", pattern> {
	let Inst{14-11} = imm4;
	}

	class SIMDSMov<bit Q, string size, RegisterClass regtype,
	Operand idxtype>
	: BaseSIMDMov<Q, size, 0b0101, regtype, idxtype, "smov", []>;
	class SIMDUMov<bit Q, string size, ValueType vectype, RegisterClass regtype,
	Operand idxtype>
	: BaseSIMDMov<Q, size, 0b0111, regtype, idxtype, "umov",
	[(set regtype:$Rd, (vector_extract (vectype V128:$Rn), idxtype:$idx))]>;

	class SIMDMovAlias<string asm, string size, Instruction inst,
	RegisterClass regtype, Operand idxtype>
	: InstAlias<asm#"{\t$dst, $src"#size#"$idx" #
	"\|" # size # "\t$dst, $src$idx}",
	(inst regtype:$dst, V128:$src, idxtype:$idx)>;

	multiclass SMov {
	def vi8to32 : SIMDSMov<0, ".b", GPR32, VectorIndexB> {
	bits<4> idx;
	let Inst{20-17} = idx;
	let Inst{16} = 1;
	}
	def vi8to64 : SIMDSMov<1, ".b", GPR64, VectorIndexB> {
	bits<4> idx;
	let Inst{20-17} = idx;
	let Inst{16} = 1;
	}
	def vi16to32 : SIMDSMov<0, ".h", GPR32, VectorIndexH> {
	bits<3> idx;
	let Inst{20-18} = idx;
	let Inst{17-16} = 0b10;
	}
	def vi16to64 : SIMDSMov<1, ".h", GPR64, VectorIndexH> {
	bits<3> idx;
	let Inst{20-18} = idx;
	let Inst{17-16} = 0b10;
	}
	def vi32to64 : SIMDSMov<1, ".s", GPR64, VectorIndexS> {
	bits<2> idx;
	let Inst{20-19} = idx;
	let Inst{18-16} = 0b100;
	}
	}

	multiclass UMov {
	def vi8 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndexB> {
	bits<4> idx;
	let Inst{20-17} = idx;
	let Inst{16} = 1;
	}
	def vi16 : SIMDUMov<0, ".h", v8i16, GPR32, VectorIndexH> {
	bits<3> idx;
	let Inst{20-18} = idx;
	let Inst{17-16} = 0b10;
	}
	def vi32 : SIMDUMov<0, ".s", v4i32, GPR32, VectorIndexS> {
	bits<2> idx;
	let Inst{20-19} = idx;
	let Inst{18-16} = 0b100;
	}
	def vi64 : SIMDUMov<1, ".d", v2i64, GPR64, VectorIndexD> {
	bits<1> idx;
	let Inst{20} = idx;
	let Inst{19-16} = 0b1000;
	}
	def : SIMDMovAlias<"mov", ".s",
	!cast<Instruction>(NAME#"vi32"),
	GPR32, VectorIndexS>;
	def : SIMDMovAlias<"mov", ".d",
	!cast<Instruction>(NAME#"vi64"),
	GPR64, VectorIndexD>;
	}

	class SIMDInsFromMain<string size, ValueType vectype,
	RegisterClass regtype, Operand idxtype>
	: BaseSIMDInsDup<1, 0, (outs V128:$dst),
	(ins V128:$Rd, idxtype:$idx, regtype:$Rn), "ins",
	"{\t$Rd" # size # "$idx, $Rn" #
	"\|" # size # "\t$Rd$idx, $Rn}",
	"$Rd = $dst",
	[(set V128:$dst,
	(vector_insert (vectype V128:$Rd), regtype:$Rn, idxtype:$idx))]> {
	let Inst{14-11} = 0b0011;
	}

	class SIMDInsFromElement<string size, ValueType vectype,
	ValueType elttype, Operand idxtype>
	: BaseSIMDInsDup<1, 1, (outs V128:$dst),
	(ins V128:$Rd, idxtype:$idx, V128:$Rn, idxtype:$idx2), "ins",
	"{\t$Rd" # size # "$idx, $Rn" # size # "$idx2" #
	"\|" # size # "\t$Rd$idx, $Rn$idx2}",
	"$Rd = $dst",
	[(set V128:$dst,
	(vector_insert
	(vectype V128:$Rd),
	(elttype (vector_extract (vectype V128:$Rn), idxtype:$idx2)),
	idxtype:$idx))]>;

	class SIMDInsMainMovAlias<string size, Instruction inst,
	RegisterClass regtype, Operand idxtype>
	: InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" #
	"\|" # size #"\t$dst$idx, $src}",
	(inst V128:$dst, idxtype:$idx, regtype:$src)>;
	class SIMDInsElementMovAlias<string size, Instruction inst,
	Operand idxtype>
	: InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2"
	# "\|" # size #"\t$dst$idx, $src$idx2}",
	(inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>;


	multiclass SIMDIns {
	def vi8gpr : SIMDInsFromMain<".b", v16i8, GPR32, VectorIndexB> {
	bits<4> idx;
	let Inst{20-17} = idx;
	let Inst{16} = 1;
	}
	def vi16gpr : SIMDInsFromMain<".h", v8i16, GPR32, VectorIndexH> {
	bits<3> idx;
	let Inst{20-18} = idx;
	let Inst{17-16} = 0b10;
	}
	def vi32gpr : SIMDInsFromMain<".s", v4i32, GPR32, VectorIndexS> {
	bits<2> idx;
	let Inst{20-19} = idx;
	let Inst{18-16} = 0b100;
	}
	def vi64gpr : SIMDInsFromMain<".d", v2i64, GPR64, VectorIndexD> {
	bits<1> idx;
	let Inst{20} = idx;
	let Inst{19-16} = 0b1000;
	}

	def vi8lane : SIMDInsFromElement<".b", v16i8, i32, VectorIndexB> {
	bits<4> idx;
	bits<4> idx2;
	let Inst{20-17} = idx;
	let Inst{16} = 1;
	let Inst{14-11} = idx2;
	}
	def vi16lane : SIMDInsFromElement<".h", v8i16, i32, VectorIndexH> {
	bits<3> idx;
	bits<3> idx2;
	let Inst{20-18} = idx;
	let Inst{17-16} = 0b10;
	let Inst{14-12} = idx2;
	let Inst{11} = {?};
	}
	def vi32lane : SIMDInsFromElement<".s", v4i32, i32, VectorIndexS> {
	bits<2> idx;
	bits<2> idx2;
	let Inst{20-19} = idx;
	let Inst{18-16} = 0b100;
	let Inst{14-13} = idx2;
	let Inst{12-11} = {?,?};
	}
	def vi64lane : SIMDInsFromElement<".d", v2i64, i64, VectorIndexD> {
	bits<1> idx;
	bits<1> idx2;
	let Inst{20} = idx;
	let Inst{19-16} = 0b1000;
	let Inst{14} = idx2;
	let Inst{13-11} = {?,?,?};
	}

	// For all forms of the INS instruction, the "mov" mnemonic is the
	// preferred alias. Why they didn't just call the instruction "mov" in
	// the first place is a very good question indeed...
	def : SIMDInsMainMovAlias<".b", !cast<Instruction>(NAME#"vi8gpr"),
	GPR32, VectorIndexB>;
	def : SIMDInsMainMovAlias<".h", !cast<Instruction>(NAME#"vi16gpr"),
	GPR32, VectorIndexH>;
	def : SIMDInsMainMovAlias<".s", !cast<Instruction>(NAME#"vi32gpr"),
	GPR32, VectorIndexS>;
	def : SIMDInsMainMovAlias<".d", !cast<Instruction>(NAME#"vi64gpr"),
	GPR64, VectorIndexD>;

	def : SIMDInsElementMovAlias<".b", !cast<Instruction>(NAME#"vi8lane"),
	VectorIndexB>;
	def : SIMDInsElementMovAlias<".h", !cast<Instruction>(NAME#"vi16lane"),
	VectorIndexH>;
	def : SIMDInsElementMovAlias<".s", !cast<Instruction>(NAME#"vi32lane"),
	VectorIndexS>;
	def : SIMDInsElementMovAlias<".d", !cast<Instruction>(NAME#"vi64lane"),
	VectorIndexD>;
	}

	//----------------------------------------------------------------------------
	// AdvSIMD TBL/TBX
	//----------------------------------------------------------------------------

	let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
	class BaseSIMDTableLookup<bit Q, bits<2> len, bit op, RegisterOperand vectype,
	RegisterOperand listtype, string asm, string kind>
	: I<(outs vectype:$Vd), (ins listtype:$Vn, vectype:$Vm), asm,
	"\t$Vd" # kind # ", $Vn, $Vm" # kind, "", []>,
	Sched<[WriteV]> {
	bits<5> Vd;
	bits<5> Vn;
	bits<5> Vm;
	let Inst{31} = 0;
	let Inst{30} = Q;
	let Inst{29-21} = 0b001110000;
	let Inst{20-16} = Vm;
	let Inst{15} = 0;
	let Inst{14-13} = len;
	let Inst{12} = op;
	let Inst{11-10} = 0b00;
	let Inst{9-5} = Vn;
	let Inst{4-0} = Vd;
	}

	let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
	class BaseSIMDTableLookupTied<bit Q, bits<2> len, bit op, RegisterOperand vectype,
	RegisterOperand listtype, string asm, string kind>
	: I<(outs vectype:$dst), (ins vectype:$Vd, listtype:$Vn, vectype:$Vm), asm,
	"\t$Vd" # kind # ", $Vn, $Vm" # kind, "$Vd = $dst", []>,
	Sched<[WriteV]> {
	bits<5> Vd;
	bits<5> Vn;
	bits<5> Vm;
	let Inst{31} = 0;
	let Inst{30} = Q;
	let Inst{29-21} = 0b001110000;
	let Inst{20-16} = Vm;
	let Inst{15} = 0;
	let Inst{14-13} = len;
	let Inst{12} = op;
	let Inst{11-10} = 0b00;
	let Inst{9-5} = Vn;
	let Inst{4-0} = Vd;
	}

	class SIMDTableLookupAlias<string asm, Instruction inst,
	RegisterOperand vectype, RegisterOperand listtype>
	: InstAlias<!strconcat(asm, "\t$dst, $lst, $index"),
	(inst vectype:$dst, listtype:$lst, vectype:$index), 0>;

	multiclass SIMDTableLookup<bit op, string asm> {
	def v8i8One : BaseSIMDTableLookup<0, 0b00, op, V64, VecListOne16b,
	asm, ".8b">;
	def v8i8Two : BaseSIMDTableLookup<0, 0b01, op, V64, VecListTwo16b,
	asm, ".8b">;
	def v8i8Three : BaseSIMDTableLookup<0, 0b10, op, V64, VecListThree16b,
	asm, ".8b">;
	def v8i8Four : BaseSIMDTableLookup<0, 0b11, op, V64, VecListFour16b,
	asm, ".8b">;
	def v16i8One : BaseSIMDTableLookup<1, 0b00, op, V128, VecListOne16b,
	asm, ".16b">;
	def v16i8Two : BaseSIMDTableLookup<1, 0b01, op, V128, VecListTwo16b,
	asm, ".16b">;
	def v16i8Three: BaseSIMDTableLookup<1, 0b10, op, V128, VecListThree16b,
	asm, ".16b">;
	def v16i8Four : BaseSIMDTableLookup<1, 0b11, op, V128, VecListFour16b,
	asm, ".16b">;

	def : SIMDTableLookupAlias<asm # ".8b",
	!cast<Instruction>(NAME#"v8i8One"),
	V64, VecListOne128>;
	def : SIMDTableLookupAlias<asm # ".8b",
	!cast<Instruction>(NAME#"v8i8Two"),
	V64, VecListTwo128>;
	def : SIMDTableLookupAlias<asm # ".8b",
	!cast<Instruction>(NAME#"v8i8Three"),
	V64, VecListThree128>;
	def : SIMDTableLookupAlias<asm # ".8b",
	!cast<Instruction>(NAME#"v8i8Four"),
	V64, VecListFour128>;
	def : SIMDTableLookupAlias<asm # ".16b",
	!cast<Instruction>(NAME#"v16i8One"),
	V128, VecListOne128>;
	def : SIMDTableLookupAlias<asm # ".16b",
	!cast<Instruction>(NAME#"v16i8Two"),
	V128, VecListTwo128>;
	def : SIMDTableLookupAlias<asm # ".16b",
	!cast<Instruction>(NAME#"v16i8Three"),
	V128, VecListThree128>;
	def : SIMDTableLookupAlias<asm # ".16b",
	!cast<Instruction>(NAME#"v16i8Four"),
	V128, VecListFour128>;
	}

	multiclass SIMDTableLookupTied<bit op, string asm> {
	def v8i8One : BaseSIMDTableLookupTied<0, 0b00, op, V64, VecListOne16b,
	asm, ".8b">;
	def v8i8Two : BaseSIMDTableLookupTied<0, 0b01, op, V64, VecListTwo16b,
	asm, ".8b">;
	def v8i8Three : BaseSIMDTableLookupTied<0, 0b10, op, V64, VecListThree16b,
	asm, ".8b">;
	def v8i8Four : BaseSIMDTableLookupTied<0, 0b11, op, V64, VecListFour16b,
	asm, ".8b">;
	def v16i8One : BaseSIMDTableLookupTied<1, 0b00, op, V128, VecListOne16b,
	asm, ".16b">;
	def v16i8Two : BaseSIMDTableLookupTied<1, 0b01, op, V128, VecListTwo16b,
	asm, ".16b">;
	def v16i8Three: BaseSIMDTableLookupTied<1, 0b10, op, V128, VecListThree16b,
	asm, ".16b">;
	def v16i8Four : BaseSIMDTableLookupTied<1, 0b11, op, V128, VecListFour16b,
	asm, ".16b">;

	def : SIMDTableLookupAlias<asm # ".8b",
	!cast<Instruction>(NAME#"v8i8One"),
	V64, VecListOne128>;
	def : SIMDTableLookupAlias<asm # ".8b",
	!cast<Instruction>(NAME#"v8i8Two"),
	V64, VecListTwo128>;
	def : SIMDTableLookupAlias<asm # ".8b",
	!cast<Instruction>(NAME#"v8i8Three"),
	V64, VecListThree128>;
	def : SIMDTableLookupAlias<asm # ".8b",
	!cast<Instruction>(NAME#"v8i8Four"),
	V64, VecListFour128>;
	def : SIMDTableLookupAlias<asm # ".16b",
	!cast<Instruction>(NAME#"v16i8One"),
	V128, VecListOne128>;
	def : SIMDTableLookupAlias<asm # ".16b",
	!cast<Instruction>(NAME#"v16i8Two"),
	V128, VecListTwo128>;
	def : SIMDTableLookupAlias<asm # ".16b",
	!cast<Instruction>(NAME#"v16i8Three"),
	V128, VecListThree128>;
	def : SIMDTableLookupAlias<asm # ".16b",
	!cast<Instruction>(NAME#"v16i8Four"),
	V128, VecListFour128>;
	}


	//----------------------------------------------------------------------------
	// AdvSIMD scalar CPY
	//----------------------------------------------------------------------------
	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseSIMDScalarCPY<RegisterClass regtype, RegisterOperand vectype,
	string kind, Operand idxtype>
	: I<(outs regtype:$dst), (ins vectype:$src, idxtype:$idx), "mov",
	"{\t$dst, $src" # kind # "$idx" #
	"\|\t$dst, $src$idx}", "", []>,
	Sched<[WriteV]> {
	bits<5> dst;
	bits<5> src;
	let Inst{31-21} = 0b01011110000;
	let Inst{15-10} = 0b000001;
	let Inst{9-5} = src;
	let Inst{4-0} = dst;
	}

	class SIMDScalarCPYAlias<string asm, string size, Instruction inst,
	RegisterClass regtype, RegisterOperand vectype, Operand idxtype>
	: InstAlias<asm # "{\t$dst, $src" # size # "$index"
	# "\|\t$dst, $src$index}",
	(inst regtype:$dst, vectype:$src, idxtype:$index), 0>;


	multiclass SIMDScalarCPY<string asm> {
	def i8 : BaseSIMDScalarCPY<FPR8, V128, ".b", VectorIndexB> {
	bits<4> idx;
	let Inst{20-17} = idx;
	let Inst{16} = 1;
	}
	def i16 : BaseSIMDScalarCPY<FPR16, V128, ".h", VectorIndexH> {
	bits<3> idx;
	let Inst{20-18} = idx;
	let Inst{17-16} = 0b10;
	}
	def i32 : BaseSIMDScalarCPY<FPR32, V128, ".s", VectorIndexS> {
	bits<2> idx;
	let Inst{20-19} = idx;
	let Inst{18-16} = 0b100;
	}
	def i64 : BaseSIMDScalarCPY<FPR64, V128, ".d", VectorIndexD> {
	bits<1> idx;
	let Inst{20} = idx;
	let Inst{19-16} = 0b1000;
	}

	def : Pat<(v1i64 (scalar_to_vector (i64 (vector_extract (v2i64 V128:$src),
	VectorIndexD:$idx)))),
	(!cast<Instruction>(NAME # i64) V128:$src, VectorIndexD:$idx)>;

	// 'DUP' mnemonic aliases.
	def : SIMDScalarCPYAlias<"dup", ".b",
	!cast<Instruction>(NAME#"i8"),
	FPR8, V128, VectorIndexB>;
	def : SIMDScalarCPYAlias<"dup", ".h",
	!cast<Instruction>(NAME#"i16"),
	FPR16, V128, VectorIndexH>;
	def : SIMDScalarCPYAlias<"dup", ".s",
	!cast<Instruction>(NAME#"i32"),
	FPR32, V128, VectorIndexS>;
	def : SIMDScalarCPYAlias<"dup", ".d",
	!cast<Instruction>(NAME#"i64"),
	FPR64, V128, VectorIndexD>;
	}

	//----------------------------------------------------------------------------
	// AdvSIMD modified immediate instructions
	//----------------------------------------------------------------------------

	class BaseSIMDModifiedImm<bit Q, bit op, bit op2, dag oops, dag iops,
	string asm, string op_string,
	string cstr, list<dag> pattern>
	: I<oops, iops, asm, op_string, cstr, pattern>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<8> imm8;
	let Inst{31} = 0;
	let Inst{30} = Q;
	let Inst{29} = op;
	let Inst{28-19} = 0b0111100000;
	let Inst{18-16} = imm8{7-5};
	let Inst{11} = op2;
	let Inst{10} = 1;
	let Inst{9-5} = imm8{4-0};
	let Inst{4-0} = Rd;
	}

	class BaseSIMDModifiedImmVector<bit Q, bit op, bit op2, RegisterOperand vectype,
	Operand immtype, dag opt_shift_iop,
	string opt_shift, string asm, string kind,
	list<dag> pattern>
	: BaseSIMDModifiedImm<Q, op, op2, (outs vectype:$Rd),
	!con((ins immtype:$imm8), opt_shift_iop), asm,
	"{\t$Rd" # kind # ", $imm8" # opt_shift #
	"\|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
	"", pattern> {
	let DecoderMethod = "DecodeModImmInstruction";
	}

	class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype,
	Operand immtype, dag opt_shift_iop,
	string opt_shift, string asm, string kind,
	list<dag> pattern>
	: BaseSIMDModifiedImm<Q, op, 0, (outs vectype:$dst),
	!con((ins vectype:$Rd, immtype:$imm8), opt_shift_iop),
	asm, "{\t$Rd" # kind # ", $imm8" # opt_shift #
	"\|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
	"$Rd = $dst", pattern> {
	let DecoderMethod = "DecodeModImmTiedInstruction";
	}

	class BaseSIMDModifiedImmVectorShift<bit Q, bit op, bits<2> b15_b12,
	RegisterOperand vectype, string asm,
	string kind, list<dag> pattern>
	: BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255,
	(ins logical_vec_shift:$shift),
	"$shift", asm, kind, pattern> {
	bits<2> shift;
	let Inst{15} = b15_b12{1};
	let Inst{14-13} = shift;
	let Inst{12} = b15_b12{0};
	}

	class BaseSIMDModifiedImmVectorShiftTied<bit Q, bit op, bits<2> b15_b12,
	RegisterOperand vectype, string asm,
	string kind, list<dag> pattern>
	: BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
	(ins logical_vec_shift:$shift),
	"$shift", asm, kind, pattern> {
	bits<2> shift;
	let Inst{15} = b15_b12{1};
	let Inst{14-13} = shift;
	let Inst{12} = b15_b12{0};
	}


	class BaseSIMDModifiedImmVectorShiftHalf<bit Q, bit op, bits<2> b15_b12,
	RegisterOperand vectype, string asm,
	string kind, list<dag> pattern>
	: BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255,
	(ins logical_vec_hw_shift:$shift),
	"$shift", asm, kind, pattern> {
	bits<2> shift;
	let Inst{15} = b15_b12{1};
	let Inst{14} = 0;
	let Inst{13} = shift{0};
	let Inst{12} = b15_b12{0};
	}

	class BaseSIMDModifiedImmVectorShiftHalfTied<bit Q, bit op, bits<2> b15_b12,
	RegisterOperand vectype, string asm,
	string kind, list<dag> pattern>
	: BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
	(ins logical_vec_hw_shift:$shift),
	"$shift", asm, kind, pattern> {
	bits<2> shift;
	let Inst{15} = b15_b12{1};
	let Inst{14} = 0;
	let Inst{13} = shift{0};
	let Inst{12} = b15_b12{0};
	}

	multiclass SIMDModifiedImmVectorShift<bit op, bits<2> hw_cmode, bits<2> w_cmode,
	string asm> {
	def v4i16 : BaseSIMDModifiedImmVectorShiftHalf<0, op, hw_cmode, V64,
	asm, ".4h", []>;
	def v8i16 : BaseSIMDModifiedImmVectorShiftHalf<1, op, hw_cmode, V128,
	asm, ".8h", []>;

	def v2i32 : BaseSIMDModifiedImmVectorShift<0, op, w_cmode, V64,
	asm, ".2s", []>;
	def v4i32 : BaseSIMDModifiedImmVectorShift<1, op, w_cmode, V128,
	asm, ".4s", []>;
	}

	multiclass SIMDModifiedImmVectorShiftTied<bit op, bits<2> hw_cmode,
	bits<2> w_cmode, string asm,
	SDNode OpNode> {
	def v4i16 : BaseSIMDModifiedImmVectorShiftHalfTied<0, op, hw_cmode, V64,
	asm, ".4h",
	[(set (v4i16 V64:$dst), (OpNode V64:$Rd,
	imm0_255:$imm8,
	(i32 imm:$shift)))]>;
	def v8i16 : BaseSIMDModifiedImmVectorShiftHalfTied<1, op, hw_cmode, V128,
	asm, ".8h",
	[(set (v8i16 V128:$dst), (OpNode V128:$Rd,
	imm0_255:$imm8,
	(i32 imm:$shift)))]>;

	def v2i32 : BaseSIMDModifiedImmVectorShiftTied<0, op, w_cmode, V64,
	asm, ".2s",
	[(set (v2i32 V64:$dst), (OpNode V64:$Rd,
	imm0_255:$imm8,
	(i32 imm:$shift)))]>;
	def v4i32 : BaseSIMDModifiedImmVectorShiftTied<1, op, w_cmode, V128,
	asm, ".4s",
	[(set (v4i32 V128:$dst), (OpNode V128:$Rd,
	imm0_255:$imm8,
	(i32 imm:$shift)))]>;
	}

	class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode,
	RegisterOperand vectype, string asm,
	string kind, list<dag> pattern>
	: BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255,
	(ins move_vec_shift:$shift),
	"$shift", asm, kind, pattern> {
	bits<1> shift;
	let Inst{15-13} = cmode{3-1};
	let Inst{12} = shift;
	}

	class SIMDModifiedImmVectorNoShift<bit Q, bit op, bit op2, bits<4> cmode,
	RegisterOperand vectype,
	Operand imm_type, string asm,
	string kind, list<dag> pattern>
	: BaseSIMDModifiedImmVector<Q, op, op2, vectype, imm_type, (ins), "",
	asm, kind, pattern> {
	let Inst{15-12} = cmode;
	}

	class SIMDModifiedImmScalarNoShift<bit Q, bit op, bits<4> cmode, string asm,
	list<dag> pattern>
	: BaseSIMDModifiedImm<Q, op, 0, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm,
	"\t$Rd, $imm8", "", pattern> {
	let Inst{15-12} = cmode;
	let DecoderMethod = "DecodeModImmInstruction";
	}

	//----------------------------------------------------------------------------
	// AdvSIMD indexed element
	//----------------------------------------------------------------------------

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseSIMDIndexed<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
	RegisterOperand dst_reg, RegisterOperand lhs_reg,
	RegisterOperand rhs_reg, Operand vec_idx, string asm,
	string apple_kind, string dst_kind, string lhs_kind,
	string rhs_kind, list<dag> pattern>
	: I<(outs dst_reg:$Rd), (ins lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx),
	asm,
	"{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
	"\|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "", pattern>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<5> Rm;

	let Inst{31} = 0;
	let Inst{30} = Q;
	let Inst{29} = U;
	let Inst{28} = Scalar;
	let Inst{27-24} = 0b1111;
	let Inst{23-22} = size;
	// Bit 21 must be set by the derived class.
	let Inst{20-16} = Rm;
	let Inst{15-12} = opc;
	// Bit 11 must be set by the derived class.
	let Inst{10} = 0;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
	RegisterOperand dst_reg, RegisterOperand lhs_reg,
	RegisterOperand rhs_reg, Operand vec_idx, string asm,
	string apple_kind, string dst_kind, string lhs_kind,
	string rhs_kind, list<dag> pattern>
	: I<(outs dst_reg:$dst),
	(ins dst_reg:$Rd, lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx), asm,
	"{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
	"\|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "$Rd = $dst", pattern>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<5> Rm;

	let Inst{31} = 0;
	let Inst{30} = Q;
	let Inst{29} = U;
	let Inst{28} = Scalar;
	let Inst{27-24} = 0b1111;
	let Inst{23-22} = size;
	// Bit 21 must be set by the derived class.
	let Inst{20-16} = Rm;
	let Inst{15-12} = opc;
	// Bit 11 must be set by the derived class.
	let Inst{10} = 0;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}


	//----------------------------------------------------------------------------
	// Armv8.6 BFloat16 Extension
	//----------------------------------------------------------------------------
	let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in {

	class BaseSIMDThreeSameVectorBFDot<bit Q, bit U, string asm, string kind1,
	string kind2, RegisterOperand RegType,
	ValueType AccumType, ValueType InputType>
	: BaseSIMDThreeSameVectorTied<Q, U, 0b010, 0b11111, RegType, asm, kind1, [(set (AccumType RegType:$dst),
	(int_aarch64_neon_bfdot (AccumType RegType:$Rd),
	(InputType RegType:$Rn),
	(InputType RegType:$Rm)))]> {
	let AsmString = !strconcat(asm,
	"{\t$Rd" # kind1 # ", $Rn" # kind2 #
	", $Rm" # kind2 # "}");
	}

	multiclass SIMDThreeSameVectorBFDot<bit U, string asm> {
	def v4bf16 : BaseSIMDThreeSameVectorBFDot<0, U, asm, ".2s", ".4h", V64,
	v2f32, v8i8>;
	def v8bf16 : BaseSIMDThreeSameVectorBFDot<1, U, asm, ".4s", ".8h", V128,
	v4f32, v16i8>;
	}

	class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm,
	string dst_kind, string lhs_kind,
	string rhs_kind,
	RegisterOperand RegType,
	ValueType AccumType,
	ValueType InputType>
	: BaseSIMDIndexedTied<Q, U, 0b0, 0b01, 0b1111,
	RegType, RegType, V128, VectorIndexS,
	asm, "", dst_kind, lhs_kind, rhs_kind,
	[(set (AccumType RegType:$dst),
	(AccumType (int_aarch64_neon_bfdot
	(AccumType RegType:$Rd),
	(InputType RegType:$Rn),
	(InputType (bitconvert (AccumType
	(AArch64duplane32 (v4f32 V128:$Rm),
	VectorIndexH:$idx)))))))]> {

	bits<2> idx;
	let Inst{21} = idx{0}; // L
	let Inst{11} = idx{1}; // H
	}

	multiclass SIMDThreeSameVectorBF16DotI<bit U, string asm> {

	def v4bf16 : BaseSIMDThreeSameVectorBF16DotI<0, U, asm, ".2s", ".4h",
	".2h", V64, v2f32, v8i8>;
	def v8bf16 : BaseSIMDThreeSameVectorBF16DotI<1, U, asm, ".4s", ".8h",
	".2h", V128, v4f32, v16i8>;
	}

	class SIMDBF16MLAL<bit Q, string asm, SDPatternOperator OpNode>
	: BaseSIMDThreeSameVectorTied<Q, 0b1, 0b110, 0b11111, V128, asm, ".4s",
	[(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
	(v16i8 V128:$Rn),
	(v16i8 V128:$Rm)))]> {
	let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}");
	}

	class SIMDBF16MLALIndex<bit Q, string asm, SDPatternOperator OpNode>
	: I<(outs V128:$dst),
	(ins V128:$Rd, V128:$Rn, V128_lo:$Rm, VectorIndexH:$idx), asm,
	"{\t$Rd.4s, $Rn.8h, $Rm.h$idx}", "$Rd = $dst",
	[(set (v4f32 V128:$dst),
	(v4f32 (OpNode (v4f32 V128:$Rd),
	(v16i8 V128:$Rn),
	(v16i8 (bitconvert (v8bf16
	(AArch64duplane16 (v8bf16 V128_lo:$Rm),
	VectorIndexH:$idx)))))))]>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<4> Rm;
	bits<3> idx;

	let Inst{31} = 0;
	let Inst{30} = Q;
	let Inst{29-22} = 0b00111111;
	let Inst{21-20} = idx{1-0};
	let Inst{19-16} = Rm;
	let Inst{15-12} = 0b1111;
	let Inst{11} = idx{2}; // H
	let Inst{10} = 0;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	class SIMDThreeSameVectorBF16MatrixMul<string asm>
	: BaseSIMDThreeSameVectorTied<1, 1, 0b010, 0b11101,
	V128, asm, ".4s",
	[(set (v4f32 V128:$dst),
	(int_aarch64_neon_bfmmla (v4f32 V128:$Rd),
	(v16i8 V128:$Rn),
	(v16i8 V128:$Rm)))]> {
	let AsmString = !strconcat(asm, "{\t$Rd", ".4s", ", $Rn", ".8h",
	", $Rm", ".8h", "}");
	}

	class SIMD_BFCVTN
	: BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128,
	"bfcvtn", ".4h", ".4s",
	[(set (v8bf16 V128:$Rd),
	(int_aarch64_neon_bfcvtn (v4f32 V128:$Rn)))]>;

	class SIMD_BFCVTN2
	: BaseSIMDMixedTwoVectorTied<1, 0, 0b10, 0b10110, V128, V128,
	"bfcvtn2", ".8h", ".4s",
	[(set (v8bf16 V128:$dst),
	(int_aarch64_neon_bfcvtn2 (v8bf16 V128:$Rd), (v4f32 V128:$Rn)))]>;

	class BF16ToSinglePrecision<string asm>
	: I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "",
	[(set (bf16 FPR16:$Rd), (int_aarch64_neon_bfcvt (f32 FPR32:$Rn)))]>,
	Sched<[WriteFCvt]> {
	bits<5> Rd;
	bits<5> Rn;
	let Inst{31-10} = 0b0001111001100011010000;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}
	} // End of let mayStore = 0, mayLoad = 0, hasSideEffects = 0

	//----------------------------------------------------------------------------
	// Armv8.6 Matrix Multiply Extension
	//----------------------------------------------------------------------------

	class SIMDThreeSameVectorMatMul<bit B, bit U, string asm, SDPatternOperator OpNode>
	: BaseSIMDThreeSameVectorTied<1, U, 0b100, {0b1010, B}, V128, asm, ".4s",
	[(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
	(v16i8 V128:$Rn),
	(v16i8 V128:$Rm)))]> {
	let AsmString = asm # "{\t$Rd.4s, $Rn.16b, $Rm.16b}";
	}

	//----------------------------------------------------------------------------
	// ARMv8.2-A Dot Product Instructions (Indexed)
	class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, bit Mixed, bits<2> size, string asm,
	string dst_kind, string lhs_kind, string rhs_kind,
	RegisterOperand RegType,
	ValueType AccumType, ValueType InputType,
	SDPatternOperator OpNode> :
	BaseSIMDIndexedTied<Q, U, 0b0, size, {0b111, Mixed}, RegType, RegType, V128,
	VectorIndexS, asm, "", dst_kind, lhs_kind, rhs_kind,
	[(set (AccumType RegType:$dst),
	(AccumType (OpNode (AccumType RegType:$Rd),
	(InputType RegType:$Rn),
	(InputType (bitconvert (AccumType
	(AArch64duplane32 (v4i32 V128:$Rm),
	VectorIndexS:$idx)))))))]> {
	bits<2> idx;
	let Inst{21} = idx{0}; // L
	let Inst{11} = idx{1}; // H
	}

	multiclass SIMDThreeSameVectorDotIndex<bit U, bit Mixed, bits<2> size, string asm,
	SDPatternOperator OpNode> {
	def v8i8 : BaseSIMDThreeSameVectorDotIndex<0, U, Mixed, size, asm, ".2s", ".8b", ".4b",
	V64, v2i32, v8i8, OpNode>;
	def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, Mixed, size, asm, ".4s", ".16b", ".4b",
	V128, v4i32, v16i8, OpNode>;
	}

	// ARMv8.2-A Fused Multiply Add-Long Instructions (Indexed)
	class BaseSIMDThreeSameVectorFMLIndex<bit Q, bit U, bits<4> opc, string asm,
	string dst_kind, string lhs_kind,
	string rhs_kind, RegisterOperand RegType,
	ValueType AccumType, ValueType InputType,
	SDPatternOperator OpNode> :
	BaseSIMDIndexedTied<Q, U, 0, 0b10, opc, RegType, RegType, V128,
	VectorIndexH, asm, "", dst_kind, lhs_kind, rhs_kind,
	[(set (AccumType RegType:$dst),
	(AccumType (OpNode (AccumType RegType:$Rd),
	(InputType RegType:$Rn),
	(InputType (AArch64duplane16 (v8f16 V128:$Rm),
	VectorIndexH:$idx)))))]> {
	// idx = H:L:M
	bits<3> idx;
	let Inst{11} = idx{2}; // H
	let Inst{21} = idx{1}; // L
	let Inst{20} = idx{0}; // M
	}

	multiclass SIMDThreeSameVectorFMLIndex<bit U, bits<4> opc, string asm,
	SDPatternOperator OpNode> {
	def v4f16 : BaseSIMDThreeSameVectorFMLIndex<0, U, opc, asm, ".2s", ".2h", ".h",
	V64, v2f32, v4f16, OpNode>;
	def v8f16 : BaseSIMDThreeSameVectorFMLIndex<1, U, opc, asm, ".4s", ".4h", ".h",
	V128, v4f32, v8f16, OpNode>;
	}

	multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
	SDPatternOperator OpNode> {
	let Predicates = [HasNEON, HasFullFP16] in {
	def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b00, opc,
	V64, V64,
	V128_lo, VectorIndexH,
	asm, ".4h", ".4h", ".4h", ".h",
	[(set (v4f16 V64:$Rd),
	(OpNode (v4f16 V64:$Rn),
	(v4f16 (AArch64duplane16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
	bits<3> idx;
	let Inst{11} = idx{2};
	let Inst{21} = idx{1};
	let Inst{20} = idx{0};
	}

	def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b00, opc,
	V128, V128,
	V128_lo, VectorIndexH,
	asm, ".8h", ".8h", ".8h", ".h",
	[(set (v8f16 V128:$Rd),
	(OpNode (v8f16 V128:$Rn),
	(v8f16 (AArch64duplane16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
	bits<3> idx;
	let Inst{11} = idx{2};
	let Inst{21} = idx{1};
	let Inst{20} = idx{0};
	}
	} // Predicates = [HasNEON, HasFullFP16]

	def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
	V64, V64,
	V128, VectorIndexS,
	asm, ".2s", ".2s", ".2s", ".s",
	[(set (v2f32 V64:$Rd),
	(OpNode (v2f32 V64:$Rn),
	(v2f32 (AArch64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
	bits<2> idx;
	let Inst{11} = idx{1};
	let Inst{21} = idx{0};
	}

	def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
	V128, V128,
	V128, VectorIndexS,
	asm, ".4s", ".4s", ".4s", ".s",
	[(set (v4f32 V128:$Rd),
	(OpNode (v4f32 V128:$Rn),
	(v4f32 (AArch64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
	bits<2> idx;
	let Inst{11} = idx{1};
	let Inst{21} = idx{0};
	}

	def v2i64_indexed : BaseSIMDIndexed<1, U, 0, 0b11, opc,
	V128, V128,
	V128, VectorIndexD,
	asm, ".2d", ".2d", ".2d", ".d",
	[(set (v2f64 V128:$Rd),
	(OpNode (v2f64 V128:$Rn),
	(v2f64 (AArch64duplane64 (v2f64 V128:$Rm), VectorIndexD:$idx))))]> {
	bits<1> idx;
	let Inst{11} = idx{0};
	let Inst{21} = 0;
	}

	let Predicates = [HasNEON, HasFullFP16] in {
	def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b00, opc,
	FPR16Op, FPR16Op, V128_lo, VectorIndexH,
	asm, ".h", "", "", ".h",
	[(set (f16 FPR16Op:$Rd),
	(OpNode (f16 FPR16Op:$Rn),
	(f16 (vector_extract (v8f16 V128_lo:$Rm),
	VectorIndexH:$idx))))]> {
	bits<3> idx;
	let Inst{11} = idx{2};
	let Inst{21} = idx{1};
	let Inst{20} = idx{0};
	}
	} // Predicates = [HasNEON, HasFullFP16]

	def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
	FPR32Op, FPR32Op, V128, VectorIndexS,
	asm, ".s", "", "", ".s",
	[(set (f32 FPR32Op:$Rd),
	(OpNode (f32 FPR32Op:$Rn),
	(f32 (vector_extract (v4f32 V128:$Rm),
	VectorIndexS:$idx))))]> {
	bits<2> idx;
	let Inst{11} = idx{1};
	let Inst{21} = idx{0};
	}

	def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b11, opc,
	FPR64Op, FPR64Op, V128, VectorIndexD,
	asm, ".d", "", "", ".d",
	[(set (f64 FPR64Op:$Rd),
	(OpNode (f64 FPR64Op:$Rn),
	(f64 (vector_extract (v2f64 V128:$Rm),
	VectorIndexD:$idx))))]> {
	bits<1> idx;
	let Inst{11} = idx{0};
	let Inst{21} = 0;
	}
	}

	multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
	let Predicates = [HasNEON, HasFullFP16] in {
	// Patterns for f16: DUPLANE, DUP scalar and vector_extract.
	def : Pat<(v8f16 (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn),
	(AArch64duplane16 (v8f16 V128_lo:$Rm),
	VectorIndexH:$idx))),
	(!cast<Instruction>(INST # "v8i16_indexed")
	V128:$Rd, V128:$Rn, V128_lo:$Rm, VectorIndexH:$idx)>;
	def : Pat<(v8f16 (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn),
	(AArch64dup (f16 FPR16Op_lo:$Rm)))),
	(!cast<Instruction>(INST # "v8i16_indexed") V128:$Rd, V128:$Rn,
	(SUBREG_TO_REG (i32 0), (f16 FPR16Op_lo:$Rm), hsub), (i64 0))>;

	def : Pat<(v4f16 (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn),
	(AArch64duplane16 (v8f16 V128_lo:$Rm),
	VectorIndexH:$idx))),
	(!cast<Instruction>(INST # "v4i16_indexed")
	V64:$Rd, V64:$Rn, V128_lo:$Rm, VectorIndexH:$idx)>;
	def : Pat<(v4f16 (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn),
	(AArch64dup (f16 FPR16Op_lo:$Rm)))),
	(!cast<Instruction>(INST # "v4i16_indexed") V64:$Rd, V64:$Rn,
	(SUBREG_TO_REG (i32 0), (f16 FPR16Op_lo:$Rm), hsub), (i64 0))>;

	def : Pat<(f16 (OpNode (f16 FPR16:$Rd), (f16 FPR16:$Rn),
	(vector_extract (v8f16 V128_lo:$Rm), VectorIndexH:$idx))),
	(!cast<Instruction>(INST # "v1i16_indexed") FPR16:$Rd, FPR16:$Rn,
	V128_lo:$Rm, VectorIndexH:$idx)>;
	} // Predicates = [HasNEON, HasFullFP16]

	// 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar.
	def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
	(AArch64duplane32 (v4f32 V128:$Rm),
	VectorIndexS:$idx))),
	(!cast<Instruction>(INST # v2i32_indexed)
	V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
	def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
	(AArch64dup (f32 FPR32Op:$Rm)))),
	(!cast<Instruction>(INST # "v2i32_indexed") V64:$Rd, V64:$Rn,
	(SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;


	// 2 variants for the .4s version: DUPLANE from 128-bit and DUP scalar.
	def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
	(AArch64duplane32 (v4f32 V128:$Rm),
	VectorIndexS:$idx))),
	(!cast<Instruction>(INST # "v4i32_indexed")
	V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
	def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
	(AArch64dup (f32 FPR32Op:$Rm)))),
	(!cast<Instruction>(INST # "v4i32_indexed") V128:$Rd, V128:$Rn,
	(SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;

	// 2 variants for the .2d version: DUPLANE from 128-bit and DUP scalar.
	def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
	(AArch64duplane64 (v2f64 V128:$Rm),
	VectorIndexD:$idx))),
	(!cast<Instruction>(INST # "v2i64_indexed")
	V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
	def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
	(AArch64dup (f64 FPR64Op:$Rm)))),
	(!cast<Instruction>(INST # "v2i64_indexed") V128:$Rd, V128:$Rn,
	(SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;

	// Covers 2 variants for 32-bit scalar version: extract from .2s or from .4s
	def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
	(vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))),
	(!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
	V128:$Rm, VectorIndexS:$idx)>;

	// 1 variant for 64-bit scalar version: extract from .1d or from .2d
	def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
	(vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx))),
	(!cast<Instruction>(INST # "v1i64_indexed") FPR64:$Rd, FPR64:$Rn,
	V128:$Rm, VectorIndexD:$idx)>;
	}

	multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> {
	let Predicates = [HasNEON, HasFullFP16] in {
	def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b00, opc, V64, V64,
	V128_lo, VectorIndexH,
	asm, ".4h", ".4h", ".4h", ".h", []> {
	bits<3> idx;
	let Inst{11} = idx{2};
	let Inst{21} = idx{1};
	let Inst{20} = idx{0};
	}

	def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b00, opc,
	V128, V128,
	V128_lo, VectorIndexH,
	asm, ".8h", ".8h", ".8h", ".h", []> {
	bits<3> idx;
	let Inst{11} = idx{2};
	let Inst{21} = idx{1};
	let Inst{20} = idx{0};
	}
	} // Predicates = [HasNEON, HasFullFP16]

	def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64,
	V128, VectorIndexS,
	asm, ".2s", ".2s", ".2s", ".s", []> {
	bits<2> idx;
	let Inst{11} = idx{1};
	let Inst{21} = idx{0};
	}

	def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
	V128, V128,
	V128, VectorIndexS,
	asm, ".4s", ".4s", ".4s", ".s", []> {
	bits<2> idx;
	let Inst{11} = idx{1};
	let Inst{21} = idx{0};
	}

	def v2i64_indexed : BaseSIMDIndexedTied<1, U, 0, 0b11, opc,
	V128, V128,
	V128, VectorIndexD,
	asm, ".2d", ".2d", ".2d", ".d", []> {
	bits<1> idx;
	let Inst{11} = idx{0};
	let Inst{21} = 0;
	}

	let Predicates = [HasNEON, HasFullFP16] in {
	def v1i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b00, opc,
	FPR16Op, FPR16Op, V128_lo, VectorIndexH,
	asm, ".h", "", "", ".h", []> {
	bits<3> idx;
	let Inst{11} = idx{2};
	let Inst{21} = idx{1};
	let Inst{20} = idx{0};
	}
	} // Predicates = [HasNEON, HasFullFP16]

	def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
	FPR32Op, FPR32Op, V128, VectorIndexS,
	asm, ".s", "", "", ".s", []> {
	bits<2> idx;
	let Inst{11} = idx{1};
	let Inst{21} = idx{0};
	}

	def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b11, opc,
	FPR64Op, FPR64Op, V128, VectorIndexD,
	asm, ".d", "", "", ".d", []> {
	bits<1> idx;
	let Inst{11} = idx{0};
	let Inst{21} = 0;
	}
	}

	multiclass SIMDIndexedHSPatterns<SDPatternOperator OpNodeLane,
	SDPatternOperator OpNodeLaneQ> {

	def : Pat<(v4i16 (OpNodeLane
	(v4i16 V64:$Rn), (v4i16 V64_lo:$Rm),
	VectorIndexS32b:$idx)),
	(!cast<Instruction>(NAME # v4i16_indexed) $Rn,
	(SUBREG_TO_REG (i32 0), (v4i16 V64_lo:$Rm), dsub),
	(UImmS1XForm $idx))>;

	def : Pat<(v4i16 (OpNodeLaneQ
	(v4i16 V64:$Rn), (v8i16 V128_lo:$Rm),
	VectorIndexH32b:$idx)),
	(!cast<Instruction>(NAME # v4i16_indexed) $Rn, $Rm,
	(UImmS1XForm $idx))>;

	def : Pat<(v8i16 (OpNodeLane
	(v8i16 V128:$Rn), (v4i16 V64_lo:$Rm),
	VectorIndexS32b:$idx)),
	(!cast<Instruction>(NAME # v8i16_indexed) $Rn,
	(SUBREG_TO_REG (i32 0), $Rm, dsub),
	(UImmS1XForm $idx))>;

	def : Pat<(v8i16 (OpNodeLaneQ
	(v8i16 V128:$Rn), (v8i16 V128_lo:$Rm),
	VectorIndexH32b:$idx)),
	(!cast<Instruction>(NAME # v8i16_indexed) $Rn, $Rm,
	(UImmS1XForm $idx))>;

	def : Pat<(v2i32 (OpNodeLane
	(v2i32 V64:$Rn), (v2i32 V64:$Rm),
	VectorIndexD32b:$idx)),
	(!cast<Instruction>(NAME # v2i32_indexed) $Rn,
	(SUBREG_TO_REG (i32 0), (v2i32 V64_lo:$Rm), dsub),
	(UImmS1XForm $idx))>;

	def : Pat<(v2i32 (OpNodeLaneQ
	(v2i32 V64:$Rn), (v4i32 V128:$Rm),
	VectorIndexS32b:$idx)),
	(!cast<Instruction>(NAME # v2i32_indexed) $Rn, $Rm,
	(UImmS1XForm $idx))>;

	def : Pat<(v4i32 (OpNodeLane
	(v4i32 V128:$Rn), (v2i32 V64:$Rm),
	VectorIndexD32b:$idx)),
	(!cast<Instruction>(NAME # v4i32_indexed) $Rn,
	(SUBREG_TO_REG (i32 0), $Rm, dsub),
	(UImmS1XForm $idx))>;

	def : Pat<(v4i32 (OpNodeLaneQ
	(v4i32 V128:$Rn),
	(v4i32 V128:$Rm),
	VectorIndexS32b:$idx)),
	(!cast<Instruction>(NAME # v4i32_indexed) $Rn, $Rm,
	(UImmS1XForm $idx))>;

	}

	multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm,
	SDPatternOperator OpNode> {
	def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64,
	V128_lo, VectorIndexH,
	asm, ".4h", ".4h", ".4h", ".h",
	[(set (v4i16 V64:$Rd),
	(OpNode (v4i16 V64:$Rn),
	(v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
	bits<3> idx;
	let Inst{11} = idx{2};
	let Inst{21} = idx{1};
	let Inst{20} = idx{0};
	}

	def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
	V128, V128,
	V128_lo, VectorIndexH,
	asm, ".8h", ".8h", ".8h", ".h",
	[(set (v8i16 V128:$Rd),
	(OpNode (v8i16 V128:$Rn),
	(v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
	bits<3> idx;
	let Inst{11} = idx{2};
	let Inst{21} = idx{1};
	let Inst{20} = idx{0};
	}

	def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
	V64, V64,
	V128, VectorIndexS,
	asm, ".2s", ".2s", ".2s", ".s",
	[(set (v2i32 V64:$Rd),
	(OpNode (v2i32 V64:$Rn),
	(v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
	bits<2> idx;
	let Inst{11} = idx{1};
	let Inst{21} = idx{0};
	}

	def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
	V128, V128,
	V128, VectorIndexS,
	asm, ".4s", ".4s", ".4s", ".s",
	[(set (v4i32 V128:$Rd),
	(OpNode (v4i32 V128:$Rn),
	(v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
	bits<2> idx;
	let Inst{11} = idx{1};
	let Inst{21} = idx{0};
	}

	def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
	FPR16Op, FPR16Op, V128_lo, VectorIndexH,
	asm, ".h", "", "", ".h", []> {
	bits<3> idx;
	let Inst{11} = idx{2};
	let Inst{21} = idx{1};
	let Inst{20} = idx{0};
	}

	def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
	FPR32Op, FPR32Op, V128, VectorIndexS,
	asm, ".s", "", "", ".s",
	[(set (i32 FPR32Op:$Rd),
	(OpNode FPR32Op:$Rn,
	(i32 (vector_extract (v4i32 V128:$Rm),
	VectorIndexS:$idx))))]> {
	bits<2> idx;
	let Inst{11} = idx{1};
	let Inst{21} = idx{0};
	}
	}

	multiclass SIMDVectorIndexedHS<bit U, bits<4> opc, string asm,
	SDPatternOperator OpNode> {
	def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
	V64, V64,
	V128_lo, VectorIndexH,
	asm, ".4h", ".4h", ".4h", ".h",
	[(set (v4i16 V64:$Rd),
	(OpNode (v4i16 V64:$Rn),
	(v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
	bits<3> idx;
	let Inst{11} = idx{2};
	let Inst{21} = idx{1};
	let Inst{20} = idx{0};
	}

	def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
	V128, V128,
	V128_lo, VectorIndexH,
	asm, ".8h", ".8h", ".8h", ".h",
	[(set (v8i16 V128:$Rd),
	(OpNode (v8i16 V128:$Rn),
	(v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
	bits<3> idx;
	let Inst{11} = idx{2};
	let Inst{21} = idx{1};
	let Inst{20} = idx{0};
	}

	def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
	V64, V64,
	V128, VectorIndexS,
	asm, ".2s", ".2s", ".2s", ".s",
	[(set (v2i32 V64:$Rd),
	(OpNode (v2i32 V64:$Rn),
	(v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
	bits<2> idx;
	let Inst{11} = idx{1};
	let Inst{21} = idx{0};
	}

	def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
	V128, V128,
	V128, VectorIndexS,
	asm, ".4s", ".4s", ".4s", ".s",
	[(set (v4i32 V128:$Rd),
	(OpNode (v4i32 V128:$Rn),
	(v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
	bits<2> idx;
	let Inst{11} = idx{1};
	let Inst{21} = idx{0};
	}
	}

	multiclass SIMDVectorIndexedHSTied<bit U, bits<4> opc, string asm,
	SDPatternOperator OpNode> {
	def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, V64, V64,
	V128_lo, VectorIndexH,
	asm, ".4h", ".4h", ".4h", ".h",
	[(set (v4i16 V64:$dst),
	(OpNode (v4i16 V64:$Rd),(v4i16 V64:$Rn),
	(v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
	bits<3> idx;
	let Inst{11} = idx{2};
	let Inst{21} = idx{1};
	let Inst{20} = idx{0};
	}

	def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
	V128, V128,
	V128_lo, VectorIndexH,
	asm, ".8h", ".8h", ".8h", ".h",
	[(set (v8i16 V128:$dst),
	(OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
	(v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
	bits<3> idx;
	let Inst{11} = idx{2};
	let Inst{21} = idx{1};
	let Inst{20} = idx{0};
	}

	def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
	V64, V64,
	V128, VectorIndexS,
	asm, ".2s", ".2s", ".2s", ".s",
	[(set (v2i32 V64:$dst),
	(OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
	(v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
	bits<2> idx;
	let Inst{11} = idx{1};
	let Inst{21} = idx{0};
	}

	def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
	V128, V128,
	V128, VectorIndexS,
	asm, ".4s", ".4s", ".4s", ".s",
	[(set (v4i32 V128:$dst),
	(OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
	(v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
	bits<2> idx;
	let Inst{11} = idx{1};
	let Inst{21} = idx{0};
	}
	}

	multiclass SIMDIndexedLongSD<bit U, bits<4> opc, string asm,
	SDPatternOperator OpNode> {
	def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
	V128, V64,
	V128_lo, VectorIndexH,
	asm, ".4s", ".4s", ".4h", ".h",
	[(set (v4i32 V128:$Rd),
	(OpNode (v4i16 V64:$Rn),
	(v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
	bits<3> idx;
	let Inst{11} = idx{2};
	let Inst{21} = idx{1};
	let Inst{20} = idx{0};
	}

	def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
	V128, V128,
	V128_lo, VectorIndexH,
	asm#"2", ".4s", ".4s", ".8h", ".h",
	[(set (v4i32 V128:$Rd),
	(OpNode (extract_high_v8i16 V128:$Rn),
	(extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
	VectorIndexH:$idx))))]> {

	bits<3> idx;
	let Inst{11} = idx{2};
	let Inst{21} = idx{1};
	let Inst{20} = idx{0};
	}

	def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
	V128, V64,
	V128, VectorIndexS,
	asm, ".2d", ".2d", ".2s", ".s",
	[(set (v2i64 V128:$Rd),
	(OpNode (v2i32 V64:$Rn),
	(v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
	bits<2> idx;
	let Inst{11} = idx{1};
	let Inst{21} = idx{0};
	}

	def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
	V128, V128,
	V128, VectorIndexS,
	asm#"2", ".2d", ".2d", ".4s", ".s",
	[(set (v2i64 V128:$Rd),
	(OpNode (extract_high_v4i32 V128:$Rn),
	(extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
	VectorIndexS:$idx))))]> {
	bits<2> idx;
	let Inst{11} = idx{1};
	let Inst{21} = idx{0};
	}

	def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
	FPR32Op, FPR16Op, V128_lo, VectorIndexH,
	asm, ".h", "", "", ".h", []> {
	bits<3> idx;
	let Inst{11} = idx{2};
	let Inst{21} = idx{1};
	let Inst{20} = idx{0};
	}

	def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
	FPR64Op, FPR32Op, V128, VectorIndexS,
	asm, ".s", "", "", ".s", []> {
	bits<2> idx;
	let Inst{11} = idx{1};
	let Inst{21} = idx{0};
	}
	}

	multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
	SDPatternOperator Accum> {
	def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
	V128, V64,
	V128_lo, VectorIndexH,
	asm, ".4s", ".4s", ".4h", ".h",
	[(set (v4i32 V128:$dst),
	(Accum (v4i32 V128:$Rd),
	(v4i32 (int_aarch64_neon_sqdmull
	(v4i16 V64:$Rn),
	(v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
	VectorIndexH:$idx))))))]> {
	bits<3> idx;
	let Inst{11} = idx{2};
	let Inst{21} = idx{1};
	let Inst{20} = idx{0};
	}

	// FIXME: it would be nice to use the scalar (v1i32) instruction here, but an
	// intermediate EXTRACT_SUBREG would be untyped.
	def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
	(i32 (vector_extract (v4i32
	(int_aarch64_neon_sqdmull (v4i16 V64:$Rn),
	(v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
	VectorIndexH:$idx)))),
	(i64 0))))),
	(EXTRACT_SUBREG
	(!cast<Instruction>(NAME # v4i16_indexed)
	(SUBREG_TO_REG (i32 0), FPR32Op:$Rd, ssub), V64:$Rn,
	V128_lo:$Rm, VectorIndexH:$idx),
	ssub)>;

	def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
	V128, V128,
	V128_lo, VectorIndexH,
	asm#"2", ".4s", ".4s", ".8h", ".h",
	[(set (v4i32 V128:$dst),
	(Accum (v4i32 V128:$Rd),
	(v4i32 (int_aarch64_neon_sqdmull
	(extract_high_v8i16 V128:$Rn),
	(extract_high_v8i16
	(AArch64duplane16 (v8i16 V128_lo:$Rm),
	VectorIndexH:$idx))))))]> {
	bits<3> idx;
	let Inst{11} = idx{2};
	let Inst{21} = idx{1};
	let Inst{20} = idx{0};
	}

	def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
	V128, V64,
	V128, VectorIndexS,
	asm, ".2d", ".2d", ".2s", ".s",
	[(set (v2i64 V128:$dst),
	(Accum (v2i64 V128:$Rd),
	(v2i64 (int_aarch64_neon_sqdmull
	(v2i32 V64:$Rn),
	(v2i32 (AArch64duplane32 (v4i32 V128:$Rm),
	VectorIndexS:$idx))))))]> {
	bits<2> idx;
	let Inst{11} = idx{1};
	let Inst{21} = idx{0};
	}

	def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
	V128, V128,
	V128, VectorIndexS,
	asm#"2", ".2d", ".2d", ".4s", ".s",
	[(set (v2i64 V128:$dst),
	(Accum (v2i64 V128:$Rd),
	(v2i64 (int_aarch64_neon_sqdmull
	(extract_high_v4i32 V128:$Rn),
	(extract_high_v4i32
	(AArch64duplane32 (v4i32 V128:$Rm),
	VectorIndexS:$idx))))))]> {
	bits<2> idx;
	let Inst{11} = idx{1};
	let Inst{21} = idx{0};
	}

	def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc,
	FPR32Op, FPR16Op, V128_lo, VectorIndexH,
	asm, ".h", "", "", ".h", []> {
	bits<3> idx;
	let Inst{11} = idx{2};
	let Inst{21} = idx{1};
	let Inst{20} = idx{0};
	}


	def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
	FPR64Op, FPR32Op, V128, VectorIndexS,
	asm, ".s", "", "", ".s",
	[(set (i64 FPR64Op:$dst),
	(Accum (i64 FPR64Op:$Rd),
	(i64 (int_aarch64_neon_sqdmulls_scalar
	(i32 FPR32Op:$Rn),
	(i32 (vector_extract (v4i32 V128:$Rm),
	VectorIndexS:$idx))))))]> {

	bits<2> idx;
	let Inst{11} = idx{1};
	let Inst{21} = idx{0};
	}
	}

	multiclass SIMDVectorIndexedLongSD<bit U, bits<4> opc, string asm,
	SDPatternOperator OpNode> {
	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
	def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
	V128, V64,
	V128_lo, VectorIndexH,
	asm, ".4s", ".4s", ".4h", ".h",
	[(set (v4i32 V128:$Rd),
	(OpNode (v4i16 V64:$Rn),
	(v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
	bits<3> idx;
	let Inst{11} = idx{2};
	let Inst{21} = idx{1};
	let Inst{20} = idx{0};
	}

	def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
	V128, V128,
	V128_lo, VectorIndexH,
	asm#"2", ".4s", ".4s", ".8h", ".h",
	[(set (v4i32 V128:$Rd),
	(OpNode (extract_high_v8i16 V128:$Rn),
	(extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
	VectorIndexH:$idx))))]> {

	bits<3> idx;
	let Inst{11} = idx{2};
	let Inst{21} = idx{1};
	let Inst{20} = idx{0};
	}

	def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
	V128, V64,
	V128, VectorIndexS,
	asm, ".2d", ".2d", ".2s", ".s",
	[(set (v2i64 V128:$Rd),
	(OpNode (v2i32 V64:$Rn),
	(v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
	bits<2> idx;
	let Inst{11} = idx{1};
	let Inst{21} = idx{0};
	}

	def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
	V128, V128,
	V128, VectorIndexS,
	asm#"2", ".2d", ".2d", ".4s", ".s",
	[(set (v2i64 V128:$Rd),
	(OpNode (extract_high_v4i32 V128:$Rn),
	(extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
	VectorIndexS:$idx))))]> {
	bits<2> idx;
	let Inst{11} = idx{1};
	let Inst{21} = idx{0};
	}
	}
	}

	multiclass SIMDVectorIndexedLongSDTied<bit U, bits<4> opc, string asm,
	SDPatternOperator OpNode> {
	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
	def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
	V128, V64,
	V128_lo, VectorIndexH,
	asm, ".4s", ".4s", ".4h", ".h",
	[(set (v4i32 V128:$dst),
	(OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn),
	(v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
	bits<3> idx;
	let Inst{11} = idx{2};
	let Inst{21} = idx{1};
	let Inst{20} = idx{0};
	}

	def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
	V128, V128,
	V128_lo, VectorIndexH,
	asm#"2", ".4s", ".4s", ".8h", ".h",
	[(set (v4i32 V128:$dst),
	(OpNode (v4i32 V128:$Rd),
	(extract_high_v8i16 V128:$Rn),
	(extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
	VectorIndexH:$idx))))]> {
	bits<3> idx;
	let Inst{11} = idx{2};
	let Inst{21} = idx{1};
	let Inst{20} = idx{0};
	}

	def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
	V128, V64,
	V128, VectorIndexS,
	asm, ".2d", ".2d", ".2s", ".s",
	[(set (v2i64 V128:$dst),
	(OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn),
	(v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
	bits<2> idx;
	let Inst{11} = idx{1};
	let Inst{21} = idx{0};
	}

	def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
	V128, V128,
	V128, VectorIndexS,
	asm#"2", ".2d", ".2d", ".4s", ".s",
	[(set (v2i64 V128:$dst),
	(OpNode (v2i64 V128:$Rd),
	(extract_high_v4i32 V128:$Rn),
	(extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
	VectorIndexS:$idx))))]> {
	bits<2> idx;
	let Inst{11} = idx{1};
	let Inst{21} = idx{0};
	}
	}
	}

	//----------------------------------------------------------------------------
	// AdvSIMD scalar shift by immediate
	//----------------------------------------------------------------------------

	let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
	class BaseSIMDScalarShift<bit U, bits<5> opc, bits<7> fixed_imm,
	RegisterClass regtype1, RegisterClass regtype2,
	Operand immtype, string asm, list<dag> pattern>
	: I<(outs regtype1:$Rd), (ins regtype2:$Rn, immtype:$imm),
	asm, "\t$Rd, $Rn, $imm", "", pattern>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<7> imm;
	let Inst{31-30} = 0b01;
	let Inst{29} = U;
	let Inst{28-23} = 0b111110;
	let Inst{22-16} = fixed_imm;
	let Inst{15-11} = opc;
	let Inst{10} = 1;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
	class BaseSIMDScalarShiftTied<bit U, bits<5> opc, bits<7> fixed_imm,
	RegisterClass regtype1, RegisterClass regtype2,
	Operand immtype, string asm, list<dag> pattern>
	: I<(outs regtype1:$dst), (ins regtype1:$Rd, regtype2:$Rn, immtype:$imm),
	asm, "\t$Rd, $Rn, $imm", "$Rd = $dst", pattern>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<7> imm;
	let Inst{31-30} = 0b01;
	let Inst{29} = U;
	let Inst{28-23} = 0b111110;
	let Inst{22-16} = fixed_imm;
	let Inst{15-11} = opc;
	let Inst{10} = 1;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}


	multiclass SIMDFPScalarRShift<bit U, bits<5> opc, string asm> {
	let Predicates = [HasNEON, HasFullFP16] in {
	def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
	FPR16, FPR16, vecshiftR16, asm, []> {
	let Inst{19-16} = imm{3-0};
	}
	} // Predicates = [HasNEON, HasFullFP16]
	def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
	FPR32, FPR32, vecshiftR32, asm, []> {
	let Inst{20-16} = imm{4-0};
	}
	def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
	FPR64, FPR64, vecshiftR64, asm, []> {
	let Inst{21-16} = imm{5-0};
	}
	}

	multiclass SIMDScalarRShiftD<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode> {
	def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
	FPR64, FPR64, vecshiftR64, asm,
	[(set (i64 FPR64:$Rd),
	(OpNode (i64 FPR64:$Rn), (i32 vecshiftR64:$imm)))]> {
	let Inst{21-16} = imm{5-0};
	}

	def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftR64:$imm))),
	(!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftR64:$imm)>;
	}

	multiclass SIMDScalarRShiftDTied<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode = null_frag> {
	def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
	FPR64, FPR64, vecshiftR64, asm,
	[(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn),
	(i32 vecshiftR64:$imm)))]> {
	let Inst{21-16} = imm{5-0};
	}

	def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
	(i32 vecshiftR64:$imm))),
	(!cast<Instruction>(NAME # "d") FPR64:$Rd, FPR64:$Rn,
	vecshiftR64:$imm)>;
	}

	multiclass SIMDScalarLShiftD<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode> {
	def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
	FPR64, FPR64, vecshiftL64, asm,
	[(set (v1i64 FPR64:$Rd),
	(OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> {
	let Inst{21-16} = imm{5-0};
	}
	}

	let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
	multiclass SIMDScalarLShiftDTied<bit U, bits<5> opc, string asm> {
	def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
	FPR64, FPR64, vecshiftL64, asm, []> {
	let Inst{21-16} = imm{5-0};
	}
	}

	let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
	multiclass SIMDScalarRShiftBHS<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode = null_frag> {
	def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
	FPR8, FPR16, vecshiftR8, asm, []> {
	let Inst{18-16} = imm{2-0};
	}

	def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
	FPR16, FPR32, vecshiftR16, asm, []> {
	let Inst{19-16} = imm{3-0};
	}

	def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
	FPR32, FPR64, vecshiftR32, asm,
	[(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn), vecshiftR32:$imm))]> {
	let Inst{20-16} = imm{4-0};
	}
	}

	multiclass SIMDScalarLShiftBHSD<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode> {
	def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
	FPR8, FPR8, vecshiftL8, asm, []> {
	let Inst{18-16} = imm{2-0};
	}

	def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
	FPR16, FPR16, vecshiftL16, asm, []> {
	let Inst{19-16} = imm{3-0};
	}

	def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
	FPR32, FPR32, vecshiftL32, asm,
	[(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn), (i32 vecshiftL32:$imm)))]> {
	let Inst{20-16} = imm{4-0};
	}

	def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
	FPR64, FPR64, vecshiftL64, asm,
	[(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> {
	let Inst{21-16} = imm{5-0};
	}

	def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm))),
	(!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftL64:$imm)>;
	}

	multiclass SIMDScalarRShiftBHSD<bit U, bits<5> opc, string asm> {
	def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
	FPR8, FPR8, vecshiftR8, asm, []> {
	let Inst{18-16} = imm{2-0};
	}

	def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
	FPR16, FPR16, vecshiftR16, asm, []> {
	let Inst{19-16} = imm{3-0};
	}

	def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
	FPR32, FPR32, vecshiftR32, asm, []> {
	let Inst{20-16} = imm{4-0};
	}

	def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
	FPR64, FPR64, vecshiftR64, asm, []> {
	let Inst{21-16} = imm{5-0};
	}
	}

	//----------------------------------------------------------------------------
	// AdvSIMD vector x indexed element
	//----------------------------------------------------------------------------

	let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
	class BaseSIMDVectorShift<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
	RegisterOperand dst_reg, RegisterOperand src_reg,
	Operand immtype,
	string asm, string dst_kind, string src_kind,
	list<dag> pattern>
	: I<(outs dst_reg:$Rd), (ins src_reg:$Rn, immtype:$imm),
	asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
	"\|" # dst_kind # "\t$Rd, $Rn, $imm}", "", pattern>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	let Inst{31} = 0;
	let Inst{30} = Q;
	let Inst{29} = U;
	let Inst{28-23} = 0b011110;
	let Inst{22-16} = fixed_imm;
	let Inst{15-11} = opc;
	let Inst{10} = 1;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
	class BaseSIMDVectorShiftTied<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
	RegisterOperand vectype1, RegisterOperand vectype2,
	Operand immtype,
	string asm, string dst_kind, string src_kind,
	list<dag> pattern>
	: I<(outs vectype1:$dst), (ins vectype1:$Rd, vectype2:$Rn, immtype:$imm),
	asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
	"\|" # dst_kind # "\t$Rd, $Rn, $imm}", "$Rd = $dst", pattern>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	let Inst{31} = 0;
	let Inst{30} = Q;
	let Inst{29} = U;
	let Inst{28-23} = 0b011110;
	let Inst{22-16} = fixed_imm;
	let Inst{15-11} = opc;
	let Inst{10} = 1;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
	Intrinsic OpNode> {
	let Predicates = [HasNEON, HasFullFP16] in {
	def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
	V64, V64, vecshiftR16,
	asm, ".4h", ".4h",
	[(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (i32 imm:$imm)))]> {
	bits<4> imm;
	let Inst{19-16} = imm;
	}

	def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
	V128, V128, vecshiftR16,
	asm, ".8h", ".8h",
	[(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (i32 imm:$imm)))]> {
	bits<4> imm;
	let Inst{19-16} = imm;
	}
	} // Predicates = [HasNEON, HasFullFP16]
	def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
	V64, V64, vecshiftR32,
	asm, ".2s", ".2s",
	[(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (i32 imm:$imm)))]> {
	bits<5> imm;
	let Inst{20-16} = imm;
	}

	def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
	V128, V128, vecshiftR32,
	asm, ".4s", ".4s",
	[(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (i32 imm:$imm)))]> {
	bits<5> imm;
	let Inst{20-16} = imm;
	}

	def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
	V128, V128, vecshiftR64,
	asm, ".2d", ".2d",
	[(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (i32 imm:$imm)))]> {
	bits<6> imm;
	let Inst{21-16} = imm;
	}
	}

	multiclass SIMDVectorRShiftToFP<bit U, bits<5> opc, string asm,
	Intrinsic OpNode> {
	let Predicates = [HasNEON, HasFullFP16] in {
	def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
	V64, V64, vecshiftR16,
	asm, ".4h", ".4h",
	[(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (i32 imm:$imm)))]> {
	bits<4> imm;
	let Inst{19-16} = imm;
	}

	def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
	V128, V128, vecshiftR16,
	asm, ".8h", ".8h",
	[(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (i32 imm:$imm)))]> {
	bits<4> imm;
	let Inst{19-16} = imm;
	}
	} // Predicates = [HasNEON, HasFullFP16]

	def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
	V64, V64, vecshiftR32,
	asm, ".2s", ".2s",
	[(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (i32 imm:$imm)))]> {
	bits<5> imm;
	let Inst{20-16} = imm;
	}

	def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
	V128, V128, vecshiftR32,
	asm, ".4s", ".4s",
	[(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (i32 imm:$imm)))]> {
	bits<5> imm;
	let Inst{20-16} = imm;
	}

	def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
	V128, V128, vecshiftR64,
	asm, ".2d", ".2d",
	[(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (i32 imm:$imm)))]> {
	bits<6> imm;
	let Inst{21-16} = imm;
	}
	}

	multiclass SIMDVectorRShiftNarrowBHS<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode> {
	def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
	V64, V128, vecshiftR16Narrow,
	asm, ".8b", ".8h",
	[(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))]> {
	bits<3> imm;
	let Inst{18-16} = imm;
	}

	def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
	V128, V128, vecshiftR16Narrow,
	asm#"2", ".16b", ".8h", []> {
	bits<3> imm;
	let Inst{18-16} = imm;
	let hasSideEffects = 0;
	}

	def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
	V64, V128, vecshiftR32Narrow,
	asm, ".4h", ".4s",
	[(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))]> {
	bits<4> imm;
	let Inst{19-16} = imm;
	}

	def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
	V128, V128, vecshiftR32Narrow,
	asm#"2", ".8h", ".4s", []> {
	bits<4> imm;
	let Inst{19-16} = imm;
	let hasSideEffects = 0;
	}

	def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
	V64, V128, vecshiftR64Narrow,
	asm, ".2s", ".2d",
	[(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))]> {
	bits<5> imm;
	let Inst{20-16} = imm;
	}

	def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
	V128, V128, vecshiftR64Narrow,
	asm#"2", ".4s", ".2d", []> {
	bits<5> imm;
	let Inst{20-16} = imm;
	let hasSideEffects = 0;
	}

	// TableGen doesn't like patters w/ INSERT_SUBREG on the instructions
	// themselves, so put them here instead.

	// Patterns involving what's effectively an insert high and a normal
	// intrinsic, represented by CONCAT_VECTORS.
	def : Pat<(concat_vectors (v8i8 V64:$Rd),(OpNode (v8i16 V128:$Rn),
	vecshiftR16Narrow:$imm)),
	(!cast<Instruction>(NAME # "v16i8_shift")
	(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
	V128:$Rn, vecshiftR16Narrow:$imm)>;
	def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn),
	vecshiftR32Narrow:$imm)),
	(!cast<Instruction>(NAME # "v8i16_shift")
	(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
	V128:$Rn, vecshiftR32Narrow:$imm)>;
	def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn),
	vecshiftR64Narrow:$imm)),
	(!cast<Instruction>(NAME # "v4i32_shift")
	(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
	V128:$Rn, vecshiftR64Narrow:$imm)>;
	}

	multiclass SIMDVectorLShiftBHSD<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode> {
	def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
	V64, V64, vecshiftL8,
	asm, ".8b", ".8b",
	[(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
	(i32 vecshiftL8:$imm)))]> {
	bits<3> imm;
	let Inst{18-16} = imm;
	}

	def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
	V128, V128, vecshiftL8,
	asm, ".16b", ".16b",
	[(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
	(i32 vecshiftL8:$imm)))]> {
	bits<3> imm;
	let Inst{18-16} = imm;
	}

	def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
	V64, V64, vecshiftL16,
	asm, ".4h", ".4h",
	[(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
	(i32 vecshiftL16:$imm)))]> {
	bits<4> imm;
	let Inst{19-16} = imm;
	}

	def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
	V128, V128, vecshiftL16,
	asm, ".8h", ".8h",
	[(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
	(i32 vecshiftL16:$imm)))]> {
	bits<4> imm;
	let Inst{19-16} = imm;
	}

	def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
	V64, V64, vecshiftL32,
	asm, ".2s", ".2s",
	[(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
	(i32 vecshiftL32:$imm)))]> {
	bits<5> imm;
	let Inst{20-16} = imm;
	}

	def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
	V128, V128, vecshiftL32,
	asm, ".4s", ".4s",
	[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
	(i32 vecshiftL32:$imm)))]> {
	bits<5> imm;
	let Inst{20-16} = imm;
	}

	def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
	V128, V128, vecshiftL64,
	asm, ".2d", ".2d",
	[(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
	(i32 vecshiftL64:$imm)))]> {
	bits<6> imm;
	let Inst{21-16} = imm;
	}
	}

	multiclass SIMDVectorRShiftBHSD<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode> {
	def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
	V64, V64, vecshiftR8,
	asm, ".8b", ".8b",
	[(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
	(i32 vecshiftR8:$imm)))]> {
	bits<3> imm;
	let Inst{18-16} = imm;
	}

	def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
	V128, V128, vecshiftR8,
	asm, ".16b", ".16b",
	[(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
	(i32 vecshiftR8:$imm)))]> {
	bits<3> imm;
	let Inst{18-16} = imm;
	}

	def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
	V64, V64, vecshiftR16,
	asm, ".4h", ".4h",
	[(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
	(i32 vecshiftR16:$imm)))]> {
	bits<4> imm;
	let Inst{19-16} = imm;
	}

	def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
	V128, V128, vecshiftR16,
	asm, ".8h", ".8h",
	[(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
	(i32 vecshiftR16:$imm)))]> {
	bits<4> imm;
	let Inst{19-16} = imm;
	}

	def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
	V64, V64, vecshiftR32,
	asm, ".2s", ".2s",
	[(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
	(i32 vecshiftR32:$imm)))]> {
	bits<5> imm;
	let Inst{20-16} = imm;
	}

	def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
	V128, V128, vecshiftR32,
	asm, ".4s", ".4s",
	[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
	(i32 vecshiftR32:$imm)))]> {
	bits<5> imm;
	let Inst{20-16} = imm;
	}

	def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
	V128, V128, vecshiftR64,
	asm, ".2d", ".2d",
	[(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
	(i32 vecshiftR64:$imm)))]> {
	bits<6> imm;
	let Inst{21-16} = imm;
	}
	}

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	multiclass SIMDVectorRShiftBHSDTied<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode = null_frag> {
	def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
	V64, V64, vecshiftR8, asm, ".8b", ".8b",
	[(set (v8i8 V64:$dst),
	(OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
	(i32 vecshiftR8:$imm)))]> {
	bits<3> imm;
	let Inst{18-16} = imm;
	}

	def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
	V128, V128, vecshiftR8, asm, ".16b", ".16b",
	[(set (v16i8 V128:$dst),
	(OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
	(i32 vecshiftR8:$imm)))]> {
	bits<3> imm;
	let Inst{18-16} = imm;
	}

	def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
	V64, V64, vecshiftR16, asm, ".4h", ".4h",
	[(set (v4i16 V64:$dst),
	(OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
	(i32 vecshiftR16:$imm)))]> {
	bits<4> imm;
	let Inst{19-16} = imm;
	}

	def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
	V128, V128, vecshiftR16, asm, ".8h", ".8h",
	[(set (v8i16 V128:$dst),
	(OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
	(i32 vecshiftR16:$imm)))]> {
	bits<4> imm;
	let Inst{19-16} = imm;
	}

	def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
	V64, V64, vecshiftR32, asm, ".2s", ".2s",
	[(set (v2i32 V64:$dst),
	(OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
	(i32 vecshiftR32:$imm)))]> {
	bits<5> imm;
	let Inst{20-16} = imm;
	}

	def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
	V128, V128, vecshiftR32, asm, ".4s", ".4s",
	[(set (v4i32 V128:$dst),
	(OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
	(i32 vecshiftR32:$imm)))]> {
	bits<5> imm;
	let Inst{20-16} = imm;
	}

	def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
	V128, V128, vecshiftR64,
	asm, ".2d", ".2d", [(set (v2i64 V128:$dst),
	(OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
	(i32 vecshiftR64:$imm)))]> {
	bits<6> imm;
	let Inst{21-16} = imm;
	}
	}

	multiclass SIMDVectorLShiftBHSDTied<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode = null_frag> {
	def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
	V64, V64, vecshiftL8,
	asm, ".8b", ".8b",
	[(set (v8i8 V64:$dst),
	(OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
	(i32 vecshiftL8:$imm)))]> {
	bits<3> imm;
	let Inst{18-16} = imm;
	}

	def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
	V128, V128, vecshiftL8,
	asm, ".16b", ".16b",
	[(set (v16i8 V128:$dst),
	(OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
	(i32 vecshiftL8:$imm)))]> {
	bits<3> imm;
	let Inst{18-16} = imm;
	}

	def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
	V64, V64, vecshiftL16,
	asm, ".4h", ".4h",
	[(set (v4i16 V64:$dst),
	(OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
	(i32 vecshiftL16:$imm)))]> {
	bits<4> imm;
	let Inst{19-16} = imm;
	}

	def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
	V128, V128, vecshiftL16,
	asm, ".8h", ".8h",
	[(set (v8i16 V128:$dst),
	(OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
	(i32 vecshiftL16:$imm)))]> {
	bits<4> imm;
	let Inst{19-16} = imm;
	}

	def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
	V64, V64, vecshiftL32,
	asm, ".2s", ".2s",
	[(set (v2i32 V64:$dst),
	(OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
	(i32 vecshiftL32:$imm)))]> {
	bits<5> imm;
	let Inst{20-16} = imm;
	}

	def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
	V128, V128, vecshiftL32,
	asm, ".4s", ".4s",
	[(set (v4i32 V128:$dst),
	(OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
	(i32 vecshiftL32:$imm)))]> {
	bits<5> imm;
	let Inst{20-16} = imm;
	}

	def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
	V128, V128, vecshiftL64,
	asm, ".2d", ".2d",
	[(set (v2i64 V128:$dst),
	(OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
	(i32 vecshiftL64:$imm)))]> {
	bits<6> imm;
	let Inst{21-16} = imm;
	}
	}

	multiclass SIMDVectorLShiftLongBHSD<bit U, bits<5> opc, string asm,
	SDPatternOperator OpNode> {
	def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
	V128, V64, vecshiftL8, asm, ".8h", ".8b",
	[(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), vecshiftL8:$imm))]> {
	bits<3> imm;
	let Inst{18-16} = imm;
	}

	def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
	V128, V128, vecshiftL8,
	asm#"2", ".8h", ".16b",
	[(set (v8i16 V128:$Rd),
	(OpNode (extract_high_v16i8 V128:$Rn), vecshiftL8:$imm))]> {
	bits<3> imm;
	let Inst{18-16} = imm;
	}

	def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
	V128, V64, vecshiftL16, asm, ".4s", ".4h",
	[(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), vecshiftL16:$imm))]> {
	bits<4> imm;
	let Inst{19-16} = imm;
	}

	def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
	V128, V128, vecshiftL16,
	asm#"2", ".4s", ".8h",
	[(set (v4i32 V128:$Rd),
	(OpNode (extract_high_v8i16 V128:$Rn), vecshiftL16:$imm))]> {

	bits<4> imm;
	let Inst{19-16} = imm;
	}

	def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
	V128, V64, vecshiftL32, asm, ".2d", ".2s",
	[(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), vecshiftL32:$imm))]> {
	bits<5> imm;
	let Inst{20-16} = imm;
	}

	def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
	V128, V128, vecshiftL32,
	asm#"2", ".2d", ".4s",
	[(set (v2i64 V128:$Rd),
	(OpNode (extract_high_v4i32 V128:$Rn), vecshiftL32:$imm))]> {
	bits<5> imm;
	let Inst{20-16} = imm;
	}
	}


	//---
	// Vector load/store
	//---
	// SIMD ldX/stX no-index memory references don't allow the optional
	// ", #0" constant and handle post-indexing explicitly, so we use
	// a more specialized parse method for them. Otherwise, it's the same as
	// the general GPR64sp handling.

	class BaseSIMDLdSt<bit Q, bit L, bits<4> opcode, bits<2> size,
	string asm, dag oops, dag iops, list<dag> pattern>
	: I<oops, iops, asm, "\t$Vt, [$Rn]", "", pattern> {
	bits<5> Vt;
	bits<5> Rn;
	let Inst{31} = 0;
	let Inst{30} = Q;
	let Inst{29-23} = 0b0011000;
	let Inst{22} = L;
	let Inst{21-16} = 0b000000;
	let Inst{15-12} = opcode;
	let Inst{11-10} = size;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Vt;
	}

	class BaseSIMDLdStPost<bit Q, bit L, bits<4> opcode, bits<2> size,
	string asm, dag oops, dag iops>
	: I<oops, iops, asm, "\t$Vt, [$Rn], $Xm", "$Rn = $wback", []> {
	bits<5> Vt;
	bits<5> Rn;
	bits<5> Xm;
	let Inst{31} = 0;
	let Inst{30} = Q;
	let Inst{29-23} = 0b0011001;
	let Inst{22} = L;
	let Inst{21} = 0;
	let Inst{20-16} = Xm;
	let Inst{15-12} = opcode;
	let Inst{11-10} = size;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Vt;
	}

	// The immediate form of AdvSIMD post-indexed addressing is encoded with
	// register post-index addressing from the zero register.
	multiclass SIMDLdStAliases<string BaseName, string asm, string layout, string Count,
	int Offset, int Size> {
	// E.g. "ld1 { v0.8b, v1.8b }, [x1], #16"
	// "ld1\t$Vt, [$Rn], #16"
	// may get mapped to
	// (LD1Twov8b_POST VecListTwo8b:$Vt, GPR64sp:$Rn, XZR)
	def : InstAlias<asm # "\t$Vt, [$Rn], #" # Offset,
	(!cast<Instruction>(BaseName # Count # "v" # layout # "_POST")
	GPR64sp:$Rn,
	!cast<RegisterOperand>("VecList" # Count # layout):$Vt,
	XZR), 1>;

	// E.g. "ld1.8b { v0, v1 }, [x1], #16"
	// "ld1.8b\t$Vt, [$Rn], #16"
	// may get mapped to
	// (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, XZR)
	def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], #" # Offset,
	(!cast<Instruction>(BaseName # Count # "v" # layout # "_POST")
	GPR64sp:$Rn,
	!cast<RegisterOperand>("VecList" # Count # Size):$Vt,
	XZR), 0>;

	// E.g. "ld1.8b { v0, v1 }, [x1]"
	// "ld1\t$Vt, [$Rn]"
	// may get mapped to
	// (LD1Twov8b VecListTwo64:$Vt, GPR64sp:$Rn)
	def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn]",
	(!cast<Instruction>(BaseName # Count # "v" # layout)
	!cast<RegisterOperand>("VecList" # Count # Size):$Vt,
	GPR64sp:$Rn), 0>;

	// E.g. "ld1.8b { v0, v1 }, [x1], x2"
	// "ld1\t$Vt, [$Rn], $Xm"
	// may get mapped to
	// (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, GPR64pi8:$Xm)
	def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], $Xm",
	(!cast<Instruction>(BaseName # Count # "v" # layout # "_POST")
	GPR64sp:$Rn,
	!cast<RegisterOperand>("VecList" # Count # Size):$Vt,
	!cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
	}

	multiclass BaseSIMDLdN<string BaseName, string Count, string asm, string veclist,
	int Offset128, int Offset64, bits<4> opcode> {
	let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
	def v16b: BaseSIMDLdSt<1, 1, opcode, 0b00, asm,
	(outs !cast<RegisterOperand>(veclist # "16b"):$Vt),
	(ins GPR64sp:$Rn), []>;
	def v8h : BaseSIMDLdSt<1, 1, opcode, 0b01, asm,
	(outs !cast<RegisterOperand>(veclist # "8h"):$Vt),
	(ins GPR64sp:$Rn), []>;
	def v4s : BaseSIMDLdSt<1, 1, opcode, 0b10, asm,
	(outs !cast<RegisterOperand>(veclist # "4s"):$Vt),
	(ins GPR64sp:$Rn), []>;
	def v2d : BaseSIMDLdSt<1, 1, opcode, 0b11, asm,
	(outs !cast<RegisterOperand>(veclist # "2d"):$Vt),
	(ins GPR64sp:$Rn), []>;
	def v8b : BaseSIMDLdSt<0, 1, opcode, 0b00, asm,
	(outs !cast<RegisterOperand>(veclist # "8b"):$Vt),
	(ins GPR64sp:$Rn), []>;
	def v4h : BaseSIMDLdSt<0, 1, opcode, 0b01, asm,
	(outs !cast<RegisterOperand>(veclist # "4h"):$Vt),
	(ins GPR64sp:$Rn), []>;
	def v2s : BaseSIMDLdSt<0, 1, opcode, 0b10, asm,
	(outs !cast<RegisterOperand>(veclist # "2s"):$Vt),
	(ins GPR64sp:$Rn), []>;


	def v16b_POST: BaseSIMDLdStPost<1, 1, opcode, 0b00, asm,
	(outs GPR64sp:$wback,
	!cast<RegisterOperand>(veclist # "16b"):$Vt),
	(ins GPR64sp:$Rn,
	!cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
	def v8h_POST : BaseSIMDLdStPost<1, 1, opcode, 0b01, asm,
	(outs GPR64sp:$wback,
	!cast<RegisterOperand>(veclist # "8h"):$Vt),
	(ins GPR64sp:$Rn,
	!cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
	def v4s_POST : BaseSIMDLdStPost<1, 1, opcode, 0b10, asm,
	(outs GPR64sp:$wback,
	!cast<RegisterOperand>(veclist # "4s"):$Vt),
	(ins GPR64sp:$Rn,
	!cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
	def v2d_POST : BaseSIMDLdStPost<1, 1, opcode, 0b11, asm,
	(outs GPR64sp:$wback,
	!cast<RegisterOperand>(veclist # "2d"):$Vt),
	(ins GPR64sp:$Rn,
	!cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
	def v8b_POST : BaseSIMDLdStPost<0, 1, opcode, 0b00, asm,
	(outs GPR64sp:$wback,
	!cast<RegisterOperand>(veclist # "8b"):$Vt),
	(ins GPR64sp:$Rn,
	!cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
	def v4h_POST : BaseSIMDLdStPost<0, 1, opcode, 0b01, asm,
	(outs GPR64sp:$wback,
	!cast<RegisterOperand>(veclist # "4h"):$Vt),
	(ins GPR64sp:$Rn,
	!cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
	def v2s_POST : BaseSIMDLdStPost<0, 1, opcode, 0b10, asm,
	(outs GPR64sp:$wback,
	!cast<RegisterOperand>(veclist # "2s"):$Vt),
	(ins GPR64sp:$Rn,
	!cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
	}

	defm : SIMDLdStAliases<BaseName, asm, "16b", Count, Offset128, 128>;
	defm : SIMDLdStAliases<BaseName, asm, "8h", Count, Offset128, 128>;
	defm : SIMDLdStAliases<BaseName, asm, "4s", Count, Offset128, 128>;
	defm : SIMDLdStAliases<BaseName, asm, "2d", Count, Offset128, 128>;
	defm : SIMDLdStAliases<BaseName, asm, "8b", Count, Offset64, 64>;
	defm : SIMDLdStAliases<BaseName, asm, "4h", Count, Offset64, 64>;
	defm : SIMDLdStAliases<BaseName, asm, "2s", Count, Offset64, 64>;
	}

	// Only ld1/st1 has a v1d version.
	multiclass BaseSIMDStN<string BaseName, string Count, string asm, string veclist,
	int Offset128, int Offset64, bits<4> opcode> {
	let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in {
	def v16b : BaseSIMDLdSt<1, 0, opcode, 0b00, asm, (outs),
	(ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
	GPR64sp:$Rn), []>;
	def v8h : BaseSIMDLdSt<1, 0, opcode, 0b01, asm, (outs),
	(ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
	GPR64sp:$Rn), []>;
	def v4s : BaseSIMDLdSt<1, 0, opcode, 0b10, asm, (outs),
	(ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
	GPR64sp:$Rn), []>;
	def v2d : BaseSIMDLdSt<1, 0, opcode, 0b11, asm, (outs),
	(ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
	GPR64sp:$Rn), []>;
	def v8b : BaseSIMDLdSt<0, 0, opcode, 0b00, asm, (outs),
	(ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
	GPR64sp:$Rn), []>;
	def v4h : BaseSIMDLdSt<0, 0, opcode, 0b01, asm, (outs),
	(ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
	GPR64sp:$Rn), []>;
	def v2s : BaseSIMDLdSt<0, 0, opcode, 0b10, asm, (outs),
	(ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
	GPR64sp:$Rn), []>;

	def v16b_POST : BaseSIMDLdStPost<1, 0, opcode, 0b00, asm,
	(outs GPR64sp:$wback),
	(ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
	GPR64sp:$Rn,
	!cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
	def v8h_POST : BaseSIMDLdStPost<1, 0, opcode, 0b01, asm,
	(outs GPR64sp:$wback),
	(ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
	GPR64sp:$Rn,
	!cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
	def v4s_POST : BaseSIMDLdStPost<1, 0, opcode, 0b10, asm,
	(outs GPR64sp:$wback),
	(ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
	GPR64sp:$Rn,
	!cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
	def v2d_POST : BaseSIMDLdStPost<1, 0, opcode, 0b11, asm,
	(outs GPR64sp:$wback),
	(ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
	GPR64sp:$Rn,
	!cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
	def v8b_POST : BaseSIMDLdStPost<0, 0, opcode, 0b00, asm,
	(outs GPR64sp:$wback),
	(ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
	GPR64sp:$Rn,
	!cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
	def v4h_POST : BaseSIMDLdStPost<0, 0, opcode, 0b01, asm,
	(outs GPR64sp:$wback),
	(ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
	GPR64sp:$Rn,
	!cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
	def v2s_POST : BaseSIMDLdStPost<0, 0, opcode, 0b10, asm,
	(outs GPR64sp:$wback),
	(ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
	GPR64sp:$Rn,
	!cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
	}

	defm : SIMDLdStAliases<BaseName, asm, "16b", Count, Offset128, 128>;
	defm : SIMDLdStAliases<BaseName, asm, "8h", Count, Offset128, 128>;
	defm : SIMDLdStAliases<BaseName, asm, "4s", Count, Offset128, 128>;
	defm : SIMDLdStAliases<BaseName, asm, "2d", Count, Offset128, 128>;
	defm : SIMDLdStAliases<BaseName, asm, "8b", Count, Offset64, 64>;
	defm : SIMDLdStAliases<BaseName, asm, "4h", Count, Offset64, 64>;
	defm : SIMDLdStAliases<BaseName, asm, "2s", Count, Offset64, 64>;
	}

	multiclass BaseSIMDLd1<string BaseName, string Count, string asm, string veclist,
	int Offset128, int Offset64, bits<4> opcode>
	: BaseSIMDLdN<BaseName, Count, asm, veclist, Offset128, Offset64, opcode> {

	// LD1 instructions have extra "1d" variants.
	let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
	def v1d : BaseSIMDLdSt<0, 1, opcode, 0b11, asm,
	(outs !cast<RegisterOperand>(veclist # "1d"):$Vt),
	(ins GPR64sp:$Rn), []>;

	def v1d_POST : BaseSIMDLdStPost<0, 1, opcode, 0b11, asm,
	(outs GPR64sp:$wback,
	!cast<RegisterOperand>(veclist # "1d"):$Vt),
	(ins GPR64sp:$Rn,
	!cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
	}

	defm : SIMDLdStAliases<BaseName, asm, "1d", Count, Offset64, 64>;
	}

	multiclass BaseSIMDSt1<string BaseName, string Count, string asm, string veclist,
	int Offset128, int Offset64, bits<4> opcode>
	: BaseSIMDStN<BaseName, Count, asm, veclist, Offset128, Offset64, opcode> {

	// ST1 instructions have extra "1d" variants.
	let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
	def v1d : BaseSIMDLdSt<0, 0, opcode, 0b11, asm, (outs),
	(ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
	GPR64sp:$Rn), []>;

	def v1d_POST : BaseSIMDLdStPost<0, 0, opcode, 0b11, asm,
	(outs GPR64sp:$wback),
	(ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
	GPR64sp:$Rn,
	!cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
	}

	defm : SIMDLdStAliases<BaseName, asm, "1d", Count, Offset64, 64>;
	}

	multiclass SIMDLd1Multiple<string asm> {
	defm One : BaseSIMDLd1<NAME, "One", asm, "VecListOne", 16, 8, 0b0111>;
	defm Two : BaseSIMDLd1<NAME, "Two", asm, "VecListTwo", 32, 16, 0b1010>;
	defm Three : BaseSIMDLd1<NAME, "Three", asm, "VecListThree", 48, 24, 0b0110>;
	defm Four : BaseSIMDLd1<NAME, "Four", asm, "VecListFour", 64, 32, 0b0010>;
	}

	multiclass SIMDSt1Multiple<string asm> {
	defm One : BaseSIMDSt1<NAME, "One", asm, "VecListOne", 16, 8, 0b0111>;
	defm Two : BaseSIMDSt1<NAME, "Two", asm, "VecListTwo", 32, 16, 0b1010>;
	defm Three : BaseSIMDSt1<NAME, "Three", asm, "VecListThree", 48, 24, 0b0110>;
	defm Four : BaseSIMDSt1<NAME, "Four", asm, "VecListFour", 64, 32, 0b0010>;
	}

	multiclass SIMDLd2Multiple<string asm> {
	defm Two : BaseSIMDLdN<NAME, "Two", asm, "VecListTwo", 32, 16, 0b1000>;
	}

	multiclass SIMDSt2Multiple<string asm> {
	defm Two : BaseSIMDStN<NAME, "Two", asm, "VecListTwo", 32, 16, 0b1000>;
	}

	multiclass SIMDLd3Multiple<string asm> {
	defm Three : BaseSIMDLdN<NAME, "Three", asm, "VecListThree", 48, 24, 0b0100>;
	}

	multiclass SIMDSt3Multiple<string asm> {
	defm Three : BaseSIMDStN<NAME, "Three", asm, "VecListThree", 48, 24, 0b0100>;
	}

	multiclass SIMDLd4Multiple<string asm> {
	defm Four : BaseSIMDLdN<NAME, "Four", asm, "VecListFour", 64, 32, 0b0000>;
	}

	multiclass SIMDSt4Multiple<string asm> {
	defm Four : BaseSIMDStN<NAME, "Four", asm, "VecListFour", 64, 32, 0b0000>;
	}

	//---
	// AdvSIMD Load/store single-element
	//---

	class BaseSIMDLdStSingle<bit L, bit R, bits<3> opcode,
	string asm, string operands, string cst,
	dag oops, dag iops, list<dag> pattern>
	: I<oops, iops, asm, operands, cst, pattern> {
	bits<5> Vt;
	bits<5> Rn;
	let Inst{31} = 0;
	let Inst{29-24} = 0b001101;
	let Inst{22} = L;
	let Inst{21} = R;
	let Inst{15-13} = opcode;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Vt;
	}

	class BaseSIMDLdStSingleTied<bit L, bit R, bits<3> opcode,
	string asm, string operands, string cst,
	dag oops, dag iops, list<dag> pattern>
	: I<oops, iops, asm, operands, "$Vt = $dst," # cst, pattern> {
	bits<5> Vt;
	bits<5> Rn;
	let Inst{31} = 0;
	let Inst{29-24} = 0b001101;
	let Inst{22} = L;
	let Inst{21} = R;
	let Inst{15-13} = opcode;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Vt;
	}


	let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
	class BaseSIMDLdR<bit Q, bit R, bits<3> opcode, bit S, bits<2> size, string asm,
	DAGOperand listtype>
	: BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn]", "",
	(outs listtype:$Vt), (ins GPR64sp:$Rn),
	[]> {
	let Inst{30} = Q;
	let Inst{23} = 0;
	let Inst{20-16} = 0b00000;
	let Inst{12} = S;
	let Inst{11-10} = size;
	}
	let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
	class BaseSIMDLdRPost<bit Q, bit R, bits<3> opcode, bit S, bits<2> size,
	string asm, DAGOperand listtype, DAGOperand GPR64pi>
	: BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn], $Xm",
	"$Rn = $wback",
	(outs GPR64sp:$wback, listtype:$Vt),
	(ins GPR64sp:$Rn, GPR64pi:$Xm), []> {
	bits<5> Xm;
	let Inst{30} = Q;
	let Inst{23} = 1;
	let Inst{20-16} = Xm;
	let Inst{12} = S;
	let Inst{11-10} = size;
	}

	multiclass SIMDLdrAliases<string BaseName, string asm, string layout, string Count,
	int Offset, int Size> {
	// E.g. "ld1r { v0.8b }, [x1], #1"
	// "ld1r.8b\t$Vt, [$Rn], #1"
	// may get mapped to
	// (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR)
	def : InstAlias<asm # "\t$Vt, [$Rn], #" # Offset,
	(!cast<Instruction>(BaseName # "v" # layout # "_POST")
	GPR64sp:$Rn,
	!cast<RegisterOperand>("VecList" # Count # layout):$Vt,
	XZR), 1>;

	// E.g. "ld1r.8b { v0 }, [x1], #1"
	// "ld1r.8b\t$Vt, [$Rn], #1"
	// may get mapped to
	// (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR)
	def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], #" # Offset,
	(!cast<Instruction>(BaseName # "v" # layout # "_POST")
	GPR64sp:$Rn,
	!cast<RegisterOperand>("VecList" # Count # Size):$Vt,
	XZR), 0>;

	// E.g. "ld1r.8b { v0 }, [x1]"
	// "ld1r.8b\t$Vt, [$Rn]"
	// may get mapped to
	// (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn)
	def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn]",
	(!cast<Instruction>(BaseName # "v" # layout)
	!cast<RegisterOperand>("VecList" # Count # Size):$Vt,
	GPR64sp:$Rn), 0>;

	// E.g. "ld1r.8b { v0 }, [x1], x2"
	// "ld1r.8b\t$Vt, [$Rn], $Xm"
	// may get mapped to
	// (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm)
	def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], $Xm",
	(!cast<Instruction>(BaseName # "v" # layout # "_POST")
	GPR64sp:$Rn,
	!cast<RegisterOperand>("VecList" # Count # Size):$Vt,
	!cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
	}

	multiclass SIMDLdR<bit R, bits<3> opcode, bit S, string asm, string Count,
	int Offset1, int Offset2, int Offset4, int Offset8> {
	def v8b : BaseSIMDLdR<0, R, opcode, S, 0b00, asm,
	!cast<DAGOperand>("VecList" # Count # "8b")>;
	def v16b: BaseSIMDLdR<1, R, opcode, S, 0b00, asm,
	!cast<DAGOperand>("VecList" # Count #"16b")>;
	def v4h : BaseSIMDLdR<0, R, opcode, S, 0b01, asm,
	!cast<DAGOperand>("VecList" # Count #"4h")>;
	def v8h : BaseSIMDLdR<1, R, opcode, S, 0b01, asm,
	!cast<DAGOperand>("VecList" # Count #"8h")>;
	def v2s : BaseSIMDLdR<0, R, opcode, S, 0b10, asm,
	!cast<DAGOperand>("VecList" # Count #"2s")>;
	def v4s : BaseSIMDLdR<1, R, opcode, S, 0b10, asm,
	!cast<DAGOperand>("VecList" # Count #"4s")>;
	def v1d : BaseSIMDLdR<0, R, opcode, S, 0b11, asm,
	!cast<DAGOperand>("VecList" # Count #"1d")>;
	def v2d : BaseSIMDLdR<1, R, opcode, S, 0b11, asm,
	!cast<DAGOperand>("VecList" # Count #"2d")>;

	def v8b_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b00, asm,
	!cast<DAGOperand>("VecList" # Count # "8b"),
	!cast<DAGOperand>("GPR64pi" # Offset1)>;
	def v16b_POST: BaseSIMDLdRPost<1, R, opcode, S, 0b00, asm,
	!cast<DAGOperand>("VecList" # Count # "16b"),
	!cast<DAGOperand>("GPR64pi" # Offset1)>;
	def v4h_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b01, asm,
	!cast<DAGOperand>("VecList" # Count # "4h"),
	!cast<DAGOperand>("GPR64pi" # Offset2)>;
	def v8h_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b01, asm,
	!cast<DAGOperand>("VecList" # Count # "8h"),
	!cast<DAGOperand>("GPR64pi" # Offset2)>;
	def v2s_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b10, asm,
	!cast<DAGOperand>("VecList" # Count # "2s"),
	!cast<DAGOperand>("GPR64pi" # Offset4)>;
	def v4s_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b10, asm,
	!cast<DAGOperand>("VecList" # Count # "4s"),
	!cast<DAGOperand>("GPR64pi" # Offset4)>;
	def v1d_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b11, asm,
	!cast<DAGOperand>("VecList" # Count # "1d"),
	!cast<DAGOperand>("GPR64pi" # Offset8)>;
	def v2d_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b11, asm,
	!cast<DAGOperand>("VecList" # Count # "2d"),
	!cast<DAGOperand>("GPR64pi" # Offset8)>;

	defm : SIMDLdrAliases<NAME, asm, "8b", Count, Offset1, 64>;
	defm : SIMDLdrAliases<NAME, asm, "16b", Count, Offset1, 128>;
	defm : SIMDLdrAliases<NAME, asm, "4h", Count, Offset2, 64>;
	defm : SIMDLdrAliases<NAME, asm, "8h", Count, Offset2, 128>;
	defm : SIMDLdrAliases<NAME, asm, "2s", Count, Offset4, 64>;
	defm : SIMDLdrAliases<NAME, asm, "4s", Count, Offset4, 128>;
	defm : SIMDLdrAliases<NAME, asm, "1d", Count, Offset8, 64>;
	defm : SIMDLdrAliases<NAME, asm, "2d", Count, Offset8, 128>;
	}

	class SIMDLdStSingleB<bit L, bit R, bits<3> opcode, string asm,
	dag oops, dag iops, list<dag> pattern>
	: BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
	pattern> {
	// idx encoded in Q:S:size fields.
	bits<4> idx;
	let Inst{30} = idx{3};
	let Inst{23} = 0;
	let Inst{20-16} = 0b00000;
	let Inst{12} = idx{2};
	let Inst{11-10} = idx{1-0};
	}
	class SIMDLdStSingleBTied<bit L, bit R, bits<3> opcode, string asm,
	dag oops, dag iops, list<dag> pattern>
	: BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
	oops, iops, pattern> {
	// idx encoded in Q:S:size fields.
	bits<4> idx;
	let Inst{30} = idx{3};
	let Inst{23} = 0;
	let Inst{20-16} = 0b00000;
	let Inst{12} = idx{2};
	let Inst{11-10} = idx{1-0};
	}
	class SIMDLdStSingleBPost<bit L, bit R, bits<3> opcode, string asm,
	dag oops, dag iops>
	: BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
	"$Rn = $wback", oops, iops, []> {
	// idx encoded in Q:S:size fields.
	bits<4> idx;
	bits<5> Xm;
	let Inst{30} = idx{3};
	let Inst{23} = 1;
	let Inst{20-16} = Xm;
	let Inst{12} = idx{2};
	let Inst{11-10} = idx{1-0};
	}
	class SIMDLdStSingleBTiedPost<bit L, bit R, bits<3> opcode, string asm,
	dag oops, dag iops>
	: BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
	"$Rn = $wback", oops, iops, []> {
	// idx encoded in Q:S:size fields.
	bits<4> idx;
	bits<5> Xm;
	let Inst{30} = idx{3};
	let Inst{23} = 1;
	let Inst{20-16} = Xm;
	let Inst{12} = idx{2};
	let Inst{11-10} = idx{1-0};
	}

	class SIMDLdStSingleH<bit L, bit R, bits<3> opcode, bit size, string asm,
	dag oops, dag iops, list<dag> pattern>
	: BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
	pattern> {
	// idx encoded in Q:S:size<1> fields.
	bits<3> idx;
	let Inst{30} = idx{2};
	let Inst{23} = 0;
	let Inst{20-16} = 0b00000;
	let Inst{12} = idx{1};
	let Inst{11} = idx{0};
	let Inst{10} = size;
	}
	class SIMDLdStSingleHTied<bit L, bit R, bits<3> opcode, bit size, string asm,
	dag oops, dag iops, list<dag> pattern>
	: BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
	oops, iops, pattern> {
	// idx encoded in Q:S:size<1> fields.
	bits<3> idx;
	let Inst{30} = idx{2};
	let Inst{23} = 0;
	let Inst{20-16} = 0b00000;
	let Inst{12} = idx{1};
	let Inst{11} = idx{0};
	let Inst{10} = size;
	}

	class SIMDLdStSingleHPost<bit L, bit R, bits<3> opcode, bit size, string asm,
	dag oops, dag iops>
	: BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
	"$Rn = $wback", oops, iops, []> {
	// idx encoded in Q:S:size<1> fields.
	bits<3> idx;
	bits<5> Xm;
	let Inst{30} = idx{2};
	let Inst{23} = 1;
	let Inst{20-16} = Xm;
	let Inst{12} = idx{1};
	let Inst{11} = idx{0};
	let Inst{10} = size;
	}
	class SIMDLdStSingleHTiedPost<bit L, bit R, bits<3> opcode, bit size, string asm,
	dag oops, dag iops>
	: BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
	"$Rn = $wback", oops, iops, []> {
	// idx encoded in Q:S:size<1> fields.
	bits<3> idx;
	bits<5> Xm;
	let Inst{30} = idx{2};
	let Inst{23} = 1;
	let Inst{20-16} = Xm;
	let Inst{12} = idx{1};
	let Inst{11} = idx{0};
	let Inst{10} = size;
	}
	class SIMDLdStSingleS<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
	dag oops, dag iops, list<dag> pattern>
	: BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
	pattern> {
	// idx encoded in Q:S fields.
	bits<2> idx;
	let Inst{30} = idx{1};
	let Inst{23} = 0;
	let Inst{20-16} = 0b00000;
	let Inst{12} = idx{0};
	let Inst{11-10} = size;
	}
	class SIMDLdStSingleSTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
	dag oops, dag iops, list<dag> pattern>
	: BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
	oops, iops, pattern> {
	// idx encoded in Q:S fields.
	bits<2> idx;
	let Inst{30} = idx{1};
	let Inst{23} = 0;
	let Inst{20-16} = 0b00000;
	let Inst{12} = idx{0};
	let Inst{11-10} = size;
	}
	class SIMDLdStSingleSPost<bit L, bit R, bits<3> opcode, bits<2> size,
	string asm, dag oops, dag iops>
	: BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
	"$Rn = $wback", oops, iops, []> {
	// idx encoded in Q:S fields.
	bits<2> idx;
	bits<5> Xm;
	let Inst{30} = idx{1};
	let Inst{23} = 1;
	let Inst{20-16} = Xm;
	let Inst{12} = idx{0};
	let Inst{11-10} = size;
	}
	class SIMDLdStSingleSTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
	string asm, dag oops, dag iops>
	: BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
	"$Rn = $wback", oops, iops, []> {
	// idx encoded in Q:S fields.
	bits<2> idx;
	bits<5> Xm;
	let Inst{30} = idx{1};
	let Inst{23} = 1;
	let Inst{20-16} = Xm;
	let Inst{12} = idx{0};
	let Inst{11-10} = size;
	}
	class SIMDLdStSingleD<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
	dag oops, dag iops, list<dag> pattern>
	: BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
	pattern> {
	// idx encoded in Q field.
	bits<1> idx;
	let Inst{30} = idx;
	let Inst{23} = 0;
	let Inst{20-16} = 0b00000;
	let Inst{12} = 0;
	let Inst{11-10} = size;
	}
	class SIMDLdStSingleDTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
	dag oops, dag iops, list<dag> pattern>
	: BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
	oops, iops, pattern> {
	// idx encoded in Q field.
	bits<1> idx;
	let Inst{30} = idx;
	let Inst{23} = 0;
	let Inst{20-16} = 0b00000;
	let Inst{12} = 0;
	let Inst{11-10} = size;
	}
	class SIMDLdStSingleDPost<bit L, bit R, bits<3> opcode, bits<2> size,
	string asm, dag oops, dag iops>
	: BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
	"$Rn = $wback", oops, iops, []> {
	// idx encoded in Q field.
	bits<1> idx;
	bits<5> Xm;
	let Inst{30} = idx;
	let Inst{23} = 1;
	let Inst{20-16} = Xm;
	let Inst{12} = 0;
	let Inst{11-10} = size;
	}
	class SIMDLdStSingleDTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
	string asm, dag oops, dag iops>
	: BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
	"$Rn = $wback", oops, iops, []> {
	// idx encoded in Q field.
	bits<1> idx;
	bits<5> Xm;
	let Inst{30} = idx;
	let Inst{23} = 1;
	let Inst{20-16} = Xm;
	let Inst{12} = 0;
	let Inst{11-10} = size;
	}

	let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
	multiclass SIMDLdSingleBTied<bit R, bits<3> opcode, string asm,
	RegisterOperand listtype,
	RegisterOperand GPR64pi> {
	def i8 : SIMDLdStSingleBTied<1, R, opcode, asm,
	(outs listtype:$dst),
	(ins listtype:$Vt, VectorIndexB:$idx,
	GPR64sp:$Rn), []>;

	def i8_POST : SIMDLdStSingleBTiedPost<1, R, opcode, asm,
	(outs GPR64sp:$wback, listtype:$dst),
	(ins listtype:$Vt, VectorIndexB:$idx,
	GPR64sp:$Rn, GPR64pi:$Xm)>;
	}
	let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
	multiclass SIMDLdSingleHTied<bit R, bits<3> opcode, bit size, string asm,
	RegisterOperand listtype,
	RegisterOperand GPR64pi> {
	def i16 : SIMDLdStSingleHTied<1, R, opcode, size, asm,
	(outs listtype:$dst),
	(ins listtype:$Vt, VectorIndexH:$idx,
	GPR64sp:$Rn), []>;

	def i16_POST : SIMDLdStSingleHTiedPost<1, R, opcode, size, asm,
	(outs GPR64sp:$wback, listtype:$dst),
	(ins listtype:$Vt, VectorIndexH:$idx,
	GPR64sp:$Rn, GPR64pi:$Xm)>;
	}
	let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
	multiclass SIMDLdSingleSTied<bit R, bits<3> opcode, bits<2> size,string asm,
	RegisterOperand listtype,
	RegisterOperand GPR64pi> {
	def i32 : SIMDLdStSingleSTied<1, R, opcode, size, asm,
	(outs listtype:$dst),
	(ins listtype:$Vt, VectorIndexS:$idx,
	GPR64sp:$Rn), []>;

	def i32_POST : SIMDLdStSingleSTiedPost<1, R, opcode, size, asm,
	(outs GPR64sp:$wback, listtype:$dst),
	(ins listtype:$Vt, VectorIndexS:$idx,
	GPR64sp:$Rn, GPR64pi:$Xm)>;
	}
	let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
	multiclass SIMDLdSingleDTied<bit R, bits<3> opcode, bits<2> size, string asm,
	RegisterOperand listtype, RegisterOperand GPR64pi> {
	def i64 : SIMDLdStSingleDTied<1, R, opcode, size, asm,
	(outs listtype:$dst),
	(ins listtype:$Vt, VectorIndexD:$idx,
	GPR64sp:$Rn), []>;

	def i64_POST : SIMDLdStSingleDTiedPost<1, R, opcode, size, asm,
	(outs GPR64sp:$wback, listtype:$dst),
	(ins listtype:$Vt, VectorIndexD:$idx,
	GPR64sp:$Rn, GPR64pi:$Xm)>;
	}
	let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
	multiclass SIMDStSingleB<bit R, bits<3> opcode, string asm,
	RegisterOperand listtype, RegisterOperand GPR64pi> {
	def i8 : SIMDLdStSingleB<0, R, opcode, asm,
	(outs), (ins listtype:$Vt, VectorIndexB:$idx,
	GPR64sp:$Rn), []>;

	def i8_POST : SIMDLdStSingleBPost<0, R, opcode, asm,
	(outs GPR64sp:$wback),
	(ins listtype:$Vt, VectorIndexB:$idx,
	GPR64sp:$Rn, GPR64pi:$Xm)>;
	}
	let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
	multiclass SIMDStSingleH<bit R, bits<3> opcode, bit size, string asm,
	RegisterOperand listtype, RegisterOperand GPR64pi> {
	def i16 : SIMDLdStSingleH<0, R, opcode, size, asm,
	(outs), (ins listtype:$Vt, VectorIndexH:$idx,
	GPR64sp:$Rn), []>;

	def i16_POST : SIMDLdStSingleHPost<0, R, opcode, size, asm,
	(outs GPR64sp:$wback),
	(ins listtype:$Vt, VectorIndexH:$idx,
	GPR64sp:$Rn, GPR64pi:$Xm)>;
	}
	let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
	multiclass SIMDStSingleS<bit R, bits<3> opcode, bits<2> size,string asm,
	RegisterOperand listtype, RegisterOperand GPR64pi> {
	def i32 : SIMDLdStSingleS<0, R, opcode, size, asm,
	(outs), (ins listtype:$Vt, VectorIndexS:$idx,
	GPR64sp:$Rn), []>;

	def i32_POST : SIMDLdStSingleSPost<0, R, opcode, size, asm,
	(outs GPR64sp:$wback),
	(ins listtype:$Vt, VectorIndexS:$idx,
	GPR64sp:$Rn, GPR64pi:$Xm)>;
	}
	let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
	multiclass SIMDStSingleD<bit R, bits<3> opcode, bits<2> size, string asm,
	RegisterOperand listtype, RegisterOperand GPR64pi> {
	def i64 : SIMDLdStSingleD<0, R, opcode, size, asm,
	(outs), (ins listtype:$Vt, VectorIndexD:$idx,
	GPR64sp:$Rn), []>;

	def i64_POST : SIMDLdStSingleDPost<0, R, opcode, size, asm,
	(outs GPR64sp:$wback),
	(ins listtype:$Vt, VectorIndexD:$idx,
	GPR64sp:$Rn, GPR64pi:$Xm)>;
	}

	multiclass SIMDLdStSingleAliases<string asm, string layout, string Type,
	string Count, int Offset, Operand idxtype> {
	// E.g. "ld1 { v0.8b }[0], [x1], #1"
	// "ld1\t$Vt, [$Rn], #1"
	// may get mapped to
	// (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR)
	def : InstAlias<asm # "\t$Vt$idx, [$Rn], #" # Offset,
	(!cast<Instruction>(NAME # Type # "_POST")
	GPR64sp:$Rn,
	!cast<RegisterOperand>("VecList" # Count # layout):$Vt,
	idxtype:$idx, XZR), 1>;

	// E.g. "ld1.8b { v0 }[0], [x1], #1"
	// "ld1.8b\t$Vt, [$Rn], #1"
	// may get mapped to
	// (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR)
	def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn], #" # Offset,
	(!cast<Instruction>(NAME # Type # "_POST")
	GPR64sp:$Rn,
	!cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
	idxtype:$idx, XZR), 0>;

	// E.g. "ld1.8b { v0 }[0], [x1]"
	// "ld1.8b\t$Vt, [$Rn]"
	// may get mapped to
	// (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn)
	def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn]",
	(!cast<Instruction>(NAME # Type)
	!cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
	idxtype:$idx, GPR64sp:$Rn), 0>;

	// E.g. "ld1.8b { v0 }[0], [x1], x2"
	// "ld1.8b\t$Vt, [$Rn], $Xm"
	// may get mapped to
	// (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm)
	def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn], $Xm",
	(!cast<Instruction>(NAME # Type # "_POST")
	GPR64sp:$Rn,
	!cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
	idxtype:$idx,
	!cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
	}

	multiclass SIMDLdSt1SingleAliases<string asm> {
	defm "" : SIMDLdStSingleAliases<asm, "b", "i8", "One", 1, VectorIndexB>;
	defm "" : SIMDLdStSingleAliases<asm, "h", "i16", "One", 2, VectorIndexH>;
	defm "" : SIMDLdStSingleAliases<asm, "s", "i32", "One", 4, VectorIndexS>;
	defm "" : SIMDLdStSingleAliases<asm, "d", "i64", "One", 8, VectorIndexD>;
	}

	multiclass SIMDLdSt2SingleAliases<string asm> {
	defm "" : SIMDLdStSingleAliases<asm, "b", "i8", "Two", 2, VectorIndexB>;
	defm "" : SIMDLdStSingleAliases<asm, "h", "i16", "Two", 4, VectorIndexH>;
	defm "" : SIMDLdStSingleAliases<asm, "s", "i32", "Two", 8, VectorIndexS>;
	defm "" : SIMDLdStSingleAliases<asm, "d", "i64", "Two", 16, VectorIndexD>;
	}

	multiclass SIMDLdSt3SingleAliases<string asm> {
	defm "" : SIMDLdStSingleAliases<asm, "b", "i8", "Three", 3, VectorIndexB>;
	defm "" : SIMDLdStSingleAliases<asm, "h", "i16", "Three", 6, VectorIndexH>;
	defm "" : SIMDLdStSingleAliases<asm, "s", "i32", "Three", 12, VectorIndexS>;
	defm "" : SIMDLdStSingleAliases<asm, "d", "i64", "Three", 24, VectorIndexD>;
	}

	multiclass SIMDLdSt4SingleAliases<string asm> {
	defm "" : SIMDLdStSingleAliases<asm, "b", "i8", "Four", 4, VectorIndexB>;
	defm "" : SIMDLdStSingleAliases<asm, "h", "i16", "Four", 8, VectorIndexH>;
	defm "" : SIMDLdStSingleAliases<asm, "s", "i32", "Four", 16, VectorIndexS>;
	defm "" : SIMDLdStSingleAliases<asm, "d", "i64", "Four", 32, VectorIndexD>;
	}
	} // end of 'let Predicates = [HasNEON]'

	//----------------------------------------------------------------------------
	// AdvSIMD v8.1 Rounding Double Multiply Add/Subtract
	//----------------------------------------------------------------------------

	let Predicates = [HasNEON, HasRDM] in {

	class BaseSIMDThreeSameVectorTiedR0<bit Q, bit U, bits<2> size, bits<5> opcode,
	RegisterOperand regtype, string asm,
	string kind, list<dag> pattern>
	: BaseSIMDThreeSameVectorTied<Q, U, {size,0}, opcode, regtype, asm, kind,
	pattern> {
	}
	multiclass SIMDThreeSameVectorSQRDMLxHTiedHS<bit U, bits<5> opc, string asm,
	SDPatternOperator Accum> {
	def v4i16 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b01, opc, V64, asm, ".4h",
	[(set (v4i16 V64:$dst),
	(Accum (v4i16 V64:$Rd),
	(v4i16 (int_aarch64_neon_sqrdmulh (v4i16 V64:$Rn),
	(v4i16 V64:$Rm)))))]>;
	def v8i16 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b01, opc, V128, asm, ".8h",
	[(set (v8i16 V128:$dst),
	(Accum (v8i16 V128:$Rd),
	(v8i16 (int_aarch64_neon_sqrdmulh (v8i16 V128:$Rn),
	(v8i16 V128:$Rm)))))]>;
	def v2i32 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b10, opc, V64, asm, ".2s",
	[(set (v2i32 V64:$dst),
	(Accum (v2i32 V64:$Rd),
	(v2i32 (int_aarch64_neon_sqrdmulh (v2i32 V64:$Rn),
	(v2i32 V64:$Rm)))))]>;
	def v4i32 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b10, opc, V128, asm, ".4s",
	[(set (v4i32 V128:$dst),
	(Accum (v4i32 V128:$Rd),
	(v4i32 (int_aarch64_neon_sqrdmulh (v4i32 V128:$Rn),
	(v4i32 V128:$Rm)))))]>;
	}

	multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm,
	SDPatternOperator Accum> {
	def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
	V64, V64, V128_lo, VectorIndexH,
	asm, ".4h", ".4h", ".4h", ".h",
	[(set (v4i16 V64:$dst),
	(Accum (v4i16 V64:$Rd),
	(v4i16 (int_aarch64_neon_sqrdmulh
	(v4i16 V64:$Rn),
	(v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
	VectorIndexH:$idx))))))]> {
	bits<3> idx;
	let Inst{11} = idx{2};
	let Inst{21} = idx{1};
	let Inst{20} = idx{0};
	}

	def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
	V128, V128, V128_lo, VectorIndexH,
	asm, ".8h", ".8h", ".8h", ".h",
	[(set (v8i16 V128:$dst),
	(Accum (v8i16 V128:$Rd),
	(v8i16 (int_aarch64_neon_sqrdmulh
	(v8i16 V128:$Rn),
	(v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
	VectorIndexH:$idx))))))]> {
	bits<3> idx;
	let Inst{11} = idx{2};
	let Inst{21} = idx{1};
	let Inst{20} = idx{0};
	}

	def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
	V64, V64, V128, VectorIndexS,
	asm, ".2s", ".2s", ".2s", ".s",
	[(set (v2i32 V64:$dst),
	(Accum (v2i32 V64:$Rd),
	(v2i32 (int_aarch64_neon_sqrdmulh
	(v2i32 V64:$Rn),
	(v2i32 (AArch64duplane32 (v4i32 V128:$Rm),
	VectorIndexS:$idx))))))]> {
	bits<2> idx;
	let Inst{11} = idx{1};
	let Inst{21} = idx{0};
	}

	// FIXME: it would be nice to use the scalar (v1i32) instruction here, but
	// an intermediate EXTRACT_SUBREG would be untyped.
	// FIXME: direct EXTRACT_SUBREG from v2i32 to i32 is illegal, that's why we
	// got it lowered here as (i32 vector_extract (v4i32 insert_subvector(..)))
	def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
	(i32 (vector_extract
	(v4i32 (insert_subvector
	(undef),
	(v2i32 (int_aarch64_neon_sqrdmulh
	(v2i32 V64:$Rn),
	(v2i32 (AArch64duplane32
	(v4i32 V128:$Rm),
	VectorIndexS:$idx)))),
	(i32 0))),
	(i64 0))))),
	(EXTRACT_SUBREG
	(v2i32 (!cast<Instruction>(NAME # v2i32_indexed)
	(v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
	FPR32Op:$Rd,
	ssub)),
	V64:$Rn,
	V128:$Rm,
	VectorIndexS:$idx)),
	ssub)>;

	def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
	V128, V128, V128, VectorIndexS,
	asm, ".4s", ".4s", ".4s", ".s",
	[(set (v4i32 V128:$dst),
	(Accum (v4i32 V128:$Rd),
	(v4i32 (int_aarch64_neon_sqrdmulh
	(v4i32 V128:$Rn),
	(v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
	VectorIndexS:$idx))))))]> {
	bits<2> idx;
	let Inst{11} = idx{1};
	let Inst{21} = idx{0};
	}

	// FIXME: it would be nice to use the scalar (v1i32) instruction here, but
	// an intermediate EXTRACT_SUBREG would be untyped.
	def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
	(i32 (vector_extract
	(v4i32 (int_aarch64_neon_sqrdmulh
	(v4i32 V128:$Rn),
	(v4i32 (AArch64duplane32
	(v4i32 V128:$Rm),
	VectorIndexS:$idx)))),
	(i64 0))))),
	(EXTRACT_SUBREG
	(v4i32 (!cast<Instruction>(NAME # v4i32_indexed)
	(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
	FPR32Op:$Rd,
	ssub)),
	V128:$Rn,
	V128:$Rm,
	VectorIndexS:$idx)),
	ssub)>;

	def i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc,
	FPR16Op, FPR16Op, V128_lo,
	VectorIndexH, asm, ".h", "", "", ".h",
	[]> {
	bits<3> idx;
	let Inst{11} = idx{2};
	let Inst{21} = idx{1};
	let Inst{20} = idx{0};
	}

	def i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
	FPR32Op, FPR32Op, V128, VectorIndexS,
	asm, ".s", "", "", ".s",
	[(set (i32 FPR32Op:$dst),
	(Accum (i32 FPR32Op:$Rd),
	(i32 (int_aarch64_neon_sqrdmulh
	(i32 FPR32Op:$Rn),
	(i32 (vector_extract (v4i32 V128:$Rm),
	VectorIndexS:$idx))))))]> {
	bits<2> idx;
	let Inst{11} = idx{1};
	let Inst{21} = idx{0};
	}
	}
	} // let Predicates = [HasNeon, HasRDM]

	//----------------------------------------------------------------------------
	// ARMv8.3 Complex ADD/MLA instructions
	//----------------------------------------------------------------------------

	class ComplexRotationOperand<int Angle, int Remainder, string Type>
	: AsmOperandClass {
	let PredicateMethod = "isComplexRotation<" # Angle # ", " # Remainder # ">";
	let DiagnosticType = "InvalidComplexRotation" # Type;
	let Name = "ComplexRotation" # Type;
	}
	def complexrotateop : Operand<i32>, TImmLeaf<i32, [{ return Imm >= 0 && Imm <= 270; }],
	SDNodeXForm<imm, [{
	return CurDAG->getTargetConstant((N->getSExtValue() / 90), SDLoc(N), MVT::i32);
	}]>> {
	let ParserMatchClass = ComplexRotationOperand<90, 0, "Even">;
	let PrintMethod = "printComplexRotationOp<90, 0>";
	}
	def complexrotateopodd : Operand<i32>, TImmLeaf<i32, [{ return Imm >= 0 && Imm <= 270; }],
	SDNodeXForm<imm, [{
	return CurDAG->getTargetConstant(((N->getSExtValue() - 90) / 180), SDLoc(N), MVT::i32);
	}]>> {
	let ParserMatchClass = ComplexRotationOperand<180, 90, "Odd">;
	let PrintMethod = "printComplexRotationOp<180, 90>";
	}
	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseSIMDThreeSameVectorComplex<bit Q, bit U, bits<2> size, bits<3> opcode,
	RegisterOperand regtype, Operand rottype,
	string asm, string kind, list<dag> pattern>
	: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, rottype:$rot), asm,
	"{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $rot"
	"\|" # kind # "\t$Rd, $Rn, $Rm, $rot}", "", pattern>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<5> Rm;
	bits<1> rot;
	let Inst{31} = 0;
	let Inst{30} = Q;
	let Inst{29} = U;
	let Inst{28-24} = 0b01110;
	let Inst{23-22} = size;
	let Inst{21} = 0;
	let Inst{20-16} = Rm;
	let Inst{15-13} = opcode;
	// Non-tied version (FCADD) only has one rotation bit
	let Inst{12} = rot;
	let Inst{11} = 0;
	let Inst{10} = 1;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	//8.3 CompNum - Floating-point complex number support
	multiclass SIMDThreeSameVectorComplexHSD<bit U, bits<3> opcode, Operand rottype,
	string asm, SDPatternOperator OpNode>{
	let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in {
	def v4f16 : BaseSIMDThreeSameVectorComplex<0, U, 0b01, opcode, V64, rottype,
	asm, ".4h",
	[(set (v4f16 V64:$dst), (OpNode (v4f16 V64:$Rd),
	(v4f16 V64:$Rn),
	(v4f16 V64:$Rm),
	(rottype i32:$rot)))]>;

	def v8f16 : BaseSIMDThreeSameVectorComplex<1, U, 0b01, opcode, V128, rottype,
	asm, ".8h",
	[(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd),
	(v8f16 V128:$Rn),
	(v8f16 V128:$Rm),
	(rottype i32:$rot)))]>;
	}

	let Predicates = [HasComplxNum, HasNEON] in {
	def v2f32 : BaseSIMDThreeSameVectorComplex<0, U, 0b10, opcode, V64, rottype,
	asm, ".2s",
	[(set (v2f32 V64:$dst), (OpNode (v2f32 V64:$Rd),
	(v2f32 V64:$Rn),
	(v2f32 V64:$Rm),
	(rottype i32:$rot)))]>;

	def v4f32 : BaseSIMDThreeSameVectorComplex<1, U, 0b10, opcode, V128, rottype,
	asm, ".4s",
	[(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
	(v4f32 V128:$Rn),
	(v4f32 V128:$Rm),
	(rottype i32:$rot)))]>;

	def v2f64 : BaseSIMDThreeSameVectorComplex<1, U, 0b11, opcode, V128, rottype,
	asm, ".2d",
	[(set (v2f64 V128:$dst), (OpNode (v2f64 V128:$Rd),
	(v2f64 V128:$Rn),
	(v2f64 V128:$Rm),
	(rottype i32:$rot)))]>;
	}
	}

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseSIMDThreeSameVectorTiedComplex<bit Q, bit U, bits<2> size,
	bits<3> opcode,
	RegisterOperand regtype,
	Operand rottype, string asm,
	string kind, list<dag> pattern>
	: I<(outs regtype:$dst),
	(ins regtype:$Rd, regtype:$Rn, regtype:$Rm, rottype:$rot), asm,
	"{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $rot"
	"\|" # kind # "\t$Rd, $Rn, $Rm, $rot}", "$Rd = $dst", pattern>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<5> Rm;
	bits<2> rot;
	let Inst{31} = 0;
	let Inst{30} = Q;
	let Inst{29} = U;
	let Inst{28-24} = 0b01110;
	let Inst{23-22} = size;
	let Inst{21} = 0;
	let Inst{20-16} = Rm;
	let Inst{15-13} = opcode;
	let Inst{12-11} = rot;
	let Inst{10} = 1;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	multiclass SIMDThreeSameVectorTiedComplexHSD<bit U, bits<3> opcode,
	Operand rottype, string asm,
	SDPatternOperator OpNode> {
	let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in {
	def v4f16 : BaseSIMDThreeSameVectorTiedComplex<0, U, 0b01, opcode, V64,
	rottype, asm, ".4h",
	[(set (v4f16 V64:$dst), (OpNode (v4f16 V64:$Rd),
	(v4f16 V64:$Rn),
	(v4f16 V64:$Rm),
	(rottype i32:$rot)))]>;

	def v8f16 : BaseSIMDThreeSameVectorTiedComplex<1, U, 0b01, opcode, V128,
	rottype, asm, ".8h",
	[(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd),
	(v8f16 V128:$Rn),
	(v8f16 V128:$Rm),
	(rottype i32:$rot)))]>;
	}

	let Predicates = [HasComplxNum, HasNEON] in {
	def v2f32 : BaseSIMDThreeSameVectorTiedComplex<0, U, 0b10, opcode, V64,
	rottype, asm, ".2s",
	[(set (v2f32 V64:$dst), (OpNode (v2f32 V64:$Rd),
	(v2f32 V64:$Rn),
	(v2f32 V64:$Rm),
	(rottype i32:$rot)))]>;

	def v4f32 : BaseSIMDThreeSameVectorTiedComplex<1, U, 0b10, opcode, V128,
	rottype, asm, ".4s",
	[(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
	(v4f32 V128:$Rn),
	(v4f32 V128:$Rm),
	(rottype i32:$rot)))]>;

	def v2f64 : BaseSIMDThreeSameVectorTiedComplex<1, U, 0b11, opcode, V128,
	rottype, asm, ".2d",
	[(set (v2f64 V128:$dst), (OpNode (v2f64 V128:$Rd),
	(v2f64 V128:$Rn),
	(v2f64 V128:$Rm),
	(rottype i32:$rot)))]>;
	}
	}

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class BaseSIMDIndexedTiedComplex<bit Q, bit U, bit Scalar, bits<2> size,
	bit opc1, bit opc2, RegisterOperand dst_reg,
	RegisterOperand lhs_reg,
	RegisterOperand rhs_reg, Operand vec_idx,
	Operand rottype, string asm, string apple_kind,
	string dst_kind, string lhs_kind,
	string rhs_kind, list<dag> pattern>
	: I<(outs dst_reg:$dst),
	(ins dst_reg:$Rd, lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx, rottype:$rot),
	asm,
	"{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind #
	"$idx, $rot" # "\|" # apple_kind #
	"\t$Rd, $Rn, $Rm$idx, $rot}", "$Rd = $dst", pattern>,
	Sched<[WriteV]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<5> Rm;
	bits<2> rot;

	let Inst{31} = 0;
	let Inst{30} = Q;
	let Inst{29} = U;
	let Inst{28} = Scalar;
	let Inst{27-24} = 0b1111;
	let Inst{23-22} = size;
	// Bit 21 must be set by the derived class.
	let Inst{20-16} = Rm;
	let Inst{15} = opc1;
	let Inst{14-13} = rot;
	let Inst{12} = opc2;
	// Bit 11 must be set by the derived class.
	let Inst{10} = 0;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	// The complex instructions index by pairs of elements, so the VectorIndexes
	// don't match the lane types, and the index bits are different to the other
	// classes.
	multiclass SIMDIndexedTiedComplexHSD<bit U, bit opc1, bit opc2, Operand rottype,
	string asm, SDPatternOperator OpNode> {
	let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in {
	def v4f16_indexed : BaseSIMDIndexedTiedComplex<0, 1, 0, 0b01, opc1, opc2, V64,
	V64, V128, VectorIndexD, rottype, asm, ".4h", ".4h",
	".4h", ".h", []> {
	bits<1> idx;
	let Inst{11} = 0;
	let Inst{21} = idx{0};
	}

	def v8f16_indexed : BaseSIMDIndexedTiedComplex<1, 1, 0, 0b01, opc1, opc2,
	V128, V128, V128, VectorIndexS, rottype, asm, ".8h",
	".8h", ".8h", ".h", []> {
	bits<2> idx;
	let Inst{11} = idx{1};
	let Inst{21} = idx{0};
	}
	} // Predicates = HasComplxNum, HasNEON, HasFullFP16]

	let Predicates = [HasComplxNum, HasNEON] in {
	def v4f32_indexed : BaseSIMDIndexedTiedComplex<1, 1, 0, 0b10, opc1, opc2,
	V128, V128, V128, VectorIndexD, rottype, asm, ".4s",
	".4s", ".4s", ".s", []> {
	bits<1> idx;
	let Inst{11} = idx{0};
	let Inst{21} = 0;
	}
	} // Predicates = [HasComplxNum, HasNEON]
	}

	//----------------------------------------------------------------------------
	// Crypto extensions
	//----------------------------------------------------------------------------

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class AESBase<bits<4> opc, string asm, dag outs, dag ins, string cstr,
	list<dag> pat>
	: I<outs, ins, asm, "{\t$Rd.16b, $Rn.16b\|.16b\t$Rd, $Rn}", cstr, pat>,
	Sched<[WriteV]>{
	bits<5> Rd;
	bits<5> Rn;
	let Inst{31-16} = 0b0100111000101000;
	let Inst{15-12} = opc;
	let Inst{11-10} = 0b10;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	class AESInst<bits<4> opc, string asm, Intrinsic OpNode>
	: AESBase<opc, asm, (outs V128:$Rd), (ins V128:$Rn), "",
	[(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;

	class AESTiedInst<bits<4> opc, string asm, Intrinsic OpNode>
	: AESBase<opc, asm, (outs V128:$dst), (ins V128:$Rd, V128:$Rn),
	"$Rd = $dst",
	[(set (v16i8 V128:$dst),
	(OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class SHA3OpTiedInst<bits<3> opc, string asm, string dst_lhs_kind,
	dag oops, dag iops, list<dag> pat>
	: I<oops, iops, asm,
	"{\t$Rd" # dst_lhs_kind # ", $Rn" # dst_lhs_kind # ", $Rm.4s" #
	"\|.4s\t$Rd, $Rn, $Rm}", "$Rd = $dst", pat>,
	Sched<[WriteV]>{
	bits<5> Rd;
	bits<5> Rn;
	bits<5> Rm;
	let Inst{31-21} = 0b01011110000;
	let Inst{20-16} = Rm;
	let Inst{15} = 0;
	let Inst{14-12} = opc;
	let Inst{11-10} = 0b00;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	class SHATiedInstQSV<bits<3> opc, string asm, Intrinsic OpNode>
	: SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
	(ins FPR128:$Rd, FPR32:$Rn, V128:$Rm),
	[(set (v4i32 FPR128:$dst),
	(OpNode (v4i32 FPR128:$Rd), (i32 FPR32:$Rn),
	(v4i32 V128:$Rm)))]>;

	class SHATiedInstVVV<bits<3> opc, string asm, Intrinsic OpNode>
	: SHA3OpTiedInst<opc, asm, ".4s", (outs V128:$dst),
	(ins V128:$Rd, V128:$Rn, V128:$Rm),
	[(set (v4i32 V128:$dst),
	(OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
	(v4i32 V128:$Rm)))]>;

	class SHATiedInstQQV<bits<3> opc, string asm, Intrinsic OpNode>
	: SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
	(ins FPR128:$Rd, FPR128:$Rn, V128:$Rm),
	[(set (v4i32 FPR128:$dst),
	(OpNode (v4i32 FPR128:$Rd), (v4i32 FPR128:$Rn),
	(v4i32 V128:$Rm)))]>;

	let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
	class SHA2OpInst<bits<4> opc, string asm, string kind,
	string cstr, dag oops, dag iops,
	list<dag> pat>
	: I<oops, iops, asm, "{\t$Rd" # kind # ", $Rn" # kind #
	"\|" # kind # "\t$Rd, $Rn}", cstr, pat>,
	Sched<[WriteV]>{
	bits<5> Rd;
	bits<5> Rn;
	let Inst{31-16} = 0b0101111000101000;
	let Inst{15-12} = opc;
	let Inst{11-10} = 0b10;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;
	}

	class SHATiedInstVV<bits<4> opc, string asm, Intrinsic OpNode>
	: SHA2OpInst<opc, asm, ".4s", "$Rd = $dst", (outs V128:$dst),
	(ins V128:$Rd, V128:$Rn),
	[(set (v4i32 V128:$dst),
	(OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;

	class SHAInstSS<bits<4> opc, string asm, Intrinsic OpNode>
	: SHA2OpInst<opc, asm, "", "", (outs FPR32:$Rd), (ins FPR32:$Rn),
	[(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;

	// Armv8.2-A Crypto extensions
	class BaseCryptoV82<dag oops, dag iops, string asm, string asmops, string cst,
	list<dag> pattern>
	: I <oops, iops, asm, asmops, cst, pattern>, Sched<[WriteV]> {
	bits<5> Vd;
	bits<5> Vn;
	let Inst{31-25} = 0b1100111;
	let Inst{9-5} = Vn;
	let Inst{4-0} = Vd;
	}

	class CryptoRRTied<bits<1>op0, bits<2>op1, string asm, string asmops>
	: BaseCryptoV82<(outs V128:$Vd), (ins V128:$Vn, V128:$Vm), asm, asmops,
	"$Vm = $Vd", []> {
	let Inst{31-25} = 0b1100111;
	let Inst{24-21} = 0b0110;
	let Inst{20-15} = 0b000001;
	let Inst{14} = op0;
	let Inst{13-12} = 0b00;
	let Inst{11-10} = op1;
	}
	class CryptoRRTied_2D<bits<1>op0, bits<2>op1, string asm>
	: CryptoRRTied<op0, op1, asm, "{\t$Vd.2d, $Vn.2d\|.2d\t$Vd, $Vn}">;
	class CryptoRRTied_4S<bits<1>op0, bits<2>op1, string asm>
	: CryptoRRTied<op0, op1, asm, "{\t$Vd.4s, $Vn.4s\|.4s\t$Vd, $Vn}">;

	class CryptoRRR<bits<1> op0, bits<2>op1, dag oops, dag iops, string asm,
	string asmops, string cst>
	: BaseCryptoV82<oops, iops, asm , asmops, cst, []> {
	bits<5> Vm;
	let Inst{24-21} = 0b0011;
	let Inst{20-16} = Vm;
	let Inst{15} = 0b1;
	let Inst{14} = op0;
	let Inst{13-12} = 0b00;
	let Inst{11-10} = op1;
	}
	class CryptoRRR_2D<bits<1> op0, bits<2>op1, string asm>
	: CryptoRRR<op0, op1, (outs V128:$Vd), (ins V128:$Vn, V128:$Vm), asm,
	"{\t$Vd.2d, $Vn.2d, $Vm.2d\|.2d\t$Vd, $Vn, $Vm}", "">;
	class CryptoRRRTied_2D<bits<1> op0, bits<2>op1, string asm>
	: CryptoRRR<op0, op1, (outs V128:$Vdst), (ins V128:$Vd, V128:$Vn, V128:$Vm), asm,
	"{\t$Vd.2d, $Vn.2d, $Vm.2d\|.2d\t$Vd, $Vn, $Vm}", "$Vd = $Vdst">;
	class CryptoRRR_4S<bits<1> op0, bits<2>op1, string asm>
	: CryptoRRR<op0, op1, (outs V128:$Vd), (ins V128:$Vn, V128:$Vm), asm,
	"{\t$Vd.4s, $Vn.4s, $Vm.4s\|.4s\t$Vd, $Vn, $Vm}", "">;
	class CryptoRRRTied_4S<bits<1> op0, bits<2>op1, string asm>
	: CryptoRRR<op0, op1, (outs V128:$Vdst), (ins V128:$Vd, V128:$Vn, V128:$Vm), asm,
	"{\t$Vd.4s, $Vn.4s, $Vm.4s\|.4s\t$Vd, $Vn, $Vm}", "$Vd = $Vdst">;
	class CryptoRRRTied<bits<1> op0, bits<2>op1, string asm>
	: CryptoRRR<op0, op1, (outs FPR128:$Vdst), (ins FPR128:$Vd, FPR128:$Vn, V128:$Vm),
	asm, "{\t$Vd, $Vn, $Vm.2d\|.2d\t$Vd, $Vn, $Vm}", "$Vd = $Vdst">;

	class CryptoRRRR<bits<2>op0, string asm, string asmops>
	: BaseCryptoV82<(outs V128:$Vd), (ins V128:$Vn, V128:$Vm, V128:$Va), asm,
	asmops, "", []> {
	bits<5> Vm;
	bits<5> Va;
	let Inst{24-23} = 0b00;
	let Inst{22-21} = op0;
	let Inst{20-16} = Vm;
	let Inst{15} = 0b0;
	let Inst{14-10} = Va;
	}
	class CryptoRRRR_16B<bits<2>op0, string asm>
	: CryptoRRRR<op0, asm, "{\t$Vd.16b, $Vn.16b, $Vm.16b, $Va.16b" #
	"\|.16b\t$Vd, $Vn, $Vm, $Va}"> {
	}
	class CryptoRRRR_4S<bits<2>op0, string asm>
	: CryptoRRRR<op0, asm, "{\t$Vd.4s, $Vn.4s, $Vm.4s, $Va.4s" #
	"\|.4s\t$Vd, $Vn, $Vm, $Va}"> {
	}

	class CryptoRRRi6<string asm>
	: BaseCryptoV82<(outs V128:$Vd), (ins V128:$Vn, V128:$Vm, uimm6:$imm), asm,
	"{\t$Vd.2d, $Vn.2d, $Vm.2d, $imm" #
	"\|.2d\t$Vd, $Vn, $Vm, $imm}", "", []> {
	bits<6> imm;
	bits<5> Vm;
	let Inst{24-21} = 0b0100;
	let Inst{20-16} = Vm;
	let Inst{15-10} = imm;
	let Inst{9-5} = Vn;
	let Inst{4-0} = Vd;
	}

	class CryptoRRRi2Tied<bits<1>op0, bits<2>op1, string asm>
	: BaseCryptoV82<(outs V128:$Vdst),
	(ins V128:$Vd, V128:$Vn, V128:$Vm, VectorIndexS:$imm),
	asm, "{\t$Vd.4s, $Vn.4s, $Vm.s$imm" #
	"\|.4s\t$Vd, $Vn, $Vm$imm}", "$Vd = $Vdst", []> {
	bits<2> imm;
	bits<5> Vm;
	let Inst{24-21} = 0b0010;
	let Inst{20-16} = Vm;
	let Inst{15} = 0b1;
	let Inst{14} = op0;
	let Inst{13-12} = imm;
	let Inst{11-10} = op1;
	}

	//----------------------------------------------------------------------------
	// v8.1 atomic instructions extension:
	// * CAS
	// * CASP
	// * SWP
	// * LDOPregister<OP>, and aliases STOPregister<OP>

	// Instruction encodings:
	//
	// 31 30\|29 24\|23\|22\|21\|20 16\|15\|14 10\|9 5\|4 0
	// CAS SZ \|001000\|1 \|A \|1 \|Rs \|R \|11111 \|Rn \|Rt
	// CASP 0\|SZ\|001000\|0 \|A \|1 \|Rs \|R \|11111 \|Rn \|Rt
	// SWP SZ \|111000\|A \|R \|1 \|Rs \|1 \|OPC\|00\|Rn \|Rt
	// LD SZ \|111000\|A \|R \|1 \|Rs \|0 \|OPC\|00\|Rn \|Rt
	// ST SZ \|111000\|A \|R \|1 \|Rs \|0 \|OPC\|00\|Rn \|11111

	// Instruction syntax:
	//
	// CAS{<order>}[<size>] <Ws>, <Wt>, [<Xn\|SP>]
	// CAS{<order>} <Xs>, <Xt>, [<Xn\|SP>]
	// CASP{<order>} <Ws>, <W(s+1)>, <Wt>, <W(t+1)>, [<Xn\|SP>]
	// CASP{<order>} <Xs>, <X(s+1)>, <Xt>, <X(t+1)>, [<Xn\|SP>]
	// SWP{<order>}[<size>] <Ws>, <Wt>, [<Xn\|SP>]
	// SWP{<order>} <Xs>, <Xt>, [<Xn\|SP>]
	// LD<OP>{<order>}[<size>] <Ws>, <Wt>, [<Xn\|SP>]
	// LD<OP>{<order>} <Xs>, <Xt>, [<Xn\|SP>]
	// ST<OP>{<order>}[<size>] <Ws>, [<Xn\|SP>]
	// ST<OP>{<order>} <Xs>, [<Xn\|SP>]

	let Predicates = [HasLSE], mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
	class BaseCASEncoding<dag oops, dag iops, string asm, string operands,
	string cstr, list<dag> pattern>
	: I<oops, iops, asm, operands, cstr, pattern> {
	bits<2> Sz;
	bit NP;
	bit Acq;
	bit Rel;
	bits<5> Rs;
	bits<5> Rn;
	bits<5> Rt;
	let Inst{31-30} = Sz;
	let Inst{29-24} = 0b001000;
	let Inst{23} = NP;
	let Inst{22} = Acq;
	let Inst{21} = 0b1;
	let Inst{20-16} = Rs;
	let Inst{15} = Rel;
	let Inst{14-10} = 0b11111;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rt;
	let Predicates = [HasLSE];
	}

	class BaseCAS<string order, string size, RegisterClass RC>
	: BaseCASEncoding<(outs RC:$out),(ins RC:$Rs, RC:$Rt, GPR64sp:$Rn),
	"cas" # order # size, "\t$Rs, $Rt, [$Rn]",
	"$out = $Rs",[]>,
	Sched<[WriteAtomic]> {
	let NP = 1;
	}

	multiclass CompareAndSwap<bits<1> Acq, bits<1> Rel, string order> {
	let Sz = 0b00, Acq = Acq, Rel = Rel in def B : BaseCAS<order, "b", GPR32>;
	let Sz = 0b01, Acq = Acq, Rel = Rel in def H : BaseCAS<order, "h", GPR32>;
	let Sz = 0b10, Acq = Acq, Rel = Rel in def W : BaseCAS<order, "", GPR32>;
	let Sz = 0b11, Acq = Acq, Rel = Rel in def X : BaseCAS<order, "", GPR64>;
	}

	class BaseCASP<string order, string size, RegisterOperand RC>
	: BaseCASEncoding<(outs RC:$out),(ins RC:$Rs, RC:$Rt, GPR64sp:$Rn),
	"casp" # order # size, "\t$Rs, $Rt, [$Rn]",
	"$out = $Rs",[]>,
	Sched<[WriteAtomic]> {
	let NP = 0;
	}

	multiclass CompareAndSwapPair<bits<1> Acq, bits<1> Rel, string order> {
	let Sz = 0b00, Acq = Acq, Rel = Rel in
	def W : BaseCASP<order, "", WSeqPairClassOperand>;
	let Sz = 0b01, Acq = Acq, Rel = Rel in
	def X : BaseCASP<order, "", XSeqPairClassOperand>;
	}

	let Predicates = [HasLSE] in
	class BaseSWP<string order, string size, RegisterClass RC>
	: I<(outs RC:$Rt),(ins RC:$Rs, GPR64sp:$Rn), "swp" # order # size,
	"\t$Rs, $Rt, [$Rn]","",[]>,
	Sched<[WriteAtomic]> {
	bits<2> Sz;
	bit Acq;
	bit Rel;
	bits<5> Rs;
	bits<3> opc = 0b000;
	bits<5> Rn;
	bits<5> Rt;
	let Inst{31-30} = Sz;
	let Inst{29-24} = 0b111000;
	let Inst{23} = Acq;
	let Inst{22} = Rel;
	let Inst{21} = 0b1;
	let Inst{20-16} = Rs;
	let Inst{15} = 0b1;
	let Inst{14-12} = opc;
	let Inst{11-10} = 0b00;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rt;
	let Predicates = [HasLSE];
	}

	multiclass Swap<bits<1> Acq, bits<1> Rel, string order> {
	let Sz = 0b00, Acq = Acq, Rel = Rel in def B : BaseSWP<order, "b", GPR32>;
	let Sz = 0b01, Acq = Acq, Rel = Rel in def H : BaseSWP<order, "h", GPR32>;
	let Sz = 0b10, Acq = Acq, Rel = Rel in def W : BaseSWP<order, "", GPR32>;
	let Sz = 0b11, Acq = Acq, Rel = Rel in def X : BaseSWP<order, "", GPR64>;
	}

	let Predicates = [HasLSE], mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
	class BaseLDOPregister<string op, string order, string size, RegisterClass RC>
	: I<(outs RC:$Rt),(ins RC:$Rs, GPR64sp:$Rn), "ld" # op # order # size,
	"\t$Rs, $Rt, [$Rn]","",[]>,
	Sched<[WriteAtomic]> {
	bits<2> Sz;
	bit Acq;
	bit Rel;
	bits<5> Rs;
	bits<3> opc;
	bits<5> Rn;
	bits<5> Rt;
	let Inst{31-30} = Sz;
	let Inst{29-24} = 0b111000;
	let Inst{23} = Acq;
	let Inst{22} = Rel;
	let Inst{21} = 0b1;
	let Inst{20-16} = Rs;
	let Inst{15} = 0b0;
	let Inst{14-12} = opc;
	let Inst{11-10} = 0b00;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Rt;
	let Predicates = [HasLSE];
	}

	multiclass LDOPregister<bits<3> opc, string op, bits<1> Acq, bits<1> Rel,
	string order> {
	let Sz = 0b00, Acq = Acq, Rel = Rel, opc = opc in
	def B : BaseLDOPregister<op, order, "b", GPR32>;
	let Sz = 0b01, Acq = Acq, Rel = Rel, opc = opc in
	def H : BaseLDOPregister<op, order, "h", GPR32>;
	let Sz = 0b10, Acq = Acq, Rel = Rel, opc = opc in
	def W : BaseLDOPregister<op, order, "", GPR32>;
	let Sz = 0b11, Acq = Acq, Rel = Rel, opc = opc in
	def X : BaseLDOPregister<op, order, "", GPR64>;
	}

	// Differing SrcRHS and DstRHS allow you to cover CLR & SUB by giving a more
	// complex DAG for DstRHS.
	let Predicates = [HasLSE] in
	multiclass LDOPregister_patterns_ord_dag<string inst, string suffix, string op,
	string size, dag SrcRHS, dag DstRHS> {
	def : Pat<(!cast<PatFrag>(op#"_"#size#"_monotonic") GPR64sp:$Rn, SrcRHS),
	(!cast<Instruction>(inst # suffix) DstRHS, GPR64sp:$Rn)>;
	def : Pat<(!cast<PatFrag>(op#"_"#size#"_acquire") GPR64sp:$Rn, SrcRHS),
	(!cast<Instruction>(inst # "A" # suffix) DstRHS, GPR64sp:$Rn)>;
	def : Pat<(!cast<PatFrag>(op#"_"#size#"_release") GPR64sp:$Rn, SrcRHS),
	(!cast<Instruction>(inst # "L" # suffix) DstRHS, GPR64sp:$Rn)>;
	def : Pat<(!cast<PatFrag>(op#"_"#size#"_acq_rel") GPR64sp:$Rn, SrcRHS),
	(!cast<Instruction>(inst # "AL" # suffix) DstRHS, GPR64sp:$Rn)>;
	def : Pat<(!cast<PatFrag>(op#"_"#size#"_seq_cst") GPR64sp:$Rn, SrcRHS),
	(!cast<Instruction>(inst # "AL" # suffix) DstRHS, GPR64sp:$Rn)>;
	}

	multiclass LDOPregister_patterns_ord<string inst, string suffix, string op,
	string size, dag RHS> {
	defm : LDOPregister_patterns_ord_dag<inst, suffix, op, size, RHS, RHS>;
	}

	multiclass LDOPregister_patterns_ord_mod<string inst, string suffix, string op,
	string size, dag LHS, dag RHS> {
	defm : LDOPregister_patterns_ord_dag<inst, suffix, op, size, LHS, RHS>;
	}

	multiclass LDOPregister_patterns<string inst, string op> {
	defm : LDOPregister_patterns_ord<inst, "X", op, "64", (i64 GPR64:$Rm)>;
	defm : LDOPregister_patterns_ord<inst, "W", op, "32", (i32 GPR32:$Rm)>;
	defm : LDOPregister_patterns_ord<inst, "H", op, "16", (i32 GPR32:$Rm)>;
	defm : LDOPregister_patterns_ord<inst, "B", op, "8", (i32 GPR32:$Rm)>;
	}

	multiclass LDOPregister_patterns_mod<string inst, string op, string mod> {
	defm : LDOPregister_patterns_ord_mod<inst, "X", op, "64",
	(i64 GPR64:$Rm),
	(i64 (!cast<Instruction>(mod#Xrr) XZR, GPR64:$Rm))>;
	defm : LDOPregister_patterns_ord_mod<inst, "W", op, "32",
	(i32 GPR32:$Rm),
	(i32 (!cast<Instruction>(mod#Wrr) WZR, GPR32:$Rm))>;
	defm : LDOPregister_patterns_ord_mod<inst, "H", op, "16",
	(i32 GPR32:$Rm),
	(i32 (!cast<Instruction>(mod#Wrr) WZR, GPR32:$Rm))>;
	defm : LDOPregister_patterns_ord_mod<inst, "B", op, "8",
	(i32 GPR32:$Rm),
	(i32 (!cast<Instruction>(mod#Wrr) WZR, GPR32:$Rm))>;
	}

	let Predicates = [HasLSE] in
	multiclass CASregister_patterns_ord_dag<string inst, string suffix, string op,
	string size, dag OLD, dag NEW> {
	def : Pat<(!cast<PatFrag>(op#"_"#size#"_monotonic") GPR64sp:$Rn, OLD, NEW),
	(!cast<Instruction>(inst # suffix) OLD, NEW, GPR64sp:$Rn)>;
	def : Pat<(!cast<PatFrag>(op#"_"#size#"_acquire") GPR64sp:$Rn, OLD, NEW),
	(!cast<Instruction>(inst # "A" # suffix) OLD, NEW, GPR64sp:$Rn)>;
	def : Pat<(!cast<PatFrag>(op#"_"#size#"_release") GPR64sp:$Rn, OLD, NEW),
	(!cast<Instruction>(inst # "L" # suffix) OLD, NEW, GPR64sp:$Rn)>;
	def : Pat<(!cast<PatFrag>(op#"_"#size#"_acq_rel") GPR64sp:$Rn, OLD, NEW),
	(!cast<Instruction>(inst # "AL" # suffix) OLD, NEW, GPR64sp:$Rn)>;
	def : Pat<(!cast<PatFrag>(op#"_"#size#"_seq_cst") GPR64sp:$Rn, OLD, NEW),
	(!cast<Instruction>(inst # "AL" # suffix) OLD, NEW, GPR64sp:$Rn)>;
	}

	multiclass CASregister_patterns_ord<string inst, string suffix, string op,
	string size, dag OLD, dag NEW> {
	defm : CASregister_patterns_ord_dag<inst, suffix, op, size, OLD, NEW>;
	}

	multiclass CASregister_patterns<string inst, string op> {
	defm : CASregister_patterns_ord<inst, "X", op, "64",
	(i64 GPR64:$Rold), (i64 GPR64:$Rnew)>;
	defm : CASregister_patterns_ord<inst, "W", op, "32",
	(i32 GPR32:$Rold), (i32 GPR32:$Rnew)>;
	defm : CASregister_patterns_ord<inst, "H", op, "16",
	(i32 GPR32:$Rold), (i32 GPR32:$Rnew)>;
	defm : CASregister_patterns_ord<inst, "B", op, "8",
	(i32 GPR32:$Rold), (i32 GPR32:$Rnew)>;
	}

	let Predicates = [HasLSE] in
	class BaseSTOPregister<string asm, RegisterClass OP, Register Reg,
	Instruction inst> :
	InstAlias<asm # "\t$Rs, [$Rn]", (inst Reg, OP:$Rs, GPR64sp:$Rn)>;

	multiclass STOPregister<string asm, string instr> {
	def : BaseSTOPregister<asm # "lb", GPR32, WZR,
	!cast<Instruction>(instr # "LB")>;
	def : BaseSTOPregister<asm # "lh", GPR32, WZR,
	!cast<Instruction>(instr # "LH")>;
	def : BaseSTOPregister<asm # "l", GPR32, WZR,
	!cast<Instruction>(instr # "LW")>;
	def : BaseSTOPregister<asm # "l", GPR64, XZR,
	!cast<Instruction>(instr # "LX")>;
	def : BaseSTOPregister<asm # "b", GPR32, WZR,
	!cast<Instruction>(instr # "B")>;
	def : BaseSTOPregister<asm # "h", GPR32, WZR,
	!cast<Instruction>(instr # "H")>;
	def : BaseSTOPregister<asm, GPR32, WZR,
	!cast<Instruction>(instr # "W")>;
	def : BaseSTOPregister<asm, GPR64, XZR,
	!cast<Instruction>(instr # "X")>;
	}

	//----------------------------------------------------------------------------
	// Allow the size specifier tokens to be upper case, not just lower.
	def : TokenAlias<".4B", ".4b">; // Add dot product
	def : TokenAlias<".8B", ".8b">;
	def : TokenAlias<".4H", ".4h">;
	def : TokenAlias<".2S", ".2s">;
	def : TokenAlias<".1D", ".1d">;
	def : TokenAlias<".16B", ".16b">;
	def : TokenAlias<".8H", ".8h">;
	def : TokenAlias<".4S", ".4s">;
	def : TokenAlias<".2D", ".2d">;
	def : TokenAlias<".1Q", ".1q">;
	def : TokenAlias<".2H", ".2h">;
	def : TokenAlias<".B", ".b">;
	def : TokenAlias<".H", ".h">;
	def : TokenAlias<".S", ".s">;
	def : TokenAlias<".D", ".d">;
	def : TokenAlias<".Q", ".q">;
	diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
	index 5139ae5ccaf1..08f80c9aa361 100644
	--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
	+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
	@@ -1,6909 +1,6938 @@
	//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains the AArch64 implementation of the TargetInstrInfo class.
	//
	//===----------------------------------------------------------------------===//

	#include "AArch64InstrInfo.h"
	#include "AArch64MachineFunctionInfo.h"
	#include "AArch64Subtarget.h"
	#include "MCTargetDesc/AArch64AddressingModes.h"
	#include "Utils/AArch64BaseInfo.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/StackMaps.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/IR/DebugInfoMetadata.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCInst.h"
	#include "llvm/MC/MCInstrDesc.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include <cassert>
	#include <cstdint>
	#include <iterator>
	#include <utility>

	using namespace llvm;

	#define GET_INSTRINFO_CTOR_DTOR
	#include "AArch64GenInstrInfo.inc"

	static cl::opt<unsigned> TBZDisplacementBits(
	"aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
	cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));

	static cl::opt<unsigned> CBZDisplacementBits(
	"aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
	cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));

	static cl::opt<unsigned>
	BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
	cl::desc("Restrict range of Bcc instructions (DEBUG)"));

	AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
	: AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
	AArch64::CATCHRET),
	RI(STI.getTargetTriple()), Subtarget(STI) {}

	/// GetInstSize - Return the number of bytes of code the specified
	/// instruction may be. This returns the maximum number of bytes.
	unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
	const MachineBasicBlock &MBB = *MI.getParent();
	const MachineFunction *MF = MBB.getParent();
	const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();

	{
	auto Op = MI.getOpcode();
	if (Op == AArch64::INLINEASM \|\| Op == AArch64::INLINEASM_BR)
	return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
	}

	// Meta-instructions emit no code.
	if (MI.isMetaInstruction())
	return 0;

	// FIXME: We currently only handle pseudoinstructions that don't get expanded
	// before the assembly printer.
	unsigned NumBytes = 0;
	const MCInstrDesc &Desc = MI.getDesc();
	switch (Desc.getOpcode()) {
	default:
	// Anything not explicitly designated otherwise is a normal 4-byte insn.
	NumBytes = 4;
	break;
	case TargetOpcode::STACKMAP:
	// The upper bound for a stackmap intrinsic is the full length of its shadow
	NumBytes = StackMapOpers(&MI).getNumPatchBytes();
	assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
	break;
	case TargetOpcode::PATCHPOINT:
	// The size of the patchpoint intrinsic is the number of bytes requested
	NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
	assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
	break;
	case AArch64::TLSDESC_CALLSEQ:
	// This gets lowered to an instruction sequence which takes 16 bytes
	NumBytes = 16;
	break;
	case AArch64::SpeculationBarrierISBDSBEndBB:
	// This gets lowered to 2 4-byte instructions.
	NumBytes = 8;
	break;
	case AArch64::SpeculationBarrierSBEndBB:
	// This gets lowered to 1 4-byte instructions.
	NumBytes = 4;
	break;
	case AArch64::JumpTableDest32:
	case AArch64::JumpTableDest16:
	case AArch64::JumpTableDest8:
	NumBytes = 12;
	break;
	case AArch64::SPACE:
	NumBytes = MI.getOperand(1).getImm();
	break;
	case TargetOpcode::BUNDLE:
	NumBytes = getInstBundleLength(MI);
	break;
	}

	return NumBytes;
	}

	unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
	unsigned Size = 0;
	MachineBasicBlock::const_instr_iterator I = MI.getIterator();
	MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
	while (++I != E && I->isInsideBundle()) {
	assert(!I->isBundle() && "No nested bundle!");
	Size += getInstSizeInBytes(*I);
	}
	return Size;
	}

	static void parseCondBranch(MachineInstr LastInst, MachineBasicBlock &Target,
	SmallVectorImpl<MachineOperand> &Cond) {
	// Block ends with fall-through condbranch.
	switch (LastInst->getOpcode()) {
	default:
	llvm_unreachable("Unknown branch instruction?");
	case AArch64::Bcc:
	Target = LastInst->getOperand(1).getMBB();
	Cond.push_back(LastInst->getOperand(0));
	break;
	case AArch64::CBZW:
	case AArch64::CBZX:
	case AArch64::CBNZW:
	case AArch64::CBNZX:
	Target = LastInst->getOperand(1).getMBB();
	Cond.push_back(MachineOperand::CreateImm(-1));
	Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
	Cond.push_back(LastInst->getOperand(0));
	break;
	case AArch64::TBZW:
	case AArch64::TBZX:
	case AArch64::TBNZW:
	case AArch64::TBNZX:
	Target = LastInst->getOperand(2).getMBB();
	Cond.push_back(MachineOperand::CreateImm(-1));
	Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
	Cond.push_back(LastInst->getOperand(0));
	Cond.push_back(LastInst->getOperand(1));
	}
	}

	static unsigned getBranchDisplacementBits(unsigned Opc) {
	switch (Opc) {
	default:
	llvm_unreachable("unexpected opcode!");
	case AArch64::B:
	return 64;
	case AArch64::TBNZW:
	case AArch64::TBZW:
	case AArch64::TBNZX:
	case AArch64::TBZX:
	return TBZDisplacementBits;
	case AArch64::CBNZW:
	case AArch64::CBZW:
	case AArch64::CBNZX:
	case AArch64::CBZX:
	return CBZDisplacementBits;
	case AArch64::Bcc:
	return BCCDisplacementBits;
	}
	}

	bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
	int64_t BrOffset) const {
	unsigned Bits = getBranchDisplacementBits(BranchOp);
	assert(Bits >= 3 && "max branch displacement must be enough to jump"
	"over conditional branch expansion");
	return isIntN(Bits, BrOffset / 4);
	}

	MachineBasicBlock *
	AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
	switch (MI.getOpcode()) {
	default:
	llvm_unreachable("unexpected opcode!");
	case AArch64::B:
	return MI.getOperand(0).getMBB();
	case AArch64::TBZW:
	case AArch64::TBNZW:
	case AArch64::TBZX:
	case AArch64::TBNZX:
	return MI.getOperand(2).getMBB();
	case AArch64::CBZW:
	case AArch64::CBNZW:
	case AArch64::CBZX:
	case AArch64::CBNZX:
	case AArch64::Bcc:
	return MI.getOperand(1).getMBB();
	}
	}

	// Branch analysis.
	bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
	MachineBasicBlock *&TBB,
	MachineBasicBlock *&FBB,
	SmallVectorImpl<MachineOperand> &Cond,
	bool AllowModify) const {
	// If the block has no terminators, it just falls into the block after it.
	MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
	if (I == MBB.end())
	return false;

	// Skip over SpeculationBarrierEndBB terminators
	if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB \|\|
	I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
	--I;
	}

	if (!isUnpredicatedTerminator(*I))
	return false;

	// Get the last instruction in the block.
	MachineInstr LastInst = &I;

	// If there is only one terminator instruction, process it.
	unsigned LastOpc = LastInst->getOpcode();
	if (I == MBB.begin() \|\| !isUnpredicatedTerminator(*--I)) {
	if (isUncondBranchOpcode(LastOpc)) {
	TBB = LastInst->getOperand(0).getMBB();
	return false;
	}
	if (isCondBranchOpcode(LastOpc)) {
	// Block ends with fall-through condbranch.
	parseCondBranch(LastInst, TBB, Cond);
	return false;
	}
	return true; // Can't handle indirect branch.
	}

	// Get the instruction before it if it is a terminator.
	MachineInstr SecondLastInst = &I;
	unsigned SecondLastOpc = SecondLastInst->getOpcode();

	// If AllowModify is true and the block ends with two or more unconditional
	// branches, delete all but the first unconditional branch.
	if (AllowModify && isUncondBranchOpcode(LastOpc)) {
	while (isUncondBranchOpcode(SecondLastOpc)) {
	LastInst->eraseFromParent();
	LastInst = SecondLastInst;
	LastOpc = LastInst->getOpcode();
	if (I == MBB.begin() \|\| !isUnpredicatedTerminator(*--I)) {
	// Return now the only terminator is an unconditional branch.
	TBB = LastInst->getOperand(0).getMBB();
	return false;
	} else {
	SecondLastInst = &*I;
	SecondLastOpc = SecondLastInst->getOpcode();
	}
	}
	}

	// If there are three terminators, we don't know what sort of block this is.
	if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
	return true;

	// If the block ends with a B and a Bcc, handle it.
	if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
	parseCondBranch(SecondLastInst, TBB, Cond);
	FBB = LastInst->getOperand(0).getMBB();
	return false;
	}

	// If the block ends with two unconditional branches, handle it. The second
	// one is not executed, so remove it.
	if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
	TBB = SecondLastInst->getOperand(0).getMBB();
	I = LastInst;
	if (AllowModify)
	I->eraseFromParent();
	return false;
	}

	// ...likewise if it ends with an indirect branch followed by an unconditional
	// branch.
	if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
	I = LastInst;
	if (AllowModify)
	I->eraseFromParent();
	return true;
	}

	// Otherwise, can't handle this.
	return true;
	}

	bool AArch64InstrInfo::reverseBranchCondition(
	SmallVectorImpl<MachineOperand> &Cond) const {
	if (Cond[0].getImm() != -1) {
	// Regular Bcc
	AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
	Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
	} else {
	// Folded compare-and-branch
	switch (Cond[1].getImm()) {
	default:
	llvm_unreachable("Unknown conditional branch!");
	case AArch64::CBZW:
	Cond[1].setImm(AArch64::CBNZW);
	break;
	case AArch64::CBNZW:
	Cond[1].setImm(AArch64::CBZW);
	break;
	case AArch64::CBZX:
	Cond[1].setImm(AArch64::CBNZX);
	break;
	case AArch64::CBNZX:
	Cond[1].setImm(AArch64::CBZX);
	break;
	case AArch64::TBZW:
	Cond[1].setImm(AArch64::TBNZW);
	break;
	case AArch64::TBNZW:
	Cond[1].setImm(AArch64::TBZW);
	break;
	case AArch64::TBZX:
	Cond[1].setImm(AArch64::TBNZX);
	break;
	case AArch64::TBNZX:
	Cond[1].setImm(AArch64::TBZX);
	break;
	}
	}

	return false;
	}

	unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
	int *BytesRemoved) const {
	MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
	if (I == MBB.end())
	return 0;

	if (!isUncondBranchOpcode(I->getOpcode()) &&
	!isCondBranchOpcode(I->getOpcode()))
	return 0;

	// Remove the branch.
	I->eraseFromParent();

	I = MBB.end();

	if (I == MBB.begin()) {
	if (BytesRemoved)
	*BytesRemoved = 4;
	return 1;
	}
	--I;
	if (!isCondBranchOpcode(I->getOpcode())) {
	if (BytesRemoved)
	*BytesRemoved = 4;
	return 1;
	}

	// Remove the branch.
	I->eraseFromParent();
	if (BytesRemoved)
	*BytesRemoved = 8;

	return 2;
	}

	void AArch64InstrInfo::instantiateCondBranch(
	MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
	ArrayRef<MachineOperand> Cond) const {
	if (Cond[0].getImm() != -1) {
	// Regular Bcc
	BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
	} else {
	// Folded compare-and-branch
	// Note that we use addOperand instead of addReg to keep the flags.
	const MachineInstrBuilder MIB =
	BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
	if (Cond.size() > 3)
	MIB.addImm(Cond[3].getImm());
	MIB.addMBB(TBB);
	}
	}

	unsigned AArch64InstrInfo::insertBranch(
	MachineBasicBlock &MBB, MachineBasicBlock TBB, MachineBasicBlock FBB,
	ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
	// Shouldn't be a fall through.
	assert(TBB && "insertBranch must not be told to insert a fallthrough");

	if (!FBB) {
	if (Cond.empty()) // Unconditional branch?
	BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
	else
	instantiateCondBranch(MBB, DL, TBB, Cond);

	if (BytesAdded)
	*BytesAdded = 4;

	return 1;
	}

	// Two-way conditional branch.
	instantiateCondBranch(MBB, DL, TBB, Cond);
	BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);

	if (BytesAdded)
	*BytesAdded = 8;

	return 2;
	}

	// Find the original register that VReg is copied from.
	static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
	while (Register::isVirtualRegister(VReg)) {
	const MachineInstr *DefMI = MRI.getVRegDef(VReg);
	if (!DefMI->isFullCopy())
	return VReg;
	VReg = DefMI->getOperand(1).getReg();
	}
	return VReg;
	}

	// Determine if VReg is defined by an instruction that can be folded into a
	// csel instruction. If so, return the folded opcode, and the replacement
	// register.
	static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
	unsigned *NewVReg = nullptr) {
	VReg = removeCopies(MRI, VReg);
	if (!Register::isVirtualRegister(VReg))
	return 0;

	bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
	const MachineInstr *DefMI = MRI.getVRegDef(VReg);
	unsigned Opc = 0;
	unsigned SrcOpNum = 0;
	switch (DefMI->getOpcode()) {
	case AArch64::ADDSXri:
	case AArch64::ADDSWri:
	// if NZCV is used, do not fold.
	if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
	return 0;
	// fall-through to ADDXri and ADDWri.
	LLVM_FALLTHROUGH;
	case AArch64::ADDXri:
	case AArch64::ADDWri:
	// add x, 1 -> csinc.
	if (!DefMI->getOperand(2).isImm() \|\| DefMI->getOperand(2).getImm() != 1 \|\|
	DefMI->getOperand(3).getImm() != 0)
	return 0;
	SrcOpNum = 1;
	Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
	break;

	case AArch64::ORNXrr:
	case AArch64::ORNWrr: {
	// not x -> csinv, represented as orn dst, xzr, src.
	unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
	if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
	return 0;
	SrcOpNum = 2;
	Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
	break;
	}

	case AArch64::SUBSXrr:
	case AArch64::SUBSWrr:
	// if NZCV is used, do not fold.
	if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
	return 0;
	// fall-through to SUBXrr and SUBWrr.
	LLVM_FALLTHROUGH;
	case AArch64::SUBXrr:
	case AArch64::SUBWrr: {
	// neg x -> csneg, represented as sub dst, xzr, src.
	unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
	if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
	return 0;
	SrcOpNum = 2;
	Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
	break;
	}
	default:
	return 0;
	}
	assert(Opc && SrcOpNum && "Missing parameters");

	if (NewVReg)
	*NewVReg = DefMI->getOperand(SrcOpNum).getReg();
	return Opc;
	}

	bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
	ArrayRef<MachineOperand> Cond,
	Register DstReg, Register TrueReg,
	Register FalseReg, int &CondCycles,
	int &TrueCycles,
	int &FalseCycles) const {
	// Check register classes.
	const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
	const TargetRegisterClass *RC =
	RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
	if (!RC)
	return false;

	// Also need to check the dest regclass, in case we're trying to optimize
	// something like:
	// %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
	if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
	return false;

	// Expanding cbz/tbz requires an extra cycle of latency on the condition.
	unsigned ExtraCondLat = Cond.size() != 1;

	// GPRs are handled by csel.
	// FIXME: Fold in x+1, -x, and ~x when applicable.
	if (AArch64::GPR64allRegClass.hasSubClassEq(RC) \|\|
	AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
	// Single-cycle csel, csinc, csinv, and csneg.
	CondCycles = 1 + ExtraCondLat;
	TrueCycles = FalseCycles = 1;
	if (canFoldIntoCSel(MRI, TrueReg))
	TrueCycles = 0;
	else if (canFoldIntoCSel(MRI, FalseReg))
	FalseCycles = 0;
	return true;
	}

	// Scalar floating point is handled by fcsel.
	// FIXME: Form fabs, fmin, and fmax when applicable.
	if (AArch64::FPR64RegClass.hasSubClassEq(RC) \|\|
	AArch64::FPR32RegClass.hasSubClassEq(RC)) {
	CondCycles = 5 + ExtraCondLat;
	TrueCycles = FalseCycles = 2;
	return true;
	}

	// Can't do vectors.
	return false;
	}

	void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I,
	const DebugLoc &DL, Register DstReg,
	ArrayRef<MachineOperand> Cond,
	Register TrueReg, Register FalseReg) const {
	MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();

	// Parse the condition code, see parseCondBranch() above.
	AArch64CC::CondCode CC;
	switch (Cond.size()) {
	default:
	llvm_unreachable("Unknown condition opcode in Cond");
	case 1: // b.cc
	CC = AArch64CC::CondCode(Cond[0].getImm());
	break;
	case 3: { // cbz/cbnz
	// We must insert a compare against 0.
	bool Is64Bit;
	switch (Cond[1].getImm()) {
	default:
	llvm_unreachable("Unknown branch opcode in Cond");
	case AArch64::CBZW:
	Is64Bit = false;
	CC = AArch64CC::EQ;
	break;
	case AArch64::CBZX:
	Is64Bit = true;
	CC = AArch64CC::EQ;
	break;
	case AArch64::CBNZW:
	Is64Bit = false;
	CC = AArch64CC::NE;
	break;
	case AArch64::CBNZX:
	Is64Bit = true;
	CC = AArch64CC::NE;
	break;
	}
	Register SrcReg = Cond[2].getReg();
	if (Is64Bit) {
	// cmp reg, #0 is actually subs xzr, reg, #0.
	MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
	BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
	.addReg(SrcReg)
	.addImm(0)
	.addImm(0);
	} else {
	MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
	BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
	.addReg(SrcReg)
	.addImm(0)
	.addImm(0);
	}
	break;
	}
	case 4: { // tbz/tbnz
	// We must insert a tst instruction.
	switch (Cond[1].getImm()) {
	default:
	llvm_unreachable("Unknown branch opcode in Cond");
	case AArch64::TBZW:
	case AArch64::TBZX:
	CC = AArch64CC::EQ;
	break;
	case AArch64::TBNZW:
	case AArch64::TBNZX:
	CC = AArch64CC::NE;
	break;
	}
	// cmp reg, #foo is actually ands xzr, reg, #1<<foo.
	if (Cond[1].getImm() == AArch64::TBZW \|\| Cond[1].getImm() == AArch64::TBNZW)
	BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
	.addReg(Cond[2].getReg())
	.addImm(
	AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
	else
	BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
	.addReg(Cond[2].getReg())
	.addImm(
	AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
	break;
	}
	}

	unsigned Opc = 0;
	const TargetRegisterClass *RC = nullptr;
	bool TryFold = false;
	if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
	RC = &AArch64::GPR64RegClass;
	Opc = AArch64::CSELXr;
	TryFold = true;
	} else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
	RC = &AArch64::GPR32RegClass;
	Opc = AArch64::CSELWr;
	TryFold = true;
	} else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
	RC = &AArch64::FPR64RegClass;
	Opc = AArch64::FCSELDrrr;
	} else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
	RC = &AArch64::FPR32RegClass;
	Opc = AArch64::FCSELSrrr;
	}
	assert(RC && "Unsupported regclass");

	// Try folding simple instructions into the csel.
	if (TryFold) {
	unsigned NewVReg = 0;
	unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
	if (FoldedOpc) {
	// The folded opcodes csinc, csinc and csneg apply the operation to
	// FalseReg, so we need to invert the condition.
	CC = AArch64CC::getInvertedCondCode(CC);
	TrueReg = FalseReg;
	} else
	FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);

	// Fold the operation. Leave any dead instructions for DCE to clean up.
	if (FoldedOpc) {
	FalseReg = NewVReg;
	Opc = FoldedOpc;
	// The extends the live range of NewVReg.
	MRI.clearKillFlags(NewVReg);
	}
	}

	// Pull all virtual register into the appropriate class.
	MRI.constrainRegClass(TrueReg, RC);
	MRI.constrainRegClass(FalseReg, RC);

	// Insert the csel.
	BuildMI(MBB, I, DL, get(Opc), DstReg)
	.addReg(TrueReg)
	.addReg(FalseReg)
	.addImm(CC);
	}

	/// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx.
	static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) {
	uint64_t Imm = MI.getOperand(1).getImm();
	uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
	uint64_t Encoding;
	return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding);
	}

	// FIXME: this implementation should be micro-architecture dependent, so a
	// micro-architecture target hook should be introduced here in future.
	bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
	if (!Subtarget.hasCustomCheapAsMoveHandling())
	return MI.isAsCheapAsAMove();

	const unsigned Opcode = MI.getOpcode();

	// Firstly, check cases gated by features.

	if (Subtarget.hasZeroCycleZeroingFP()) {
	if (Opcode == AArch64::FMOVH0 \|\|
	Opcode == AArch64::FMOVS0 \|\|
	Opcode == AArch64::FMOVD0)
	return true;
	}

	if (Subtarget.hasZeroCycleZeroingGP()) {
	if (Opcode == TargetOpcode::COPY &&
	(MI.getOperand(1).getReg() == AArch64::WZR \|\|
	MI.getOperand(1).getReg() == AArch64::XZR))
	return true;
	}

	// Secondly, check cases specific to sub-targets.

	if (Subtarget.hasExynosCheapAsMoveHandling()) {
	if (isExynosCheapAsMove(MI))
	return true;

	return MI.isAsCheapAsAMove();
	}

	// Finally, check generic cases.

	switch (Opcode) {
	default:
	return false;

	// add/sub on register without shift
	case AArch64::ADDWri:
	case AArch64::ADDXri:
	case AArch64::SUBWri:
	case AArch64::SUBXri:
	return (MI.getOperand(3).getImm() == 0);

	// logical ops on immediate
	case AArch64::ANDWri:
	case AArch64::ANDXri:
	case AArch64::EORWri:
	case AArch64::EORXri:
	case AArch64::ORRWri:
	case AArch64::ORRXri:
	return true;

	// logical ops on register without shift
	case AArch64::ANDWrr:
	case AArch64::ANDXrr:
	case AArch64::BICWrr:
	case AArch64::BICXrr:
	case AArch64::EONWrr:
	case AArch64::EONXrr:
	case AArch64::EORWrr:
	case AArch64::EORXrr:
	case AArch64::ORNWrr:
	case AArch64::ORNXrr:
	case AArch64::ORRWrr:
	case AArch64::ORRXrr:
	return true;

	// If MOVi32imm or MOVi64imm can be expanded into ORRWri or
	// ORRXri, it is as cheap as MOV
	case AArch64::MOVi32imm:
	return canBeExpandedToORR(MI, 32);
	case AArch64::MOVi64imm:
	return canBeExpandedToORR(MI, 64);
	}

	llvm_unreachable("Unknown opcode to check as cheap as a move!");
	}

	bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
	switch (MI.getOpcode()) {
	default:
	return false;

	case AArch64::ADDWrs:
	case AArch64::ADDXrs:
	case AArch64::ADDSWrs:
	case AArch64::ADDSXrs: {
	unsigned Imm = MI.getOperand(3).getImm();
	unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
	if (ShiftVal == 0)
	return true;
	return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
	}

	case AArch64::ADDWrx:
	case AArch64::ADDXrx:
	case AArch64::ADDXrx64:
	case AArch64::ADDSWrx:
	case AArch64::ADDSXrx:
	case AArch64::ADDSXrx64: {
	unsigned Imm = MI.getOperand(3).getImm();
	switch (AArch64_AM::getArithExtendType(Imm)) {
	default:
	return false;
	case AArch64_AM::UXTB:
	case AArch64_AM::UXTH:
	case AArch64_AM::UXTW:
	case AArch64_AM::UXTX:
	return AArch64_AM::getArithShiftValue(Imm) <= 4;
	}
	}

	case AArch64::SUBWrs:
	case AArch64::SUBSWrs: {
	unsigned Imm = MI.getOperand(3).getImm();
	unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
	return ShiftVal == 0 \|\|
	(AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
	}

	case AArch64::SUBXrs:
	case AArch64::SUBSXrs: {
	unsigned Imm = MI.getOperand(3).getImm();
	unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
	return ShiftVal == 0 \|\|
	(AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
	}

	case AArch64::SUBWrx:
	case AArch64::SUBXrx:
	case AArch64::SUBXrx64:
	case AArch64::SUBSWrx:
	case AArch64::SUBSXrx:
	case AArch64::SUBSXrx64: {
	unsigned Imm = MI.getOperand(3).getImm();
	switch (AArch64_AM::getArithExtendType(Imm)) {
	default:
	return false;
	case AArch64_AM::UXTB:
	case AArch64_AM::UXTH:
	case AArch64_AM::UXTW:
	case AArch64_AM::UXTX:
	return AArch64_AM::getArithShiftValue(Imm) == 0;
	}
	}

	case AArch64::LDRBBroW:
	case AArch64::LDRBBroX:
	case AArch64::LDRBroW:
	case AArch64::LDRBroX:
	case AArch64::LDRDroW:
	case AArch64::LDRDroX:
	case AArch64::LDRHHroW:
	case AArch64::LDRHHroX:
	case AArch64::LDRHroW:
	case AArch64::LDRHroX:
	case AArch64::LDRQroW:
	case AArch64::LDRQroX:
	case AArch64::LDRSBWroW:
	case AArch64::LDRSBWroX:
	case AArch64::LDRSBXroW:
	case AArch64::LDRSBXroX:
	case AArch64::LDRSHWroW:
	case AArch64::LDRSHWroX:
	case AArch64::LDRSHXroW:
	case AArch64::LDRSHXroX:
	case AArch64::LDRSWroW:
	case AArch64::LDRSWroX:
	case AArch64::LDRSroW:
	case AArch64::LDRSroX:
	case AArch64::LDRWroW:
	case AArch64::LDRWroX:
	case AArch64::LDRXroW:
	case AArch64::LDRXroX:
	case AArch64::PRFMroW:
	case AArch64::PRFMroX:
	case AArch64::STRBBroW:
	case AArch64::STRBBroX:
	case AArch64::STRBroW:
	case AArch64::STRBroX:
	case AArch64::STRDroW:
	case AArch64::STRDroX:
	case AArch64::STRHHroW:
	case AArch64::STRHHroX:
	case AArch64::STRHroW:
	case AArch64::STRHroX:
	case AArch64::STRQroW:
	case AArch64::STRQroX:
	case AArch64::STRSroW:
	case AArch64::STRSroX:
	case AArch64::STRWroW:
	case AArch64::STRWroX:
	case AArch64::STRXroW:
	case AArch64::STRXroX: {
	unsigned IsSigned = MI.getOperand(3).getImm();
	return !IsSigned;
	}
	}
	}

	bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
	unsigned Opc = MI.getOpcode();
	switch (Opc) {
	default:
	return false;
	case AArch64::SEH_StackAlloc:
	case AArch64::SEH_SaveFPLR:
	case AArch64::SEH_SaveFPLR_X:
	case AArch64::SEH_SaveReg:
	case AArch64::SEH_SaveReg_X:
	case AArch64::SEH_SaveRegP:
	case AArch64::SEH_SaveRegP_X:
	case AArch64::SEH_SaveFReg:
	case AArch64::SEH_SaveFReg_X:
	case AArch64::SEH_SaveFRegP:
	case AArch64::SEH_SaveFRegP_X:
	case AArch64::SEH_SetFP:
	case AArch64::SEH_AddFP:
	case AArch64::SEH_Nop:
	case AArch64::SEH_PrologEnd:
	case AArch64::SEH_EpilogStart:
	case AArch64::SEH_EpilogEnd:
	return true;
	}
	}

	bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
	Register &SrcReg, Register &DstReg,
	unsigned &SubIdx) const {
	switch (MI.getOpcode()) {
	default:
	return false;
	case AArch64::SBFMXri: // aka sxtw
	case AArch64::UBFMXri: // aka uxtw
	// Check for the 32 -> 64 bit extension case, these instructions can do
	// much more.
	if (MI.getOperand(2).getImm() != 0 \|\| MI.getOperand(3).getImm() != 31)
	return false;
	// This is a signed or unsigned 32 -> 64 bit extension.
	SrcReg = MI.getOperand(1).getReg();
	DstReg = MI.getOperand(0).getReg();
	SubIdx = AArch64::sub_32;
	return true;
	}
	}

	bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
	const MachineInstr &MIa, const MachineInstr &MIb) const {
	const TargetRegisterInfo *TRI = &getRegisterInfo();
	const MachineOperand BaseOpA = nullptr, BaseOpB = nullptr;
	int64_t OffsetA = 0, OffsetB = 0;
	unsigned WidthA = 0, WidthB = 0;
	bool OffsetAIsScalable = false, OffsetBIsScalable = false;

	assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
	assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");

	if (MIa.hasUnmodeledSideEffects() \|\| MIb.hasUnmodeledSideEffects() \|\|
	MIa.hasOrderedMemoryRef() \|\| MIb.hasOrderedMemoryRef())
	return false;

	// Retrieve the base, offset from the base and width. Width
	// is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
	// base are identical, and the offset of a lower memory access +
	// the width doesn't overlap the offset of a higher memory access,
	// then the memory accesses are different.
	// If OffsetAIsScalable and OffsetBIsScalable are both true, they
	// are assumed to have the same scale (vscale).
	if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
	WidthA, TRI) &&
	getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
	WidthB, TRI)) {
	if (BaseOpA->isIdenticalTo(*BaseOpB) &&
	OffsetAIsScalable == OffsetBIsScalable) {
	int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
	int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
	int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
	if (LowOffset + LowWidth <= HighOffset)
	return true;
	}
	}
	return false;
	}

	bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
	const MachineBasicBlock *MBB,
	const MachineFunction &MF) const {
	if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
	return true;
	switch (MI.getOpcode()) {
	case AArch64::HINT:
	// CSDB hints are scheduling barriers.
	if (MI.getOperand(0).getImm() == 0x14)
	return true;
	break;
	case AArch64::DSB:
	case AArch64::ISB:
	// DSB and ISB also are scheduling barriers.
	return true;
	default:;
	}
	return isSEHInstruction(MI);
	}

	/// analyzeCompare - For a comparison instruction, return the source registers
	/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
	/// Return true if the comparison instruction can be analyzed.
	bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
	Register &SrcReg2, int &CmpMask,
	int &CmpValue) const {
	// The first operand can be a frame index where we'd normally expect a
	// register.
	assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
	if (!MI.getOperand(1).isReg())
	return false;

	switch (MI.getOpcode()) {
	default:
	break;
	case AArch64::SUBSWrr:
	case AArch64::SUBSWrs:
	case AArch64::SUBSWrx:
	case AArch64::SUBSXrr:
	case AArch64::SUBSXrs:
	case AArch64::SUBSXrx:
	case AArch64::ADDSWrr:
	case AArch64::ADDSWrs:
	case AArch64::ADDSWrx:
	case AArch64::ADDSXrr:
	case AArch64::ADDSXrs:
	case AArch64::ADDSXrx:
	// Replace SUBSWrr with SUBWrr if NZCV is not used.
	SrcReg = MI.getOperand(1).getReg();
	SrcReg2 = MI.getOperand(2).getReg();
	CmpMask = ~0;
	CmpValue = 0;
	return true;
	case AArch64::SUBSWri:
	case AArch64::ADDSWri:
	case AArch64::SUBSXri:
	case AArch64::ADDSXri:
	SrcReg = MI.getOperand(1).getReg();
	SrcReg2 = 0;
	CmpMask = ~0;
	// FIXME: In order to convert CmpValue to 0 or 1
	CmpValue = MI.getOperand(2).getImm() != 0;
	return true;
	case AArch64::ANDSWri:
	case AArch64::ANDSXri:
	// ANDS does not use the same encoding scheme as the others xxxS
	// instructions.
	SrcReg = MI.getOperand(1).getReg();
	SrcReg2 = 0;
	CmpMask = ~0;
	// FIXME:The return val type of decodeLogicalImmediate is uint64_t,
	// while the type of CmpValue is int. When converting uint64_t to int,
	// the high 32 bits of uint64_t will be lost.
	// In fact it causes a bug in spec2006-483.xalancbmk
	// CmpValue is only used to compare with zero in OptimizeCompareInstr
	CmpValue = AArch64_AM::decodeLogicalImmediate(
	MI.getOperand(2).getImm(),
	MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0;
	return true;
	}

	return false;
	}

	static bool UpdateOperandRegClass(MachineInstr &Instr) {
	MachineBasicBlock *MBB = Instr.getParent();
	assert(MBB && "Can't get MachineBasicBlock here");
	MachineFunction *MF = MBB->getParent();
	assert(MF && "Can't get MachineFunction here");
	const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
	const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
	MachineRegisterInfo *MRI = &MF->getRegInfo();

	for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
	++OpIdx) {
	MachineOperand &MO = Instr.getOperand(OpIdx);
	const TargetRegisterClass *OpRegCstraints =
	Instr.getRegClassConstraint(OpIdx, TII, TRI);

	// If there's no constraint, there's nothing to do.
	if (!OpRegCstraints)
	continue;
	// If the operand is a frame index, there's nothing to do here.
	// A frame index operand will resolve correctly during PEI.
	if (MO.isFI())
	continue;

	assert(MO.isReg() &&
	"Operand has register constraints without being a register!");

	Register Reg = MO.getReg();
	if (Register::isPhysicalRegister(Reg)) {
	if (!OpRegCstraints->contains(Reg))
	return false;
	} else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
	!MRI->constrainRegClass(Reg, OpRegCstraints))
	return false;
	}

	return true;
	}

	/// Return the opcode that does not set flags when possible - otherwise
	/// return the original opcode. The caller is responsible to do the actual
	/// substitution and legality checking.
	static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
	// Don't convert all compare instructions, because for some the zero register
	// encoding becomes the sp register.
	bool MIDefinesZeroReg = false;
	if (MI.definesRegister(AArch64::WZR) \|\| MI.definesRegister(AArch64::XZR))
	MIDefinesZeroReg = true;

	switch (MI.getOpcode()) {
	default:
	return MI.getOpcode();
	case AArch64::ADDSWrr:
	return AArch64::ADDWrr;
	case AArch64::ADDSWri:
	return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
	case AArch64::ADDSWrs:
	return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
	case AArch64::ADDSWrx:
	return AArch64::ADDWrx;
	case AArch64::ADDSXrr:
	return AArch64::ADDXrr;
	case AArch64::ADDSXri:
	return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
	case AArch64::ADDSXrs:
	return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
	case AArch64::ADDSXrx:
	return AArch64::ADDXrx;
	case AArch64::SUBSWrr:
	return AArch64::SUBWrr;
	case AArch64::SUBSWri:
	return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
	case AArch64::SUBSWrs:
	return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
	case AArch64::SUBSWrx:
	return AArch64::SUBWrx;
	case AArch64::SUBSXrr:
	return AArch64::SUBXrr;
	case AArch64::SUBSXri:
	return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
	case AArch64::SUBSXrs:
	return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
	case AArch64::SUBSXrx:
	return AArch64::SUBXrx;
	}
	}

	enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };

	/// True when condition flags are accessed (either by writing or reading)
	/// on the instruction trace starting at From and ending at To.
	///
	/// Note: If From and To are from different blocks it's assumed CC are accessed
	/// on the path.
	static bool areCFlagsAccessedBetweenInstrs(
	MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
	const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
	// Early exit if To is at the beginning of the BB.
	if (To == To->getParent()->begin())
	return true;

	// Check whether the instructions are in the same basic block
	// If not, assume the condition flags might get modified somewhere.
	if (To->getParent() != From->getParent())
	return true;

	// From must be above To.
	assert(std::find_if(++To.getReverse(), To->getParent()->rend(),
	[From](MachineInstr &MI) {
	return MI.getIterator() == From;
	}) != To->getParent()->rend());

	// We iterate backward starting at \p To until we hit \p From.
	for (const MachineInstr &Instr :
	instructionsWithoutDebug(++To.getReverse(), From.getReverse())) {
	if (((AccessToCheck & AK_Write) &&
	Instr.modifiesRegister(AArch64::NZCV, TRI)) \|\|
	((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
	return true;
	}
	return false;
	}

	/// Try to optimize a compare instruction. A compare instruction is an
	/// instruction which produces AArch64::NZCV. It can be truly compare
	/// instruction
	/// when there are no uses of its destination register.
	///
	/// The following steps are tried in order:
	/// 1. Convert CmpInstr into an unconditional version.
	/// 2. Remove CmpInstr if above there is an instruction producing a needed
	/// condition code or an instruction which can be converted into such an
	/// instruction.
	/// Only comparison with zero is supported.
	bool AArch64InstrInfo::optimizeCompareInstr(
	MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int CmpMask,
	int CmpValue, const MachineRegisterInfo *MRI) const {
	assert(CmpInstr.getParent());
	assert(MRI);

	// Replace SUBSWrr with SUBWrr if NZCV is not used.
	int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true);
	if (DeadNZCVIdx != -1) {
	if (CmpInstr.definesRegister(AArch64::WZR) \|\|
	CmpInstr.definesRegister(AArch64::XZR)) {
	CmpInstr.eraseFromParent();
	return true;
	}
	unsigned Opc = CmpInstr.getOpcode();
	unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
	if (NewOpc == Opc)
	return false;
	const MCInstrDesc &MCID = get(NewOpc);
	CmpInstr.setDesc(MCID);
	CmpInstr.RemoveOperand(DeadNZCVIdx);
	bool succeeded = UpdateOperandRegClass(CmpInstr);
	(void)succeeded;
	assert(succeeded && "Some operands reg class are incompatible!");
	return true;
	}

	// Continue only if we have a "ri" where immediate is zero.
	// FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare
	// function.
	assert((CmpValue == 0 \|\| CmpValue == 1) && "CmpValue must be 0 or 1!");
	if (CmpValue != 0 \|\| SrcReg2 != 0)
	return false;

	// CmpInstr is a Compare instruction if destination register is not used.
	if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
	return false;

	return substituteCmpToZero(CmpInstr, SrcReg, MRI);
	}

	/// Get opcode of S version of Instr.
	/// If Instr is S version its opcode is returned.
	/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
	/// or we are not interested in it.
	static unsigned sForm(MachineInstr &Instr) {
	switch (Instr.getOpcode()) {
	default:
	return AArch64::INSTRUCTION_LIST_END;

	case AArch64::ADDSWrr:
	case AArch64::ADDSWri:
	case AArch64::ADDSXrr:
	case AArch64::ADDSXri:
	case AArch64::SUBSWrr:
	case AArch64::SUBSWri:
	case AArch64::SUBSXrr:
	case AArch64::SUBSXri:
	return Instr.getOpcode();

	case AArch64::ADDWrr:
	return AArch64::ADDSWrr;
	case AArch64::ADDWri:
	return AArch64::ADDSWri;
	case AArch64::ADDXrr:
	return AArch64::ADDSXrr;
	case AArch64::ADDXri:
	return AArch64::ADDSXri;
	case AArch64::ADCWr:
	return AArch64::ADCSWr;
	case AArch64::ADCXr:
	return AArch64::ADCSXr;
	case AArch64::SUBWrr:
	return AArch64::SUBSWrr;
	case AArch64::SUBWri:
	return AArch64::SUBSWri;
	case AArch64::SUBXrr:
	return AArch64::SUBSXrr;
	case AArch64::SUBXri:
	return AArch64::SUBSXri;
	case AArch64::SBCWr:
	return AArch64::SBCSWr;
	case AArch64::SBCXr:
	return AArch64::SBCSXr;
	case AArch64::ANDWri:
	return AArch64::ANDSWri;
	case AArch64::ANDXri:
	return AArch64::ANDSXri;
	}
	}

	/// Check if AArch64::NZCV should be alive in successors of MBB.
	static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) {
	for (auto *BB : MBB->successors())
	if (BB->isLiveIn(AArch64::NZCV))
	return true;
	return false;
	}

	namespace {

	struct UsedNZCV {
	bool N = false;
	bool Z = false;
	bool C = false;
	bool V = false;

	UsedNZCV() = default;

	UsedNZCV &operator\|=(const UsedNZCV &UsedFlags) {
	this->N \|= UsedFlags.N;
	this->Z \|= UsedFlags.Z;
	this->C \|= UsedFlags.C;
	this->V \|= UsedFlags.V;
	return *this;
	}
	};

	} // end anonymous namespace

	/// Find a condition code used by the instruction.
	/// Returns AArch64CC::Invalid if either the instruction does not use condition
	/// codes or we don't optimize CmpInstr in the presence of such instructions.
	static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
	switch (Instr.getOpcode()) {
	default:
	return AArch64CC::Invalid;

	case AArch64::Bcc: {
	int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
	assert(Idx >= 2);
	return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 2).getImm());
	}

	case AArch64::CSINVWr:
	case AArch64::CSINVXr:
	case AArch64::CSINCWr:
	case AArch64::CSINCXr:
	case AArch64::CSELWr:
	case AArch64::CSELXr:
	case AArch64::CSNEGWr:
	case AArch64::CSNEGXr:
	case AArch64::FCSELSrrr:
	case AArch64::FCSELDrrr: {
	int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
	assert(Idx >= 1);
	return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 1).getImm());
	}
	}
	}

	static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
	assert(CC != AArch64CC::Invalid);
	UsedNZCV UsedFlags;
	switch (CC) {
	default:
	break;

	case AArch64CC::EQ: // Z set
	case AArch64CC::NE: // Z clear
	UsedFlags.Z = true;
	break;

	case AArch64CC::HI: // Z clear and C set
	case AArch64CC::LS: // Z set or C clear
	UsedFlags.Z = true;
	LLVM_FALLTHROUGH;
	case AArch64CC::HS: // C set
	case AArch64CC::LO: // C clear
	UsedFlags.C = true;
	break;

	case AArch64CC::MI: // N set
	case AArch64CC::PL: // N clear
	UsedFlags.N = true;
	break;

	case AArch64CC::VS: // V set
	case AArch64CC::VC: // V clear
	UsedFlags.V = true;
	break;

	case AArch64CC::GT: // Z clear, N and V the same
	case AArch64CC::LE: // Z set, N and V differ
	UsedFlags.Z = true;
	LLVM_FALLTHROUGH;
	case AArch64CC::GE: // N and V the same
	case AArch64CC::LT: // N and V differ
	UsedFlags.N = true;
	UsedFlags.V = true;
	break;
	}
	return UsedFlags;
	}

	static bool isADDSRegImm(unsigned Opcode) {
	return Opcode == AArch64::ADDSWri \|\| Opcode == AArch64::ADDSXri;
	}

	static bool isSUBSRegImm(unsigned Opcode) {
	return Opcode == AArch64::SUBSWri \|\| Opcode == AArch64::SUBSXri;
	}

	/// Check if CmpInstr can be substituted by MI.
	///
	/// CmpInstr can be substituted:
	/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
	/// - and, MI and CmpInstr are from the same MachineBB
	/// - and, condition flags are not alive in successors of the CmpInstr parent
	/// - and, if MI opcode is the S form there must be no defs of flags between
	/// MI and CmpInstr
	/// or if MI opcode is not the S form there must be neither defs of flags
	/// nor uses of flags between MI and CmpInstr.
	/// - and C/V flags are not used after CmpInstr
	static bool canInstrSubstituteCmpInstr(MachineInstr MI, MachineInstr CmpInstr,
	const TargetRegisterInfo *TRI) {
	assert(MI);
	assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END);
	assert(CmpInstr);

	const unsigned CmpOpcode = CmpInstr->getOpcode();
	if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
	return false;

	if (MI->getParent() != CmpInstr->getParent())
	return false;

	if (areCFlagsAliveInSuccessors(CmpInstr->getParent()))
	return false;

	AccessKind AccessToCheck = AK_Write;
	if (sForm(*MI) != MI->getOpcode())
	AccessToCheck = AK_All;
	if (areCFlagsAccessedBetweenInstrs(MI, CmpInstr, TRI, AccessToCheck))
	return false;

	UsedNZCV NZCVUsedAfterCmp;
	for (const MachineInstr &Instr :
	instructionsWithoutDebug(std::next(CmpInstr->getIterator()),
	CmpInstr->getParent()->instr_end())) {
	if (Instr.readsRegister(AArch64::NZCV, TRI)) {
	AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
	if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
	return false;
	NZCVUsedAfterCmp \|= getUsedNZCV(CC);
	}

	if (Instr.modifiesRegister(AArch64::NZCV, TRI))
	break;
	}

	return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V;
	}

	/// Substitute an instruction comparing to zero with another instruction
	/// which produces needed condition flags.
	///
	/// Return true on success.
	bool AArch64InstrInfo::substituteCmpToZero(
	MachineInstr &CmpInstr, unsigned SrcReg,
	const MachineRegisterInfo *MRI) const {
	assert(MRI);
	// Get the unique definition of SrcReg.
	MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
	if (!MI)
	return false;

	const TargetRegisterInfo *TRI = &getRegisterInfo();

	unsigned NewOpc = sForm(*MI);
	if (NewOpc == AArch64::INSTRUCTION_LIST_END)
	return false;

	if (!canInstrSubstituteCmpInstr(MI, &CmpInstr, TRI))
	return false;

	// Update the instruction to set NZCV.
	MI->setDesc(get(NewOpc));
	CmpInstr.eraseFromParent();
	bool succeeded = UpdateOperandRegClass(*MI);
	(void)succeeded;
	assert(succeeded && "Some operands reg class are incompatible!");
	MI->addRegisterDefined(AArch64::NZCV, TRI);
	return true;
	}

	bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
	if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
	MI.getOpcode() != AArch64::CATCHRET)
	return false;

	MachineBasicBlock &MBB = *MI.getParent();
	auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
	auto TRI = Subtarget.getRegisterInfo();
	DebugLoc DL = MI.getDebugLoc();

	if (MI.getOpcode() == AArch64::CATCHRET) {
	// Skip to the first instruction before the epilog.
	const TargetInstrInfo *TII =
	MBB.getParent()->getSubtarget().getInstrInfo();
	MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
	auto MBBI = MachineBasicBlock::iterator(MI);
	MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
	while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
	FirstEpilogSEH != MBB.begin())
	FirstEpilogSEH = std::prev(FirstEpilogSEH);
	if (FirstEpilogSEH != MBB.begin())
	FirstEpilogSEH = std::next(FirstEpilogSEH);
	BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
	.addReg(AArch64::X0, RegState::Define)
	.addMBB(TargetMBB);
	BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
	.addReg(AArch64::X0, RegState::Define)
	.addReg(AArch64::X0)
	.addMBB(TargetMBB)
	.addImm(0);
	return true;
	}

	Register Reg = MI.getOperand(0).getReg();
	const GlobalValue *GV =
	cast<GlobalValue>((*MI.memoperands_begin())->getValue());
	const TargetMachine &TM = MBB.getParent()->getTarget();
	unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
	const unsigned char MO_NC = AArch64II::MO_NC;

	if ((OpFlags & AArch64II::MO_GOT) != 0) {
	BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
	.addGlobalAddress(GV, 0, OpFlags);
	if (Subtarget.isTargetILP32()) {
	unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
	BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
	.addDef(Reg32, RegState::Dead)
	.addUse(Reg, RegState::Kill)
	.addImm(0)
	.addMemOperand(*MI.memoperands_begin())
	.addDef(Reg, RegState::Implicit);
	} else {
	BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
	.addReg(Reg, RegState::Kill)
	.addImm(0)
	.addMemOperand(*MI.memoperands_begin());
	}
	} else if (TM.getCodeModel() == CodeModel::Large) {
	assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
	BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
	.addGlobalAddress(GV, 0, AArch64II::MO_G0 \| MO_NC)
	.addImm(0);
	BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
	.addReg(Reg, RegState::Kill)
	.addGlobalAddress(GV, 0, AArch64II::MO_G1 \| MO_NC)
	.addImm(16);
	BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
	.addReg(Reg, RegState::Kill)
	.addGlobalAddress(GV, 0, AArch64II::MO_G2 \| MO_NC)
	.addImm(32);
	BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
	.addReg(Reg, RegState::Kill)
	.addGlobalAddress(GV, 0, AArch64II::MO_G3)
	.addImm(48);
	BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
	.addReg(Reg, RegState::Kill)
	.addImm(0)
	.addMemOperand(*MI.memoperands_begin());
	} else if (TM.getCodeModel() == CodeModel::Tiny) {
	BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
	.addGlobalAddress(GV, 0, OpFlags);
	} else {
	BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
	.addGlobalAddress(GV, 0, OpFlags \| AArch64II::MO_PAGE);
	unsigned char LoFlags = OpFlags \| AArch64II::MO_PAGEOFF \| MO_NC;
	if (Subtarget.isTargetILP32()) {
	unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
	BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
	.addDef(Reg32, RegState::Dead)
	.addUse(Reg, RegState::Kill)
	.addGlobalAddress(GV, 0, LoFlags)
	.addMemOperand(*MI.memoperands_begin())
	.addDef(Reg, RegState::Implicit);
	} else {
	BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
	.addReg(Reg, RegState::Kill)
	.addGlobalAddress(GV, 0, LoFlags)
	.addMemOperand(*MI.memoperands_begin());
	}
	}

	MBB.erase(MI);

	return true;
	}

	// Return true if this instruction simply sets its single destination register
	// to zero. This is equivalent to a register rename of the zero-register.
	bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
	switch (MI.getOpcode()) {
	default:
	break;
	case AArch64::MOVZWi:
	case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
	if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
	assert(MI.getDesc().getNumOperands() == 3 &&
	MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
	return true;
	}
	break;
	case AArch64::ANDWri: // and Rd, Rzr, #imm
	return MI.getOperand(1).getReg() == AArch64::WZR;
	case AArch64::ANDXri:
	return MI.getOperand(1).getReg() == AArch64::XZR;
	case TargetOpcode::COPY:
	return MI.getOperand(1).getReg() == AArch64::WZR;
	}
	return false;
	}

	// Return true if this instruction simply renames a general register without
	// modifying bits.
	bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
	switch (MI.getOpcode()) {
	default:
	break;
	case TargetOpcode::COPY: {
	// GPR32 copies will by lowered to ORRXrs
	Register DstReg = MI.getOperand(0).getReg();
	return (AArch64::GPR32RegClass.contains(DstReg) \|\|
	AArch64::GPR64RegClass.contains(DstReg));
	}
	case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
	if (MI.getOperand(1).getReg() == AArch64::XZR) {
	assert(MI.getDesc().getNumOperands() == 4 &&
	MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
	return true;
	}
	break;
	case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
	if (MI.getOperand(2).getImm() == 0) {
	assert(MI.getDesc().getNumOperands() == 4 &&
	MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
	return true;
	}
	break;
	}
	return false;
	}

	// Return true if this instruction simply renames a general register without
	// modifying bits.
	bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
	switch (MI.getOpcode()) {
	default:
	break;
	case TargetOpcode::COPY: {
	// FPR64 copies will by lowered to ORR.16b
	Register DstReg = MI.getOperand(0).getReg();
	return (AArch64::FPR64RegClass.contains(DstReg) \|\|
	AArch64::FPR128RegClass.contains(DstReg));
	}
	case AArch64::ORRv16i8:
	if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
	assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
	"invalid ORRv16i8 operands");
	return true;
	}
	break;
	}
	return false;
	}

	unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
	int &FrameIndex) const {
	switch (MI.getOpcode()) {
	default:
	break;
	case AArch64::LDRWui:
	case AArch64::LDRXui:
	case AArch64::LDRBui:
	case AArch64::LDRHui:
	case AArch64::LDRSui:
	case AArch64::LDRDui:
	case AArch64::LDRQui:
	if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
	MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
	FrameIndex = MI.getOperand(1).getIndex();
	return MI.getOperand(0).getReg();
	}
	break;
	}

	return 0;
	}

	unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
	int &FrameIndex) const {
	switch (MI.getOpcode()) {
	default:
	break;
	case AArch64::STRWui:
	case AArch64::STRXui:
	case AArch64::STRBui:
	case AArch64::STRHui:
	case AArch64::STRSui:
	case AArch64::STRDui:
	case AArch64::STRQui:
	case AArch64::LDR_PXI:
	case AArch64::STR_PXI:
	if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
	MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
	FrameIndex = MI.getOperand(1).getIndex();
	return MI.getOperand(0).getReg();
	}
	break;
	}
	return 0;
	}

	/// Check all MachineMemOperands for a hint to suppress pairing.
	bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
	return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
	return MMO->getFlags() & MOSuppressPair;
	});
	}

	/// Set a flag on the first MachineMemOperand to suppress pairing.
	void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
	if (MI.memoperands_empty())
	return;
	(*MI.memoperands_begin())->setFlags(MOSuppressPair);
	}

	/// Check all MachineMemOperands for a hint that the load/store is strided.
	bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
	return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
	return MMO->getFlags() & MOStridedAccess;
	});
	}

	bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) {
	switch (Opc) {
	default:
	return false;
	case AArch64::STURSi:
	case AArch64::STURDi:
	case AArch64::STURQi:
	case AArch64::STURBBi:
	case AArch64::STURHHi:
	case AArch64::STURWi:
	case AArch64::STURXi:
	case AArch64::LDURSi:
	case AArch64::LDURDi:
	case AArch64::LDURQi:
	case AArch64::LDURWi:
	case AArch64::LDURXi:
	case AArch64::LDURSWi:
	case AArch64::LDURHHi:
	case AArch64::LDURBBi:
	case AArch64::LDURSBWi:
	case AArch64::LDURSHWi:
	return true;
	}
	}

	Optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
	switch (Opc) {
	default: return {};
	case AArch64::PRFMui: return AArch64::PRFUMi;
	case AArch64::LDRXui: return AArch64::LDURXi;
	case AArch64::LDRWui: return AArch64::LDURWi;
	case AArch64::LDRBui: return AArch64::LDURBi;
	case AArch64::LDRHui: return AArch64::LDURHi;
	case AArch64::LDRSui: return AArch64::LDURSi;
	case AArch64::LDRDui: return AArch64::LDURDi;
	case AArch64::LDRQui: return AArch64::LDURQi;
	case AArch64::LDRBBui: return AArch64::LDURBBi;
	case AArch64::LDRHHui: return AArch64::LDURHHi;
	case AArch64::LDRSBXui: return AArch64::LDURSBXi;
	case AArch64::LDRSBWui: return AArch64::LDURSBWi;
	case AArch64::LDRSHXui: return AArch64::LDURSHXi;
	case AArch64::LDRSHWui: return AArch64::LDURSHWi;
	case AArch64::LDRSWui: return AArch64::LDURSWi;
	case AArch64::STRXui: return AArch64::STURXi;
	case AArch64::STRWui: return AArch64::STURWi;
	case AArch64::STRBui: return AArch64::STURBi;
	case AArch64::STRHui: return AArch64::STURHi;
	case AArch64::STRSui: return AArch64::STURSi;
	case AArch64::STRDui: return AArch64::STURDi;
	case AArch64::STRQui: return AArch64::STURQi;
	case AArch64::STRBBui: return AArch64::STURBBi;
	case AArch64::STRHHui: return AArch64::STURHHi;
	}
	}

	unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
	switch (Opc) {
	default:
	return 2;
	case AArch64::LDPXi:
	case AArch64::LDPDi:
	case AArch64::STPXi:
	case AArch64::STPDi:
	case AArch64::LDNPXi:
	case AArch64::LDNPDi:
	case AArch64::STNPXi:
	case AArch64::STNPDi:
	case AArch64::LDPQi:
	case AArch64::STPQi:
	case AArch64::LDNPQi:
	case AArch64::STNPQi:
	case AArch64::LDPWi:
	case AArch64::LDPSi:
	case AArch64::STPWi:
	case AArch64::STPSi:
	case AArch64::LDNPWi:
	case AArch64::LDNPSi:
	case AArch64::STNPWi:
	case AArch64::STNPSi:
	case AArch64::LDG:
	case AArch64::STGPi:
	case AArch64::LD1B_IMM:
	case AArch64::LD1H_IMM:
	case AArch64::LD1W_IMM:
	case AArch64::LD1D_IMM:
	case AArch64::ST1B_IMM:
	case AArch64::ST1H_IMM:
	case AArch64::ST1W_IMM:
	case AArch64::ST1D_IMM:
	case AArch64::LD1B_H_IMM:
	case AArch64::LD1SB_H_IMM:
	case AArch64::LD1H_S_IMM:
	case AArch64::LD1SH_S_IMM:
	case AArch64::LD1W_D_IMM:
	case AArch64::LD1SW_D_IMM:
	case AArch64::ST1B_H_IMM:
	case AArch64::ST1H_S_IMM:
	case AArch64::ST1W_D_IMM:
	case AArch64::LD1B_S_IMM:
	case AArch64::LD1SB_S_IMM:
	case AArch64::LD1H_D_IMM:
	case AArch64::LD1SH_D_IMM:
	case AArch64::ST1B_S_IMM:
	case AArch64::ST1H_D_IMM:
	case AArch64::LD1B_D_IMM:
	case AArch64::LD1SB_D_IMM:
	case AArch64::ST1B_D_IMM:
	return 3;
	case AArch64::ADDG:
	case AArch64::STGOffset:
	case AArch64::LDR_PXI:
	case AArch64::STR_PXI:
	return 2;
	}
	}

	bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
	switch (MI.getOpcode()) {
	default:
	return false;
	// Scaled instructions.
	case AArch64::STRSui:
	case AArch64::STRDui:
	case AArch64::STRQui:
	case AArch64::STRXui:
	case AArch64::STRWui:
	case AArch64::LDRSui:
	case AArch64::LDRDui:
	case AArch64::LDRQui:
	case AArch64::LDRXui:
	case AArch64::LDRWui:
	case AArch64::LDRSWui:
	// Unscaled instructions.
	case AArch64::STURSi:
	case AArch64::STURDi:
	case AArch64::STURQi:
	case AArch64::STURWi:
	case AArch64::STURXi:
	case AArch64::LDURSi:
	case AArch64::LDURDi:
	case AArch64::LDURQi:
	case AArch64::LDURWi:
	case AArch64::LDURXi:
	case AArch64::LDURSWi:
	return true;
	}
	}

	unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc,
	bool &Is64Bit) {
	switch (Opc) {
	default:
	llvm_unreachable("Opcode has no flag setting equivalent!");
	// 32-bit cases:
	case AArch64::ADDWri:
	Is64Bit = false;
	return AArch64::ADDSWri;
	case AArch64::ADDWrr:
	Is64Bit = false;
	return AArch64::ADDSWrr;
	case AArch64::ADDWrs:
	Is64Bit = false;
	return AArch64::ADDSWrs;
	case AArch64::ADDWrx:
	Is64Bit = false;
	return AArch64::ADDSWrx;
	case AArch64::ANDWri:
	Is64Bit = false;
	return AArch64::ANDSWri;
	case AArch64::ANDWrr:
	Is64Bit = false;
	return AArch64::ANDSWrr;
	case AArch64::ANDWrs:
	Is64Bit = false;
	return AArch64::ANDSWrs;
	case AArch64::BICWrr:
	Is64Bit = false;
	return AArch64::BICSWrr;
	case AArch64::BICWrs:
	Is64Bit = false;
	return AArch64::BICSWrs;
	case AArch64::SUBWri:
	Is64Bit = false;
	return AArch64::SUBSWri;
	case AArch64::SUBWrr:
	Is64Bit = false;
	return AArch64::SUBSWrr;
	case AArch64::SUBWrs:
	Is64Bit = false;
	return AArch64::SUBSWrs;
	case AArch64::SUBWrx:
	Is64Bit = false;
	return AArch64::SUBSWrx;
	// 64-bit cases:
	case AArch64::ADDXri:
	Is64Bit = true;
	return AArch64::ADDSXri;
	case AArch64::ADDXrr:
	Is64Bit = true;
	return AArch64::ADDSXrr;
	case AArch64::ADDXrs:
	Is64Bit = true;
	return AArch64::ADDSXrs;
	case AArch64::ADDXrx:
	Is64Bit = true;
	return AArch64::ADDSXrx;
	case AArch64::ANDXri:
	Is64Bit = true;
	return AArch64::ANDSXri;
	case AArch64::ANDXrr:
	Is64Bit = true;
	return AArch64::ANDSXrr;
	case AArch64::ANDXrs:
	Is64Bit = true;
	return AArch64::ANDSXrs;
	case AArch64::BICXrr:
	Is64Bit = true;
	return AArch64::BICSXrr;
	case AArch64::BICXrs:
	Is64Bit = true;
	return AArch64::BICSXrs;
	case AArch64::SUBXri:
	Is64Bit = true;
	return AArch64::SUBSXri;
	case AArch64::SUBXrr:
	Is64Bit = true;
	return AArch64::SUBSXrr;
	case AArch64::SUBXrs:
	Is64Bit = true;
	return AArch64::SUBSXrs;
	case AArch64::SUBXrx:
	Is64Bit = true;
	return AArch64::SUBSXrx;
	}
	}

	// Is this a candidate for ld/st merging or pairing? For example, we don't
	// touch volatiles or load/stores that have a hint to avoid pair formation.
	bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
	// If this is a volatile load/store, don't mess with it.
	if (MI.hasOrderedMemoryRef())
	return false;

	// Make sure this is a reg/fi+imm (as opposed to an address reloc).
	assert((MI.getOperand(1).isReg() \|\| MI.getOperand(1).isFI()) &&
	"Expected a reg or frame index operand.");
	if (!MI.getOperand(2).isImm())
	return false;

	// Can't merge/pair if the instruction modifies the base register.
	// e.g., ldr x0, [x0]
	// This case will never occur with an FI base.
	if (MI.getOperand(1).isReg()) {
	Register BaseReg = MI.getOperand(1).getReg();
	const TargetRegisterInfo *TRI = &getRegisterInfo();
	if (MI.modifiesRegister(BaseReg, TRI))
	return false;
	}

	// Check if this load/store has a hint to avoid pair formation.
	// MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
	if (isLdStPairSuppressed(MI))
	return false;

	// Do not pair any callee-save store/reload instructions in the
	// prologue/epilogue if the CFI information encoded the operations as separate
	// instructions, as that will cause the size of the actual prologue to mismatch
	// with the prologue size recorded in the Windows CFI.
	const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
	bool NeedsWinCFI = MAI->usesWindowsCFI() &&
	MI.getMF()->getFunction().needsUnwindTableEntry();
	if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) \|\|
	MI.getFlag(MachineInstr::FrameDestroy)))
	return false;

	// On some CPUs quad load/store pairs are slower than two single load/stores.
	if (Subtarget.isPaired128Slow()) {
	switch (MI.getOpcode()) {
	default:
	break;
	case AArch64::LDURQi:
	case AArch64::STURQi:
	case AArch64::LDRQui:
	case AArch64::STRQui:
	return false;
	}
	}

	return true;
	}

	bool AArch64InstrInfo::getMemOperandsWithOffsetWidth(
	const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
	int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
	const TargetRegisterInfo *TRI) const {
	if (!LdSt.mayLoadOrStore())
	return false;

	const MachineOperand *BaseOp;
	if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
	Width, TRI))
	return false;
	BaseOps.push_back(BaseOp);
	return true;
	}

	bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
	const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
	bool &OffsetIsScalable, unsigned &Width,
	const TargetRegisterInfo *TRI) const {
	assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
	// Handle only loads/stores with base register followed by immediate offset.
	if (LdSt.getNumExplicitOperands() == 3) {
	// Non-paired instruction (e.g., ldr x1, [x0, #8]).
	if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) \|\|
	!LdSt.getOperand(2).isImm())
	return false;
	} else if (LdSt.getNumExplicitOperands() == 4) {
	// Paired instruction (e.g., ldp x1, x2, [x0, #8]).
	if (!LdSt.getOperand(1).isReg() \|\|
	(!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) \|\|
	!LdSt.getOperand(3).isImm())
	return false;
	} else
	return false;

	// Get the scaling factor for the instruction and set the width for the
	// instruction.
	TypeSize Scale(0U, false);
	int64_t Dummy1, Dummy2;

	// If this returns false, then it's an instruction we don't want to handle.
	if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
	return false;

	// Compute the offset. Offset is calculated as the immediate operand
	// multiplied by the scaling factor. Unscaled instructions have scaling factor
	// set to 1.
	if (LdSt.getNumExplicitOperands() == 3) {
	BaseOp = &LdSt.getOperand(1);
	Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinSize();
	} else {
	assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
	BaseOp = &LdSt.getOperand(2);
	Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinSize();
	}
	OffsetIsScalable = Scale.isScalable();

	if (!BaseOp->isReg() && !BaseOp->isFI())
	return false;

	return true;
	}

	MachineOperand &
	AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
	assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
	MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
	assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
	return OfsOp;
	}

	bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
	unsigned &Width, int64_t &MinOffset,
	int64_t &MaxOffset) {
	const unsigned SVEMaxBytesPerVector = AArch64::SVEMaxBitsPerVector / 8;
	switch (Opcode) {
	// Not a memory operation or something we want to handle.
	default:
	Scale = TypeSize::Fixed(0);
	Width = 0;
	MinOffset = MaxOffset = 0;
	return false;
	case AArch64::STRWpost:
	case AArch64::LDRWpost:
	Width = 32;
	Scale = TypeSize::Fixed(4);
	MinOffset = -256;
	MaxOffset = 255;
	break;
	case AArch64::LDURQi:
	case AArch64::STURQi:
	Width = 16;
	Scale = TypeSize::Fixed(1);
	MinOffset = -256;
	MaxOffset = 255;
	break;
	case AArch64::PRFUMi:
	case AArch64::LDURXi:
	case AArch64::LDURDi:
	case AArch64::STURXi:
	case AArch64::STURDi:
	Width = 8;
	Scale = TypeSize::Fixed(1);
	MinOffset = -256;
	MaxOffset = 255;
	break;
	case AArch64::LDURWi:
	case AArch64::LDURSi:
	case AArch64::LDURSWi:
	case AArch64::STURWi:
	case AArch64::STURSi:
	Width = 4;
	Scale = TypeSize::Fixed(1);
	MinOffset = -256;
	MaxOffset = 255;
	break;
	case AArch64::LDURHi:
	case AArch64::LDURHHi:
	case AArch64::LDURSHXi:
	case AArch64::LDURSHWi:
	case AArch64::STURHi:
	case AArch64::STURHHi:
	Width = 2;
	Scale = TypeSize::Fixed(1);
	MinOffset = -256;
	MaxOffset = 255;
	break;
	case AArch64::LDURBi:
	case AArch64::LDURBBi:
	case AArch64::LDURSBXi:
	case AArch64::LDURSBWi:
	case AArch64::STURBi:
	case AArch64::STURBBi:
	Width = 1;
	Scale = TypeSize::Fixed(1);
	MinOffset = -256;
	MaxOffset = 255;
	break;
	case AArch64::LDPQi:
	case AArch64::LDNPQi:
	case AArch64::STPQi:
	case AArch64::STNPQi:
	Scale = TypeSize::Fixed(16);
	Width = 32;
	MinOffset = -64;
	MaxOffset = 63;
	break;
	case AArch64::LDRQui:
	case AArch64::STRQui:
	Scale = TypeSize::Fixed(16);
	Width = 16;
	MinOffset = 0;
	MaxOffset = 4095;
	break;
	case AArch64::LDPXi:
	case AArch64::LDPDi:
	case AArch64::LDNPXi:
	case AArch64::LDNPDi:
	case AArch64::STPXi:
	case AArch64::STPDi:
	case AArch64::STNPXi:
	case AArch64::STNPDi:
	Scale = TypeSize::Fixed(8);
	Width = 16;
	MinOffset = -64;
	MaxOffset = 63;
	break;
	case AArch64::PRFMui:
	case AArch64::LDRXui:
	case AArch64::LDRDui:
	case AArch64::STRXui:
	case AArch64::STRDui:
	Scale = TypeSize::Fixed(8);
	Width = 8;
	MinOffset = 0;
	MaxOffset = 4095;
	break;
	case AArch64::LDPWi:
	case AArch64::LDPSi:
	case AArch64::LDNPWi:
	case AArch64::LDNPSi:
	case AArch64::STPWi:
	case AArch64::STPSi:
	case AArch64::STNPWi:
	case AArch64::STNPSi:
	Scale = TypeSize::Fixed(4);
	Width = 8;
	MinOffset = -64;
	MaxOffset = 63;
	break;
	case AArch64::LDRWui:
	case AArch64::LDRSui:
	case AArch64::LDRSWui:
	case AArch64::STRWui:
	case AArch64::STRSui:
	Scale = TypeSize::Fixed(4);
	Width = 4;
	MinOffset = 0;
	MaxOffset = 4095;
	break;
	case AArch64::LDRHui:
	case AArch64::LDRHHui:
	case AArch64::LDRSHWui:
	case AArch64::LDRSHXui:
	case AArch64::STRHui:
	case AArch64::STRHHui:
	Scale = TypeSize::Fixed(2);
	Width = 2;
	MinOffset = 0;
	MaxOffset = 4095;
	break;
	case AArch64::LDRBui:
	case AArch64::LDRBBui:
	case AArch64::LDRSBWui:
	case AArch64::LDRSBXui:
	case AArch64::STRBui:
	case AArch64::STRBBui:
	Scale = TypeSize::Fixed(1);
	Width = 1;
	MinOffset = 0;
	MaxOffset = 4095;
	break;
	case AArch64::ADDG:
	Scale = TypeSize::Fixed(16);
	Width = 0;
	MinOffset = 0;
	MaxOffset = 63;
	break;
	case AArch64::TAGPstack:
	Scale = TypeSize::Fixed(16);
	Width = 0;
	// TAGP with a negative offset turns into SUBP, which has a maximum offset
	// of 63 (not 64!).
	MinOffset = -63;
	MaxOffset = 63;
	break;
	case AArch64::LDG:
	case AArch64::STGOffset:
	case AArch64::STZGOffset:
	Scale = TypeSize::Fixed(16);
	Width = 16;
	MinOffset = -256;
	MaxOffset = 255;
	break;
	case AArch64::STR_ZZZZXI:
	case AArch64::LDR_ZZZZXI:
	Scale = TypeSize::Scalable(16);
	Width = SVEMaxBytesPerVector * 4;
	MinOffset = -256;
	MaxOffset = 252;
	break;
	case AArch64::STR_ZZZXI:
	case AArch64::LDR_ZZZXI:
	Scale = TypeSize::Scalable(16);
	Width = SVEMaxBytesPerVector * 3;
	MinOffset = -256;
	MaxOffset = 253;
	break;
	case AArch64::STR_ZZXI:
	case AArch64::LDR_ZZXI:
	Scale = TypeSize::Scalable(16);
	Width = SVEMaxBytesPerVector * 2;
	MinOffset = -256;
	MaxOffset = 254;
	break;
	case AArch64::LDR_PXI:
	case AArch64::STR_PXI:
	Scale = TypeSize::Scalable(2);
	Width = SVEMaxBytesPerVector / 8;
	MinOffset = -256;
	MaxOffset = 255;
	break;
	case AArch64::LDR_ZXI:
	case AArch64::STR_ZXI:
	Scale = TypeSize::Scalable(16);
	Width = SVEMaxBytesPerVector;
	MinOffset = -256;
	MaxOffset = 255;
	break;
	case AArch64::LD1B_IMM:
	case AArch64::LD1H_IMM:
	case AArch64::LD1W_IMM:
	case AArch64::LD1D_IMM:
	case AArch64::ST1B_IMM:
	case AArch64::ST1H_IMM:
	case AArch64::ST1W_IMM:
	case AArch64::ST1D_IMM:
	// A full vectors worth of data
	// Width = mbytes * elements
	Scale = TypeSize::Scalable(16);
	Width = SVEMaxBytesPerVector;
	MinOffset = -8;
	MaxOffset = 7;
	break;
	case AArch64::LD1B_H_IMM:
	case AArch64::LD1SB_H_IMM:
	case AArch64::LD1H_S_IMM:
	case AArch64::LD1SH_S_IMM:
	case AArch64::LD1W_D_IMM:
	case AArch64::LD1SW_D_IMM:
	case AArch64::ST1B_H_IMM:
	case AArch64::ST1H_S_IMM:
	case AArch64::ST1W_D_IMM:
	// A half vector worth of data
	// Width = mbytes * elements
	Scale = TypeSize::Scalable(8);
	Width = SVEMaxBytesPerVector / 2;
	MinOffset = -8;
	MaxOffset = 7;
	break;
	case AArch64::LD1B_S_IMM:
	case AArch64::LD1SB_S_IMM:
	case AArch64::LD1H_D_IMM:
	case AArch64::LD1SH_D_IMM:
	case AArch64::ST1B_S_IMM:
	case AArch64::ST1H_D_IMM:
	// A quarter vector worth of data
	// Width = mbytes * elements
	Scale = TypeSize::Scalable(4);
	Width = SVEMaxBytesPerVector / 4;
	MinOffset = -8;
	MaxOffset = 7;
	break;
	case AArch64::LD1B_D_IMM:
	case AArch64::LD1SB_D_IMM:
	case AArch64::ST1B_D_IMM:
	// A eighth vector worth of data
	// Width = mbytes * elements
	Scale = TypeSize::Scalable(2);
	Width = SVEMaxBytesPerVector / 8;
	MinOffset = -8;
	MaxOffset = 7;
	break;
	case AArch64::ST2GOffset:
	case AArch64::STZ2GOffset:
	Scale = TypeSize::Fixed(16);
	Width = 32;
	MinOffset = -256;
	MaxOffset = 255;
	break;
	case AArch64::STGPi:
	Scale = TypeSize::Fixed(16);
	Width = 16;
	MinOffset = -64;
	MaxOffset = 63;
	break;
	}

	return true;
	}

	// Scaling factor for unscaled load or store.
	int AArch64InstrInfo::getMemScale(unsigned Opc) {
	switch (Opc) {
	default:
	llvm_unreachable("Opcode has unknown scale!");
	case AArch64::LDRBBui:
	case AArch64::LDURBBi:
	case AArch64::LDRSBWui:
	case AArch64::LDURSBWi:
	case AArch64::STRBBui:
	case AArch64::STURBBi:
	return 1;
	case AArch64::LDRHHui:
	case AArch64::LDURHHi:
	case AArch64::LDRSHWui:
	case AArch64::LDURSHWi:
	case AArch64::STRHHui:
	case AArch64::STURHHi:
	return 2;
	case AArch64::LDRSui:
	case AArch64::LDURSi:
	case AArch64::LDRSWui:
	case AArch64::LDURSWi:
	case AArch64::LDRWui:
	case AArch64::LDURWi:
	case AArch64::STRSui:
	case AArch64::STURSi:
	case AArch64::STRWui:
	case AArch64::STURWi:
	case AArch64::LDPSi:
	case AArch64::LDPSWi:
	case AArch64::LDPWi:
	case AArch64::STPSi:
	case AArch64::STPWi:
	return 4;
	case AArch64::LDRDui:
	case AArch64::LDURDi:
	case AArch64::LDRXui:
	case AArch64::LDURXi:
	case AArch64::STRDui:
	case AArch64::STURDi:
	case AArch64::STRXui:
	case AArch64::STURXi:
	case AArch64::LDPDi:
	case AArch64::LDPXi:
	case AArch64::STPDi:
	case AArch64::STPXi:
	return 8;
	case AArch64::LDRQui:
	case AArch64::LDURQi:
	case AArch64::STRQui:
	case AArch64::STURQi:
	case AArch64::LDPQi:
	case AArch64::STPQi:
	case AArch64::STGOffset:
	case AArch64::STZGOffset:
	case AArch64::ST2GOffset:
	case AArch64::STZ2GOffset:
	case AArch64::STGPi:
	return 16;
	}
	}

	// Scale the unscaled offsets. Returns false if the unscaled offset can't be
	// scaled.
	static bool scaleOffset(unsigned Opc, int64_t &Offset) {
	int Scale = AArch64InstrInfo::getMemScale(Opc);

	// If the byte-offset isn't a multiple of the stride, we can't scale this
	// offset.
	if (Offset % Scale != 0)
	return false;

	// Convert the byte-offset used by unscaled into an "element" offset used
	// by the scaled pair load/store instructions.
	Offset /= Scale;
	return true;
	}

	static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
	if (FirstOpc == SecondOpc)
	return true;
	// We can also pair sign-ext and zero-ext instructions.
	switch (FirstOpc) {
	default:
	return false;
	case AArch64::LDRWui:
	case AArch64::LDURWi:
	return SecondOpc == AArch64::LDRSWui \|\| SecondOpc == AArch64::LDURSWi;
	case AArch64::LDRSWui:
	case AArch64::LDURSWi:
	return SecondOpc == AArch64::LDRWui \|\| SecondOpc == AArch64::LDURWi;
	}
	// These instructions can't be paired based on their opcodes.
	return false;
	}

	static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
	int64_t Offset1, unsigned Opcode1, int FI2,
	int64_t Offset2, unsigned Opcode2) {
	// Accesses through fixed stack object frame indices may access a different
	// fixed stack slot. Check that the object offsets + offsets match.
	if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
	int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
	int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
	assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
	// Convert to scaled object offsets.
	int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
	if (ObjectOffset1 % Scale1 != 0)
	return false;
	ObjectOffset1 /= Scale1;
	int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
	if (ObjectOffset2 % Scale2 != 0)
	return false;
	ObjectOffset2 /= Scale2;
	ObjectOffset1 += Offset1;
	ObjectOffset2 += Offset2;
	return ObjectOffset1 + 1 == ObjectOffset2;
	}

	return FI1 == FI2;
	}

	/// Detect opportunities for ldp/stp formation.
	///
	/// Only called for LdSt for which getMemOperandWithOffset returns true.
	bool AArch64InstrInfo::shouldClusterMemOps(
	ArrayRef<const MachineOperand *> BaseOps1,
	ArrayRef<const MachineOperand *> BaseOps2, unsigned NumLoads,
	unsigned NumBytes) const {
	assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
	const MachineOperand &BaseOp1 = *BaseOps1.front();
	const MachineOperand &BaseOp2 = *BaseOps2.front();
	const MachineInstr &FirstLdSt = *BaseOp1.getParent();
	const MachineInstr &SecondLdSt = *BaseOp2.getParent();
	if (BaseOp1.getType() != BaseOp2.getType())
	return false;

	assert((BaseOp1.isReg() \|\| BaseOp1.isFI()) &&
	"Only base registers and frame indices are supported.");

	// Check for both base regs and base FI.
	if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
	return false;

	// Only cluster up to a single pair.
	if (NumLoads > 2)
	return false;

	if (!isPairableLdStInst(FirstLdSt) \|\| !isPairableLdStInst(SecondLdSt))
	return false;

	// Can we pair these instructions based on their opcodes?
	unsigned FirstOpc = FirstLdSt.getOpcode();
	unsigned SecondOpc = SecondLdSt.getOpcode();
	if (!canPairLdStOpc(FirstOpc, SecondOpc))
	return false;

	// Can't merge volatiles or load/stores that have a hint to avoid pair
	// formation, for example.
	if (!isCandidateToMergeOrPair(FirstLdSt) \|\|
	!isCandidateToMergeOrPair(SecondLdSt))
	return false;

	// isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
	int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
	if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
	return false;

	int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
	if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
	return false;

	// Pairwise instructions have a 7-bit signed offset field.
	if (Offset1 > 63 \|\| Offset1 < -64)
	return false;

	// The caller should already have ordered First/SecondLdSt by offset.
	// Note: except for non-equal frame index bases
	if (BaseOp1.isFI()) {
	assert((!BaseOp1.isIdenticalTo(BaseOp2) \|\| Offset1 <= Offset2) &&
	"Caller should have ordered offsets.");

	const MachineFrameInfo &MFI =
	FirstLdSt.getParent()->getParent()->getFrameInfo();
	return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
	BaseOp2.getIndex(), Offset2, SecondOpc);
	}

	assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");

	return Offset1 + 1 == Offset2;
	}

	static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
	unsigned Reg, unsigned SubIdx,
	unsigned State,
	const TargetRegisterInfo *TRI) {
	if (!SubIdx)
	return MIB.addReg(Reg, State);

	if (Register::isPhysicalRegister(Reg))
	return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
	return MIB.addReg(Reg, State, SubIdx);
	}

	static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
	unsigned NumRegs) {
	// We really want the positive remainder mod 32 here, that happens to be
	// easily obtainable with a mask.
	return ((DestReg - SrcReg) & 0x1f) < NumRegs;
	}

	void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I,
	const DebugLoc &DL, MCRegister DestReg,
	MCRegister SrcReg, bool KillSrc,
	unsigned Opcode,
	ArrayRef<unsigned> Indices) const {
	assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
	const TargetRegisterInfo *TRI = &getRegisterInfo();
	uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
	uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
	unsigned NumRegs = Indices.size();

	int SubReg = 0, End = NumRegs, Incr = 1;
	if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
	SubReg = NumRegs - 1;
	End = -1;
	Incr = -1;
	}

	for (; SubReg != End; SubReg += Incr) {
	const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
	AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
	AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
	AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
	}
	}

	void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I,
	DebugLoc DL, unsigned DestReg,
	unsigned SrcReg, bool KillSrc,
	unsigned Opcode, unsigned ZeroReg,
	llvm::ArrayRef<unsigned> Indices) const {
	const TargetRegisterInfo *TRI = &getRegisterInfo();
	unsigned NumRegs = Indices.size();

	#ifndef NDEBUG
	uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
	uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
	assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
	"GPR reg sequences should not be able to overlap");
	#endif

	for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
	const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
	AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
	MIB.addReg(ZeroReg);
	AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
	MIB.addImm(0);
	}
	}

	void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I,
	const DebugLoc &DL, MCRegister DestReg,
	MCRegister SrcReg, bool KillSrc) const {
	if (AArch64::GPR32spRegClass.contains(DestReg) &&
	(AArch64::GPR32spRegClass.contains(SrcReg) \|\| SrcReg == AArch64::WZR)) {
	const TargetRegisterInfo *TRI = &getRegisterInfo();

	if (DestReg == AArch64::WSP \|\| SrcReg == AArch64::WSP) {
	// If either operand is WSP, expand to ADD #0.
	if (Subtarget.hasZeroCycleRegMove()) {
	// Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
	MCRegister DestRegX = TRI->getMatchingSuperReg(
	DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
	MCRegister SrcRegX = TRI->getMatchingSuperReg(
	SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
	// This instruction is reading and writing X registers. This may upset
	// the register scavenger and machine verifier, so we need to indicate
	// that we are reading an undefined value from SrcRegX, but a proper
	// value from SrcReg.
	BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
	.addReg(SrcRegX, RegState::Undef)
	.addImm(0)
	.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
	.addReg(SrcReg, RegState::Implicit \| getKillRegState(KillSrc));
	} else {
	BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
	.addReg(SrcReg, getKillRegState(KillSrc))
	.addImm(0)
	.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
	}
	} else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
	BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
	.addImm(0)
	.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
	} else {
	if (Subtarget.hasZeroCycleRegMove()) {
	// Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
	MCRegister DestRegX = TRI->getMatchingSuperReg(
	DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
	MCRegister SrcRegX = TRI->getMatchingSuperReg(
	SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
	// This instruction is reading and writing X registers. This may upset
	// the register scavenger and machine verifier, so we need to indicate
	// that we are reading an undefined value from SrcRegX, but a proper
	// value from SrcReg.
	BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
	.addReg(AArch64::XZR)
	.addReg(SrcRegX, RegState::Undef)
	.addReg(SrcReg, RegState::Implicit \| getKillRegState(KillSrc));
	} else {
	// Otherwise, expand to ORR WZR.
	BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
	.addReg(AArch64::WZR)
	.addReg(SrcReg, getKillRegState(KillSrc));
	}
	}
	return;
	}

	// Copy a Predicate register by ORRing with itself.
	if (AArch64::PPRRegClass.contains(DestReg) &&
	AArch64::PPRRegClass.contains(SrcReg)) {
	assert(Subtarget.hasSVE() && "Unexpected SVE register.");
	BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
	.addReg(SrcReg) // Pg
	.addReg(SrcReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	return;
	}

	// Copy a Z register by ORRing with itself.
	if (AArch64::ZPRRegClass.contains(DestReg) &&
	AArch64::ZPRRegClass.contains(SrcReg)) {
	assert(Subtarget.hasSVE() && "Unexpected SVE register.");
	BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
	.addReg(SrcReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	return;
	}

	+ // Copy a Z register pair by copying the individual sub-registers.
	+ if (AArch64::ZPR2RegClass.contains(DestReg) &&
	+ AArch64::ZPR2RegClass.contains(SrcReg)) {
	+ static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
	+ copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
	+ Indices);
	+ return;
	+ }
	+
	+ // Copy a Z register triple by copying the individual sub-registers.
	+ if (AArch64::ZPR3RegClass.contains(DestReg) &&
	+ AArch64::ZPR3RegClass.contains(SrcReg)) {
	+ static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
	+ AArch64::zsub2};
	+ copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
	+ Indices);
	+ return;
	+ }
	+
	+ // Copy a Z register quad by copying the individual sub-registers.
	+ if (AArch64::ZPR4RegClass.contains(DestReg) &&
	+ AArch64::ZPR4RegClass.contains(SrcReg)) {
	+ static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
	+ AArch64::zsub2, AArch64::zsub3};
	+ copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
	+ Indices);
	+ return;
	+ }
	+
	if (AArch64::GPR64spRegClass.contains(DestReg) &&
	(AArch64::GPR64spRegClass.contains(SrcReg) \|\| SrcReg == AArch64::XZR)) {
	if (DestReg == AArch64::SP \|\| SrcReg == AArch64::SP) {
	// If either operand is SP, expand to ADD #0.
	BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
	.addReg(SrcReg, getKillRegState(KillSrc))
	.addImm(0)
	.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
	} else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
	BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
	.addImm(0)
	.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
	} else {
	// Otherwise, expand to ORR XZR.
	BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
	.addReg(AArch64::XZR)
	.addReg(SrcReg, getKillRegState(KillSrc));
	}
	return;
	}

	// Copy a DDDD register quad by copying the individual sub-registers.
	if (AArch64::DDDDRegClass.contains(DestReg) &&
	AArch64::DDDDRegClass.contains(SrcReg)) {
	static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
	AArch64::dsub2, AArch64::dsub3};
	copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
	Indices);
	return;
	}

	// Copy a DDD register triple by copying the individual sub-registers.
	if (AArch64::DDDRegClass.contains(DestReg) &&
	AArch64::DDDRegClass.contains(SrcReg)) {
	static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
	AArch64::dsub2};
	copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
	Indices);
	return;
	}

	// Copy a DD register pair by copying the individual sub-registers.
	if (AArch64::DDRegClass.contains(DestReg) &&
	AArch64::DDRegClass.contains(SrcReg)) {
	static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
	copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
	Indices);
	return;
	}

	// Copy a QQQQ register quad by copying the individual sub-registers.
	if (AArch64::QQQQRegClass.contains(DestReg) &&
	AArch64::QQQQRegClass.contains(SrcReg)) {
	static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
	AArch64::qsub2, AArch64::qsub3};
	copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
	Indices);
	return;
	}

	// Copy a QQQ register triple by copying the individual sub-registers.
	if (AArch64::QQQRegClass.contains(DestReg) &&
	AArch64::QQQRegClass.contains(SrcReg)) {
	static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
	AArch64::qsub2};
	copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
	Indices);
	return;
	}

	// Copy a QQ register pair by copying the individual sub-registers.
	if (AArch64::QQRegClass.contains(DestReg) &&
	AArch64::QQRegClass.contains(SrcReg)) {
	static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
	copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
	Indices);
	return;
	}

	if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
	AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
	static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
	copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
	AArch64::XZR, Indices);
	return;
	}

	if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
	AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
	static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
	copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
	AArch64::WZR, Indices);
	return;
	}

	if (AArch64::FPR128RegClass.contains(DestReg) &&
	AArch64::FPR128RegClass.contains(SrcReg)) {
	if (Subtarget.hasNEON()) {
	BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
	.addReg(SrcReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	} else {
	BuildMI(MBB, I, DL, get(AArch64::STRQpre))
	.addReg(AArch64::SP, RegState::Define)
	.addReg(SrcReg, getKillRegState(KillSrc))
	.addReg(AArch64::SP)
	.addImm(-16);
	BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
	.addReg(AArch64::SP, RegState::Define)
	.addReg(DestReg, RegState::Define)
	.addReg(AArch64::SP)
	.addImm(16);
	}
	return;
	}

	if (AArch64::FPR64RegClass.contains(DestReg) &&
	AArch64::FPR64RegClass.contains(SrcReg)) {
	if (Subtarget.hasNEON()) {
	DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
	&AArch64::FPR128RegClass);
	SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
	&AArch64::FPR128RegClass);
	BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
	.addReg(SrcReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	} else {
	BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	}
	return;
	}

	if (AArch64::FPR32RegClass.contains(DestReg) &&
	AArch64::FPR32RegClass.contains(SrcReg)) {
	if (Subtarget.hasNEON()) {
	DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
	&AArch64::FPR128RegClass);
	SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
	&AArch64::FPR128RegClass);
	BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
	.addReg(SrcReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	} else {
	BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	}
	return;
	}

	if (AArch64::FPR16RegClass.contains(DestReg) &&
	AArch64::FPR16RegClass.contains(SrcReg)) {
	if (Subtarget.hasNEON()) {
	DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
	&AArch64::FPR128RegClass);
	SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
	&AArch64::FPR128RegClass);
	BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
	.addReg(SrcReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	} else {
	DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
	&AArch64::FPR32RegClass);
	SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
	&AArch64::FPR32RegClass);
	BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	}
	return;
	}

	if (AArch64::FPR8RegClass.contains(DestReg) &&
	AArch64::FPR8RegClass.contains(SrcReg)) {
	if (Subtarget.hasNEON()) {
	DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
	&AArch64::FPR128RegClass);
	SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
	&AArch64::FPR128RegClass);
	BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
	.addReg(SrcReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	} else {
	DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
	&AArch64::FPR32RegClass);
	SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
	&AArch64::FPR32RegClass);
	BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	}
	return;
	}

	// Copies between GPR64 and FPR64.
	if (AArch64::FPR64RegClass.contains(DestReg) &&
	AArch64::GPR64RegClass.contains(SrcReg)) {
	BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	return;
	}
	if (AArch64::GPR64RegClass.contains(DestReg) &&
	AArch64::FPR64RegClass.contains(SrcReg)) {
	BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	return;
	}
	// Copies between GPR32 and FPR32.
	if (AArch64::FPR32RegClass.contains(DestReg) &&
	AArch64::GPR32RegClass.contains(SrcReg)) {
	BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	return;
	}
	if (AArch64::GPR32RegClass.contains(DestReg) &&
	AArch64::FPR32RegClass.contains(SrcReg)) {
	BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	return;
	}

	if (DestReg == AArch64::NZCV) {
	assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
	BuildMI(MBB, I, DL, get(AArch64::MSR))
	.addImm(AArch64SysReg::NZCV)
	.addReg(SrcReg, getKillRegState(KillSrc))
	.addReg(AArch64::NZCV, RegState::Implicit \| RegState::Define);
	return;
	}

	if (SrcReg == AArch64::NZCV) {
	assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
	BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
	.addImm(AArch64SysReg::NZCV)
	.addReg(AArch64::NZCV, RegState::Implicit \| getKillRegState(KillSrc));
	return;
	}

	llvm_unreachable("unimplemented reg-to-reg copy");
	}

	static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
	MachineBasicBlock &MBB,
	MachineBasicBlock::iterator InsertBefore,
	const MCInstrDesc &MCID,
	Register SrcReg, bool IsKill,
	unsigned SubIdx0, unsigned SubIdx1, int FI,
	MachineMemOperand *MMO) {
	Register SrcReg0 = SrcReg;
	Register SrcReg1 = SrcReg;
	if (Register::isPhysicalRegister(SrcReg)) {
	SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
	SubIdx0 = 0;
	SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
	SubIdx1 = 0;
	}
	BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
	.addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
	.addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
	.addFrameIndex(FI)
	.addImm(0)
	.addMemOperand(MMO);
	}

	void AArch64InstrInfo::storeRegToStackSlot(
	MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
	bool isKill, int FI, const TargetRegisterClass *RC,
	const TargetRegisterInfo *TRI) const {
	MachineFunction &MF = *MBB.getParent();
	MachineFrameInfo &MFI = MF.getFrameInfo();

	MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
	MachineMemOperand *MMO =
	MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
	MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
	unsigned Opc = 0;
	bool Offset = true;
	unsigned StackID = TargetStackID::Default;
	switch (TRI->getSpillSize(*RC)) {
	case 1:
	if (AArch64::FPR8RegClass.hasSubClassEq(RC))
	Opc = AArch64::STRBui;
	break;
	case 2:
	if (AArch64::FPR16RegClass.hasSubClassEq(RC))
	Opc = AArch64::STRHui;
	else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
	Opc = AArch64::STR_PXI;
	StackID = TargetStackID::SVEVector;
	}
	break;
	case 4:
	if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
	Opc = AArch64::STRWui;
	if (Register::isVirtualRegister(SrcReg))
	MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
	else
	assert(SrcReg != AArch64::WSP);
	} else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
	Opc = AArch64::STRSui;
	break;
	case 8:
	if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
	Opc = AArch64::STRXui;
	if (Register::isVirtualRegister(SrcReg))
	MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
	else
	assert(SrcReg != AArch64::SP);
	} else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
	Opc = AArch64::STRDui;
	} else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
	storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
	get(AArch64::STPWi), SrcReg, isKill,
	AArch64::sube32, AArch64::subo32, FI, MMO);
	return;
	}
	break;
	case 16:
	if (AArch64::FPR128RegClass.hasSubClassEq(RC))
	Opc = AArch64::STRQui;
	else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
	Opc = AArch64::ST1Twov1d;
	Offset = false;
	} else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
	storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
	get(AArch64::STPXi), SrcReg, isKill,
	AArch64::sube64, AArch64::subo64, FI, MMO);
	return;
	} else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
	Opc = AArch64::STR_ZXI;
	StackID = TargetStackID::SVEVector;
	}
	break;
	case 24:
	if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
	Opc = AArch64::ST1Threev1d;
	Offset = false;
	}
	break;
	case 32:
	if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
	Opc = AArch64::ST1Fourv1d;
	Offset = false;
	} else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
	Opc = AArch64::ST1Twov2d;
	Offset = false;
	} else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
	Opc = AArch64::STR_ZZXI;
	StackID = TargetStackID::SVEVector;
	}
	break;
	case 48:
	if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
	Opc = AArch64::ST1Threev2d;
	Offset = false;
	} else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
	Opc = AArch64::STR_ZZZXI;
	StackID = TargetStackID::SVEVector;
	}
	break;
	case 64:
	if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
	Opc = AArch64::ST1Fourv2d;
	Offset = false;
	} else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
	Opc = AArch64::STR_ZZZZXI;
	StackID = TargetStackID::SVEVector;
	}
	break;
	}
	assert(Opc && "Unknown register class");
	MFI.setStackID(FI, StackID);

	const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
	.addReg(SrcReg, getKillRegState(isKill))
	.addFrameIndex(FI);

	if (Offset)
	MI.addImm(0);
	MI.addMemOperand(MMO);
	}

	static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
	MachineBasicBlock &MBB,
	MachineBasicBlock::iterator InsertBefore,
	const MCInstrDesc &MCID,
	Register DestReg, unsigned SubIdx0,
	unsigned SubIdx1, int FI,
	MachineMemOperand *MMO) {
	Register DestReg0 = DestReg;
	Register DestReg1 = DestReg;
	bool IsUndef = true;
	if (Register::isPhysicalRegister(DestReg)) {
	DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
	SubIdx0 = 0;
	DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
	SubIdx1 = 0;
	IsUndef = false;
	}
	BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
	.addReg(DestReg0, RegState::Define \| getUndefRegState(IsUndef), SubIdx0)
	.addReg(DestReg1, RegState::Define \| getUndefRegState(IsUndef), SubIdx1)
	.addFrameIndex(FI)
	.addImm(0)
	.addMemOperand(MMO);
	}

	void AArch64InstrInfo::loadRegFromStackSlot(
	MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg,
	int FI, const TargetRegisterClass *RC,
	const TargetRegisterInfo *TRI) const {
	MachineFunction &MF = *MBB.getParent();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
	MachineMemOperand *MMO =
	MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
	MFI.getObjectSize(FI), MFI.getObjectAlign(FI));

	unsigned Opc = 0;
	bool Offset = true;
	unsigned StackID = TargetStackID::Default;
	switch (TRI->getSpillSize(*RC)) {
	case 1:
	if (AArch64::FPR8RegClass.hasSubClassEq(RC))
	Opc = AArch64::LDRBui;
	break;
	case 2:
	if (AArch64::FPR16RegClass.hasSubClassEq(RC))
	Opc = AArch64::LDRHui;
	else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
	Opc = AArch64::LDR_PXI;
	StackID = TargetStackID::SVEVector;
	}
	break;
	case 4:
	if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
	Opc = AArch64::LDRWui;
	if (Register::isVirtualRegister(DestReg))
	MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
	else
	assert(DestReg != AArch64::WSP);
	} else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
	Opc = AArch64::LDRSui;
	break;
	case 8:
	if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
	Opc = AArch64::LDRXui;
	if (Register::isVirtualRegister(DestReg))
	MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
	else
	assert(DestReg != AArch64::SP);
	} else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
	Opc = AArch64::LDRDui;
	} else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
	loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
	get(AArch64::LDPWi), DestReg, AArch64::sube32,
	AArch64::subo32, FI, MMO);
	return;
	}
	break;
	case 16:
	if (AArch64::FPR128RegClass.hasSubClassEq(RC))
	Opc = AArch64::LDRQui;
	else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
	Opc = AArch64::LD1Twov1d;
	Offset = false;
	} else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
	loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
	get(AArch64::LDPXi), DestReg, AArch64::sube64,
	AArch64::subo64, FI, MMO);
	return;
	} else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
	Opc = AArch64::LDR_ZXI;
	StackID = TargetStackID::SVEVector;
	}
	break;
	case 24:
	if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
	Opc = AArch64::LD1Threev1d;
	Offset = false;
	}
	break;
	case 32:
	if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
	Opc = AArch64::LD1Fourv1d;
	Offset = false;
	} else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
	Opc = AArch64::LD1Twov2d;
	Offset = false;
	} else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
	Opc = AArch64::LDR_ZZXI;
	StackID = TargetStackID::SVEVector;
	}
	break;
	case 48:
	if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
	Opc = AArch64::LD1Threev2d;
	Offset = false;
	} else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
	Opc = AArch64::LDR_ZZZXI;
	StackID = TargetStackID::SVEVector;
	}
	break;
	case 64:
	if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
	Opc = AArch64::LD1Fourv2d;
	Offset = false;
	} else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
	Opc = AArch64::LDR_ZZZZXI;
	StackID = TargetStackID::SVEVector;
	}
	break;
	}

	assert(Opc && "Unknown register class");
	MFI.setStackID(FI, StackID);

	const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
	.addReg(DestReg, getDefRegState(true))
	.addFrameIndex(FI);
	if (Offset)
	MI.addImm(0);
	MI.addMemOperand(MMO);
	}

	bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
	const MachineInstr &UseMI,
	const TargetRegisterInfo *TRI) {
	return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
	UseMI.getIterator()),
	[TRI](const MachineInstr &I) {
	return I.modifiesRegister(AArch64::NZCV, TRI) \|\|
	I.readsRegister(AArch64::NZCV, TRI);
	});
	}

	// Helper function to emit a frame offset adjustment from a given
	// pointer (SrcReg), stored into DestReg. This function is explicit
	// in that it requires the opcode.
	static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	const DebugLoc &DL, unsigned DestReg,
	unsigned SrcReg, int64_t Offset, unsigned Opc,
	const TargetInstrInfo *TII,
	MachineInstr::MIFlag Flag, bool NeedsWinCFI,
	bool *HasWinCFI) {
	int Sign = 1;
	unsigned MaxEncoding, ShiftSize;
	switch (Opc) {
	case AArch64::ADDXri:
	case AArch64::ADDSXri:
	case AArch64::SUBXri:
	case AArch64::SUBSXri:
	MaxEncoding = 0xfff;
	ShiftSize = 12;
	break;
	case AArch64::ADDVL_XXI:
	case AArch64::ADDPL_XXI:
	MaxEncoding = 31;
	ShiftSize = 0;
	if (Offset < 0) {
	MaxEncoding = 32;
	Sign = -1;
	Offset = -Offset;
	}
	break;
	default:
	llvm_unreachable("Unsupported opcode");
	}

	// FIXME: If the offset won't fit in 24-bits, compute the offset into a
	// scratch register. If DestReg is a virtual register, use it as the
	// scratch register; otherwise, create a new virtual register (to be
	// replaced by the scavenger at the end of PEI). That case can be optimized
	// slightly if DestReg is SP which is always 16-byte aligned, so the scratch
	// register can be loaded with offset%8 and the add/sub can use an extending
	// instruction with LSL#3.
	// Currently the function handles any offsets but generates a poor sequence
	// of code.
	// assert(Offset < (1 << 24) && "unimplemented reg plus immediate");

	const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
	Register TmpReg = DestReg;
	if (TmpReg == AArch64::XZR)
	TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
	&AArch64::GPR64RegClass);
	do {
	uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
	unsigned LocalShiftSize = 0;
	if (ThisVal > MaxEncoding) {
	ThisVal = ThisVal >> ShiftSize;
	LocalShiftSize = ShiftSize;
	}
	assert((ThisVal >> ShiftSize) <= MaxEncoding &&
	"Encoding cannot handle value that big");

	Offset -= ThisVal << LocalShiftSize;
	if (Offset == 0)
	TmpReg = DestReg;
	auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
	.addReg(SrcReg)
	.addImm(Sign * (int)ThisVal);
	if (ShiftSize)
	MBI = MBI.addImm(
	AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize));
	MBI = MBI.setMIFlag(Flag);

	if (NeedsWinCFI) {
	assert(Sign == 1 && "SEH directives should always have a positive sign");
	int Imm = (int)(ThisVal << LocalShiftSize);
	if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) \|\|
	(SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
	if (HasWinCFI)
	*HasWinCFI = true;
	if (Imm == 0)
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
	else
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
	.addImm(Imm)
	.setMIFlag(Flag);
	assert(Offset == 0 && "Expected remaining offset to be zero to "
	"emit a single SEH directive");
	} else if (DestReg == AArch64::SP) {
	if (HasWinCFI)
	*HasWinCFI = true;
	assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
	.addImm(Imm)
	.setMIFlag(Flag);
	}
	if (HasWinCFI)
	*HasWinCFI = true;
	}

	SrcReg = TmpReg;
	} while (Offset);
	}

	void llvm::emitFrameOffset(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
	unsigned DestReg, unsigned SrcReg,
	StackOffset Offset, const TargetInstrInfo *TII,
	MachineInstr::MIFlag Flag, bool SetNZCV,
	bool NeedsWinCFI, bool *HasWinCFI) {
	int64_t Bytes, NumPredicateVectors, NumDataVectors;
	Offset.getForFrameOffset(Bytes, NumPredicateVectors, NumDataVectors);

	// First emit non-scalable frame offsets, or a simple 'mov'.
	if (Bytes \|\| (!Offset && SrcReg != DestReg)) {
	assert((DestReg != AArch64::SP \|\| Bytes % 16 == 0) &&
	"SP increment/decrement not 16-byte aligned");
	unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
	if (Bytes < 0) {
	Bytes = -Bytes;
	Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
	}
	emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
	NeedsWinCFI, HasWinCFI);
	SrcReg = DestReg;
	}

	assert(!(SetNZCV && (NumPredicateVectors \|\| NumDataVectors)) &&
	"SetNZCV not supported with SVE vectors");
	assert(!(NeedsWinCFI && (NumPredicateVectors \|\| NumDataVectors)) &&
	"WinCFI not supported with SVE vectors");

	if (NumDataVectors) {
	emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
	AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr);
	SrcReg = DestReg;
	}

	if (NumPredicateVectors) {
	assert(DestReg != AArch64::SP && "Unaligned access to SP");
	emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
	AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr);
	}
	}

	MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
	MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
	MachineBasicBlock::iterator InsertPt, int FrameIndex,
	LiveIntervals LIS, VirtRegMap VRM) const {
	// This is a bit of a hack. Consider this instruction:
	//
	// %0 = COPY %sp; GPR64all:%0
	//
	// We explicitly chose GPR64all for the virtual register so such a copy might
	// be eliminated by RegisterCoalescer. However, that may not be possible, and
	// %0 may even spill. We can't spill %sp, and since it is in the GPR64all
	// register class, TargetInstrInfo::foldMemoryOperand() is going to try.
	//
	// To prevent that, we are going to constrain the %0 register class here.
	//
	// <rdar://problem/11522048>
	//
	if (MI.isFullCopy()) {
	Register DstReg = MI.getOperand(0).getReg();
	Register SrcReg = MI.getOperand(1).getReg();
	if (SrcReg == AArch64::SP && Register::isVirtualRegister(DstReg)) {
	MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
	return nullptr;
	}
	if (DstReg == AArch64::SP && Register::isVirtualRegister(SrcReg)) {
	MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
	return nullptr;
	}
	}

	// Handle the case where a copy is being spilled or filled but the source
	// and destination register class don't match. For example:
	//
	// %0 = COPY %xzr; GPR64common:%0
	//
	// In this case we can still safely fold away the COPY and generate the
	// following spill code:
	//
	// STRXui %xzr, %stack.0
	//
	// This also eliminates spilled cross register class COPYs (e.g. between x and
	// d regs) of the same size. For example:
	//
	// %0 = COPY %1; GPR64:%0, FPR64:%1
	//
	// will be filled as
	//
	// LDRDui %0, fi<#0>
	//
	// instead of
	//
	// LDRXui %Temp, fi<#0>
	// %0 = FMOV %Temp
	//
	if (MI.isCopy() && Ops.size() == 1 &&
	// Make sure we're only folding the explicit COPY defs/uses.
	(Ops[0] == 0 \|\| Ops[0] == 1)) {
	bool IsSpill = Ops[0] == 0;
	bool IsFill = !IsSpill;
	const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
	const MachineRegisterInfo &MRI = MF.getRegInfo();
	MachineBasicBlock &MBB = *MI.getParent();
	const MachineOperand &DstMO = MI.getOperand(0);
	const MachineOperand &SrcMO = MI.getOperand(1);
	Register DstReg = DstMO.getReg();
	Register SrcReg = SrcMO.getReg();
	// This is slightly expensive to compute for physical regs since
	// getMinimalPhysRegClass is slow.
	auto getRegClass = [&](unsigned Reg) {
	return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
	: TRI.getMinimalPhysRegClass(Reg);
	};

	if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
	assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
	TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
	"Mismatched register size in non subreg COPY");
	if (IsSpill)
	storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
	getRegClass(SrcReg), &TRI);
	else
	loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
	getRegClass(DstReg), &TRI);
	return &*--InsertPt;
	}

	// Handle cases like spilling def of:
	//
	// %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
	//
	// where the physical register source can be widened and stored to the full
	// virtual reg destination stack slot, in this case producing:
	//
	// STRXui %xzr, %stack.0
	//
	if (IsSpill && DstMO.isUndef() && Register::isPhysicalRegister(SrcReg)) {
	assert(SrcMO.getSubReg() == 0 &&
	"Unexpected subreg on physical register");
	const TargetRegisterClass *SpillRC;
	unsigned SpillSubreg;
	switch (DstMO.getSubReg()) {
	default:
	SpillRC = nullptr;
	break;
	case AArch64::sub_32:
	case AArch64::ssub:
	if (AArch64::GPR32RegClass.contains(SrcReg)) {
	SpillRC = &AArch64::GPR64RegClass;
	SpillSubreg = AArch64::sub_32;
	} else if (AArch64::FPR32RegClass.contains(SrcReg)) {
	SpillRC = &AArch64::FPR64RegClass;
	SpillSubreg = AArch64::ssub;
	} else
	SpillRC = nullptr;
	break;
	case AArch64::dsub:
	if (AArch64::FPR64RegClass.contains(SrcReg)) {
	SpillRC = &AArch64::FPR128RegClass;
	SpillSubreg = AArch64::dsub;
	} else
	SpillRC = nullptr;
	break;
	}

	if (SpillRC)
	if (unsigned WidenedSrcReg =
	TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) {
	storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(),
	FrameIndex, SpillRC, &TRI);
	return &*--InsertPt;
	}
	}

	// Handle cases like filling use of:
	//
	// %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
	//
	// where we can load the full virtual reg source stack slot, into the subreg
	// destination, in this case producing:
	//
	// LDRWui %0:sub_32<def,read-undef>, %stack.0
	//
	if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
	const TargetRegisterClass *FillRC;
	switch (DstMO.getSubReg()) {
	default:
	FillRC = nullptr;
	break;
	case AArch64::sub_32:
	FillRC = &AArch64::GPR32RegClass;
	break;
	case AArch64::ssub:
	FillRC = &AArch64::FPR32RegClass;
	break;
	case AArch64::dsub:
	FillRC = &AArch64::FPR64RegClass;
	break;
	}

	if (FillRC) {
	assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
	TRI.getRegSizeInBits(*FillRC) &&
	"Mismatched regclass size on folded subreg COPY");
	loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI);
	MachineInstr &LoadMI = *--InsertPt;
	MachineOperand &LoadDst = LoadMI.getOperand(0);
	assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
	LoadDst.setSubReg(DstMO.getSubReg());
	LoadDst.setIsUndef();
	return &LoadMI;
	}
	}
	}

	// Cannot fold.
	return nullptr;
	}

	int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
	StackOffset &SOffset,
	bool *OutUseUnscaledOp,
	unsigned *OutUnscaledOp,
	int64_t *EmittableOffset) {
	// Set output values in case of early exit.
	if (EmittableOffset)
	*EmittableOffset = 0;
	if (OutUseUnscaledOp)
	*OutUseUnscaledOp = false;
	if (OutUnscaledOp)
	*OutUnscaledOp = 0;

	// Exit early for structured vector spills/fills as they can't take an
	// immediate offset.
	switch (MI.getOpcode()) {
	default:
	break;
	case AArch64::LD1Twov2d:
	case AArch64::LD1Threev2d:
	case AArch64::LD1Fourv2d:
	case AArch64::LD1Twov1d:
	case AArch64::LD1Threev1d:
	case AArch64::LD1Fourv1d:
	case AArch64::ST1Twov2d:
	case AArch64::ST1Threev2d:
	case AArch64::ST1Fourv2d:
	case AArch64::ST1Twov1d:
	case AArch64::ST1Threev1d:
	case AArch64::ST1Fourv1d:
	case AArch64::IRG:
	case AArch64::IRGstack:
	case AArch64::STGloop:
	case AArch64::STZGloop:
	return AArch64FrameOffsetCannotUpdate;
	}

	// Get the min/max offset and the scale.
	TypeSize ScaleValue(0U, false);
	unsigned Width;
	int64_t MinOff, MaxOff;
	if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
	MaxOff))
	llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");

	// Construct the complete offset.
	bool IsMulVL = ScaleValue.isScalable();
	unsigned Scale = ScaleValue.getKnownMinSize();
	int64_t Offset = IsMulVL ? SOffset.getScalableBytes() : SOffset.getBytes();

	const MachineOperand &ImmOpnd =
	MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
	Offset += ImmOpnd.getImm() * Scale;

	// If the offset doesn't match the scale, we rewrite the instruction to
	// use the unscaled instruction instead. Likewise, if we have a negative
	// offset and there is an unscaled op to use.
	Optional<unsigned> UnscaledOp =
	AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode());
	bool useUnscaledOp = UnscaledOp && (Offset % Scale \|\| Offset < 0);
	if (useUnscaledOp &&
	!AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
	MaxOff))
	llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");

	Scale = ScaleValue.getKnownMinSize();
	assert(IsMulVL == ScaleValue.isScalable() &&
	"Unscaled opcode has different value for scalable");

	int64_t Remainder = Offset % Scale;
	assert(!(Remainder && useUnscaledOp) &&
	"Cannot have remainder when using unscaled op");

	assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
	int64_t NewOffset = Offset / Scale;
	if (MinOff <= NewOffset && NewOffset <= MaxOff)
	Offset = Remainder;
	else {
	NewOffset = NewOffset < 0 ? MinOff : MaxOff;
	Offset = Offset - NewOffset * Scale + Remainder;
	}

	if (EmittableOffset)
	*EmittableOffset = NewOffset;
	if (OutUseUnscaledOp)
	*OutUseUnscaledOp = useUnscaledOp;
	if (OutUnscaledOp && UnscaledOp)
	OutUnscaledOp = UnscaledOp;

	if (IsMulVL)
	SOffset = StackOffset(Offset, MVT::nxv1i8) +
	StackOffset(SOffset.getBytes(), MVT::i8);
	else
	SOffset = StackOffset(Offset, MVT::i8) +
	StackOffset(SOffset.getScalableBytes(), MVT::nxv1i8);
	return AArch64FrameOffsetCanUpdate \|
	(SOffset ? 0 : AArch64FrameOffsetIsLegal);
	}

	bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
	unsigned FrameReg, StackOffset &Offset,
	const AArch64InstrInfo *TII) {
	unsigned Opcode = MI.getOpcode();
	unsigned ImmIdx = FrameRegIdx + 1;

	if (Opcode == AArch64::ADDSXri \|\| Opcode == AArch64::ADDXri) {
	Offset += StackOffset(MI.getOperand(ImmIdx).getImm(), MVT::i8);
	emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
	MI.getOperand(0).getReg(), FrameReg, Offset, TII,
	MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
	MI.eraseFromParent();
	Offset = StackOffset();
	return true;
	}

	int64_t NewOffset;
	unsigned UnscaledOp;
	bool UseUnscaledOp;
	int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
	&UnscaledOp, &NewOffset);
	if (Status & AArch64FrameOffsetCanUpdate) {
	if (Status & AArch64FrameOffsetIsLegal)
	// Replace the FrameIndex with FrameReg.
	MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
	if (UseUnscaledOp)
	MI.setDesc(TII->get(UnscaledOp));

	MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
	return !Offset;
	}

	return false;
	}

	void AArch64InstrInfo::getNoop(MCInst &NopInst) const {
	NopInst.setOpcode(AArch64::HINT);
	NopInst.addOperand(MCOperand::createImm(0));
	}

	// AArch64 supports MachineCombiner.
	bool AArch64InstrInfo::useMachineCombiner() const { return true; }

	// True when Opc sets flag
	static bool isCombineInstrSettingFlag(unsigned Opc) {
	switch (Opc) {
	case AArch64::ADDSWrr:
	case AArch64::ADDSWri:
	case AArch64::ADDSXrr:
	case AArch64::ADDSXri:
	case AArch64::SUBSWrr:
	case AArch64::SUBSXrr:
	// Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
	case AArch64::SUBSWri:
	case AArch64::SUBSXri:
	return true;
	default:
	break;
	}
	return false;
	}

	// 32b Opcodes that can be combined with a MUL
	static bool isCombineInstrCandidate32(unsigned Opc) {
	switch (Opc) {
	case AArch64::ADDWrr:
	case AArch64::ADDWri:
	case AArch64::SUBWrr:
	case AArch64::ADDSWrr:
	case AArch64::ADDSWri:
	case AArch64::SUBSWrr:
	// Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
	case AArch64::SUBWri:
	case AArch64::SUBSWri:
	return true;
	default:
	break;
	}
	return false;
	}

	// 64b Opcodes that can be combined with a MUL
	static bool isCombineInstrCandidate64(unsigned Opc) {
	switch (Opc) {
	case AArch64::ADDXrr:
	case AArch64::ADDXri:
	case AArch64::SUBXrr:
	case AArch64::ADDSXrr:
	case AArch64::ADDSXri:
	case AArch64::SUBSXrr:
	// Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
	case AArch64::SUBXri:
	case AArch64::SUBSXri:
	case AArch64::ADDv8i8:
	case AArch64::ADDv16i8:
	case AArch64::ADDv4i16:
	case AArch64::ADDv8i16:
	case AArch64::ADDv2i32:
	case AArch64::ADDv4i32:
	case AArch64::SUBv8i8:
	case AArch64::SUBv16i8:
	case AArch64::SUBv4i16:
	case AArch64::SUBv8i16:
	case AArch64::SUBv2i32:
	case AArch64::SUBv4i32:
	return true;
	default:
	break;
	}
	return false;
	}

	// FP Opcodes that can be combined with a FMUL
	static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
	switch (Inst.getOpcode()) {
	default:
	break;
	case AArch64::FADDHrr:
	case AArch64::FADDSrr:
	case AArch64::FADDDrr:
	case AArch64::FADDv4f16:
	case AArch64::FADDv8f16:
	case AArch64::FADDv2f32:
	case AArch64::FADDv2f64:
	case AArch64::FADDv4f32:
	case AArch64::FSUBHrr:
	case AArch64::FSUBSrr:
	case AArch64::FSUBDrr:
	case AArch64::FSUBv4f16:
	case AArch64::FSUBv8f16:
	case AArch64::FSUBv2f32:
	case AArch64::FSUBv2f64:
	case AArch64::FSUBv4f32:
	TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
	return (Options.UnsafeFPMath \|\|
	Options.AllowFPOpFusion == FPOpFusion::Fast);
	}
	return false;
	}

	// Opcodes that can be combined with a MUL
	static bool isCombineInstrCandidate(unsigned Opc) {
	return (isCombineInstrCandidate32(Opc) \|\| isCombineInstrCandidate64(Opc));
	}

	//
	// Utility routine that checks if \param MO is defined by an
	// \param CombineOpc instruction in the basic block \param MBB
	static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
	unsigned CombineOpc, unsigned ZeroReg = 0,
	bool CheckZeroReg = false) {
	MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
	MachineInstr *MI = nullptr;

	if (MO.isReg() && Register::isVirtualRegister(MO.getReg()))
	MI = MRI.getUniqueVRegDef(MO.getReg());
	// And it needs to be in the trace (otherwise, it won't have a depth).
	if (!MI \|\| MI->getParent() != &MBB \|\| (unsigned)MI->getOpcode() != CombineOpc)
	return false;
	// Must only used by the user we combine with.
	if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
	return false;

	if (CheckZeroReg) {
	assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
	MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
	MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
	// The third input reg must be zero.
	if (MI->getOperand(3).getReg() != ZeroReg)
	return false;
	}

	return true;
	}

	//
	// Is \param MO defined by an integer multiply and can be combined?
	static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
	unsigned MulOpc, unsigned ZeroReg) {
	return canCombine(MBB, MO, MulOpc, ZeroReg, true);
	}

	//
	// Is \param MO defined by a floating-point multiply and can be combined?
	static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
	unsigned MulOpc) {
	return canCombine(MBB, MO, MulOpc);
	}

	// TODO: There are many more machine instruction opcodes to match:
	// 1. Other data types (integer, vectors)
	// 2. Other math / logic operations (xor, or)
	// 3. Other forms of the same operation (intrinsics and other variants)
	bool AArch64InstrInfo::isAssociativeAndCommutative(
	const MachineInstr &Inst) const {
	switch (Inst.getOpcode()) {
	case AArch64::FADDDrr:
	case AArch64::FADDSrr:
	case AArch64::FADDv2f32:
	case AArch64::FADDv2f64:
	case AArch64::FADDv4f32:
	case AArch64::FMULDrr:
	case AArch64::FMULSrr:
	case AArch64::FMULX32:
	case AArch64::FMULX64:
	case AArch64::FMULXv2f32:
	case AArch64::FMULXv2f64:
	case AArch64::FMULXv4f32:
	case AArch64::FMULv2f32:
	case AArch64::FMULv2f64:
	case AArch64::FMULv4f32:
	return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
	default:
	return false;
	}
	}

	/// Find instructions that can be turned into madd.
	static bool getMaddPatterns(MachineInstr &Root,
	SmallVectorImpl<MachineCombinerPattern> &Patterns) {
	unsigned Opc = Root.getOpcode();
	MachineBasicBlock &MBB = *Root.getParent();
	bool Found = false;

	if (!isCombineInstrCandidate(Opc))
	return false;
	if (isCombineInstrSettingFlag(Opc)) {
	int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
	// When NZCV is live bail out.
	if (Cmp_NZCV == -1)
	return false;
	unsigned NewOpc = convertToNonFlagSettingOpc(Root);
	// When opcode can't change bail out.
	// CHECKME: do we miss any cases for opcode conversion?
	if (NewOpc == Opc)
	return false;
	Opc = NewOpc;
	}

	auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
	MachineCombinerPattern Pattern) {
	if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
	Patterns.push_back(Pattern);
	Found = true;
	}
	};

	auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) {
	if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
	Patterns.push_back(Pattern);
	Found = true;
	}
	};

	typedef MachineCombinerPattern MCP;

	switch (Opc) {
	default:
	break;
	case AArch64::ADDWrr:
	assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
	"ADDWrr does not have register operands");
	setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
	setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
	break;
	case AArch64::ADDXrr:
	setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
	setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
	break;
	case AArch64::SUBWrr:
	setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
	setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
	break;
	case AArch64::SUBXrr:
	setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
	setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
	break;
	case AArch64::ADDWri:
	setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
	break;
	case AArch64::ADDXri:
	setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
	break;
	case AArch64::SUBWri:
	setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
	break;
	case AArch64::SUBXri:
	setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
	break;
	case AArch64::ADDv8i8:
	setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
	setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
	break;
	case AArch64::ADDv16i8:
	setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
	setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
	break;
	case AArch64::ADDv4i16:
	setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
	setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
	setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
	setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
	break;
	case AArch64::ADDv8i16:
	setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
	setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
	setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
	setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
	break;
	case AArch64::ADDv2i32:
	setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
	setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
	setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
	setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
	break;
	case AArch64::ADDv4i32:
	setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
	setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
	setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
	setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
	break;
	case AArch64::SUBv8i8:
	setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
	setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
	break;
	case AArch64::SUBv16i8:
	setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
	setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
	break;
	case AArch64::SUBv4i16:
	setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
	setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
	setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
	setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
	break;
	case AArch64::SUBv8i16:
	setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
	setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
	setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
	setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
	break;
	case AArch64::SUBv2i32:
	setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
	setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
	setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
	setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
	break;
	case AArch64::SUBv4i32:
	setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
	setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
	setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
	setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
	break;
	}
	return Found;
	}
	/// Floating-Point Support

	/// Find instructions that can be turned into madd.
	static bool getFMAPatterns(MachineInstr &Root,
	SmallVectorImpl<MachineCombinerPattern> &Patterns) {

	if (!isCombineInstrCandidateFP(Root))
	return false;

	MachineBasicBlock &MBB = *Root.getParent();
	bool Found = false;

	auto Match = [&](int Opcode, int Operand,
	MachineCombinerPattern Pattern) -> bool {
	if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
	Patterns.push_back(Pattern);
	return true;
	}
	return false;
	};

	typedef MachineCombinerPattern MCP;

	switch (Root.getOpcode()) {
	default:
	assert(false && "Unsupported FP instruction in combiner\n");
	break;
	case AArch64::FADDHrr:
	assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
	"FADDHrr does not have register operands");

	Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
	Found \|= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
	break;
	case AArch64::FADDSrr:
	assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
	"FADDSrr does not have register operands");

	Found \|= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) \|\|
	Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);

	Found \|= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) \|\|
	Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
	break;
	case AArch64::FADDDrr:
	Found \|= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) \|\|
	Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);

	Found \|= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) \|\|
	Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
	break;
	case AArch64::FADDv4f16:
	Found \|= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) \|\|
	Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);

	Found \|= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) \|\|
	Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
	break;
	case AArch64::FADDv8f16:
	Found \|= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) \|\|
	Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);

	Found \|= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) \|\|
	Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
	break;
	case AArch64::FADDv2f32:
	Found \|= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) \|\|
	Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);

	Found \|= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) \|\|
	Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
	break;
	case AArch64::FADDv2f64:
	Found \|= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) \|\|
	Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);

	Found \|= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) \|\|
	Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
	break;
	case AArch64::FADDv4f32:
	Found \|= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) \|\|
	Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);

	Found \|= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) \|\|
	Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
	break;
	case AArch64::FSUBHrr:
	Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
	Found \|= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
	Found \|= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
	break;
	case AArch64::FSUBSrr:
	Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);

	Found \|= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) \|\|
	Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);

	Found \|= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
	break;
	case AArch64::FSUBDrr:
	Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);

	Found \|= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) \|\|
	Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);

	Found \|= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
	break;
	case AArch64::FSUBv4f16:
	Found \|= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) \|\|
	Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);

	Found \|= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) \|\|
	Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
	break;
	case AArch64::FSUBv8f16:
	Found \|= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) \|\|
	Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);

	Found \|= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) \|\|
	Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
	break;
	case AArch64::FSUBv2f32:
	Found \|= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) \|\|
	Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);

	Found \|= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) \|\|
	Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
	break;
	case AArch64::FSUBv2f64:
	Found \|= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) \|\|
	Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);

	Found \|= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) \|\|
	Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
	break;
	case AArch64::FSUBv4f32:
	Found \|= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) \|\|
	Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);

	Found \|= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) \|\|
	Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
	break;
	}
	return Found;
	}

	/// Return true when a code sequence can improve throughput. It
	/// should be called only for instructions in loops.
	/// \param Pattern - combiner pattern
	bool AArch64InstrInfo::isThroughputPattern(
	MachineCombinerPattern Pattern) const {
	switch (Pattern) {
	default:
	break;
	case MachineCombinerPattern::FMULADDH_OP1:
	case MachineCombinerPattern::FMULADDH_OP2:
	case MachineCombinerPattern::FMULSUBH_OP1:
	case MachineCombinerPattern::FMULSUBH_OP2:
	case MachineCombinerPattern::FMULADDS_OP1:
	case MachineCombinerPattern::FMULADDS_OP2:
	case MachineCombinerPattern::FMULSUBS_OP1:
	case MachineCombinerPattern::FMULSUBS_OP2:
	case MachineCombinerPattern::FMULADDD_OP1:
	case MachineCombinerPattern::FMULADDD_OP2:
	case MachineCombinerPattern::FMULSUBD_OP1:
	case MachineCombinerPattern::FMULSUBD_OP2:
	case MachineCombinerPattern::FNMULSUBH_OP1:
	case MachineCombinerPattern::FNMULSUBS_OP1:
	case MachineCombinerPattern::FNMULSUBD_OP1:
	case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
	case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
	case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
	case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
	case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
	case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
	case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
	case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
	case MachineCombinerPattern::FMLAv4f16_OP2:
	case MachineCombinerPattern::FMLAv4f16_OP1:
	case MachineCombinerPattern::FMLAv8f16_OP1:
	case MachineCombinerPattern::FMLAv8f16_OP2:
	case MachineCombinerPattern::FMLAv2f32_OP2:
	case MachineCombinerPattern::FMLAv2f32_OP1:
	case MachineCombinerPattern::FMLAv2f64_OP1:
	case MachineCombinerPattern::FMLAv2f64_OP2:
	case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
	case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
	case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
	case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
	case MachineCombinerPattern::FMLAv4f32_OP1:
	case MachineCombinerPattern::FMLAv4f32_OP2:
	case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
	case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
	case MachineCombinerPattern::FMLSv4i16_indexed_OP1:
	case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
	case MachineCombinerPattern::FMLSv8i16_indexed_OP1:
	case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
	case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
	case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
	case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
	case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
	case MachineCombinerPattern::FMLSv4f16_OP1:
	case MachineCombinerPattern::FMLSv4f16_OP2:
	case MachineCombinerPattern::FMLSv8f16_OP1:
	case MachineCombinerPattern::FMLSv8f16_OP2:
	case MachineCombinerPattern::FMLSv2f32_OP2:
	case MachineCombinerPattern::FMLSv2f64_OP2:
	case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
	case MachineCombinerPattern::FMLSv4f32_OP2:
	case MachineCombinerPattern::MULADDv8i8_OP1:
	case MachineCombinerPattern::MULADDv8i8_OP2:
	case MachineCombinerPattern::MULADDv16i8_OP1:
	case MachineCombinerPattern::MULADDv16i8_OP2:
	case MachineCombinerPattern::MULADDv4i16_OP1:
	case MachineCombinerPattern::MULADDv4i16_OP2:
	case MachineCombinerPattern::MULADDv8i16_OP1:
	case MachineCombinerPattern::MULADDv8i16_OP2:
	case MachineCombinerPattern::MULADDv2i32_OP1:
	case MachineCombinerPattern::MULADDv2i32_OP2:
	case MachineCombinerPattern::MULADDv4i32_OP1:
	case MachineCombinerPattern::MULADDv4i32_OP2:
	case MachineCombinerPattern::MULSUBv8i8_OP1:
	case MachineCombinerPattern::MULSUBv8i8_OP2:
	case MachineCombinerPattern::MULSUBv16i8_OP1:
	case MachineCombinerPattern::MULSUBv16i8_OP2:
	case MachineCombinerPattern::MULSUBv4i16_OP1:
	case MachineCombinerPattern::MULSUBv4i16_OP2:
	case MachineCombinerPattern::MULSUBv8i16_OP1:
	case MachineCombinerPattern::MULSUBv8i16_OP2:
	case MachineCombinerPattern::MULSUBv2i32_OP1:
	case MachineCombinerPattern::MULSUBv2i32_OP2:
	case MachineCombinerPattern::MULSUBv4i32_OP1:
	case MachineCombinerPattern::MULSUBv4i32_OP2:
	case MachineCombinerPattern::MULADDv4i16_indexed_OP1:
	case MachineCombinerPattern::MULADDv4i16_indexed_OP2:
	case MachineCombinerPattern::MULADDv8i16_indexed_OP1:
	case MachineCombinerPattern::MULADDv8i16_indexed_OP2:
	case MachineCombinerPattern::MULADDv2i32_indexed_OP1:
	case MachineCombinerPattern::MULADDv2i32_indexed_OP2:
	case MachineCombinerPattern::MULADDv4i32_indexed_OP1:
	case MachineCombinerPattern::MULADDv4i32_indexed_OP2:
	case MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
	case MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
	case MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
	case MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
	case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
	case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
	case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
	case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
	return true;
	} // end switch (Pattern)
	return false;
	}
	/// Return true when there is potentially a faster code sequence for an
	/// instruction chain ending in \p Root. All potential patterns are listed in
	/// the \p Pattern vector. Pattern should be sorted in priority order since the
	/// pattern evaluator stops checking as soon as it finds a faster sequence.

	bool AArch64InstrInfo::getMachineCombinerPatterns(
	MachineInstr &Root,
	SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
	// Integer patterns
	if (getMaddPatterns(Root, Patterns))
	return true;
	// Floating point patterns
	if (getFMAPatterns(Root, Patterns))
	return true;

	return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
	}

	enum class FMAInstKind { Default, Indexed, Accumulator };
	/// genFusedMultiply - Generate fused multiply instructions.
	/// This function supports both integer and floating point instructions.
	/// A typical example:
	/// F\|MUL I=A,B,0
	/// F\|ADD R,I,C
	/// ==> F\|MADD R,A,B,C
	/// \param MF Containing MachineFunction
	/// \param MRI Register information
	/// \param TII Target information
	/// \param Root is the F\|ADD instruction
	/// \param [out] InsInstrs is a vector of machine instructions and will
	/// contain the generated madd instruction
	/// \param IdxMulOpd is index of operand in Root that is the result of
	/// the F\|MUL. In the example above IdxMulOpd is 1.
	/// \param MaddOpc the opcode fo the f\|madd instruction
	/// \param RC Register class of operands
	/// \param kind of fma instruction (addressing mode) to be generated
	/// \param ReplacedAddend is the result register from the instruction
	/// replacing the non-combined operand, if any.
	static MachineInstr *
	genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
	const TargetInstrInfo *TII, MachineInstr &Root,
	SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
	unsigned MaddOpc, const TargetRegisterClass *RC,
	FMAInstKind kind = FMAInstKind::Default,
	const Register *ReplacedAddend = nullptr) {
	assert(IdxMulOpd == 1 \|\| IdxMulOpd == 2);

	unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
	MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
	Register ResultReg = Root.getOperand(0).getReg();
	Register SrcReg0 = MUL->getOperand(1).getReg();
	bool Src0IsKill = MUL->getOperand(1).isKill();
	Register SrcReg1 = MUL->getOperand(2).getReg();
	bool Src1IsKill = MUL->getOperand(2).isKill();

	unsigned SrcReg2;
	bool Src2IsKill;
	if (ReplacedAddend) {
	// If we just generated a new addend, we must be it's only use.
	SrcReg2 = *ReplacedAddend;
	Src2IsKill = true;
	} else {
	SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
	Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
	}

	if (Register::isVirtualRegister(ResultReg))
	MRI.constrainRegClass(ResultReg, RC);
	if (Register::isVirtualRegister(SrcReg0))
	MRI.constrainRegClass(SrcReg0, RC);
	if (Register::isVirtualRegister(SrcReg1))
	MRI.constrainRegClass(SrcReg1, RC);
	if (Register::isVirtualRegister(SrcReg2))
	MRI.constrainRegClass(SrcReg2, RC);

	MachineInstrBuilder MIB;
	if (kind == FMAInstKind::Default)
	MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
	.addReg(SrcReg0, getKillRegState(Src0IsKill))
	.addReg(SrcReg1, getKillRegState(Src1IsKill))
	.addReg(SrcReg2, getKillRegState(Src2IsKill));
	else if (kind == FMAInstKind::Indexed)
	MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
	.addReg(SrcReg2, getKillRegState(Src2IsKill))
	.addReg(SrcReg0, getKillRegState(Src0IsKill))
	.addReg(SrcReg1, getKillRegState(Src1IsKill))
	.addImm(MUL->getOperand(3).getImm());
	else if (kind == FMAInstKind::Accumulator)
	MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
	.addReg(SrcReg2, getKillRegState(Src2IsKill))
	.addReg(SrcReg0, getKillRegState(Src0IsKill))
	.addReg(SrcReg1, getKillRegState(Src1IsKill));
	else
	assert(false && "Invalid FMA instruction kind \n");
	// Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
	InsInstrs.push_back(MIB);
	return MUL;
	}

	/// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
	/// instructions.
	///
	/// \see genFusedMultiply
	static MachineInstr *genFusedMultiplyAcc(
	MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
	MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
	unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
	return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
	FMAInstKind::Accumulator);
	}

	/// genNeg - Helper to generate an intermediate negation of the second operand
	/// of Root
	static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI,
	const TargetInstrInfo *TII, MachineInstr &Root,
	SmallVectorImpl<MachineInstr *> &InsInstrs,
	DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
	unsigned MnegOpc, const TargetRegisterClass *RC) {
	Register NewVR = MRI.createVirtualRegister(RC);
	MachineInstrBuilder MIB =
	BuildMI(MF, Root.getDebugLoc(), TII->get(MnegOpc), NewVR)
	.add(Root.getOperand(2));
	InsInstrs.push_back(MIB);

	assert(InstrIdxForVirtReg.empty());
	InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));

	return NewVR;
	}

	/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
	/// instructions with an additional negation of the accumulator
	static MachineInstr *genFusedMultiplyAccNeg(
	MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
	MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
	DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
	unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
	assert(IdxMulOpd == 1);

	Register NewVR =
	genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
	return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
	FMAInstKind::Accumulator, &NewVR);
	}

	/// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
	/// instructions.
	///
	/// \see genFusedMultiply
	static MachineInstr *genFusedMultiplyIdx(
	MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
	MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
	unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
	return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
	FMAInstKind::Indexed);
	}

	/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
	/// instructions with an additional negation of the accumulator
	static MachineInstr *genFusedMultiplyIdxNeg(
	MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
	MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
	DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
	unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
	assert(IdxMulOpd == 1);

	Register NewVR =
	genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);

	return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
	FMAInstKind::Indexed, &NewVR);
	}

	/// genMaddR - Generate madd instruction and combine mul and add using
	/// an extra virtual register
	/// Example - an ADD intermediate needs to be stored in a register:
	/// MUL I=A,B,0
	/// ADD R,I,Imm
	/// ==> ORR V, ZR, Imm
	/// ==> MADD R,A,B,V
	/// \param MF Containing MachineFunction
	/// \param MRI Register information
	/// \param TII Target information
	/// \param Root is the ADD instruction
	/// \param [out] InsInstrs is a vector of machine instructions and will
	/// contain the generated madd instruction
	/// \param IdxMulOpd is index of operand in Root that is the result of
	/// the MUL. In the example above IdxMulOpd is 1.
	/// \param MaddOpc the opcode fo the madd instruction
	/// \param VR is a virtual register that holds the value of an ADD operand
	/// (V in the example above).
	/// \param RC Register class of operands
	static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
	const TargetInstrInfo *TII, MachineInstr &Root,
	SmallVectorImpl<MachineInstr *> &InsInstrs,
	unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
	const TargetRegisterClass *RC) {
	assert(IdxMulOpd == 1 \|\| IdxMulOpd == 2);

	MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
	Register ResultReg = Root.getOperand(0).getReg();
	Register SrcReg0 = MUL->getOperand(1).getReg();
	bool Src0IsKill = MUL->getOperand(1).isKill();
	Register SrcReg1 = MUL->getOperand(2).getReg();
	bool Src1IsKill = MUL->getOperand(2).isKill();

	if (Register::isVirtualRegister(ResultReg))
	MRI.constrainRegClass(ResultReg, RC);
	if (Register::isVirtualRegister(SrcReg0))
	MRI.constrainRegClass(SrcReg0, RC);
	if (Register::isVirtualRegister(SrcReg1))
	MRI.constrainRegClass(SrcReg1, RC);
	if (Register::isVirtualRegister(VR))
	MRI.constrainRegClass(VR, RC);

	MachineInstrBuilder MIB =
	BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
	.addReg(SrcReg0, getKillRegState(Src0IsKill))
	.addReg(SrcReg1, getKillRegState(Src1IsKill))
	.addReg(VR);
	// Insert the MADD
	InsInstrs.push_back(MIB);
	return MUL;
	}

	/// When getMachineCombinerPatterns() finds potential patterns,
	/// this function generates the instructions that could replace the
	/// original code sequence
	void AArch64InstrInfo::genAlternativeCodeSequence(
	MachineInstr &Root, MachineCombinerPattern Pattern,
	SmallVectorImpl<MachineInstr *> &InsInstrs,
	SmallVectorImpl<MachineInstr *> &DelInstrs,
	DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
	MachineBasicBlock &MBB = *Root.getParent();
	MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
	MachineFunction &MF = *MBB.getParent();
	const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();

	MachineInstr *MUL;
	const TargetRegisterClass *RC;
	unsigned Opc;
	switch (Pattern) {
	default:
	// Reassociate instructions.
	TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
	DelInstrs, InstrIdxForVirtReg);
	return;
	case MachineCombinerPattern::MULADDW_OP1:
	case MachineCombinerPattern::MULADDX_OP1:
	// MUL I=A,B,0
	// ADD R,I,C
	// ==> MADD R,A,B,C
	// --- Create(MADD);
	if (Pattern == MachineCombinerPattern::MULADDW_OP1) {
	Opc = AArch64::MADDWrrr;
	RC = &AArch64::GPR32RegClass;
	} else {
	Opc = AArch64::MADDXrrr;
	RC = &AArch64::GPR64RegClass;
	}
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
	break;
	case MachineCombinerPattern::MULADDW_OP2:
	case MachineCombinerPattern::MULADDX_OP2:
	// MUL I=A,B,0
	// ADD R,C,I
	// ==> MADD R,A,B,C
	// --- Create(MADD);
	if (Pattern == MachineCombinerPattern::MULADDW_OP2) {
	Opc = AArch64::MADDWrrr;
	RC = &AArch64::GPR32RegClass;
	} else {
	Opc = AArch64::MADDXrrr;
	RC = &AArch64::GPR64RegClass;
	}
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;
	case MachineCombinerPattern::MULADDWI_OP1:
	case MachineCombinerPattern::MULADDXI_OP1: {
	// MUL I=A,B,0
	// ADD R,I,Imm
	// ==> ORR V, ZR, Imm
	// ==> MADD R,A,B,V
	// --- Create(MADD);
	const TargetRegisterClass *OrrRC;
	unsigned BitSize, OrrOpc, ZeroReg;
	if (Pattern == MachineCombinerPattern::MULADDWI_OP1) {
	OrrOpc = AArch64::ORRWri;
	OrrRC = &AArch64::GPR32spRegClass;
	BitSize = 32;
	ZeroReg = AArch64::WZR;
	Opc = AArch64::MADDWrrr;
	RC = &AArch64::GPR32RegClass;
	} else {
	OrrOpc = AArch64::ORRXri;
	OrrRC = &AArch64::GPR64spRegClass;
	BitSize = 64;
	ZeroReg = AArch64::XZR;
	Opc = AArch64::MADDXrrr;
	RC = &AArch64::GPR64RegClass;
	}
	Register NewVR = MRI.createVirtualRegister(OrrRC);
	uint64_t Imm = Root.getOperand(2).getImm();

	if (Root.getOperand(3).isImm()) {
	unsigned Val = Root.getOperand(3).getImm();
	Imm = Imm << Val;
	}
	uint64_t UImm = SignExtend64(Imm, BitSize);
	uint64_t Encoding;
	if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
	MachineInstrBuilder MIB1 =
	BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
	.addReg(ZeroReg)
	.addImm(Encoding);
	InsInstrs.push_back(MIB1);
	InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
	MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
	}
	break;
	}
	case MachineCombinerPattern::MULSUBW_OP1:
	case MachineCombinerPattern::MULSUBX_OP1: {
	// MUL I=A,B,0
	// SUB R,I, C
	// ==> SUB V, 0, C
	// ==> MADD R,A,B,V // = -C + A*B
	// --- Create(MADD);
	const TargetRegisterClass *SubRC;
	unsigned SubOpc, ZeroReg;
	if (Pattern == MachineCombinerPattern::MULSUBW_OP1) {
	SubOpc = AArch64::SUBWrr;
	SubRC = &AArch64::GPR32spRegClass;
	ZeroReg = AArch64::WZR;
	Opc = AArch64::MADDWrrr;
	RC = &AArch64::GPR32RegClass;
	} else {
	SubOpc = AArch64::SUBXrr;
	SubRC = &AArch64::GPR64spRegClass;
	ZeroReg = AArch64::XZR;
	Opc = AArch64::MADDXrrr;
	RC = &AArch64::GPR64RegClass;
	}
	Register NewVR = MRI.createVirtualRegister(SubRC);
	// SUB NewVR, 0, C
	MachineInstrBuilder MIB1 =
	BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR)
	.addReg(ZeroReg)
	.add(Root.getOperand(2));
	InsInstrs.push_back(MIB1);
	InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
	MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
	break;
	}
	case MachineCombinerPattern::MULSUBW_OP2:
	case MachineCombinerPattern::MULSUBX_OP2:
	// MUL I=A,B,0
	// SUB R,C,I
	// ==> MSUB R,A,B,C (computes C - A*B)
	// --- Create(MSUB);
	if (Pattern == MachineCombinerPattern::MULSUBW_OP2) {
	Opc = AArch64::MSUBWrrr;
	RC = &AArch64::GPR32RegClass;
	} else {
	Opc = AArch64::MSUBXrrr;
	RC = &AArch64::GPR64RegClass;
	}
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;
	case MachineCombinerPattern::MULSUBWI_OP1:
	case MachineCombinerPattern::MULSUBXI_OP1: {
	// MUL I=A,B,0
	// SUB R,I, Imm
	// ==> ORR V, ZR, -Imm
	// ==> MADD R,A,B,V // = -Imm + A*B
	// --- Create(MADD);
	const TargetRegisterClass *OrrRC;
	unsigned BitSize, OrrOpc, ZeroReg;
	if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) {
	OrrOpc = AArch64::ORRWri;
	OrrRC = &AArch64::GPR32spRegClass;
	BitSize = 32;
	ZeroReg = AArch64::WZR;
	Opc = AArch64::MADDWrrr;
	RC = &AArch64::GPR32RegClass;
	} else {
	OrrOpc = AArch64::ORRXri;
	OrrRC = &AArch64::GPR64spRegClass;
	BitSize = 64;
	ZeroReg = AArch64::XZR;
	Opc = AArch64::MADDXrrr;
	RC = &AArch64::GPR64RegClass;
	}
	Register NewVR = MRI.createVirtualRegister(OrrRC);
	uint64_t Imm = Root.getOperand(2).getImm();
	if (Root.getOperand(3).isImm()) {
	unsigned Val = Root.getOperand(3).getImm();
	Imm = Imm << Val;
	}
	uint64_t UImm = SignExtend64(-Imm, BitSize);
	uint64_t Encoding;
	if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
	MachineInstrBuilder MIB1 =
	BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
	.addReg(ZeroReg)
	.addImm(Encoding);
	InsInstrs.push_back(MIB1);
	InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
	MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
	}
	break;
	}

	case MachineCombinerPattern::MULADDv8i8_OP1:
	Opc = AArch64::MLAv8i8;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
	break;
	case MachineCombinerPattern::MULADDv8i8_OP2:
	Opc = AArch64::MLAv8i8;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;
	case MachineCombinerPattern::MULADDv16i8_OP1:
	Opc = AArch64::MLAv16i8;
	RC = &AArch64::FPR128RegClass;
	MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
	break;
	case MachineCombinerPattern::MULADDv16i8_OP2:
	Opc = AArch64::MLAv16i8;
	RC = &AArch64::FPR128RegClass;
	MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;
	case MachineCombinerPattern::MULADDv4i16_OP1:
	Opc = AArch64::MLAv4i16;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
	break;
	case MachineCombinerPattern::MULADDv4i16_OP2:
	Opc = AArch64::MLAv4i16;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;
	case MachineCombinerPattern::MULADDv8i16_OP1:
	Opc = AArch64::MLAv8i16;
	RC = &AArch64::FPR128RegClass;
	MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
	break;
	case MachineCombinerPattern::MULADDv8i16_OP2:
	Opc = AArch64::MLAv8i16;
	RC = &AArch64::FPR128RegClass;
	MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;
	case MachineCombinerPattern::MULADDv2i32_OP1:
	Opc = AArch64::MLAv2i32;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
	break;
	case MachineCombinerPattern::MULADDv2i32_OP2:
	Opc = AArch64::MLAv2i32;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;
	case MachineCombinerPattern::MULADDv4i32_OP1:
	Opc = AArch64::MLAv4i32;
	RC = &AArch64::FPR128RegClass;
	MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
	break;
	case MachineCombinerPattern::MULADDv4i32_OP2:
	Opc = AArch64::MLAv4i32;
	RC = &AArch64::FPR128RegClass;
	MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;

	case MachineCombinerPattern::MULSUBv8i8_OP1:
	Opc = AArch64::MLAv8i8;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
	InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
	RC);
	break;
	case MachineCombinerPattern::MULSUBv8i8_OP2:
	Opc = AArch64::MLSv8i8;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;
	case MachineCombinerPattern::MULSUBv16i8_OP1:
	Opc = AArch64::MLAv16i8;
	RC = &AArch64::FPR128RegClass;
	MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
	InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
	RC);
	break;
	case MachineCombinerPattern::MULSUBv16i8_OP2:
	Opc = AArch64::MLSv16i8;
	RC = &AArch64::FPR128RegClass;
	MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;
	case MachineCombinerPattern::MULSUBv4i16_OP1:
	Opc = AArch64::MLAv4i16;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
	InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
	RC);
	break;
	case MachineCombinerPattern::MULSUBv4i16_OP2:
	Opc = AArch64::MLSv4i16;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;
	case MachineCombinerPattern::MULSUBv8i16_OP1:
	Opc = AArch64::MLAv8i16;
	RC = &AArch64::FPR128RegClass;
	MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
	InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
	RC);
	break;
	case MachineCombinerPattern::MULSUBv8i16_OP2:
	Opc = AArch64::MLSv8i16;
	RC = &AArch64::FPR128RegClass;
	MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;
	case MachineCombinerPattern::MULSUBv2i32_OP1:
	Opc = AArch64::MLAv2i32;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
	InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
	RC);
	break;
	case MachineCombinerPattern::MULSUBv2i32_OP2:
	Opc = AArch64::MLSv2i32;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;
	case MachineCombinerPattern::MULSUBv4i32_OP1:
	Opc = AArch64::MLAv4i32;
	RC = &AArch64::FPR128RegClass;
	MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
	InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
	RC);
	break;
	case MachineCombinerPattern::MULSUBv4i32_OP2:
	Opc = AArch64::MLSv4i32;
	RC = &AArch64::FPR128RegClass;
	MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;

	case MachineCombinerPattern::MULADDv4i16_indexed_OP1:
	Opc = AArch64::MLAv4i16_indexed;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
	break;
	case MachineCombinerPattern::MULADDv4i16_indexed_OP2:
	Opc = AArch64::MLAv4i16_indexed;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;
	case MachineCombinerPattern::MULADDv8i16_indexed_OP1:
	Opc = AArch64::MLAv8i16_indexed;
	RC = &AArch64::FPR128RegClass;
	MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
	break;
	case MachineCombinerPattern::MULADDv8i16_indexed_OP2:
	Opc = AArch64::MLAv8i16_indexed;
	RC = &AArch64::FPR128RegClass;
	MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;
	case MachineCombinerPattern::MULADDv2i32_indexed_OP1:
	Opc = AArch64::MLAv2i32_indexed;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
	break;
	case MachineCombinerPattern::MULADDv2i32_indexed_OP2:
	Opc = AArch64::MLAv2i32_indexed;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;
	case MachineCombinerPattern::MULADDv4i32_indexed_OP1:
	Opc = AArch64::MLAv4i32_indexed;
	RC = &AArch64::FPR128RegClass;
	MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
	break;
	case MachineCombinerPattern::MULADDv4i32_indexed_OP2:
	Opc = AArch64::MLAv4i32_indexed;
	RC = &AArch64::FPR128RegClass;
	MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;

	case MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
	Opc = AArch64::MLAv4i16_indexed;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
	InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
	RC);
	break;
	case MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
	Opc = AArch64::MLSv4i16_indexed;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;
	case MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
	Opc = AArch64::MLAv8i16_indexed;
	RC = &AArch64::FPR128RegClass;
	MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
	InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
	RC);
	break;
	case MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
	Opc = AArch64::MLSv8i16_indexed;
	RC = &AArch64::FPR128RegClass;
	MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;
	case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
	Opc = AArch64::MLAv2i32_indexed;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
	InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
	RC);
	break;
	case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
	Opc = AArch64::MLSv2i32_indexed;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;
	case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
	Opc = AArch64::MLAv4i32_indexed;
	RC = &AArch64::FPR128RegClass;
	MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
	InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
	RC);
	break;
	case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
	Opc = AArch64::MLSv4i32_indexed;
	RC = &AArch64::FPR128RegClass;
	MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;

	// Floating Point Support
	case MachineCombinerPattern::FMULADDH_OP1:
	Opc = AArch64::FMADDHrrr;
	RC = &AArch64::FPR16RegClass;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
	break;
	case MachineCombinerPattern::FMULADDS_OP1:
	Opc = AArch64::FMADDSrrr;
	RC = &AArch64::FPR32RegClass;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
	break;
	case MachineCombinerPattern::FMULADDD_OP1:
	Opc = AArch64::FMADDDrrr;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
	break;

	case MachineCombinerPattern::FMULADDH_OP2:
	Opc = AArch64::FMADDHrrr;
	RC = &AArch64::FPR16RegClass;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;
	case MachineCombinerPattern::FMULADDS_OP2:
	Opc = AArch64::FMADDSrrr;
	RC = &AArch64::FPR32RegClass;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;
	case MachineCombinerPattern::FMULADDD_OP2:
	Opc = AArch64::FMADDDrrr;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;

	case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
	Opc = AArch64::FMLAv1i32_indexed;
	RC = &AArch64::FPR32RegClass;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Indexed);
	break;
	case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
	Opc = AArch64::FMLAv1i32_indexed;
	RC = &AArch64::FPR32RegClass;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Indexed);
	break;

	case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
	Opc = AArch64::FMLAv1i64_indexed;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Indexed);
	break;
	case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
	Opc = AArch64::FMLAv1i64_indexed;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Indexed);
	break;

	case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
	RC = &AArch64::FPR64RegClass;
	Opc = AArch64::FMLAv4i16_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Indexed);
	break;
	case MachineCombinerPattern::FMLAv4f16_OP1:
	RC = &AArch64::FPR64RegClass;
	Opc = AArch64::FMLAv4f16;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Accumulator);
	break;
	case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
	RC = &AArch64::FPR64RegClass;
	Opc = AArch64::FMLAv4i16_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Indexed);
	break;
	case MachineCombinerPattern::FMLAv4f16_OP2:
	RC = &AArch64::FPR64RegClass;
	Opc = AArch64::FMLAv4f16;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Accumulator);
	break;

	case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
	case MachineCombinerPattern::FMLAv2f32_OP1:
	RC = &AArch64::FPR64RegClass;
	if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
	Opc = AArch64::FMLAv2i32_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Indexed);
	} else {
	Opc = AArch64::FMLAv2f32;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Accumulator);
	}
	break;
	case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
	case MachineCombinerPattern::FMLAv2f32_OP2:
	RC = &AArch64::FPR64RegClass;
	if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
	Opc = AArch64::FMLAv2i32_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Indexed);
	} else {
	Opc = AArch64::FMLAv2f32;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Accumulator);
	}
	break;

	case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
	RC = &AArch64::FPR128RegClass;
	Opc = AArch64::FMLAv8i16_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Indexed);
	break;
	case MachineCombinerPattern::FMLAv8f16_OP1:
	RC = &AArch64::FPR128RegClass;
	Opc = AArch64::FMLAv8f16;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Accumulator);
	break;
	case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
	RC = &AArch64::FPR128RegClass;
	Opc = AArch64::FMLAv8i16_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Indexed);
	break;
	case MachineCombinerPattern::FMLAv8f16_OP2:
	RC = &AArch64::FPR128RegClass;
	Opc = AArch64::FMLAv8f16;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Accumulator);
	break;

	case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
	case MachineCombinerPattern::FMLAv2f64_OP1:
	RC = &AArch64::FPR128RegClass;
	if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
	Opc = AArch64::FMLAv2i64_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Indexed);
	} else {
	Opc = AArch64::FMLAv2f64;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Accumulator);
	}
	break;
	case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
	case MachineCombinerPattern::FMLAv2f64_OP2:
	RC = &AArch64::FPR128RegClass;
	if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
	Opc = AArch64::FMLAv2i64_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Indexed);
	} else {
	Opc = AArch64::FMLAv2f64;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Accumulator);
	}
	break;

	case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
	case MachineCombinerPattern::FMLAv4f32_OP1:
	RC = &AArch64::FPR128RegClass;
	if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
	Opc = AArch64::FMLAv4i32_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Indexed);
	} else {
	Opc = AArch64::FMLAv4f32;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Accumulator);
	}
	break;

	case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
	case MachineCombinerPattern::FMLAv4f32_OP2:
	RC = &AArch64::FPR128RegClass;
	if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
	Opc = AArch64::FMLAv4i32_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Indexed);
	} else {
	Opc = AArch64::FMLAv4f32;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Accumulator);
	}
	break;

	case MachineCombinerPattern::FMULSUBH_OP1:
	Opc = AArch64::FNMSUBHrrr;
	RC = &AArch64::FPR16RegClass;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
	break;
	case MachineCombinerPattern::FMULSUBS_OP1:
	Opc = AArch64::FNMSUBSrrr;
	RC = &AArch64::FPR32RegClass;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
	break;
	case MachineCombinerPattern::FMULSUBD_OP1:
	Opc = AArch64::FNMSUBDrrr;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
	break;

	case MachineCombinerPattern::FNMULSUBH_OP1:
	Opc = AArch64::FNMADDHrrr;
	RC = &AArch64::FPR16RegClass;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
	break;
	case MachineCombinerPattern::FNMULSUBS_OP1:
	Opc = AArch64::FNMADDSrrr;
	RC = &AArch64::FPR32RegClass;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
	break;
	case MachineCombinerPattern::FNMULSUBD_OP1:
	Opc = AArch64::FNMADDDrrr;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
	break;

	case MachineCombinerPattern::FMULSUBH_OP2:
	Opc = AArch64::FMSUBHrrr;
	RC = &AArch64::FPR16RegClass;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;
	case MachineCombinerPattern::FMULSUBS_OP2:
	Opc = AArch64::FMSUBSrrr;
	RC = &AArch64::FPR32RegClass;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;
	case MachineCombinerPattern::FMULSUBD_OP2:
	Opc = AArch64::FMSUBDrrr;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;

	case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
	Opc = AArch64::FMLSv1i32_indexed;
	RC = &AArch64::FPR32RegClass;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Indexed);
	break;

	case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
	Opc = AArch64::FMLSv1i64_indexed;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Indexed);
	break;

	case MachineCombinerPattern::FMLSv4f16_OP1:
	case MachineCombinerPattern::FMLSv4i16_indexed_OP1: {
	RC = &AArch64::FPR64RegClass;
	Register NewVR = MRI.createVirtualRegister(RC);
	MachineInstrBuilder MIB1 =
	BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f16), NewVR)
	.add(Root.getOperand(2));
	InsInstrs.push_back(MIB1);
	InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
	if (Pattern == MachineCombinerPattern::FMLSv4f16_OP1) {
	Opc = AArch64::FMLAv4f16;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Accumulator, &NewVR);
	} else {
	Opc = AArch64::FMLAv4i16_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Indexed, &NewVR);
	}
	break;
	}
	case MachineCombinerPattern::FMLSv4f16_OP2:
	RC = &AArch64::FPR64RegClass;
	Opc = AArch64::FMLSv4f16;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Accumulator);
	break;
	case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
	RC = &AArch64::FPR64RegClass;
	Opc = AArch64::FMLSv4i16_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Indexed);
	break;

	case MachineCombinerPattern::FMLSv2f32_OP2:
	case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
	RC = &AArch64::FPR64RegClass;
	if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
	Opc = AArch64::FMLSv2i32_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Indexed);
	} else {
	Opc = AArch64::FMLSv2f32;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Accumulator);
	}
	break;

	case MachineCombinerPattern::FMLSv8f16_OP1:
	case MachineCombinerPattern::FMLSv8i16_indexed_OP1: {
	RC = &AArch64::FPR128RegClass;
	Register NewVR = MRI.createVirtualRegister(RC);
	MachineInstrBuilder MIB1 =
	BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv8f16), NewVR)
	.add(Root.getOperand(2));
	InsInstrs.push_back(MIB1);
	InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
	if (Pattern == MachineCombinerPattern::FMLSv8f16_OP1) {
	Opc = AArch64::FMLAv8f16;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Accumulator, &NewVR);
	} else {
	Opc = AArch64::FMLAv8i16_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Indexed, &NewVR);
	}
	break;
	}
	case MachineCombinerPattern::FMLSv8f16_OP2:
	RC = &AArch64::FPR128RegClass;
	Opc = AArch64::FMLSv8f16;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Accumulator);
	break;
	case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
	RC = &AArch64::FPR128RegClass;
	Opc = AArch64::FMLSv8i16_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Indexed);
	break;

	case MachineCombinerPattern::FMLSv2f64_OP2:
	case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
	RC = &AArch64::FPR128RegClass;
	if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
	Opc = AArch64::FMLSv2i64_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Indexed);
	} else {
	Opc = AArch64::FMLSv2f64;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Accumulator);
	}
	break;

	case MachineCombinerPattern::FMLSv4f32_OP2:
	case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
	RC = &AArch64::FPR128RegClass;
	if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
	Opc = AArch64::FMLSv4i32_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Indexed);
	} else {
	Opc = AArch64::FMLSv4f32;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Accumulator);
	}
	break;
	case MachineCombinerPattern::FMLSv2f32_OP1:
	case MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
	RC = &AArch64::FPR64RegClass;
	Register NewVR = MRI.createVirtualRegister(RC);
	MachineInstrBuilder MIB1 =
	BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR)
	.add(Root.getOperand(2));
	InsInstrs.push_back(MIB1);
	InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
	if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
	Opc = AArch64::FMLAv2i32_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Indexed, &NewVR);
	} else {
	Opc = AArch64::FMLAv2f32;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Accumulator, &NewVR);
	}
	break;
	}
	case MachineCombinerPattern::FMLSv4f32_OP1:
	case MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
	RC = &AArch64::FPR128RegClass;
	Register NewVR = MRI.createVirtualRegister(RC);
	MachineInstrBuilder MIB1 =
	BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR)
	.add(Root.getOperand(2));
	InsInstrs.push_back(MIB1);
	InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
	if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
	Opc = AArch64::FMLAv4i32_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Indexed, &NewVR);
	} else {
	Opc = AArch64::FMLAv4f32;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Accumulator, &NewVR);
	}
	break;
	}
	case MachineCombinerPattern::FMLSv2f64_OP1:
	case MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
	RC = &AArch64::FPR128RegClass;
	Register NewVR = MRI.createVirtualRegister(RC);
	MachineInstrBuilder MIB1 =
	BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR)
	.add(Root.getOperand(2));
	InsInstrs.push_back(MIB1);
	InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
	if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
	Opc = AArch64::FMLAv2i64_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Indexed, &NewVR);
	} else {
	Opc = AArch64::FMLAv2f64;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Accumulator, &NewVR);
	}
	break;
	}
	} // end switch (Pattern)
	// Record MUL and ADD/SUB for deletion
	DelInstrs.push_back(MUL);
	DelInstrs.push_back(&Root);
	}

	/// Replace csincr-branch sequence by simple conditional branch
	///
	/// Examples:
	/// 1. \code
	/// csinc w9, wzr, wzr, <condition code>
	/// tbnz w9, #0, 0x44
	/// \endcode
	/// to
	/// \code
	/// b.<inverted condition code>
	/// \endcode
	///
	/// 2. \code
	/// csinc w9, wzr, wzr, <condition code>
	/// tbz w9, #0, 0x44
	/// \endcode
	/// to
	/// \code
	/// b.<condition code>
	/// \endcode
	///
	/// Replace compare and branch sequence by TBZ/TBNZ instruction when the
	/// compare's constant operand is power of 2.
	///
	/// Examples:
	/// \code
	/// and w8, w8, #0x400
	/// cbnz w8, L1
	/// \endcode
	/// to
	/// \code
	/// tbnz w8, #10, L1
	/// \endcode
	///
	/// \param MI Conditional Branch
	/// \return True when the simple conditional branch is generated
	///
	bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
	bool IsNegativeBranch = false;
	bool IsTestAndBranch = false;
	unsigned TargetBBInMI = 0;
	switch (MI.getOpcode()) {
	default:
	llvm_unreachable("Unknown branch instruction?");
	case AArch64::Bcc:
	return false;
	case AArch64::CBZW:
	case AArch64::CBZX:
	TargetBBInMI = 1;
	break;
	case AArch64::CBNZW:
	case AArch64::CBNZX:
	TargetBBInMI = 1;
	IsNegativeBranch = true;
	break;
	case AArch64::TBZW:
	case AArch64::TBZX:
	TargetBBInMI = 2;
	IsTestAndBranch = true;
	break;
	case AArch64::TBNZW:
	case AArch64::TBNZX:
	TargetBBInMI = 2;
	IsNegativeBranch = true;
	IsTestAndBranch = true;
	break;
	}
	// So we increment a zero register and test for bits other
	// than bit 0? Conservatively bail out in case the verifier
	// missed this case.
	if (IsTestAndBranch && MI.getOperand(1).getImm())
	return false;

	// Find Definition.
	assert(MI.getParent() && "Incomplete machine instruciton\n");
	MachineBasicBlock *MBB = MI.getParent();
	MachineFunction *MF = MBB->getParent();
	MachineRegisterInfo *MRI = &MF->getRegInfo();
	Register VReg = MI.getOperand(0).getReg();
	if (!Register::isVirtualRegister(VReg))
	return false;

	MachineInstr *DefMI = MRI->getVRegDef(VReg);

	// Look through COPY instructions to find definition.
	while (DefMI->isCopy()) {
	Register CopyVReg = DefMI->getOperand(1).getReg();
	if (!MRI->hasOneNonDBGUse(CopyVReg))
	return false;
	if (!MRI->hasOneDef(CopyVReg))
	return false;
	DefMI = MRI->getVRegDef(CopyVReg);
	}

	switch (DefMI->getOpcode()) {
	default:
	return false;
	// Fold AND into a TBZ/TBNZ if constant operand is power of 2.
	case AArch64::ANDWri:
	case AArch64::ANDXri: {
	if (IsTestAndBranch)
	return false;
	if (DefMI->getParent() != MBB)
	return false;
	if (!MRI->hasOneNonDBGUse(VReg))
	return false;

	bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
	uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
	DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
	if (!isPowerOf2_64(Mask))
	return false;

	MachineOperand &MO = DefMI->getOperand(1);
	Register NewReg = MO.getReg();
	if (!Register::isVirtualRegister(NewReg))
	return false;

	assert(!MRI->def_empty(NewReg) && "Register must be defined.");

	MachineBasicBlock &RefToMBB = *MBB;
	MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
	DebugLoc DL = MI.getDebugLoc();
	unsigned Imm = Log2_64(Mask);
	unsigned Opc = (Imm < 32)
	? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
	: (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
	MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
	.addReg(NewReg)
	.addImm(Imm)
	.addMBB(TBB);
	// Register lives on to the CBZ now.
	MO.setIsKill(false);

	// For immediate smaller than 32, we need to use the 32-bit
	// variant (W) in all cases. Indeed the 64-bit variant does not
	// allow to encode them.
	// Therefore, if the input register is 64-bit, we need to take the
	// 32-bit sub-part.
	if (!Is32Bit && Imm < 32)
	NewMI->getOperand(0).setSubReg(AArch64::sub_32);
	MI.eraseFromParent();
	return true;
	}
	// Look for CSINC
	case AArch64::CSINCWr:
	case AArch64::CSINCXr: {
	if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
	DefMI->getOperand(2).getReg() == AArch64::WZR) &&
	!(DefMI->getOperand(1).getReg() == AArch64::XZR &&
	DefMI->getOperand(2).getReg() == AArch64::XZR))
	return false;

	if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
	return false;

	AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
	// Convert only when the condition code is not modified between
	// the CSINC and the branch. The CC may be used by other
	// instructions in between.
	if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
	return false;
	MachineBasicBlock &RefToMBB = *MBB;
	MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
	DebugLoc DL = MI.getDebugLoc();
	if (IsNegativeBranch)
	CC = AArch64CC::getInvertedCondCode(CC);
	BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
	MI.eraseFromParent();
	return true;
	}
	}
	}

	std::pair<unsigned, unsigned>
	AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
	const unsigned Mask = AArch64II::MO_FRAGMENT;
	return std::make_pair(TF & Mask, TF & ~Mask);
	}

	ArrayRef<std::pair<unsigned, const char *>>
	AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
	using namespace AArch64II;

	static const std::pair<unsigned, const char *> TargetFlags[] = {
	{MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
	{MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
	{MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
	{MO_HI12, "aarch64-hi12"}};
	return makeArrayRef(TargetFlags);
	}

	ArrayRef<std::pair<unsigned, const char *>>
	AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
	using namespace AArch64II;

	static const std::pair<unsigned, const char *> TargetFlags[] = {
	{MO_COFFSTUB, "aarch64-coffstub"},
	{MO_GOT, "aarch64-got"},
	{MO_NC, "aarch64-nc"},
	{MO_S, "aarch64-s"},
	{MO_TLS, "aarch64-tls"},
	{MO_DLLIMPORT, "aarch64-dllimport"},
	{MO_PREL, "aarch64-prel"},
	{MO_TAGGED, "aarch64-tagged"}};
	return makeArrayRef(TargetFlags);
	}

	ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
	AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
	static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
	{{MOSuppressPair, "aarch64-suppress-pair"},
	{MOStridedAccess, "aarch64-strided-access"}};
	return makeArrayRef(TargetFlags);
	}

	/// Constants defining how certain sequences should be outlined.
	/// This encompasses how an outlined function should be called, and what kind of
	/// frame should be emitted for that outlined function.
	///
	/// \p MachineOutlinerDefault implies that the function should be called with
	/// a save and restore of LR to the stack.
	///
	/// That is,
	///
	/// I1 Save LR OUTLINED_FUNCTION:
	/// I2 --> BL OUTLINED_FUNCTION I1
	/// I3 Restore LR I2
	/// I3
	/// RET
	///
	/// * Call construction overhead: 3 (save + BL + restore)
	/// * Frame construction overhead: 1 (ret)
	/// * Requires stack fixups? Yes
	///
	/// \p MachineOutlinerTailCall implies that the function is being created from
	/// a sequence of instructions ending in a return.
	///
	/// That is,
	///
	/// I1 OUTLINED_FUNCTION:
	/// I2 --> B OUTLINED_FUNCTION I1
	/// RET I2
	/// RET
	///
	/// * Call construction overhead: 1 (B)
	/// * Frame construction overhead: 0 (Return included in sequence)
	/// * Requires stack fixups? No
	///
	/// \p MachineOutlinerNoLRSave implies that the function should be called using
	/// a BL instruction, but doesn't require LR to be saved and restored. This
	/// happens when LR is known to be dead.
	///
	/// That is,
	///
	/// I1 OUTLINED_FUNCTION:
	/// I2 --> BL OUTLINED_FUNCTION I1
	/// I3 I2
	/// I3
	/// RET
	///
	/// * Call construction overhead: 1 (BL)
	/// * Frame construction overhead: 1 (RET)
	/// * Requires stack fixups? No
	///
	/// \p MachineOutlinerThunk implies that the function is being created from
	/// a sequence of instructions ending in a call. The outlined function is
	/// called with a BL instruction, and the outlined function tail-calls the
	/// original call destination.
	///
	/// That is,
	///
	/// I1 OUTLINED_FUNCTION:
	/// I2 --> BL OUTLINED_FUNCTION I1
	/// BL f I2
	/// B f
	/// * Call construction overhead: 1 (BL)
	/// * Frame construction overhead: 0
	/// * Requires stack fixups? No
	///
	/// \p MachineOutlinerRegSave implies that the function should be called with a
	/// save and restore of LR to an available register. This allows us to avoid
	/// stack fixups. Note that this outlining variant is compatible with the
	/// NoLRSave case.
	///
	/// That is,
	///
	/// I1 Save LR OUTLINED_FUNCTION:
	/// I2 --> BL OUTLINED_FUNCTION I1
	/// I3 Restore LR I2
	/// I3
	/// RET
	///
	/// * Call construction overhead: 3 (save + BL + restore)
	/// * Frame construction overhead: 1 (ret)
	/// * Requires stack fixups? No
	enum MachineOutlinerClass {
	MachineOutlinerDefault, /// Emit a save, restore, call, and return.
	MachineOutlinerTailCall, /// Only emit a branch.
	MachineOutlinerNoLRSave, /// Emit a call and return.
	MachineOutlinerThunk, /// Emit a call and tail-call.
	MachineOutlinerRegSave /// Same as default, but save to a register.
	};

	enum MachineOutlinerMBBFlags {
	LRUnavailableSomewhere = 0x2,
	HasCalls = 0x4,
	UnsafeRegsDead = 0x8
	};

	unsigned
	AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
	assert(C.LRUWasSet && "LRU wasn't set?");
	MachineFunction *MF = C.getMF();
	const AArch64RegisterInfo ARI = static_cast<const AArch64RegisterInfo >(
	MF->getSubtarget().getRegisterInfo());

	// Check if there is an available register across the sequence that we can
	// use.
	for (unsigned Reg : AArch64::GPR64RegClass) {
	if (!ARI->isReservedReg(*MF, Reg) &&
	Reg != AArch64::LR && // LR is not reserved, but don't use it.
	Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
	Reg != AArch64::X17 && // Ditto for X17.
	C.LRU.available(Reg) && C.UsedInSequence.available(Reg))
	return Reg;
	}

	// No suitable register. Return 0.
	return 0u;
	}

	static bool
	outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a,
	const outliner::Candidate &b) {
	const Function &Fa = a.getMF()->getFunction();
	const Function &Fb = b.getMF()->getFunction();

	// If none of the functions have the "sign-return-address" attribute their
	// signing behaviour is equal
	if (!Fa.hasFnAttribute("sign-return-address") &&
	!Fb.hasFnAttribute("sign-return-address")) {
	return true;
	}

	// If both functions have the "sign-return-address" attribute their signing
	// behaviour is equal, if the values of the attributes are equal
	if (Fa.hasFnAttribute("sign-return-address") &&
	Fb.hasFnAttribute("sign-return-address")) {
	StringRef ScopeA =
	Fa.getFnAttribute("sign-return-address").getValueAsString();
	StringRef ScopeB =
	Fb.getFnAttribute("sign-return-address").getValueAsString();
	return ScopeA.equals(ScopeB);
	}

	// If function B doesn't have the "sign-return-address" attribute but A does,
	// the functions' signing behaviour is equal if A's value for
	// "sign-return-address" is "none" and vice versa.
	if (Fa.hasFnAttribute("sign-return-address")) {
	StringRef ScopeA =
	Fa.getFnAttribute("sign-return-address").getValueAsString();
	return ScopeA.equals("none");
	}

	if (Fb.hasFnAttribute("sign-return-address")) {
	StringRef ScopeB =
	Fb.getFnAttribute("sign-return-address").getValueAsString();
	return ScopeB.equals("none");
	}

	llvm_unreachable("Unkown combination of sign-return-address attributes");
	}

	static bool
	outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a,
	const outliner::Candidate &b) {
	const Function &Fa = a.getMF()->getFunction();
	const Function &Fb = b.getMF()->getFunction();

	// If none of the functions have the "sign-return-address-key" attribute
	// their keys are equal
	if (!Fa.hasFnAttribute("sign-return-address-key") &&
	!Fb.hasFnAttribute("sign-return-address-key")) {
	return true;
	}

	// If both functions have the "sign-return-address-key" attribute their
	// keys are equal if the values of "sign-return-address-key" are equal
	if (Fa.hasFnAttribute("sign-return-address-key") &&
	Fb.hasFnAttribute("sign-return-address-key")) {
	StringRef KeyA =
	Fa.getFnAttribute("sign-return-address-key").getValueAsString();
	StringRef KeyB =
	Fb.getFnAttribute("sign-return-address-key").getValueAsString();
	return KeyA.equals(KeyB);
	}

	// If B doesn't have the "sign-return-address-key" attribute, both keys are
	// equal, if function a has the default key (a_key)
	if (Fa.hasFnAttribute("sign-return-address-key")) {
	StringRef KeyA =
	Fa.getFnAttribute("sign-return-address-key").getValueAsString();
	return KeyA.equals_lower("a_key");
	}

	if (Fb.hasFnAttribute("sign-return-address-key")) {
	StringRef KeyB =
	Fb.getFnAttribute("sign-return-address-key").getValueAsString();
	return KeyB.equals_lower("a_key");
	}

	llvm_unreachable("Unkown combination of sign-return-address-key attributes");
	}

	static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
	const outliner::Candidate &b) {
	const AArch64Subtarget &SubtargetA =
	a.getMF()->getSubtarget<AArch64Subtarget>();
	const AArch64Subtarget &SubtargetB =
	b.getMF()->getSubtarget<AArch64Subtarget>();
	return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
	}

	outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
	std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
	outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
	unsigned SequenceSize =
	std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0,
	[this](unsigned Sum, const MachineInstr &MI) {
	return Sum + getInstSizeInBytes(MI);
	});
	unsigned NumBytesToCreateFrame = 0;

	// We only allow outlining for functions having exactly matching return
	// address signing attributes, i.e., all share the same value for the
	// attribute "sign-return-address" and all share the same type of key they
	// are signed with.
	// Additionally we require all functions to simultaniously either support
	// v8.3a features or not. Otherwise an outlined function could get signed
	// using dedicated v8.3 instructions and a call from a function that doesn't
	// support v8.3 instructions would therefore be invalid.
	if (std::adjacent_find(
	RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
	[](const outliner::Candidate &a, const outliner::Candidate &b) {
	// Return true if a and b are non-equal w.r.t. return address
	// signing or support of v8.3a features
	if (outliningCandidatesSigningScopeConsensus(a, b) &&
	outliningCandidatesSigningKeyConsensus(a, b) &&
	outliningCandidatesV8_3OpsConsensus(a, b)) {
	return false;
	}
	return true;
	}) != RepeatedSequenceLocs.end()) {
	return outliner::OutlinedFunction();
	}

	// Since at this point all candidates agree on their return address signing
	// picking just one is fine. If the candidate functions potentially sign their
	// return addresses, the outlined function should do the same. Note that in
	// the case of "sign-return-address"="non-leaf" this is an assumption: It is
	// not certainly true that the outlined function will have to sign its return
	// address but this decision is made later, when the decision to outline
	// has already been made.
	// The same holds for the number of additional instructions we need: On
	// v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
	// necessary. However, at this point we don't know if the outlined function
	// will have a RET instruction so we assume the worst.
	const Function &FCF = FirstCand.getMF()->getFunction();
	const TargetRegisterInfo &TRI = getRegisterInfo();
	if (FCF.hasFnAttribute("sign-return-address")) {
	// One PAC and one AUT instructions
	NumBytesToCreateFrame += 8;

	// We have to check if sp modifying instructions would get outlined.
	// If so we only allow outlining if sp is unchanged overall, so matching
	// sub and add instructions are okay to outline, all other sp modifications
	// are not
	auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
	int SPValue = 0;
	MachineBasicBlock::iterator MBBI = C.front();
	for (;;) {
	if (MBBI->modifiesRegister(AArch64::SP, &TRI)) {
	switch (MBBI->getOpcode()) {
	case AArch64::ADDXri:
	case AArch64::ADDWri:
	assert(MBBI->getNumOperands() == 4 && "Wrong number of operands");
	assert(MBBI->getOperand(2).isImm() &&
	"Expected operand to be immediate");
	assert(MBBI->getOperand(1).isReg() &&
	"Expected operand to be a register");
	// Check if the add just increments sp. If so, we search for
	// matching sub instructions that decrement sp. If not, the
	// modification is illegal
	if (MBBI->getOperand(1).getReg() == AArch64::SP)
	SPValue += MBBI->getOperand(2).getImm();
	else
	return true;
	break;
	case AArch64::SUBXri:
	case AArch64::SUBWri:
	assert(MBBI->getNumOperands() == 4 && "Wrong number of operands");
	assert(MBBI->getOperand(2).isImm() &&
	"Expected operand to be immediate");
	assert(MBBI->getOperand(1).isReg() &&
	"Expected operand to be a register");
	// Check if the sub just decrements sp. If so, we search for
	// matching add instructions that increment sp. If not, the
	// modification is illegal
	if (MBBI->getOperand(1).getReg() == AArch64::SP)
	SPValue -= MBBI->getOperand(2).getImm();
	else
	return true;
	break;
	default:
	return true;
	}
	}
	if (MBBI == C.back())
	break;
	++MBBI;
	}
	if (SPValue)
	return true;
	return false;
	};
	// Remove candidates with illegal stack modifying instructions
	RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
	RepeatedSequenceLocs.end(),
	hasIllegalSPModification),
	RepeatedSequenceLocs.end());

	// If the sequence doesn't have enough candidates left, then we're done.
	if (RepeatedSequenceLocs.size() < 2)
	return outliner::OutlinedFunction();
	}

	// Properties about candidate MBBs that hold for all of them.
	unsigned FlagsSetInAll = 0xF;

	// Compute liveness information for each candidate, and set FlagsSetInAll.
	std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
	[&FlagsSetInAll](outliner::Candidate &C) {
	FlagsSetInAll &= C.Flags;
	});

	// According to the AArch64 Procedure Call Standard, the following are
	// undefined on entry/exit from a function call:
	//
	// * Registers x16, x17, (and thus w16, w17)
	// * Condition codes (and thus the NZCV register)
	//
	// Because if this, we can't outline any sequence of instructions where
	// one
	// of these registers is live into/across it. Thus, we need to delete
	// those
	// candidates.
	auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) {
	// If the unsafe registers in this block are all dead, then we don't need
	// to compute liveness here.
	if (C.Flags & UnsafeRegsDead)
	return false;
	C.initLRU(TRI);
	LiveRegUnits LRU = C.LRU;
	return (!LRU.available(AArch64::W16) \|\| !LRU.available(AArch64::W17) \|\|
	!LRU.available(AArch64::NZCV));
	};

	// Are there any candidates where those registers are live?
	if (!(FlagsSetInAll & UnsafeRegsDead)) {
	// Erase every candidate that violates the restrictions above. (It could be
	// true that we have viable candidates, so it's not worth bailing out in
	// the case that, say, 1 out of 20 candidates violate the restructions.)
	RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
	RepeatedSequenceLocs.end(),
	CantGuaranteeValueAcrossCall),
	RepeatedSequenceLocs.end());

	// If the sequence doesn't have enough candidates left, then we're done.
	if (RepeatedSequenceLocs.size() < 2)
	return outliner::OutlinedFunction();
	}

	// At this point, we have only "safe" candidates to outline. Figure out
	// frame + call instruction information.

	unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode();

	// Helper lambda which sets call information for every candidate.
	auto SetCandidateCallInfo =
	[&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
	for (outliner::Candidate &C : RepeatedSequenceLocs)
	C.setCallInfo(CallID, NumBytesForCall);
	};

	unsigned FrameID = MachineOutlinerDefault;
	NumBytesToCreateFrame += 4;

	bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
	return C.getMF()->getFunction().hasFnAttribute("branch-target-enforcement");
	});

	// We check to see if CFI Instructions are present, and if they are
	// we find the number of CFI Instructions in the candidates.
	unsigned CFICount = 0;
	MachineBasicBlock::iterator MBBI = RepeatedSequenceLocs[0].front();
	for (unsigned Loc = RepeatedSequenceLocs[0].getStartIdx();
	Loc < RepeatedSequenceLocs[0].getEndIdx() + 1; Loc++) {
	const std::vector<MCCFIInstruction> &CFIInstructions =
	RepeatedSequenceLocs[0].getMF()->getFrameInstructions();
	if (MBBI->isCFIInstruction()) {
	unsigned CFIIndex = MBBI->getOperand(0).getCFIIndex();
	MCCFIInstruction CFI = CFIInstructions[CFIIndex];
	CFICount++;
	}
	MBBI++;
	}

	// We compare the number of found CFI Instructions to the number of CFI
	// instructions in the parent function for each candidate. We must check this
	// since if we outline one of the CFI instructions in a function, we have to
	// outline them all for correctness. If we do not, the address offsets will be
	// incorrect between the two sections of the program.
	for (outliner::Candidate &C : RepeatedSequenceLocs) {
	std::vector<MCCFIInstruction> CFIInstructions =
	C.getMF()->getFrameInstructions();

	if (CFICount > 0 && CFICount != CFIInstructions.size())
	return outliner::OutlinedFunction();
	}

	// Returns true if an instructions is safe to fix up, false otherwise.
	auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
	if (MI.isCall())
	return true;

	if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
	!MI.readsRegister(AArch64::SP, &TRI))
	return true;

	// Any modification of SP will break our code to save/restore LR.
	// FIXME: We could handle some instructions which add a constant
	// offset to SP, with a bit more work.
	if (MI.modifiesRegister(AArch64::SP, &TRI))
	return false;

	// At this point, we have a stack instruction that we might need to
	// fix up. We'll handle it if it's a load or store.
	if (MI.mayLoadOrStore()) {
	const MachineOperand *Base; // Filled with the base operand of MI.
	int64_t Offset; // Filled with the offset of MI.
	bool OffsetIsScalable;

	// Does it allow us to offset the base operand and is the base the
	// register SP?
	if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) \|\|
	!Base->isReg() \|\| Base->getReg() != AArch64::SP)
	return false;

	// Fixe-up code below assumes bytes.
	if (OffsetIsScalable)
	return false;

	// Find the minimum/maximum offset for this instruction and check
	// if fixing it up would be in range.
	int64_t MinOffset,
	MaxOffset; // Unscaled offsets for the instruction.
	TypeSize Scale(0U, false); // The scale to multiply the offsets by.
	unsigned DummyWidth;
	getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);

	Offset += 16; // Update the offset to what it would be if we outlined.
	if (Offset < MinOffset * (int64_t)Scale.getFixedSize() \|\|
	Offset > MaxOffset * (int64_t)Scale.getFixedSize())
	return false;

	// It's in range, so we can outline it.
	return true;
	}

	// FIXME: Add handling for instructions like "add x0, sp, #8".

	// We can't fix it up, so don't outline it.
	return false;
	};

	// True if it's possible to fix up each stack instruction in this sequence.
	// Important for frames/call variants that modify the stack.
	bool AllStackInstrsSafe = std::all_of(
	FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup);

	// If the last instruction in any candidate is a terminator, then we should
	// tail call all of the candidates.
	if (RepeatedSequenceLocs[0].back()->isTerminator()) {
	FrameID = MachineOutlinerTailCall;
	NumBytesToCreateFrame = 0;
	SetCandidateCallInfo(MachineOutlinerTailCall, 4);
	}

	else if (LastInstrOpcode == AArch64::BL \|\|
	((LastInstrOpcode == AArch64::BLR \|\|
	LastInstrOpcode == AArch64::BLRNoIP) &&
	!HasBTI)) {
	// FIXME: Do we need to check if the code after this uses the value of LR?
	FrameID = MachineOutlinerThunk;
	NumBytesToCreateFrame = 0;
	SetCandidateCallInfo(MachineOutlinerThunk, 4);
	}

	else {
	// We need to decide how to emit calls + frames. We can always emit the same
	// frame if we don't need to save to the stack. If we have to save to the
	// stack, then we need a different frame.
	unsigned NumBytesNoStackCalls = 0;
	std::vector<outliner::Candidate> CandidatesWithoutStackFixups;

	// Check if we have to save LR.
	for (outliner::Candidate &C : RepeatedSequenceLocs) {
	C.initLRU(TRI);

	// If we have a noreturn caller, then we're going to be conservative and
	// say that we have to save LR. If we don't have a ret at the end of the
	// block, then we can't reason about liveness accurately.
	//
	// FIXME: We can probably do better than always disabling this in
	// noreturn functions by fixing up the liveness info.
	bool IsNoReturn =
	C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);

	// Is LR available? If so, we don't need a save.
	if (C.LRU.available(AArch64::LR) && !IsNoReturn) {
	NumBytesNoStackCalls += 4;
	C.setCallInfo(MachineOutlinerNoLRSave, 4);
	CandidatesWithoutStackFixups.push_back(C);
	}

	// Is an unused register available? If so, we won't modify the stack, so
	// we can outline with the same frame type as those that don't save LR.
	else if (findRegisterToSaveLRTo(C)) {
	NumBytesNoStackCalls += 12;
	C.setCallInfo(MachineOutlinerRegSave, 12);
	CandidatesWithoutStackFixups.push_back(C);
	}

	// Is SP used in the sequence at all? If not, we don't have to modify
	// the stack, so we are guaranteed to get the same frame.
	else if (C.UsedInSequence.available(AArch64::SP)) {
	NumBytesNoStackCalls += 12;
	C.setCallInfo(MachineOutlinerDefault, 12);
	CandidatesWithoutStackFixups.push_back(C);
	}

	// If we outline this, we need to modify the stack. Pretend we don't
	// outline this by saving all of its bytes.
	else {
	NumBytesNoStackCalls += SequenceSize;
	}
	}

	// If there are no places where we have to save LR, then note that we
	// don't have to update the stack. Otherwise, give every candidate the
	// default call type, as long as it's safe to do so.
	if (!AllStackInstrsSafe \|\|
	NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
	RepeatedSequenceLocs = CandidatesWithoutStackFixups;
	FrameID = MachineOutlinerNoLRSave;
	} else {
	SetCandidateCallInfo(MachineOutlinerDefault, 12);
	}

	// If we dropped all of the candidates, bail out here.
	if (RepeatedSequenceLocs.size() < 2) {
	RepeatedSequenceLocs.clear();
	return outliner::OutlinedFunction();
	}
	}

	// Does every candidate's MBB contain a call? If so, then we might have a call
	// in the range.
	if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
	// Check if the range contains a call. These require a save + restore of the
	// link register.
	bool ModStackToSaveLR = false;
	if (std::any_of(FirstCand.front(), FirstCand.back(),
	[](const MachineInstr &MI) { return MI.isCall(); }))
	ModStackToSaveLR = true;

	// Handle the last instruction separately. If this is a tail call, then the
	// last instruction is a call. We don't want to save + restore in this case.
	// However, it could be possible that the last instruction is a call without
	// it being valid to tail call this sequence. We should consider this as
	// well.
	else if (FrameID != MachineOutlinerThunk &&
	FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall())
	ModStackToSaveLR = true;

	if (ModStackToSaveLR) {
	// We can't fix up the stack. Bail out.
	if (!AllStackInstrsSafe) {
	RepeatedSequenceLocs.clear();
	return outliner::OutlinedFunction();
	}

	// Save + restore LR.
	NumBytesToCreateFrame += 8;
	}
	}

	// If we have CFI instructions, we can only outline if the outlined section
	// can be a tail call
	if (FrameID != MachineOutlinerTailCall && CFICount > 0)
	return outliner::OutlinedFunction();

	return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
	NumBytesToCreateFrame, FrameID);
	}

	bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
	MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
	const Function &F = MF.getFunction();

	// Can F be deduplicated by the linker? If it can, don't outline from it.
	if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
	return false;

	// Don't outline from functions with section markings; the program could
	// expect that all the code is in the named section.
	// FIXME: Allow outlining from multiple functions with the same section
	// marking.
	if (F.hasSection())
	return false;

	// Outlining from functions with redzones is unsafe since the outliner may
	// modify the stack. Check if hasRedZone is true or unknown; if yes, don't
	// outline from it.
	AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	if (!AFI \|\| AFI->hasRedZone().getValueOr(true))
	return false;

	// FIXME: Teach the outliner to generate/handle Windows unwind info.
	if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
	return false;

	// It's safe to outline from MF.
	return true;
	}

	bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
	unsigned &Flags) const {
	// Check if LR is available through all of the MBB. If it's not, then set
	// a flag.
	assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
	"Suitable Machine Function for outlining must track liveness");
	LiveRegUnits LRU(getRegisterInfo());

	std::for_each(MBB.rbegin(), MBB.rend(),
	[&LRU](MachineInstr &MI) { LRU.accumulate(MI); });

	// Check if each of the unsafe registers are available...
	bool W16AvailableInBlock = LRU.available(AArch64::W16);
	bool W17AvailableInBlock = LRU.available(AArch64::W17);
	bool NZCVAvailableInBlock = LRU.available(AArch64::NZCV);

	// If all of these are dead (and not live out), we know we don't have to check
	// them later.
	if (W16AvailableInBlock && W17AvailableInBlock && NZCVAvailableInBlock)
	Flags \|= MachineOutlinerMBBFlags::UnsafeRegsDead;

	// Now, add the live outs to the set.
	LRU.addLiveOuts(MBB);

	// If any of these registers is available in the MBB, but also a live out of
	// the block, then we know outlining is unsafe.
	if (W16AvailableInBlock && !LRU.available(AArch64::W16))
	return false;
	if (W17AvailableInBlock && !LRU.available(AArch64::W17))
	return false;
	if (NZCVAvailableInBlock && !LRU.available(AArch64::NZCV))
	return false;

	// Check if there's a call inside this MachineBasicBlock. If there is, then
	// set a flag.
	if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); }))
	Flags \|= MachineOutlinerMBBFlags::HasCalls;

	MachineFunction *MF = MBB.getParent();

	// In the event that we outline, we may have to save LR. If there is an
	// available register in the MBB, then we'll always save LR there. Check if
	// this is true.
	bool CanSaveLR = false;
	const AArch64RegisterInfo ARI = static_cast<const AArch64RegisterInfo >(
	MF->getSubtarget().getRegisterInfo());

	// Check if there is an available register across the sequence that we can
	// use.
	for (unsigned Reg : AArch64::GPR64RegClass) {
	if (!ARI->isReservedReg(*MF, Reg) && Reg != AArch64::LR &&
	Reg != AArch64::X16 && Reg != AArch64::X17 && LRU.available(Reg)) {
	CanSaveLR = true;
	break;
	}
	}

	// Check if we have a register we can save LR to, and if LR was used
	// somewhere. If both of those things are true, then we need to evaluate the
	// safety of outlining stack instructions later.
	if (!CanSaveLR && !LRU.available(AArch64::LR))
	Flags \|= MachineOutlinerMBBFlags::LRUnavailableSomewhere;

	return true;
	}

	outliner::InstrType
	AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
	unsigned Flags) const {
	MachineInstr &MI = *MIT;
	MachineBasicBlock *MBB = MI.getParent();
	MachineFunction *MF = MBB->getParent();
	AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();

	// Don't outline anything used for return address signing. The outlined
	// function will get signed later if needed
	switch (MI.getOpcode()) {
	case AArch64::PACIASP:
	case AArch64::PACIBSP:
	case AArch64::AUTIASP:
	case AArch64::AUTIBSP:
	case AArch64::RETAA:
	case AArch64::RETAB:
	case AArch64::EMITBKEY:
	return outliner::InstrType::Illegal;
	}

	// Don't outline LOHs.
	if (FuncInfo->getLOHRelated().count(&MI))
	return outliner::InstrType::Illegal;

	// We can only outline these if we will tail call the outlined function, or
	// fix up the CFI offsets. Currently, CFI instructions are outlined only if
	// in a tail call.
	//
	// FIXME: If the proper fixups for the offset are implemented, this should be
	// possible.
	if (MI.isCFIInstruction())
	return outliner::InstrType::Legal;

	// Don't allow debug values to impact outlining type.
	if (MI.isDebugInstr() \|\| MI.isIndirectDebugValue())
	return outliner::InstrType::Invisible;

	// At this point, KILL instructions don't really tell us much so we can go
	// ahead and skip over them.
	if (MI.isKill())
	return outliner::InstrType::Invisible;

	// Is this a terminator for a basic block?
	if (MI.isTerminator()) {

	// Is this the end of a function?
	if (MI.getParent()->succ_empty())
	return outliner::InstrType::Legal;

	// It's not, so don't outline it.
	return outliner::InstrType::Illegal;
	}

	// Make sure none of the operands are un-outlinable.
	for (const MachineOperand &MOP : MI.operands()) {
	if (MOP.isCPI() \|\| MOP.isJTI() \|\| MOP.isCFIIndex() \|\| MOP.isFI() \|\|
	MOP.isTargetIndex())
	return outliner::InstrType::Illegal;

	// If it uses LR or W30 explicitly, then don't touch it.
	if (MOP.isReg() && !MOP.isImplicit() &&
	(MOP.getReg() == AArch64::LR \|\| MOP.getReg() == AArch64::W30))
	return outliner::InstrType::Illegal;
	}

	// Special cases for instructions that can always be outlined, but will fail
	// the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
	// be outlined because they don't require a specific value to be in LR.
	if (MI.getOpcode() == AArch64::ADRP)
	return outliner::InstrType::Legal;

	// If MI is a call we might be able to outline it. We don't want to outline
	// any calls that rely on the position of items on the stack. When we outline
	// something containing a call, we have to emit a save and restore of LR in
	// the outlined function. Currently, this always happens by saving LR to the
	// stack. Thus, if we outline, say, half the parameters for a function call
	// plus the call, then we'll break the callee's expectations for the layout
	// of the stack.
	//
	// FIXME: Allow calls to functions which construct a stack frame, as long
	// as they don't access arguments on the stack.
	// FIXME: Figure out some way to analyze functions defined in other modules.
	// We should be able to compute the memory usage based on the IR calling
	// convention, even if we can't see the definition.
	if (MI.isCall()) {
	// Get the function associated with the call. Look at each operand and find
	// the one that represents the callee and get its name.
	const Function *Callee = nullptr;
	for (const MachineOperand &MOP : MI.operands()) {
	if (MOP.isGlobal()) {
	Callee = dyn_cast<Function>(MOP.getGlobal());
	break;
	}
	}

	// Never outline calls to mcount. There isn't any rule that would require
	// this, but the Linux kernel's "ftrace" feature depends on it.
	if (Callee && Callee->getName() == "\01_mcount")
	return outliner::InstrType::Illegal;

	// If we don't know anything about the callee, assume it depends on the
	// stack layout of the caller. In that case, it's only legal to outline
	// as a tail-call. Explicitly list the call instructions we know about so we
	// don't get unexpected results with call pseudo-instructions.
	auto UnknownCallOutlineType = outliner::InstrType::Illegal;
	if (MI.getOpcode() == AArch64::BLR \|\|
	MI.getOpcode() == AArch64::BLRNoIP \|\| MI.getOpcode() == AArch64::BL)
	UnknownCallOutlineType = outliner::InstrType::LegalTerminator;

	if (!Callee)
	return UnknownCallOutlineType;

	// We have a function we have information about. Check it if it's something
	// can safely outline.
	MachineFunction CalleeMF = MF->getMMI().getMachineFunction(Callee);

	// We don't know what's going on with the callee at all. Don't touch it.
	if (!CalleeMF)
	return UnknownCallOutlineType;

	// Check if we know anything about the callee saves on the function. If we
	// don't, then don't touch it, since that implies that we haven't
	// computed anything about its stack frame yet.
	MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
	if (!MFI.isCalleeSavedInfoValid() \|\| MFI.getStackSize() > 0 \|\|
	MFI.getNumObjects() > 0)
	return UnknownCallOutlineType;

	// At this point, we can say that CalleeMF ought to not pass anything on the
	// stack. Therefore, we can outline it.
	return outliner::InstrType::Legal;
	}

	// Don't outline positions.
	if (MI.isPosition())
	return outliner::InstrType::Illegal;

	// Don't touch the link register or W30.
	if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) \|\|
	MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
	return outliner::InstrType::Illegal;

	// Don't outline BTI instructions, because that will prevent the outlining
	// site from being indirectly callable.
	if (MI.getOpcode() == AArch64::HINT) {
	int64_t Imm = MI.getOperand(0).getImm();
	if (Imm == 32 \|\| Imm == 34 \|\| Imm == 36 \|\| Imm == 38)
	return outliner::InstrType::Illegal;
	}

	return outliner::InstrType::Legal;
	}

	void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
	for (MachineInstr &MI : MBB) {
	const MachineOperand *Base;
	unsigned Width;
	int64_t Offset;
	bool OffsetIsScalable;

	// Is this a load or store with an immediate offset with SP as the base?
	if (!MI.mayLoadOrStore() \|\|
	!getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
	&RI) \|\|
	(Base->isReg() && Base->getReg() != AArch64::SP))
	continue;

	// It is, so we have to fix it up.
	TypeSize Scale(0U, false);
	int64_t Dummy1, Dummy2;

	MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
	assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
	getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
	assert(Scale != 0 && "Unexpected opcode!");
	assert(!OffsetIsScalable && "Expected offset to be a byte offset");

	// We've pushed the return address to the stack, so add 16 to the offset.
	// This is safe, since we already checked if it would overflow when we
	// checked if this instruction was legal to outline.
	int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedSize();
	StackOffsetOperand.setImm(NewImm);
	}
	}

	static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
	bool ShouldSignReturnAddr,
	bool ShouldSignReturnAddrWithAKey) {
	if (ShouldSignReturnAddr) {
	MachineBasicBlock::iterator MBBPAC = MBB.begin();
	MachineBasicBlock::iterator MBBAUT = MBB.getFirstTerminator();
	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL;

	if (MBBAUT != MBB.end())
	DL = MBBAUT->getDebugLoc();

	// At the very beginning of the basic block we insert the following
	// depending on the key type
	//
	// a_key: b_key:
	// PACIASP EMITBKEY
	// CFI_INSTRUCTION PACIBSP
	// CFI_INSTRUCTION
	if (ShouldSignReturnAddrWithAKey) {
	BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIASP))
	.setMIFlag(MachineInstr::FrameSetup);
	} else {
	BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::EMITBKEY))
	.setMIFlag(MachineInstr::FrameSetup);
	BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIBSP))
	.setMIFlag(MachineInstr::FrameSetup);
	}
	unsigned CFIIndex =
	MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
	BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::CFI_INSTRUCTION))
	.addCFIIndex(CFIIndex)
	.setMIFlags(MachineInstr::FrameSetup);

	// If v8.3a features are available we can replace a RET instruction by
	// RETAA or RETAB and omit the AUT instructions
	if (Subtarget.hasV8_3aOps() && MBBAUT != MBB.end() &&
	MBBAUT->getOpcode() == AArch64::RET) {
	BuildMI(MBB, MBBAUT, DL,
	TII->get(ShouldSignReturnAddrWithAKey ? AArch64::RETAA
	: AArch64::RETAB))
	.copyImplicitOps(*MBBAUT);
	MBB.erase(MBBAUT);
	} else {
	BuildMI(MBB, MBBAUT, DL,
	TII->get(ShouldSignReturnAddrWithAKey ? AArch64::AUTIASP
	: AArch64::AUTIBSP))
	.setMIFlag(MachineInstr::FrameDestroy);
	}
	}
	}

	void AArch64InstrInfo::buildOutlinedFrame(
	MachineBasicBlock &MBB, MachineFunction &MF,
	const outliner::OutlinedFunction &OF) const {

	AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();

	if (OF.FrameConstructionID == MachineOutlinerTailCall)
	FI->setOutliningStyle("Tail Call");
	else if (OF.FrameConstructionID == MachineOutlinerThunk) {
	// For thunk outlining, rewrite the last instruction from a call to a
	// tail-call.
	MachineInstr Call = &--MBB.instr_end();
	unsigned TailOpcode;
	if (Call->getOpcode() == AArch64::BL) {
	TailOpcode = AArch64::TCRETURNdi;
	} else {
	assert(Call->getOpcode() == AArch64::BLR \|\|
	Call->getOpcode() == AArch64::BLRNoIP);
	TailOpcode = AArch64::TCRETURNriALL;
	}
	MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
	.add(Call->getOperand(0))
	.addImm(0);
	MBB.insert(MBB.end(), TC);
	Call->eraseFromParent();

	FI->setOutliningStyle("Thunk");
	}

	bool IsLeafFunction = true;

	// Is there a call in the outlined range?
	auto IsNonTailCall = [](const MachineInstr &MI) {
	return MI.isCall() && !MI.isReturn();
	};

	if (std::any_of(MBB.instr_begin(), MBB.instr_end(), IsNonTailCall)) {
	// Fix up the instructions in the range, since we're going to modify the
	// stack.
	assert(OF.FrameConstructionID != MachineOutlinerDefault &&
	"Can only fix up stack references once");
	fixupPostOutline(MBB);

	IsLeafFunction = false;

	// LR has to be a live in so that we can save it.
	if (!MBB.isLiveIn(AArch64::LR))
	MBB.addLiveIn(AArch64::LR);

	MachineBasicBlock::iterator It = MBB.begin();
	MachineBasicBlock::iterator Et = MBB.end();

	if (OF.FrameConstructionID == MachineOutlinerTailCall \|\|
	OF.FrameConstructionID == MachineOutlinerThunk)
	Et = std::prev(MBB.end());

	// Insert a save before the outlined region
	MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
	.addReg(AArch64::SP, RegState::Define)
	.addReg(AArch64::LR)
	.addReg(AArch64::SP)
	.addImm(-16);
	It = MBB.insert(It, STRXpre);

	const TargetSubtargetInfo &STI = MF.getSubtarget();
	const MCRegisterInfo *MRI = STI.getRegisterInfo();
	unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);

	// Add a CFI saying the stack was moved 16 B down.
	int64_t StackPosEntry =
	MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16));
	BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
	.addCFIIndex(StackPosEntry)
	.setMIFlags(MachineInstr::FrameSetup);

	// Add a CFI saying that the LR that we want to find is now 16 B higher than
	// before.
	int64_t LRPosEntry =
	MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, -16));
	BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
	.addCFIIndex(LRPosEntry)
	.setMIFlags(MachineInstr::FrameSetup);

	// Insert a restore before the terminator for the function.
	MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
	.addReg(AArch64::SP, RegState::Define)
	.addReg(AArch64::LR, RegState::Define)
	.addReg(AArch64::SP)
	.addImm(16);
	Et = MBB.insert(Et, LDRXpost);
	}

	// If a bunch of candidates reach this point they must agree on their return
	// address signing. It is therefore enough to just consider the signing
	// behaviour of one of them
	const Function &CF = OF.Candidates.front().getMF()->getFunction();
	bool ShouldSignReturnAddr = false;
	if (CF.hasFnAttribute("sign-return-address")) {
	StringRef Scope =
	CF.getFnAttribute("sign-return-address").getValueAsString();
	if (Scope.equals("all"))
	ShouldSignReturnAddr = true;
	else if (Scope.equals("non-leaf") && !IsLeafFunction)
	ShouldSignReturnAddr = true;
	}

	// a_key is the default
	bool ShouldSignReturnAddrWithAKey = true;
	if (CF.hasFnAttribute("sign-return-address-key")) {
	const StringRef Key =
	CF.getFnAttribute("sign-return-address-key").getValueAsString();
	// Key can either be a_key or b_key
	assert((Key.equals_lower("a_key") \|\| Key.equals_lower("b_key")) &&
	"Return address signing key must be either a_key or b_key");
	ShouldSignReturnAddrWithAKey = Key.equals_lower("a_key");
	}

	// If this is a tail call outlined function, then there's already a return.
	if (OF.FrameConstructionID == MachineOutlinerTailCall \|\|
	OF.FrameConstructionID == MachineOutlinerThunk) {
	signOutlinedFunction(MF, MBB, ShouldSignReturnAddr,
	ShouldSignReturnAddrWithAKey);
	return;
	}

	// It's not a tail call, so we have to insert the return ourselves.

	// LR has to be a live in so that we can return to it.
	if (!MBB.isLiveIn(AArch64::LR))
	MBB.addLiveIn(AArch64::LR);

	MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
	.addReg(AArch64::LR);
	MBB.insert(MBB.end(), ret);

	signOutlinedFunction(MF, MBB, ShouldSignReturnAddr,
	ShouldSignReturnAddrWithAKey);

	FI->setOutliningStyle("Function");

	// Did we have to modify the stack by saving the link register?
	if (OF.FrameConstructionID != MachineOutlinerDefault)
	return;

	// We modified the stack.
	// Walk over the basic block and fix up all the stack accesses.
	fixupPostOutline(MBB);
	}

	MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
	Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
	MachineFunction &MF, const outliner::Candidate &C) const {

	// Are we tail calling?
	if (C.CallConstructionID == MachineOutlinerTailCall) {
	// If yes, then we can just branch to the label.
	It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
	.addGlobalAddress(M.getNamedValue(MF.getName()))
	.addImm(0));
	return It;
	}

	// Are we saving the link register?
	if (C.CallConstructionID == MachineOutlinerNoLRSave \|\|
	C.CallConstructionID == MachineOutlinerThunk) {
	// No, so just insert the call.
	It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
	.addGlobalAddress(M.getNamedValue(MF.getName())));
	return It;
	}

	// We want to return the spot where we inserted the call.
	MachineBasicBlock::iterator CallPt;

	// Instructions for saving and restoring LR around the call instruction we're
	// going to insert.
	MachineInstr *Save;
	MachineInstr *Restore;
	// Can we save to a register?
	if (C.CallConstructionID == MachineOutlinerRegSave) {
	// FIXME: This logic should be sunk into a target-specific interface so that
	// we don't have to recompute the register.
	unsigned Reg = findRegisterToSaveLRTo(C);
	assert(Reg != 0 && "No callee-saved register available?");

	// Save and restore LR from that register.
	Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
	.addReg(AArch64::XZR)
	.addReg(AArch64::LR)
	.addImm(0);
	Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
	.addReg(AArch64::XZR)
	.addReg(Reg)
	.addImm(0);
	} else {
	// We have the default case. Save and restore from SP.
	Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
	.addReg(AArch64::SP, RegState::Define)
	.addReg(AArch64::LR)
	.addReg(AArch64::SP)
	.addImm(-16);
	Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
	.addReg(AArch64::SP, RegState::Define)
	.addReg(AArch64::LR, RegState::Define)
	.addReg(AArch64::SP)
	.addImm(16);
	}

	It = MBB.insert(It, Save);
	It++;

	// Insert the call.
	It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
	.addGlobalAddress(M.getNamedValue(MF.getName())));
	CallPt = It;
	It++;

	It = MBB.insert(It, Restore);
	return CallPt;
	}

	bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
	MachineFunction &MF) const {
	return MF.getFunction().hasMinSize();
	}

	Optional<DestSourcePair>
	AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {

	// AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
	// and zero immediate operands used as an alias for mov instruction.
	if (MI.getOpcode() == AArch64::ORRWrs &&
	MI.getOperand(1).getReg() == AArch64::WZR &&
	MI.getOperand(3).getImm() == 0x0) {
	return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
	}

	if (MI.getOpcode() == AArch64::ORRXrs &&
	MI.getOperand(1).getReg() == AArch64::XZR &&
	MI.getOperand(3).getImm() == 0x0) {
	return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
	}

	return None;
	}

	Optional<RegImmPair> AArch64InstrInfo::isAddImmediate(const MachineInstr &MI,
	Register Reg) const {
	int Sign = 1;
	int64_t Offset = 0;

	// TODO: Handle cases where Reg is a super- or sub-register of the
	// destination register.
	const MachineOperand &Op0 = MI.getOperand(0);
	if (!Op0.isReg() \|\| Reg != Op0.getReg())
	return None;

	switch (MI.getOpcode()) {
	default:
	return None;
	case AArch64::SUBWri:
	case AArch64::SUBXri:
	case AArch64::SUBSWri:
	case AArch64::SUBSXri:
	Sign *= -1;
	LLVM_FALLTHROUGH;
	case AArch64::ADDSWri:
	case AArch64::ADDSXri:
	case AArch64::ADDWri:
	case AArch64::ADDXri: {
	// TODO: Third operand can be global address (usually some string).
	if (!MI.getOperand(0).isReg() \|\| !MI.getOperand(1).isReg() \|\|
	!MI.getOperand(2).isImm())
	return None;
	Offset = MI.getOperand(2).getImm() * Sign;
	int Shift = MI.getOperand(3).getImm();
	assert((Shift == 0 \|\| Shift == 12) && "Shift can be either 0 or 12");
	Offset = Offset << Shift;
	}
	}
	return RegImmPair{MI.getOperand(1).getReg(), Offset};
	}

	/// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
	/// the destination register then, if possible, describe the value in terms of
	/// the source register.
	static Optional<ParamLoadedValue>
	describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
	const TargetInstrInfo *TII,
	const TargetRegisterInfo *TRI) {
	auto DestSrc = TII->isCopyInstr(MI);
	if (!DestSrc)
	return None;

	Register DestReg = DestSrc->Destination->getReg();
	Register SrcReg = DestSrc->Source->getReg();

	auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});

	// If the described register is the destination, just return the source.
	if (DestReg == DescribedReg)
	return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);

	// ORRWrs zero-extends to 64-bits, so we need to consider such cases.
	if (MI.getOpcode() == AArch64::ORRWrs &&
	TRI->isSuperRegister(DestReg, DescribedReg))
	return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);

	// We may need to describe the lower part of a ORRXrs move.
	if (MI.getOpcode() == AArch64::ORRXrs &&
	TRI->isSubRegister(DestReg, DescribedReg)) {
	Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
	return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
	}

	assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
	"Unhandled ORR[XW]rs copy case");

	return None;
	}

	Optional<ParamLoadedValue>
	AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
	Register Reg) const {
	const MachineFunction *MF = MI.getMF();
	const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
	switch (MI.getOpcode()) {
	case AArch64::MOVZWi:
	case AArch64::MOVZXi: {
	// MOVZWi may be used for producing zero-extended 32-bit immediates in
	// 64-bit parameters, so we need to consider super-registers.
	if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
	return None;

	if (!MI.getOperand(1).isImm())
	return None;
	int64_t Immediate = MI.getOperand(1).getImm();
	int Shift = MI.getOperand(2).getImm();
	return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
	nullptr);
	}
	case AArch64::ORRWrs:
	case AArch64::ORRXrs:
	return describeORRLoadedValue(MI, Reg, this, TRI);
	}

	return TargetInstrInfo::describeLoadedValue(MI, Reg);
	}

	uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
	return get(Opc).TSFlags & AArch64::ElementSizeMask;
	}

	unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
	if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
	return AArch64::BLRNoIP;
	else
	return AArch64::BLR;
	}

	#define GET_INSTRINFO_HELPERS
	#define GET_INSTRMAP_INFO
	#include "AArch64GenInstrInfo.inc"
	diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
	index 886158ca4490..83a488afc797 100644
	--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
	+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
	@@ -1,679 +1,705 @@
	//===- AArch64RegisterInfo.cpp - AArch64 Register Information -------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains the AArch64 implementation of the TargetRegisterInfo
	// class.
	//
	//===----------------------------------------------------------------------===//

	#include "AArch64RegisterInfo.h"
	#include "AArch64FrameLowering.h"
	#include "AArch64InstrInfo.h"
	#include "AArch64MachineFunctionInfo.h"
	#include "AArch64StackOffset.h"
	#include "AArch64Subtarget.h"
	#include "MCTargetDesc/AArch64AddressingModes.h"
	#include "llvm/ADT/BitVector.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/RegisterScavenging.h"
	#include "llvm/CodeGen/TargetFrameLowering.h"
	#include "llvm/IR/DiagnosticInfo.h"
	#include "llvm/IR/Function.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetOptions.h"

	using namespace llvm;

	#define GET_REGINFO_TARGET_DESC
	#include "AArch64GenRegisterInfo.inc"

	AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT)
	: AArch64GenRegisterInfo(AArch64::LR), TT(TT) {
	AArch64_MC::initLLVMToCVRegMapping(this);
	}

	+static bool hasSVEArgsOrReturn(const MachineFunction *MF) {
	+ const Function &F = MF->getFunction();
	+ return isa<ScalableVectorType>(F.getReturnType()) \|\|
	+ any_of(F.args(), [](const Argument &Arg) {
	+ return isa<ScalableVectorType>(Arg.getType());
	+ });
	+}
	+
	const MCPhysReg *
	AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
	assert(MF && "Invalid MachineFunction pointer.");

	if (MF->getFunction().getCallingConv() == CallingConv::GHC)
	// GHC set of callee saved regs is empty as all those regs are
	// used for passing STG regs around
	return CSR_AArch64_NoRegs_SaveList;
	if (MF->getFunction().getCallingConv() == CallingConv::AnyReg)
	return CSR_AArch64_AllRegs_SaveList;

	// Darwin has its own CSR_AArch64_AAPCS_SaveList, which means most CSR save
	// lists depending on that will need to have their Darwin variant as well.
	if (MF->getSubtarget<AArch64Subtarget>().isTargetDarwin())
	return getDarwinCalleeSavedRegs(MF);

	if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check)
	return CSR_Win_AArch64_CFGuard_Check_SaveList;
	if (MF->getSubtarget<AArch64Subtarget>().isTargetWindows())
	return CSR_Win_AArch64_AAPCS_SaveList;
	if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall)
	return CSR_AArch64_AAVPCS_SaveList;
	if (MF->getFunction().getCallingConv() == CallingConv::AArch64_SVE_VectorCall)
	return CSR_AArch64_SVE_AAPCS_SaveList;
	if (MF->getSubtarget<AArch64Subtarget>().getTargetLowering()
	->supportSwiftError() &&
	MF->getFunction().getAttributes().hasAttrSomewhere(
	Attribute::SwiftError))
	return CSR_AArch64_AAPCS_SwiftError_SaveList;
	if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost)
	return CSR_AArch64_RT_MostRegs_SaveList;
	if (MF->getFunction().getCallingConv() == CallingConv::Win64)
	// This is for OSes other than Windows; Windows is a separate case further
	// above.
	return CSR_AArch64_AAPCS_X18_SaveList;
	+ if (hasSVEArgsOrReturn(MF))
	+ return CSR_AArch64_SVE_AAPCS_SaveList;
	return CSR_AArch64_AAPCS_SaveList;
	}

	const MCPhysReg *
	AArch64RegisterInfo::getDarwinCalleeSavedRegs(const MachineFunction *MF) const {
	assert(MF && "Invalid MachineFunction pointer.");
	assert(MF->getSubtarget<AArch64Subtarget>().isTargetDarwin() &&
	"Invalid subtarget for getDarwinCalleeSavedRegs");

	if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check)
	report_fatal_error(
	"Calling convention CFGuard_Check is unsupported on Darwin.");
	if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall)
	return CSR_Darwin_AArch64_AAVPCS_SaveList;
	if (MF->getFunction().getCallingConv() == CallingConv::AArch64_SVE_VectorCall)
	report_fatal_error(
	"Calling convention SVE_VectorCall is unsupported on Darwin.");
	if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS)
	return MF->getInfo<AArch64FunctionInfo>()->isSplitCSR()
	? CSR_Darwin_AArch64_CXX_TLS_PE_SaveList
	: CSR_Darwin_AArch64_CXX_TLS_SaveList;
	if (MF->getSubtarget<AArch64Subtarget>().getTargetLowering()
	->supportSwiftError() &&
	MF->getFunction().getAttributes().hasAttrSomewhere(
	Attribute::SwiftError))
	return CSR_Darwin_AArch64_AAPCS_SwiftError_SaveList;
	if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost)
	return CSR_Darwin_AArch64_RT_MostRegs_SaveList;
	return CSR_Darwin_AArch64_AAPCS_SaveList;
	}

	const MCPhysReg *AArch64RegisterInfo::getCalleeSavedRegsViaCopy(
	const MachineFunction *MF) const {
	assert(MF && "Invalid MachineFunction pointer.");
	if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
	MF->getInfo<AArch64FunctionInfo>()->isSplitCSR())
	return CSR_Darwin_AArch64_CXX_TLS_ViaCopy_SaveList;
	return nullptr;
	}

	void AArch64RegisterInfo::UpdateCustomCalleeSavedRegs(
	MachineFunction &MF) const {
	const MCPhysReg *CSRs = getCalleeSavedRegs(&MF);
	SmallVector<MCPhysReg, 32> UpdatedCSRs;
	for (const MCPhysReg I = CSRs; I; ++I)
	UpdatedCSRs.push_back(*I);

	for (size_t i = 0; i < AArch64::GPR64commonRegClass.getNumRegs(); ++i) {
	if (MF.getSubtarget<AArch64Subtarget>().isXRegCustomCalleeSaved(i)) {
	UpdatedCSRs.push_back(AArch64::GPR64commonRegClass.getRegister(i));
	}
	}
	// Register lists are zero-terminated.
	UpdatedCSRs.push_back(0);
	MF.getRegInfo().setCalleeSavedRegs(UpdatedCSRs);
	}

	const TargetRegisterClass *
	AArch64RegisterInfo::getSubClassWithSubReg(const TargetRegisterClass *RC,
	unsigned Idx) const {
	// edge case for GPR/FPR register classes
	if (RC == &AArch64::GPR32allRegClass && Idx == AArch64::hsub)
	return &AArch64::FPR32RegClass;
	else if (RC == &AArch64::GPR64allRegClass && Idx == AArch64::hsub)
	return &AArch64::FPR64RegClass;

	// Forward to TableGen's default version.
	return AArch64GenRegisterInfo::getSubClassWithSubReg(RC, Idx);
	}

	const uint32_t *
	AArch64RegisterInfo::getDarwinCallPreservedMask(const MachineFunction &MF,
	CallingConv::ID CC) const {
	assert(MF.getSubtarget<AArch64Subtarget>().isTargetDarwin() &&
	"Invalid subtarget for getDarwinCallPreservedMask");

	if (CC == CallingConv::CXX_FAST_TLS)
	return CSR_Darwin_AArch64_CXX_TLS_RegMask;
	if (CC == CallingConv::AArch64_VectorCall)
	return CSR_Darwin_AArch64_AAVPCS_RegMask;
	if (CC == CallingConv::AArch64_SVE_VectorCall)
	report_fatal_error(
	"Calling convention SVE_VectorCall is unsupported on Darwin.");
	if (CC == CallingConv::CFGuard_Check)
	report_fatal_error(
	"Calling convention CFGuard_Check is unsupported on Darwin.");
	if (MF.getSubtarget<AArch64Subtarget>()
	.getTargetLowering()
	->supportSwiftError() &&
	MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError))
	return CSR_Darwin_AArch64_AAPCS_SwiftError_RegMask;
	if (CC == CallingConv::PreserveMost)
	return CSR_Darwin_AArch64_RT_MostRegs_RegMask;
	return CSR_Darwin_AArch64_AAPCS_RegMask;
	}

	const uint32_t *
	AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
	CallingConv::ID CC) const {
	bool SCS = MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack);
	if (CC == CallingConv::GHC)
	// This is academic because all GHC calls are (supposed to be) tail calls
	return SCS ? CSR_AArch64_NoRegs_SCS_RegMask : CSR_AArch64_NoRegs_RegMask;
	if (CC == CallingConv::AnyReg)
	return SCS ? CSR_AArch64_AllRegs_SCS_RegMask : CSR_AArch64_AllRegs_RegMask;

	// All the following calling conventions are handled differently on Darwin.
	if (MF.getSubtarget<AArch64Subtarget>().isTargetDarwin()) {
	if (SCS)
	report_fatal_error("ShadowCallStack attribute not supported on Darwin.");
	return getDarwinCallPreservedMask(MF, CC);
	}

	if (CC == CallingConv::AArch64_VectorCall)
	return SCS ? CSR_AArch64_AAVPCS_SCS_RegMask : CSR_AArch64_AAVPCS_RegMask;
	if (CC == CallingConv::AArch64_SVE_VectorCall)
	return SCS ? CSR_AArch64_SVE_AAPCS_SCS_RegMask
	: CSR_AArch64_SVE_AAPCS_RegMask;
	if (CC == CallingConv::CFGuard_Check)
	return CSR_Win_AArch64_CFGuard_Check_RegMask;
	if (MF.getSubtarget<AArch64Subtarget>().getTargetLowering()
	->supportSwiftError() &&
	MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError))
	return SCS ? CSR_AArch64_AAPCS_SwiftError_SCS_RegMask
	: CSR_AArch64_AAPCS_SwiftError_RegMask;
	if (CC == CallingConv::PreserveMost)
	return SCS ? CSR_AArch64_RT_MostRegs_SCS_RegMask
	: CSR_AArch64_RT_MostRegs_RegMask;
	else
	return SCS ? CSR_AArch64_AAPCS_SCS_RegMask : CSR_AArch64_AAPCS_RegMask;
	}

	const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const {
	if (TT.isOSDarwin())
	return CSR_Darwin_AArch64_TLS_RegMask;

	assert(TT.isOSBinFormatELF() && "Invalid target");
	return CSR_AArch64_TLS_ELF_RegMask;
	}

	void AArch64RegisterInfo::UpdateCustomCallPreservedMask(MachineFunction &MF,
	const uint32_t **Mask) const {
	uint32_t *UpdatedMask = MF.allocateRegMask();
	unsigned RegMaskSize = MachineOperand::getRegMaskSize(getNumRegs());
	memcpy(UpdatedMask, Mask, sizeof(UpdatedMask[0]) RegMaskSize);

	for (size_t i = 0; i < AArch64::GPR64commonRegClass.getNumRegs(); ++i) {
	if (MF.getSubtarget<AArch64Subtarget>().isXRegCustomCalleeSaved(i)) {
	for (MCSubRegIterator SubReg(AArch64::GPR64commonRegClass.getRegister(i),
	this, true);
	SubReg.isValid(); ++SubReg) {
	// See TargetRegisterInfo::getCallPreservedMask for how to interpret the
	// register mask.
	UpdatedMask[SubReg / 32] \|= 1u << (SubReg % 32);
	}
	}
	}
	*Mask = UpdatedMask;
	}

	const uint32_t *AArch64RegisterInfo::getNoPreservedMask() const {
	return CSR_AArch64_NoRegs_RegMask;
	}

	const uint32_t *
	AArch64RegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF,
	CallingConv::ID CC) const {
	// This should return a register mask that is the same as that returned by
	// getCallPreservedMask but that additionally preserves the register used for
	// the first i64 argument (which must also be the register used to return a
	// single i64 return value)
	//
	// In case that the calling convention does not use the same register for
	// both, the function should return NULL (does not currently apply)
	assert(CC != CallingConv::GHC && "should not be GHC calling convention.");
	if (MF.getSubtarget<AArch64Subtarget>().isTargetDarwin())
	return CSR_Darwin_AArch64_AAPCS_ThisReturn_RegMask;
	return CSR_AArch64_AAPCS_ThisReturn_RegMask;
	}

	const uint32_t *AArch64RegisterInfo::getWindowsStackProbePreservedMask() const {
	return CSR_AArch64_StackProbe_Windows_RegMask;
	}

	BitVector
	AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
	const AArch64FrameLowering *TFI = getFrameLowering(MF);

	// FIXME: avoid re-calculating this every time.
	BitVector Reserved(getNumRegs());
	markSuperRegs(Reserved, AArch64::WSP);
	markSuperRegs(Reserved, AArch64::WZR);

	if (TFI->hasFP(MF) \|\| TT.isOSDarwin())
	markSuperRegs(Reserved, AArch64::W29);

	for (size_t i = 0; i < AArch64::GPR32commonRegClass.getNumRegs(); ++i) {
	if (MF.getSubtarget<AArch64Subtarget>().isXRegisterReserved(i))
	markSuperRegs(Reserved, AArch64::GPR32commonRegClass.getRegister(i));
	}

	if (hasBasePointer(MF))
	markSuperRegs(Reserved, AArch64::W19);

	// SLH uses register W16/X16 as the taint register.
	if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
	markSuperRegs(Reserved, AArch64::W16);

	assert(checkAllSuperRegsMarked(Reserved));
	return Reserved;
	}

	bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF,
	MCRegister Reg) const {
	return getReservedRegs(MF)[Reg];
	}

	bool AArch64RegisterInfo::isAnyArgRegReserved(const MachineFunction &MF) const {
	return std::any_of(std::begin(*AArch64::GPR64argRegClass.MC),
	std::end(*AArch64::GPR64argRegClass.MC),
	[this, &MF](MCPhysReg r){return isReservedReg(MF, r);});
	}

	void AArch64RegisterInfo::emitReservedArgRegCallError(
	const MachineFunction &MF) const {
	const Function &F = MF.getFunction();
	F.getContext().diagnose(DiagnosticInfoUnsupported{F, "AArch64 doesn't support"
	" function calls if any of the argument registers is reserved."});
	}

	bool AArch64RegisterInfo::isAsmClobberable(const MachineFunction &MF,
	MCRegister PhysReg) const {
	return !isReservedReg(MF, PhysReg);
	}

	bool AArch64RegisterInfo::isConstantPhysReg(MCRegister PhysReg) const {
	return PhysReg == AArch64::WZR \|\| PhysReg == AArch64::XZR;
	}

	const TargetRegisterClass *
	AArch64RegisterInfo::getPointerRegClass(const MachineFunction &MF,
	unsigned Kind) const {
	return &AArch64::GPR64spRegClass;
	}

	const TargetRegisterClass *
	AArch64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
	if (RC == &AArch64::CCRRegClass)
	return &AArch64::GPR64RegClass; // Only MSR & MRS copy NZCV.
	return RC;
	}

	unsigned AArch64RegisterInfo::getBaseRegister() const { return AArch64::X19; }

	bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
	const MachineFrameInfo &MFI = MF.getFrameInfo();

	// In the presence of variable sized objects or funclets, if the fixed stack
	// size is large enough that referencing from the FP won't result in things
	// being in range relatively often, we can use a base pointer to allow access
	// from the other direction like the SP normally works.
	//
	// Furthermore, if both variable sized objects are present, and the
	// stack needs to be dynamically re-aligned, the base pointer is the only
	// reliable way to reference the locals.
	if (MFI.hasVarSizedObjects() \|\| MF.hasEHFunclets()) {
	if (needsStackRealignment(MF))
	return true;
	+
	+ if (MF.getSubtarget<AArch64Subtarget>().hasSVE()) {
	+ const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	+ // Frames that have variable sized objects and scalable SVE objects,
	+ // should always use a basepointer.
	+ if (!AFI->hasCalculatedStackSizeSVE() \|\| AFI->getStackSizeSVE())
	+ return true;
	+ }
	+
	// Conservatively estimate whether the negative offset from the frame
	// pointer will be sufficient to reach. If a function has a smallish
	// frame, it's less likely to have lots of spills and callee saved
	// space, so it's all more likely to be within range of the frame pointer.
	// If it's wrong, we'll materialize the constant and still get to the
	// object; it's just suboptimal. Negative offsets use the unscaled
	// load/store instructions, which have a 9-bit signed immediate.
	return MFI.getLocalFrameSize() >= 256;
	}

	return false;
	}

	Register
	AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
	const AArch64FrameLowering *TFI = getFrameLowering(MF);
	return TFI->hasFP(MF) ? AArch64::FP : AArch64::SP;
	}

	bool AArch64RegisterInfo::requiresRegisterScavenging(
	const MachineFunction &MF) const {
	return true;
	}

	bool AArch64RegisterInfo::requiresVirtualBaseRegisters(
	const MachineFunction &MF) const {
	return true;
	}

	bool
	AArch64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
	// This function indicates whether the emergency spillslot should be placed
	// close to the beginning of the stackframe (closer to FP) or the end
	// (closer to SP).
	//
	// The beginning works most reliably if we have a frame pointer.
	+ // In the presence of any non-constant space between FP and locals,
	+ // (e.g. in case of stack realignment or a scalable SVE area), it is
	+ // better to use SP or BP.
	const AArch64FrameLowering &TFI = *getFrameLowering(MF);
	- return TFI.hasFP(MF);
	+ const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	+ assert((!MF.getSubtarget<AArch64Subtarget>().hasSVE() \|\|
	+ AFI->hasCalculatedStackSizeSVE()) &&
	+ "Expected SVE area to be calculated by this point");
	+ return TFI.hasFP(MF) && !needsStackRealignment(MF) && !AFI->getStackSizeSVE();
	}

	bool AArch64RegisterInfo::requiresFrameIndexScavenging(
	const MachineFunction &MF) const {
	return true;
	}

	bool
	AArch64RegisterInfo::cannotEliminateFrame(const MachineFunction &MF) const {
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	if (MF.getTarget().Options.DisableFramePointerElim(MF) && MFI.adjustsStack())
	return true;
	return MFI.hasVarSizedObjects() \|\| MFI.isFrameAddressTaken();
	}

	/// needsFrameBaseReg - Returns true if the instruction's frame index
	/// reference would be better served by a base register other than FP
	/// or SP. Used by LocalStackFrameAllocation to determine which frame index
	/// references it should create new base registers for.
	bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI,
	int64_t Offset) const {
	for (unsigned i = 0; !MI->getOperand(i).isFI(); ++i)
	assert(i < MI->getNumOperands() &&
	"Instr doesn't have FrameIndex operand!");

	// It's the load/store FI references that cause issues, as it can be difficult
	// to materialize the offset if it won't fit in the literal field. Estimate
	// based on the size of the local frame and some conservative assumptions
	// about the rest of the stack frame (note, this is pre-regalloc, so
	// we don't know everything for certain yet) whether this offset is likely
	// to be out of range of the immediate. Return true if so.

	// We only generate virtual base registers for loads and stores, so
	// return false for everything else.
	if (!MI->mayLoad() && !MI->mayStore())
	return false;

	// Without a virtual base register, if the function has variable sized
	// objects, all fixed-size local references will be via the frame pointer,
	// Approximate the offset and see if it's legal for the instruction.
	// Note that the incoming offset is based on the SP value at function entry,
	// so it'll be negative.
	MachineFunction &MF = *MI->getParent()->getParent();
	const AArch64FrameLowering *TFI = getFrameLowering(MF);
	MachineFrameInfo &MFI = MF.getFrameInfo();

	// Estimate an offset from the frame pointer.
	// Conservatively assume all GPR callee-saved registers get pushed.
	// FP, LR, X19-X28, D8-D15. 64-bits each.
	int64_t FPOffset = Offset - 16 * 20;
	// Estimate an offset from the stack pointer.
	// The incoming offset is relating to the SP at the start of the function,
	// but when we access the local it'll be relative to the SP after local
	// allocation, so adjust our SP-relative offset by that allocation size.
	Offset += MFI.getLocalFrameSize();
	// Assume that we'll have at least some spill slots allocated.
	// FIXME: This is a total SWAG number. We should run some statistics
	// and pick a real one.
	Offset += 128; // 128 bytes of spill slots

	// If there is a frame pointer, try using it.
	// The FP is only available if there is no dynamic realignment. We
	// don't know for sure yet whether we'll need that, so we guess based
	// on whether there are any local variables that would trigger it.
	if (TFI->hasFP(MF) && isFrameOffsetLegal(MI, AArch64::FP, FPOffset))
	return false;

	// If we can reference via the stack pointer or base pointer, try that.
	// FIXME: This (and the code that resolves the references) can be improved
	// to only disallow SP relative references in the live range of
	// the VLA(s). In practice, it's unclear how much difference that
	// would make, but it may be worth doing.
	if (isFrameOffsetLegal(MI, AArch64::SP, Offset))
	return false;

	// If even offset 0 is illegal, we don't want a virtual base register.
	if (!isFrameOffsetLegal(MI, AArch64::SP, 0))
	return false;

	// The offset likely isn't legal; we want to allocate a virtual base register.
	return true;
	}

	bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
	Register BaseReg,
	int64_t Offset) const {
	assert(MI && "Unable to get the legal offset for nil instruction.");
	StackOffset SaveOffset(Offset, MVT::i8);
	return isAArch64FrameOffsetLegal(*MI, SaveOffset) & AArch64FrameOffsetIsLegal;
	}

	/// Insert defining instruction(s) for BaseReg to be a pointer to FrameIdx
	/// at the beginning of the basic block.
	void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
	Register BaseReg,
	int FrameIdx,
	int64_t Offset) const {
	MachineBasicBlock::iterator Ins = MBB->begin();
	DebugLoc DL; // Defaults to "unknown"
	if (Ins != MBB->end())
	DL = Ins->getDebugLoc();
	const MachineFunction &MF = *MBB->getParent();
	const AArch64InstrInfo *TII =
	MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
	const MCInstrDesc &MCID = TII->get(AArch64::ADDXri);
	MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
	MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this, MF));
	unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);

	BuildMI(*MBB, Ins, DL, MCID, BaseReg)
	.addFrameIndex(FrameIdx)
	.addImm(Offset)
	.addImm(Shifter);
	}

	void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
	int64_t Offset) const {
	// ARM doesn't need the general 64-bit offsets
	StackOffset Off(Offset, MVT::i8);

	unsigned i = 0;

	while (!MI.getOperand(i).isFI()) {
	++i;
	assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
	}
	const MachineFunction *MF = MI.getParent()->getParent();
	const AArch64InstrInfo *TII =
	MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
	bool Done = rewriteAArch64FrameIndex(MI, i, BaseReg, Off, TII);
	assert(Done && "Unable to resolve frame index!");
	(void)Done;
	}

	// Create a scratch register for the frame index elimination in an instruction.
	// This function has special handling of stack tagging loop pseudos, in which
	// case it can also change the instruction opcode (but not the operands).
	static Register
	createScratchRegisterForInstruction(MachineInstr &MI,
	const AArch64InstrInfo *TII) {
	// ST*Gloop have a reserved scratch register in operand 1. Use it, and also
	// replace the instruction with the writeback variant because it will now
	// satisfy the operand constraints for it.
	if (MI.getOpcode() == AArch64::STGloop) {
	MI.setDesc(TII->get(AArch64::STGloop_wback));
	return MI.getOperand(1).getReg();
	} else if (MI.getOpcode() == AArch64::STZGloop) {
	MI.setDesc(TII->get(AArch64::STZGloop_wback));
	return MI.getOperand(1).getReg();
	} else {
	return MI.getMF()->getRegInfo().createVirtualRegister(
	&AArch64::GPR64RegClass);
	}
	}

	void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
	int SPAdj, unsigned FIOperandNum,
	RegScavenger *RS) const {
	assert(SPAdj == 0 && "Unexpected");

	MachineInstr &MI = *II;
	MachineBasicBlock &MBB = *MI.getParent();
	MachineFunction &MF = *MBB.getParent();
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	const AArch64InstrInfo *TII =
	MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
	const AArch64FrameLowering *TFI = getFrameLowering(MF);

	int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
	bool Tagged =
	MI.getOperand(FIOperandNum).getTargetFlags() & AArch64II::MO_TAGGED;
	Register FrameReg;

	// Special handling of dbg_value, stackmap and patchpoint instructions.
	if (MI.isDebugValue() \|\| MI.getOpcode() == TargetOpcode::STACKMAP \|\|
	MI.getOpcode() == TargetOpcode::PATCHPOINT) {
	StackOffset Offset =
	TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg,
	/PreferFP=/true,
	/ForSimm=/false);
	Offset += StackOffset(MI.getOperand(FIOperandNum + 1).getImm(), MVT::i8);
	MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /isDef/);
	MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getBytes());
	return;
	}

	if (MI.getOpcode() == TargetOpcode::LOCAL_ESCAPE) {
	MachineOperand &FI = MI.getOperand(FIOperandNum);
	int Offset = TFI->getNonLocalFrameIndexReference(MF, FrameIndex);
	FI.ChangeToImmediate(Offset);
	return;
	}

	StackOffset Offset;
	if (MI.getOpcode() == AArch64::TAGPstack) {
	// TAGPstack must use the virtual frame register in its 3rd operand.
	const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	FrameReg = MI.getOperand(3).getReg();
	Offset = {MFI.getObjectOffset(FrameIndex) +
	AFI->getTaggedBasePointerOffset(),
	MVT::i8};
	} else if (Tagged) {
	StackOffset SPOffset = {
	MFI.getObjectOffset(FrameIndex) + (int64_t)MFI.getStackSize(), MVT::i8};
	if (MFI.hasVarSizedObjects() \|\|
	isAArch64FrameOffsetLegal(MI, SPOffset, nullptr, nullptr, nullptr) !=
	(AArch64FrameOffsetCanUpdate \| AArch64FrameOffsetIsLegal)) {
	// Can't update to SP + offset in place. Precalculate the tagged pointer
	// in a scratch register.
	Offset = TFI->resolveFrameIndexReference(
	MF, FrameIndex, FrameReg, /PreferFP=/false, /ForSimm=/true);
	Register ScratchReg =
	MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
	emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset,
	TII);
	BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::LDG), ScratchReg)
	.addReg(ScratchReg)
	.addReg(ScratchReg)
	.addImm(0);
	MI.getOperand(FIOperandNum)
	.ChangeToRegister(ScratchReg, false, false, true);
	return;
	}
	FrameReg = AArch64::SP;
	Offset = {MFI.getObjectOffset(FrameIndex) + (int64_t)MFI.getStackSize(),
	MVT::i8};
	} else {
	Offset = TFI->resolveFrameIndexReference(
	MF, FrameIndex, FrameReg, /PreferFP=/false, /ForSimm=/true);
	}

	// Modify MI as necessary to handle as much of 'Offset' as possible
	if (rewriteAArch64FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII))
	return;

	assert((!RS \|\| !RS->isScavengingFrameIndex(FrameIndex)) &&
	"Emergency spill slot is out of reach");

	// If we get here, the immediate doesn't fit into the instruction. We folded
	// as much as possible above. Handle the rest, providing a register that is
	// SP+LargeImm.
	Register ScratchReg = createScratchRegisterForInstruction(MI, TII);
	emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
	MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
	}

	unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
	MachineFunction &MF) const {
	const AArch64FrameLowering *TFI = getFrameLowering(MF);

	switch (RC->getID()) {
	default:
	return 0;
	case AArch64::GPR32RegClassID:
	case AArch64::GPR32spRegClassID:
	case AArch64::GPR32allRegClassID:
	case AArch64::GPR64spRegClassID:
	case AArch64::GPR64allRegClassID:
	case AArch64::GPR64RegClassID:
	case AArch64::GPR32commonRegClassID:
	case AArch64::GPR64commonRegClassID:
	return 32 - 1 // XZR/SP
	- (TFI->hasFP(MF) \|\| TT.isOSDarwin()) // FP
	- MF.getSubtarget<AArch64Subtarget>().getNumXRegisterReserved()
	- hasBasePointer(MF); // X19
	case AArch64::FPR8RegClassID:
	case AArch64::FPR16RegClassID:
	case AArch64::FPR32RegClassID:
	case AArch64::FPR64RegClassID:
	case AArch64::FPR128RegClassID:
	return 32;

	case AArch64::DDRegClassID:
	case AArch64::DDDRegClassID:
	case AArch64::DDDDRegClassID:
	case AArch64::QQRegClassID:
	case AArch64::QQQRegClassID:
	case AArch64::QQQQRegClassID:
	return 32;

	case AArch64::FPR128_loRegClassID:
	case AArch64::FPR64_loRegClassID:
	case AArch64::FPR16_loRegClassID:
	return 16;
	}
	}

	unsigned AArch64RegisterInfo::getLocalAddressRegister(
	const MachineFunction &MF) const {
	const auto &MFI = MF.getFrameInfo();
	if (!MF.hasEHFunclets() && !MFI.hasVarSizedObjects())
	return AArch64::SP;
	else if (needsStackRealignment(MF))
	return getBaseRegister();
	return getFrameRegister(MF);
	}
	diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
	index 28a54e6f7d79..3449a8bd16d2 100644
	--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
	+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
	@@ -1,2583 +1,2605 @@
	//=- AArch64SVEInstrInfo.td - AArch64 SVE Instructions -- tablegen ------=//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// AArch64 Scalable Vector Extension (SVE) Instruction definitions.
	//
	//===----------------------------------------------------------------------===//

	// For predicated nodes where the entire operation is controlled by a governing
	// predicate, please stick to a similar naming convention as used for the
	// ISD nodes:
	//
	// SDNode <=> AArch64ISD
	// -------------------------------
	// _m<n> <=> _MERGE_OP<n>
	// _mt <=> _MERGE_PASSTHRU
	// _z <=> _MERGE_ZERO
	// _p <=> _PRED
	//
	// Given the context of this file, it is not strictly necessary to use _p to
	// distinguish predicated from unpredicated nodes given that most SVE
	// instructions are predicated.

	// Contiguous loads - node definitions
	//
	def SDT_AArch64_LD1 : SDTypeProfile<1, 3, [
	SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>,
	SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
	]>;

	def AArch64ld1_z : SDNode<"AArch64ISD::LD1_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
	def AArch64ld1s_z : SDNode<"AArch64ISD::LD1S_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;

	// Non-faulting & first-faulting loads - node definitions
	//
	def AArch64ldnf1_z : SDNode<"AArch64ISD::LDNF1_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64ldff1_z : SDNode<"AArch64ISD::LDFF1_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;

	def AArch64ldnf1s_z : SDNode<"AArch64ISD::LDNF1S_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64ldff1s_z : SDNode<"AArch64ISD::LDFF1S_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;

	// Contiguous load and replicate - node definitions
	//

	def SDT_AArch64_LD1Replicate : SDTypeProfile<1, 2, [
	SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>,
	SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
	]>;

	def AArch64ld1rq_z : SDNode<"AArch64ISD::LD1RQ_MERGE_ZERO", SDT_AArch64_LD1Replicate, [SDNPHasChain, SDNPMayLoad]>;
	def AArch64ld1ro_z : SDNode<"AArch64ISD::LD1RO_MERGE_ZERO", SDT_AArch64_LD1Replicate, [SDNPHasChain, SDNPMayLoad]>;

	// Gather loads - node definitions
	//
	def SDT_AArch64_GATHER_SV : SDTypeProfile<1, 4, [
	SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>,
	SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
	]>;

	def SDT_AArch64_GATHER_VS : SDTypeProfile<1, 4, [
	SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>,
	SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
	]>;

	def AArch64ld1_gather_z : SDNode<"AArch64ISD::GLD1_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
	def AArch64ld1_gather_scaled_z : SDNode<"AArch64ISD::GLD1_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
	def AArch64ld1_gather_uxtw_z : SDNode<"AArch64ISD::GLD1_UXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
	def AArch64ld1_gather_sxtw_z : SDNode<"AArch64ISD::GLD1_SXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
	def AArch64ld1_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
	def AArch64ld1_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
	def AArch64ld1_gather_imm_z : SDNode<"AArch64ISD::GLD1_IMM_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>;

	def AArch64ld1s_gather_z : SDNode<"AArch64ISD::GLD1S_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
	def AArch64ld1s_gather_scaled_z : SDNode<"AArch64ISD::GLD1S_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
	def AArch64ld1s_gather_uxtw_z : SDNode<"AArch64ISD::GLD1S_UXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
	def AArch64ld1s_gather_sxtw_z : SDNode<"AArch64ISD::GLD1S_SXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
	def AArch64ld1s_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
	def AArch64ld1s_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
	def AArch64ld1s_gather_imm_z : SDNode<"AArch64ISD::GLD1S_IMM_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>;

	def AArch64ldff1_gather_z : SDNode<"AArch64ISD::GLDFF1_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64ldff1_gather_scaled_z : SDNode<"AArch64ISD::GLDFF1_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64ldff1_gather_uxtw_z : SDNode<"AArch64ISD::GLDFF1_UXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64ldff1_gather_sxtw_z : SDNode<"AArch64ISD::GLDFF1_SXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64ldff1_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64ldff1_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64ldff1_gather_imm_z : SDNode<"AArch64ISD::GLDFF1_IMM_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;

	def AArch64ldff1s_gather_z : SDNode<"AArch64ISD::GLDFF1S_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64ldff1s_gather_scaled_z : SDNode<"AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64ldff1s_gather_uxtw_z : SDNode<"AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64ldff1s_gather_sxtw_z : SDNode<"AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64ldff1s_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64ldff1s_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64ldff1s_gather_imm_z : SDNode<"AArch64ISD::GLDFF1S_IMM_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;

	def AArch64ldnt1_gather_z : SDNode<"AArch64ISD::GLDNT1_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>;
	def AArch64ldnt1s_gather_z : SDNode<"AArch64ISD::GLDNT1S_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>;

	// Contiguous stores - node definitions
	//
	def SDT_AArch64_ST1 : SDTypeProfile<0, 4, [
	SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>,
	SDTCVecEltisVT<2,i1>, SDTCisSameNumEltsAs<0,2>
	]>;

	def AArch64st1 : SDNode<"AArch64ISD::ST1_PRED", SDT_AArch64_ST1, [SDNPHasChain, SDNPMayStore]>;

	// Scatter stores - node definitions
	//
	def SDT_AArch64_SCATTER_SV : SDTypeProfile<0, 5, [
	SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>,
	SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
	]>;

	def SDT_AArch64_SCATTER_VS : SDTypeProfile<0, 5, [
	SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>,
	SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
	]>;

	def AArch64st1_scatter : SDNode<"AArch64ISD::SST1_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
	def AArch64st1_scatter_scaled : SDNode<"AArch64ISD::SST1_SCALED_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
	def AArch64st1_scatter_uxtw : SDNode<"AArch64ISD::SST1_UXTW_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
	def AArch64st1_scatter_sxtw : SDNode<"AArch64ISD::SST1_SXTW_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
	def AArch64st1_scatter_uxtw_scaled : SDNode<"AArch64ISD::SST1_UXTW_SCALED_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
	def AArch64st1_scatter_sxtw_scaled : SDNode<"AArch64ISD::SST1_SXTW_SCALED_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
	def AArch64st1_scatter_imm : SDNode<"AArch64ISD::SST1_IMM_PRED", SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore]>;

	def AArch64stnt1_scatter : SDNode<"AArch64ISD::SSTNT1_PRED", SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore]>;

	// AArch64 SVE/SVE2 - the remaining node definitions
	//

	// SVE CNT/INC/RDVL
	def sve_rdvl_imm : ComplexPattern<i32, 1, "SelectRDVLImm<-32, 31, 16>">;
	def sve_cnth_imm : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, 8>">;
	def sve_cntw_imm : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, 4>">;
	def sve_cntd_imm : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, 2>">;

	// SVE DEC
	def sve_cnth_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -8>">;
	def sve_cntw_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -4>">;
	def sve_cntd_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -2>">;

	def SDT_AArch64Reduce : SDTypeProfile<1, 2, [SDTCisVec<1>, SDTCisVec<2>]>;
	def AArch64faddv_p : SDNode<"AArch64ISD::FADDV_PRED", SDT_AArch64Reduce>;
	def AArch64fmaxv_p : SDNode<"AArch64ISD::FMAXV_PRED", SDT_AArch64Reduce>;
	def AArch64fmaxnmv_p : SDNode<"AArch64ISD::FMAXNMV_PRED", SDT_AArch64Reduce>;
	def AArch64fminv_p : SDNode<"AArch64ISD::FMINV_PRED", SDT_AArch64Reduce>;
	def AArch64fminnmv_p : SDNode<"AArch64ISD::FMINNMV_PRED", SDT_AArch64Reduce>;
	def AArch64smaxv_p : SDNode<"AArch64ISD::SMAXV_PRED", SDT_AArch64Reduce>;
	def AArch64umaxv_p : SDNode<"AArch64ISD::UMAXV_PRED", SDT_AArch64Reduce>;
	def AArch64sminv_p : SDNode<"AArch64ISD::SMINV_PRED", SDT_AArch64Reduce>;
	def AArch64uminv_p : SDNode<"AArch64ISD::UMINV_PRED", SDT_AArch64Reduce>;
	def AArch64orv_p : SDNode<"AArch64ISD::ORV_PRED", SDT_AArch64Reduce>;
	def AArch64eorv_p : SDNode<"AArch64ISD::EORV_PRED", SDT_AArch64Reduce>;
	def AArch64andv_p : SDNode<"AArch64ISD::ANDV_PRED", SDT_AArch64Reduce>;
	def AArch64lasta : SDNode<"AArch64ISD::LASTA", SDT_AArch64Reduce>;
	def AArch64lastb : SDNode<"AArch64ISD::LASTB", SDT_AArch64Reduce>;

	def SDT_AArch64Arith : SDTypeProfile<1, 3, [
	SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>,
	SDTCVecEltisVT<1,i1>, SDTCisSameAs<2,3>
	]>;

	def SDT_AArch64FMA : SDTypeProfile<1, 4, [
	SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>, SDTCisVec<4>,
	SDTCVecEltisVT<1,i1>, SDTCisSameAs<2,3>, SDTCisSameAs<3,4>
	]>;

	// Predicated operations with the result of inactive lanes being unspecified.
	def AArch64add_p : SDNode<"AArch64ISD::ADD_PRED", SDT_AArch64Arith>;
	def AArch64fadd_p : SDNode<"AArch64ISD::FADD_PRED", SDT_AArch64Arith>;
	def AArch64fma_p : SDNode<"AArch64ISD::FMA_PRED", SDT_AArch64FMA>;
	def AArch64sdiv_p : SDNode<"AArch64ISD::SDIV_PRED", SDT_AArch64Arith>;
	def AArch64udiv_p : SDNode<"AArch64ISD::UDIV_PRED", SDT_AArch64Arith>;

	// Merging op1 into the inactive lanes.
	def AArch64smin_m1 : SDNode<"AArch64ISD::SMIN_MERGE_OP1", SDT_AArch64Arith>;
	def AArch64umin_m1 : SDNode<"AArch64ISD::UMIN_MERGE_OP1", SDT_AArch64Arith>;
	def AArch64smax_m1 : SDNode<"AArch64ISD::SMAX_MERGE_OP1", SDT_AArch64Arith>;
	def AArch64umax_m1 : SDNode<"AArch64ISD::UMAX_MERGE_OP1", SDT_AArch64Arith>;
	def AArch64lsl_m1 : SDNode<"AArch64ISD::SHL_MERGE_OP1", SDT_AArch64Arith>;
	def AArch64lsr_m1 : SDNode<"AArch64ISD::SRL_MERGE_OP1", SDT_AArch64Arith>;
	def AArch64asr_m1 : SDNode<"AArch64ISD::SRA_MERGE_OP1", SDT_AArch64Arith>;

	def SDT_AArch64ReduceWithInit : SDTypeProfile<1, 3, [SDTCisVec<1>, SDTCisVec<3>]>;
	def AArch64clasta_n : SDNode<"AArch64ISD::CLASTA_N", SDT_AArch64ReduceWithInit>;
	def AArch64clastb_n : SDNode<"AArch64ISD::CLASTB_N", SDT_AArch64ReduceWithInit>;
	def AArch64fadda_p : SDNode<"AArch64ISD::FADDA_PRED", SDT_AArch64ReduceWithInit>;

	def SDT_AArch64Rev : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
	def AArch64rev : SDNode<"AArch64ISD::REV", SDT_AArch64Rev>;

	def SDT_AArch64PTest : SDTypeProfile<0, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
	def AArch64ptest : SDNode<"AArch64ISD::PTEST", SDT_AArch64PTest>;

	def SDT_AArch64DUP_PRED : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 3>, SDTCisVec<1>, SDTCVecEltisVT<1,i1>]>;
	def AArch64dup_mt : SDNode<"AArch64ISD::DUP_MERGE_PASSTHRU", SDT_AArch64DUP_PRED>;

	def SDT_IndexVector : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<1, 2>, SDTCisInt<2>]>;
	def index_vector : SDNode<"AArch64ISD::INDEX_VECTOR", SDT_IndexVector, []>;

	def reinterpret_cast : SDNode<"AArch64ISD::REINTERPRET_CAST", SDTUnaryOp>;

	let Predicates = [HasSVE] in {
	defm RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr", int_aarch64_sve_rdffr_z>;
	def RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">;
	defm RDFFR_P : sve_int_rdffr_unpred<"rdffr", int_aarch64_sve_rdffr>;
	def SETFFR : sve_int_setffr<"setffr", int_aarch64_sve_setffr>;
	def WRFFR : sve_int_wrffr<"wrffr", int_aarch64_sve_wrffr>;

	defm ADD_ZZZ : sve_int_bin_cons_arit_0<0b000, "add", add, null_frag>;
	defm SUB_ZZZ : sve_int_bin_cons_arit_0<0b001, "sub", sub, null_frag>;
	defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd", saddsat, int_aarch64_sve_sqadd_x>;
	defm UQADD_ZZZ : sve_int_bin_cons_arit_0<0b101, "uqadd", uaddsat, int_aarch64_sve_uqadd_x>;
	defm SQSUB_ZZZ : sve_int_bin_cons_arit_0<0b110, "sqsub", ssubsat, int_aarch64_sve_sqsub_x>;
	defm UQSUB_ZZZ : sve_int_bin_cons_arit_0<0b111, "uqsub", usubsat, int_aarch64_sve_uqsub_x>;

	defm AND_ZZZ : sve_int_bin_cons_log<0b00, "and", and>;
	defm ORR_ZZZ : sve_int_bin_cons_log<0b01, "orr", or>;
	defm EOR_ZZZ : sve_int_bin_cons_log<0b10, "eor", xor>;
	defm BIC_ZZZ : sve_int_bin_cons_log<0b11, "bic", null_frag>;

	defm ADD_ZPmZ : sve_int_bin_pred_arit_0<0b000, "add", "ADD_ZPZZ", int_aarch64_sve_add, DestructiveBinaryComm>;
	defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub", "SUB_ZPZZ", int_aarch64_sve_sub, DestructiveBinaryCommWithRev, "SUBR_ZPmZ">;
	defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr", "SUBR_ZPZZ", int_aarch64_sve_subr, DestructiveBinaryCommWithRev, "SUB_ZPmZ", /isReverseInstr/ 1>;

	defm ADD_ZPZZ : sve_int_bin_pred_bhsd<AArch64add_p>;

	let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
	defm ADD_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_add>;
	defm SUB_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_sub>;
	defm SUBR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_subr>;
	}

	defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr", int_aarch64_sve_orr>;
	defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor", int_aarch64_sve_eor>;
	defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and", int_aarch64_sve_and>;
	defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic", int_aarch64_sve_bic>;

	defm ADD_ZI : sve_int_arith_imm0<0b000, "add", add, null_frag>;
	defm SUB_ZI : sve_int_arith_imm0<0b001, "sub", sub, null_frag>;
	defm SUBR_ZI : sve_int_arith_imm0_subr<0b011, "subr", sub>;
	defm SQADD_ZI : sve_int_arith_imm0<0b100, "sqadd", saddsat, int_aarch64_sve_sqadd_x>;
	defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd", uaddsat, int_aarch64_sve_uqadd_x>;
	defm SQSUB_ZI : sve_int_arith_imm0<0b110, "sqsub", ssubsat, int_aarch64_sve_sqsub_x>;
	defm UQSUB_ZI : sve_int_arith_imm0<0b111, "uqsub", usubsat, int_aarch64_sve_uqsub_x>;

	defm MAD_ZPmZZ : sve_int_mladdsub_vvv_pred<0b0, "mad", int_aarch64_sve_mad>;
	defm MSB_ZPmZZ : sve_int_mladdsub_vvv_pred<0b1, "msb", int_aarch64_sve_msb>;
	defm MLA_ZPmZZ : sve_int_mlas_vvv_pred<0b0, "mla", int_aarch64_sve_mla>;
	defm MLS_ZPmZZ : sve_int_mlas_vvv_pred<0b1, "mls", int_aarch64_sve_mls>;

	// SVE predicated integer reductions.
	defm SADDV_VPZ : sve_int_reduce_0_saddv<0b000, "saddv", int_aarch64_sve_saddv>;
	defm UADDV_VPZ : sve_int_reduce_0_uaddv<0b001, "uaddv", int_aarch64_sve_uaddv, int_aarch64_sve_saddv>;
	defm SMAXV_VPZ : sve_int_reduce_1<0b000, "smaxv", AArch64smaxv_p>;
	defm UMAXV_VPZ : sve_int_reduce_1<0b001, "umaxv", AArch64umaxv_p>;
	defm SMINV_VPZ : sve_int_reduce_1<0b010, "sminv", AArch64sminv_p>;
	defm UMINV_VPZ : sve_int_reduce_1<0b011, "uminv", AArch64uminv_p>;
	defm ORV_VPZ : sve_int_reduce_2<0b000, "orv", AArch64orv_p>;
	defm EORV_VPZ : sve_int_reduce_2<0b001, "eorv", AArch64eorv_p>;
	defm ANDV_VPZ : sve_int_reduce_2<0b010, "andv", AArch64andv_p>;

	defm ORR_ZI : sve_int_log_imm<0b00, "orr", "orn", or>;
	defm EOR_ZI : sve_int_log_imm<0b01, "eor", "eon", xor>;
	defm AND_ZI : sve_int_log_imm<0b10, "and", "bic", and>;

	defm SMAX_ZI : sve_int_arith_imm1<0b00, "smax", AArch64smax_m1>;
	defm SMIN_ZI : sve_int_arith_imm1<0b10, "smin", AArch64smin_m1>;
	defm UMAX_ZI : sve_int_arith_imm1_unsigned<0b01, "umax", AArch64umax_m1>;
	defm UMIN_ZI : sve_int_arith_imm1_unsigned<0b11, "umin", AArch64umin_m1>;

	defm MUL_ZI : sve_int_arith_imm2<"mul", mul>;
	defm MUL_ZPmZ : sve_int_bin_pred_arit_2<0b000, "mul", int_aarch64_sve_mul>;
	defm SMULH_ZPmZ : sve_int_bin_pred_arit_2<0b010, "smulh", int_aarch64_sve_smulh>;
	defm UMULH_ZPmZ : sve_int_bin_pred_arit_2<0b011, "umulh", int_aarch64_sve_umulh>;

	// Add unpredicated alternative for the mul instruction.
	def : Pat<(mul nxv16i8:$Op1, nxv16i8:$Op2),
	(MUL_ZPmZ_B (PTRUE_B 31), $Op1, $Op2)>;
	def : Pat<(mul nxv8i16:$Op1, nxv8i16:$Op2),
	(MUL_ZPmZ_H (PTRUE_H 31), $Op1, $Op2)>;
	def : Pat<(mul nxv4i32:$Op1, nxv4i32:$Op2),
	(MUL_ZPmZ_S (PTRUE_S 31), $Op1, $Op2)>;
	def : Pat<(mul nxv2i64:$Op1, nxv2i64:$Op2),
	(MUL_ZPmZ_D (PTRUE_D 31), $Op1, $Op2)>;

	defm SDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b100, "sdiv", "SDIV_ZPZZ", int_aarch64_sve_sdiv, DestructiveBinaryCommWithRev, "SDIVR_ZPmZ">;
	defm UDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b101, "udiv", "UDIV_ZPZZ", int_aarch64_sve_udiv, DestructiveBinaryCommWithRev, "UDIVR_ZPmZ">;
	defm SDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b110, "sdivr", "SDIVR_ZPZZ", int_aarch64_sve_sdivr, DestructiveBinaryCommWithRev, "SDIV_ZPmZ", /isReverseInstr/ 1>;
	defm UDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b111, "udivr", "UDIVR_ZPZZ", int_aarch64_sve_udivr, DestructiveBinaryCommWithRev, "UDIV_ZPmZ", /isReverseInstr/ 1>;

	defm SDIV_ZPZZ : sve_int_bin_pred_sd<AArch64sdiv_p>;
	defm UDIV_ZPZZ : sve_int_bin_pred_sd<AArch64udiv_p>;

	defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot", int_aarch64_sve_sdot>;
	defm UDOT_ZZZ : sve_intx_dot<0b1, "udot", int_aarch64_sve_udot>;

	defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot", int_aarch64_sve_sdot_lane>;
	defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot", int_aarch64_sve_udot_lane>;

	defm SXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b000, "sxtb", int_aarch64_sve_sxtb>;
	defm UXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b001, "uxtb", int_aarch64_sve_uxtb>;
	defm SXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b010, "sxth", int_aarch64_sve_sxth>;
	defm UXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b011, "uxth", int_aarch64_sve_uxth>;
	defm SXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b100, "sxtw", int_aarch64_sve_sxtw>;
	defm UXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b101, "uxtw", int_aarch64_sve_uxtw>;
	defm ABS_ZPmZ : sve_int_un_pred_arit_0< 0b110, "abs", int_aarch64_sve_abs>;
	defm NEG_ZPmZ : sve_int_un_pred_arit_0< 0b111, "neg", int_aarch64_sve_neg>;

	defm CLS_ZPmZ : sve_int_un_pred_arit_1< 0b000, "cls", int_aarch64_sve_cls>;
	defm CLZ_ZPmZ : sve_int_un_pred_arit_1< 0b001, "clz", int_aarch64_sve_clz>;
	defm CNT_ZPmZ : sve_int_un_pred_arit_1< 0b010, "cnt", int_aarch64_sve_cnt>;

	let Predicates = [HasSVE, HasBF16] in {
	def : SVE_3_Op_Pat<nxv8i16, int_aarch64_sve_cnt, nxv8i16, nxv8i1, nxv8bf16, !cast<Instruction>(CNT_ZPmZ_H)>;
	}

	defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot", int_aarch64_sve_cnot>;
	defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not", int_aarch64_sve_not>;
	defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs", int_aarch64_sve_fabs>;
	defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg", int_aarch64_sve_fneg>;

	defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax", AArch64smax_m1>;
	defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax", AArch64umax_m1>;
	defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin", AArch64smin_m1>;
	defm UMIN_ZPmZ : sve_int_bin_pred_arit_1<0b011, "umin", AArch64umin_m1>;
	defm SABD_ZPmZ : sve_int_bin_pred_arit_1<0b100, "sabd", int_aarch64_sve_sabd>;
	defm UABD_ZPmZ : sve_int_bin_pred_arit_1<0b101, "uabd", int_aarch64_sve_uabd>;

	defm FRECPE_ZZ : sve_fp_2op_u_zd<0b110, "frecpe", int_aarch64_sve_frecpe_x>;
	defm FRSQRTE_ZZ : sve_fp_2op_u_zd<0b111, "frsqrte", int_aarch64_sve_frsqrte_x>;

	defm FADD_ZPmI : sve_fp_2op_i_p_zds<0b000, "fadd", sve_fpimm_half_one>;
	defm FSUB_ZPmI : sve_fp_2op_i_p_zds<0b001, "fsub", sve_fpimm_half_one>;
	defm FMUL_ZPmI : sve_fp_2op_i_p_zds<0b010, "fmul", sve_fpimm_half_two>;
	defm FSUBR_ZPmI : sve_fp_2op_i_p_zds<0b011, "fsubr", sve_fpimm_half_one>;
	defm FMAXNM_ZPmI : sve_fp_2op_i_p_zds<0b100, "fmaxnm", sve_fpimm_zero_one>;
	defm FMINNM_ZPmI : sve_fp_2op_i_p_zds<0b101, "fminnm", sve_fpimm_zero_one>;
	defm FMAX_ZPmI : sve_fp_2op_i_p_zds<0b110, "fmax", sve_fpimm_zero_one>;
	defm FMIN_ZPmI : sve_fp_2op_i_p_zds<0b111, "fmin", sve_fpimm_zero_one>;

	defm FADD_ZPmZ : sve_fp_2op_p_zds<0b0000, "fadd", "FADD_ZPZZ", int_aarch64_sve_fadd, DestructiveBinaryComm>;
	defm FSUB_ZPmZ : sve_fp_2op_p_zds<0b0001, "fsub", "FSUB_ZPZZ", int_aarch64_sve_fsub, DestructiveBinaryCommWithRev, "FSUBR_ZPmZ">;
	defm FMUL_ZPmZ : sve_fp_2op_p_zds<0b0010, "fmul", "FMUL_ZPZZ", int_aarch64_sve_fmul, DestructiveBinaryComm>;
	defm FSUBR_ZPmZ : sve_fp_2op_p_zds<0b0011, "fsubr", "FSUBR_ZPZZ", int_aarch64_sve_fsubr, DestructiveBinaryCommWithRev, "FSUB_ZPmZ", /isReverseInstr/ 1>;
	defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm", "FMAXNM_ZPZZ", int_aarch64_sve_fmaxnm, DestructiveBinaryComm>;
	defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm", "FMINNM_ZPZZ", int_aarch64_sve_fminnm, DestructiveBinaryComm>;
	defm FMAX_ZPmZ : sve_fp_2op_p_zds<0b0110, "fmax", "FMAX_ZPZZ", int_aarch64_sve_fmax, DestructiveBinaryComm>;
	defm FMIN_ZPmZ : sve_fp_2op_p_zds<0b0111, "fmin", "FMIN_ZPZZ", int_aarch64_sve_fmin, DestructiveBinaryComm>;
	defm FABD_ZPmZ : sve_fp_2op_p_zds<0b1000, "fabd", "FABD_ZPZZ", int_aarch64_sve_fabd, DestructiveBinaryComm>;
	defm FSCALE_ZPmZ : sve_fp_2op_p_zds_fscale<0b1001, "fscale", int_aarch64_sve_fscale>;
	defm FMULX_ZPmZ : sve_fp_2op_p_zds<0b1010, "fmulx", "FMULX_ZPZZ", int_aarch64_sve_fmulx, DestructiveBinaryComm>;
	defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr", "FDIVR_ZPZZ", int_aarch64_sve_fdivr, DestructiveBinaryCommWithRev, "FDIV_ZPmZ", /isReverseInstr/ 1>;
	defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv", "FDIV_ZPZZ", int_aarch64_sve_fdiv, DestructiveBinaryCommWithRev, "FDIVR_ZPmZ">;

	defm FADD_ZPZZ : sve_fp_bin_pred_hfd<AArch64fadd_p>;

	let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
	defm FADD_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fadd>;
	defm FSUB_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fsub>;
	defm FMUL_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmul>;
	defm FSUBR_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fsubr>;
	defm FMAXNM_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmaxnm>;
	defm FMINNM_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fminnm>;
	defm FMAX_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmax>;
	defm FMIN_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmin>;
	defm FABD_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fabd>;
	defm FMULX_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmulx>;
	defm FDIVR_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fdivr>;
	defm FDIV_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fdiv>;
	}

	defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd", fadd>;
	defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub", fsub>;
	defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul", fmul>;
	defm FTSMUL_ZZZ : sve_fp_3op_u_zd_ftsmul<0b011, "ftsmul", int_aarch64_sve_ftsmul_x>;
	defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps", int_aarch64_sve_frecps_x>;
	defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts", int_aarch64_sve_frsqrts_x>;

	defm FTSSEL_ZZZ : sve_int_bin_cons_misc_0_b<"ftssel", int_aarch64_sve_ftssel_x>;

	defm FCADD_ZPmZ : sve_fp_fcadd<"fcadd", int_aarch64_sve_fcadd>;
	defm FCMLA_ZPmZZ : sve_fp_fcmla<"fcmla", int_aarch64_sve_fcmla>;

	defm FMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b00, "fmla", int_aarch64_sve_fmla>;
	defm FMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b01, "fmls", int_aarch64_sve_fmls>;
	defm FNMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b10, "fnmla", int_aarch64_sve_fnmla>;
	defm FNMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b11, "fnmls", int_aarch64_sve_fnmls>;

	defm FMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b00, "fmad", int_aarch64_sve_fmad>;
	defm FMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b01, "fmsb", int_aarch64_sve_fmsb>;
	defm FNMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b10, "fnmad", int_aarch64_sve_fnmad>;
	defm FNMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b11, "fnmsb", int_aarch64_sve_fnmsb>;

	// Add patterns for FMA where disabled lanes are undef.
	// FIXME: Implement a pseudo so we can choose a better instruction after
	// regalloc.
	def : Pat<(nxv8f16 (AArch64fma_p nxv8i1:$P, nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3)),
	(FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>;
	def : Pat<(nxv4f32 (AArch64fma_p nxv4i1:$P, nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3)),
	(FMLA_ZPmZZ_S $P, $Op3, $Op1, $Op2)>;
	def : Pat<(nxv2f64 (AArch64fma_p nxv2i1:$P, nxv2f64:$Op1, nxv2f64:$Op2, nxv2f64:$Op3)),
	(FMLA_ZPmZZ_D $P, $Op3, $Op1, $Op2)>;

	defm FTMAD_ZZI : sve_fp_ftmad<"ftmad", int_aarch64_sve_ftmad_x>;

	defm FMLA_ZZZI : sve_fp_fma_by_indexed_elem<0b0, "fmla", int_aarch64_sve_fmla_lane>;
	defm FMLS_ZZZI : sve_fp_fma_by_indexed_elem<0b1, "fmls", int_aarch64_sve_fmls_lane>;

	defm FCMLA_ZZZI : sve_fp_fcmla_by_indexed_elem<"fcmla", int_aarch64_sve_fcmla_lane>;
	defm FMUL_ZZZI : sve_fp_fmul_by_indexed_elem<"fmul", int_aarch64_sve_fmul_lane>;

	// SVE floating point reductions.
	defm FADDA_VPZ : sve_fp_2op_p_vd<0b000, "fadda", AArch64fadda_p>;
	defm FADDV_VPZ : sve_fp_fast_red<0b000, "faddv", AArch64faddv_p>;
	defm FMAXNMV_VPZ : sve_fp_fast_red<0b100, "fmaxnmv", AArch64fmaxnmv_p>;
	defm FMINNMV_VPZ : sve_fp_fast_red<0b101, "fminnmv", AArch64fminnmv_p>;
	defm FMAXV_VPZ : sve_fp_fast_red<0b110, "fmaxv", AArch64fmaxv_p>;
	defm FMINV_VPZ : sve_fp_fast_red<0b111, "fminv", AArch64fminv_p>;

	// Use more efficient NEON instructions to extract elements within the NEON
	// part (first 128bits) of an SVE register.
	def : Pat<(vector_extract (nxv8f16 ZPR:$Zs), (i64 0)),
	(f16 (EXTRACT_SUBREG (v8f16 (EXTRACT_SUBREG ZPR:$Zs, zsub)), hsub))>;
	def : Pat<(vector_extract (nxv4f32 ZPR:$Zs), (i64 0)),
	(f32 (EXTRACT_SUBREG (v4f32 (EXTRACT_SUBREG ZPR:$Zs, zsub)), ssub))>;
	def : Pat<(vector_extract (nxv2f64 ZPR:$Zs), (i64 0)),
	(f64 (EXTRACT_SUBREG (v2f64 (EXTRACT_SUBREG ZPR:$Zs, zsub)), dsub))>;

	// Splat immediate (unpredicated)
	defm DUP_ZI : sve_int_dup_imm<"dup">;
	defm FDUP_ZI : sve_int_dup_fpimm<"fdup">;
	defm DUPM_ZI : sve_int_dup_mask_imm<"dupm">;

	// Splat immediate (predicated)
	defm CPY_ZPmI : sve_int_dup_imm_pred_merge<"cpy">;
	defm CPY_ZPzI : sve_int_dup_imm_pred_zero<"cpy">;
	defm FCPY_ZPmI : sve_int_dup_fpimm_pred<"fcpy">;

	// Splat scalar register (unpredicated, GPR or vector + element index)
	defm DUP_ZR : sve_int_perm_dup_r<"dup", AArch64dup>;
	defm DUP_ZZI : sve_int_perm_dup_i<"dup">;

	// Splat scalar register (predicated)
	defm CPY_ZPmR : sve_int_perm_cpy_r<"cpy", AArch64dup_mt>;
	defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy", AArch64dup_mt>;

	let Predicates = [HasSVE, HasBF16] in {
	def : Pat<(nxv8bf16 (AArch64dup_mt nxv8i1:$pg, bf16:$splat, nxv8bf16:$passthru)),
	(CPY_ZPmV_H $passthru, $pg, $splat)>;
	}

	// Duplicate FP scalar into all vector elements
	def : Pat<(nxv8f16 (AArch64dup (f16 FPR16:$src))),
	(DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
	def : Pat<(nxv4f16 (AArch64dup (f16 FPR16:$src))),
	(DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
	def : Pat<(nxv2f16 (AArch64dup (f16 FPR16:$src))),
	(DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
	def : Pat<(nxv4f32 (AArch64dup (f32 FPR32:$src))),
	(DUP_ZZI_S (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), 0)>;
	def : Pat<(nxv2f32 (AArch64dup (f32 FPR32:$src))),
	(DUP_ZZI_S (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), 0)>;
	def : Pat<(nxv2f64 (AArch64dup (f64 FPR64:$src))),
	(DUP_ZZI_D (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$src, dsub), 0)>;
	let Predicates = [HasSVE, HasBF16] in {
	def : Pat<(nxv8bf16 (AArch64dup (bf16 FPR16:$src))),
	(DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
	}

	// Duplicate +0.0 into all vector elements
	def : Pat<(nxv8f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
	def : Pat<(nxv4f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
	def : Pat<(nxv2f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
	def : Pat<(nxv4f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
	def : Pat<(nxv2f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
	def : Pat<(nxv2f64 (AArch64dup (f64 fpimm0))), (DUP_ZI_D 0, 0)>;
	let Predicates = [HasSVE, HasBF16] in {
	def : Pat<(nxv8bf16 (AArch64dup (bf16 fpimm0))), (DUP_ZI_H 0, 0)>;
	}

	// Duplicate Int immediate into all vector elements
	def : Pat<(nxv16i8 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))),
	(DUP_ZI_B $a, $b)>;
	def : Pat<(nxv8i16 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))),
	(DUP_ZI_H $a, $b)>;
	def : Pat<(nxv4i32 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))),
	(DUP_ZI_S $a, $b)>;
	def : Pat<(nxv2i64 (AArch64dup (i64 (SVE8BitLslImm i32:$a, i32:$b)))),
	(DUP_ZI_D $a, $b)>;

	// Duplicate FP immediate into all vector elements
	let AddedComplexity = 2 in {
	def : Pat<(nxv8f16 (AArch64dup fpimm16:$imm8)),
	(FDUP_ZI_H fpimm16:$imm8)>;
	def : Pat<(nxv4f16 (AArch64dup fpimm16:$imm8)),
	(FDUP_ZI_H fpimm16:$imm8)>;
	def : Pat<(nxv2f16 (AArch64dup fpimm16:$imm8)),
	(FDUP_ZI_H fpimm16:$imm8)>;
	def : Pat<(nxv4f32 (AArch64dup fpimm32:$imm8)),
	(FDUP_ZI_S fpimm32:$imm8)>;
	def : Pat<(nxv2f32 (AArch64dup fpimm32:$imm8)),
	(FDUP_ZI_S fpimm32:$imm8)>;
	def : Pat<(nxv2f64 (AArch64dup fpimm64:$imm8)),
	(FDUP_ZI_D fpimm64:$imm8)>;
	}

	// Select elements from either vector (predicated)
	defm SEL_ZPZZ : sve_int_sel_vvv<"sel", vselect>;

	defm SPLICE_ZPZ : sve_int_perm_splice<"splice", int_aarch64_sve_splice>;

	let Predicates = [HasSVE, HasBF16] in {
	def : SVE_3_Op_Pat<nxv8bf16, vselect, nxv8i1, nxv8bf16, nxv8bf16, SEL_ZPZZ_H>;
	def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_splice, nxv8i1, nxv8bf16, nxv8bf16, SPLICE_ZPZ_H>;
	}

	defm COMPACT_ZPZ : sve_int_perm_compact<"compact", int_aarch64_sve_compact>;
	defm INSR_ZR : sve_int_perm_insrs<"insr", AArch64insr>;
	defm INSR_ZV : sve_int_perm_insrv<"insr", AArch64insr>;
	defm EXT_ZZI : sve_int_perm_extract_i<"ext", AArch64ext>;

	let Predicates = [HasSVE, HasBF16] in {
	def : SVE_2_Op_Pat<nxv8bf16, AArch64insr, nxv8bf16, bf16, INSR_ZV_H>;
	}

	defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit", int_aarch64_sve_rbit>;
	defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb", int_aarch64_sve_revb, bswap>;
	defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh", int_aarch64_sve_revh>;
	defm REVW_ZPmZ : sve_int_perm_rev_revw<"revw", int_aarch64_sve_revw>;

	defm REV_PP : sve_int_perm_reverse_p<"rev", AArch64rev>;
	defm REV_ZZ : sve_int_perm_reverse_z<"rev", AArch64rev>;

	let Predicates = [HasSVE, HasBF16] in {
	def : SVE_1_Op_Pat<nxv8bf16, AArch64rev, nxv8bf16, REV_ZZ_H>;
	}

	defm SUNPKLO_ZZ : sve_int_perm_unpk<0b00, "sunpklo", AArch64sunpklo>;
	defm SUNPKHI_ZZ : sve_int_perm_unpk<0b01, "sunpkhi", AArch64sunpkhi>;
	defm UUNPKLO_ZZ : sve_int_perm_unpk<0b10, "uunpklo", AArch64uunpklo>;
	defm UUNPKHI_ZZ : sve_int_perm_unpk<0b11, "uunpkhi", AArch64uunpkhi>;

	defm PUNPKLO_PP : sve_int_perm_punpk<0b0, "punpklo", int_aarch64_sve_punpklo>;
	defm PUNPKHI_PP : sve_int_perm_punpk<0b1, "punpkhi", int_aarch64_sve_punpkhi>;

	defm MOVPRFX_ZPzZ : sve_int_movprfx_pred_zero<0b000, "movprfx">;
	defm MOVPRFX_ZPmZ : sve_int_movprfx_pred_merge<0b001, "movprfx">;
	def MOVPRFX_ZZ : sve_int_bin_cons_misc_0_c<0b00000001, "movprfx", ZPRAny>;
	defm FEXPA_ZZ : sve_int_bin_cons_misc_0_c_fexpa<"fexpa", int_aarch64_sve_fexpa_x>;

	defm BRKPA_PPzPP : sve_int_brkp<0b00, "brkpa", int_aarch64_sve_brkpa_z>;
	defm BRKPAS_PPzPP : sve_int_brkp<0b10, "brkpas", null_frag>;
	defm BRKPB_PPzPP : sve_int_brkp<0b01, "brkpb", int_aarch64_sve_brkpb_z>;
	defm BRKPBS_PPzPP : sve_int_brkp<0b11, "brkpbs", null_frag>;

	defm BRKN_PPzP : sve_int_brkn<0b0, "brkn", int_aarch64_sve_brkn_z>;
	defm BRKNS_PPzP : sve_int_brkn<0b1, "brkns", null_frag>;

	defm BRKA_PPzP : sve_int_break_z<0b000, "brka", int_aarch64_sve_brka_z>;
	defm BRKA_PPmP : sve_int_break_m<0b001, "brka", int_aarch64_sve_brka>;
	defm BRKAS_PPzP : sve_int_break_z<0b010, "brkas", null_frag>;
	defm BRKB_PPzP : sve_int_break_z<0b100, "brkb", int_aarch64_sve_brkb_z>;
	defm BRKB_PPmP : sve_int_break_m<0b101, "brkb", int_aarch64_sve_brkb>;
	defm BRKBS_PPzP : sve_int_break_z<0b110, "brkbs", null_frag>;

	def PTEST_PP : sve_int_ptest<0b010000, "ptest">;
	def PFALSE : sve_int_pfalse<0b000000, "pfalse">;
	defm PFIRST : sve_int_pfirst<0b00000, "pfirst", int_aarch64_sve_pfirst>;
	defm PNEXT : sve_int_pnext<0b00110, "pnext", int_aarch64_sve_pnext>;

	defm AND_PPzPP : sve_int_pred_log<0b0000, "and", int_aarch64_sve_and_z, and>;
	defm BIC_PPzPP : sve_int_pred_log<0b0001, "bic", int_aarch64_sve_bic_z>;
	defm EOR_PPzPP : sve_int_pred_log<0b0010, "eor", int_aarch64_sve_eor_z, xor>;
	defm SEL_PPPP : sve_int_pred_log<0b0011, "sel", vselect>;
	defm ANDS_PPzPP : sve_int_pred_log<0b0100, "ands", null_frag>;
	defm BICS_PPzPP : sve_int_pred_log<0b0101, "bics", null_frag>;
	defm EORS_PPzPP : sve_int_pred_log<0b0110, "eors", null_frag>;
	defm ORR_PPzPP : sve_int_pred_log<0b1000, "orr", int_aarch64_sve_orr_z, or>;
	defm ORN_PPzPP : sve_int_pred_log<0b1001, "orn", int_aarch64_sve_orn_z>;
	defm NOR_PPzPP : sve_int_pred_log<0b1010, "nor", int_aarch64_sve_nor_z>;
	defm NAND_PPzPP : sve_int_pred_log<0b1011, "nand", int_aarch64_sve_nand_z>;
	defm ORRS_PPzPP : sve_int_pred_log<0b1100, "orrs", null_frag>;
	defm ORNS_PPzPP : sve_int_pred_log<0b1101, "orns", null_frag>;
	defm NORS_PPzPP : sve_int_pred_log<0b1110, "nors", null_frag>;
	defm NANDS_PPzPP : sve_int_pred_log<0b1111, "nands", null_frag>;

	defm CLASTA_RPZ : sve_int_perm_clast_rz<0, "clasta", AArch64clasta_n>;
	defm CLASTB_RPZ : sve_int_perm_clast_rz<1, "clastb", AArch64clastb_n>;
	defm CLASTA_VPZ : sve_int_perm_clast_vz<0, "clasta", AArch64clasta_n>;
	defm CLASTB_VPZ : sve_int_perm_clast_vz<1, "clastb", AArch64clastb_n>;
	defm CLASTA_ZPZ : sve_int_perm_clast_zz<0, "clasta", int_aarch64_sve_clasta>;
	defm CLASTB_ZPZ : sve_int_perm_clast_zz<1, "clastb", int_aarch64_sve_clastb>;

	let Predicates = [HasSVE, HasBF16] in {
	def : SVE_3_Op_Pat<bf16, AArch64clasta_n, nxv8i1, bf16, nxv8bf16, CLASTA_VPZ_H>;
	def : SVE_3_Op_Pat<bf16, AArch64clastb_n, nxv8i1, bf16, nxv8bf16, CLASTB_VPZ_H>;
	def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_clasta, nxv8i1, nxv8bf16, nxv8bf16, CLASTA_ZPZ_H>;
	def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_clastb, nxv8i1, nxv8bf16, nxv8bf16, CLASTB_ZPZ_H>;
	}

	defm LASTA_RPZ : sve_int_perm_last_r<0, "lasta", AArch64lasta>;
	defm LASTB_RPZ : sve_int_perm_last_r<1, "lastb", AArch64lastb>;
	defm LASTA_VPZ : sve_int_perm_last_v<0, "lasta", AArch64lasta>;
	defm LASTB_VPZ : sve_int_perm_last_v<1, "lastb", AArch64lastb>;

	let Predicates = [HasSVE, HasBF16] in {
	def : SVE_2_Op_Pat<bf16, AArch64lasta, nxv8i1, nxv8bf16, LASTA_VPZ_H>;
	def : SVE_2_Op_Pat<bf16, AArch64lastb, nxv8i1, nxv8bf16, LASTB_VPZ_H>;
	}

	// continuous load with reg+immediate
	defm LD1B_IMM : sve_mem_cld_si<0b0000, "ld1b", Z_b, ZPR8>;
	defm LD1B_H_IMM : sve_mem_cld_si<0b0001, "ld1b", Z_h, ZPR16>;
	defm LD1B_S_IMM : sve_mem_cld_si<0b0010, "ld1b", Z_s, ZPR32>;
	defm LD1B_D_IMM : sve_mem_cld_si<0b0011, "ld1b", Z_d, ZPR64>;
	defm LD1SW_D_IMM : sve_mem_cld_si<0b0100, "ld1sw", Z_d, ZPR64>;
	defm LD1H_IMM : sve_mem_cld_si<0b0101, "ld1h", Z_h, ZPR16>;
	defm LD1H_S_IMM : sve_mem_cld_si<0b0110, "ld1h", Z_s, ZPR32>;
	defm LD1H_D_IMM : sve_mem_cld_si<0b0111, "ld1h", Z_d, ZPR64>;
	defm LD1SH_D_IMM : sve_mem_cld_si<0b1000, "ld1sh", Z_d, ZPR64>;
	defm LD1SH_S_IMM : sve_mem_cld_si<0b1001, "ld1sh", Z_s, ZPR32>;
	defm LD1W_IMM : sve_mem_cld_si<0b1010, "ld1w", Z_s, ZPR32>;
	defm LD1W_D_IMM : sve_mem_cld_si<0b1011, "ld1w", Z_d, ZPR64>;
	defm LD1SB_D_IMM : sve_mem_cld_si<0b1100, "ld1sb", Z_d, ZPR64>;
	defm LD1SB_S_IMM : sve_mem_cld_si<0b1101, "ld1sb", Z_s, ZPR32>;
	defm LD1SB_H_IMM : sve_mem_cld_si<0b1110, "ld1sb", Z_h, ZPR16>;
	defm LD1D_IMM : sve_mem_cld_si<0b1111, "ld1d", Z_d, ZPR64>;

	// LD1R loads (splat scalar to vector)
	defm LD1RB_IMM : sve_mem_ld_dup<0b00, 0b00, "ld1rb", Z_b, ZPR8, uimm6s1>;
	defm LD1RB_H_IMM : sve_mem_ld_dup<0b00, 0b01, "ld1rb", Z_h, ZPR16, uimm6s1>;
	defm LD1RB_S_IMM : sve_mem_ld_dup<0b00, 0b10, "ld1rb", Z_s, ZPR32, uimm6s1>;
	defm LD1RB_D_IMM : sve_mem_ld_dup<0b00, 0b11, "ld1rb", Z_d, ZPR64, uimm6s1>;
	defm LD1RSW_IMM : sve_mem_ld_dup<0b01, 0b00, "ld1rsw", Z_d, ZPR64, uimm6s4>;
	defm LD1RH_IMM : sve_mem_ld_dup<0b01, 0b01, "ld1rh", Z_h, ZPR16, uimm6s2>;
	defm LD1RH_S_IMM : sve_mem_ld_dup<0b01, 0b10, "ld1rh", Z_s, ZPR32, uimm6s2>;
	defm LD1RH_D_IMM : sve_mem_ld_dup<0b01, 0b11, "ld1rh", Z_d, ZPR64, uimm6s2>;
	defm LD1RSH_D_IMM : sve_mem_ld_dup<0b10, 0b00, "ld1rsh", Z_d, ZPR64, uimm6s2>;
	defm LD1RSH_S_IMM : sve_mem_ld_dup<0b10, 0b01, "ld1rsh", Z_s, ZPR32, uimm6s2>;
	defm LD1RW_IMM : sve_mem_ld_dup<0b10, 0b10, "ld1rw", Z_s, ZPR32, uimm6s4>;
	defm LD1RW_D_IMM : sve_mem_ld_dup<0b10, 0b11, "ld1rw", Z_d, ZPR64, uimm6s4>;
	defm LD1RSB_D_IMM : sve_mem_ld_dup<0b11, 0b00, "ld1rsb", Z_d, ZPR64, uimm6s1>;
	defm LD1RSB_S_IMM : sve_mem_ld_dup<0b11, 0b01, "ld1rsb", Z_s, ZPR32, uimm6s1>;
	defm LD1RSB_H_IMM : sve_mem_ld_dup<0b11, 0b10, "ld1rsb", Z_h, ZPR16, uimm6s1>;
	defm LD1RD_IMM : sve_mem_ld_dup<0b11, 0b11, "ld1rd", Z_d, ZPR64, uimm6s8>;

	// LD1RQ loads (load quadword-vector and splat to scalable vector)
	defm LD1RQ_B_IMM : sve_mem_ldqr_si<0b00, "ld1rqb", Z_b, ZPR8>;
	defm LD1RQ_H_IMM : sve_mem_ldqr_si<0b01, "ld1rqh", Z_h, ZPR16>;
	defm LD1RQ_W_IMM : sve_mem_ldqr_si<0b10, "ld1rqw", Z_s, ZPR32>;
	defm LD1RQ_D_IMM : sve_mem_ldqr_si<0b11, "ld1rqd", Z_d, ZPR64>;
	defm LD1RQ_B : sve_mem_ldqr_ss<0b00, "ld1rqb", Z_b, ZPR8, GPR64NoXZRshifted8>;
	defm LD1RQ_H : sve_mem_ldqr_ss<0b01, "ld1rqh", Z_h, ZPR16, GPR64NoXZRshifted16>;
	defm LD1RQ_W : sve_mem_ldqr_ss<0b10, "ld1rqw", Z_s, ZPR32, GPR64NoXZRshifted32>;
	defm LD1RQ_D : sve_mem_ldqr_ss<0b11, "ld1rqd", Z_d, ZPR64, GPR64NoXZRshifted64>;

	// continuous load with reg+reg addressing.
	defm LD1B : sve_mem_cld_ss<0b0000, "ld1b", Z_b, ZPR8, GPR64NoXZRshifted8>;
	defm LD1B_H : sve_mem_cld_ss<0b0001, "ld1b", Z_h, ZPR16, GPR64NoXZRshifted8>;
	defm LD1B_S : sve_mem_cld_ss<0b0010, "ld1b", Z_s, ZPR32, GPR64NoXZRshifted8>;
	defm LD1B_D : sve_mem_cld_ss<0b0011, "ld1b", Z_d, ZPR64, GPR64NoXZRshifted8>;
	defm LD1SW_D : sve_mem_cld_ss<0b0100, "ld1sw", Z_d, ZPR64, GPR64NoXZRshifted32>;
	defm LD1H : sve_mem_cld_ss<0b0101, "ld1h", Z_h, ZPR16, GPR64NoXZRshifted16>;
	defm LD1H_S : sve_mem_cld_ss<0b0110, "ld1h", Z_s, ZPR32, GPR64NoXZRshifted16>;
	defm LD1H_D : sve_mem_cld_ss<0b0111, "ld1h", Z_d, ZPR64, GPR64NoXZRshifted16>;
	defm LD1SH_D : sve_mem_cld_ss<0b1000, "ld1sh", Z_d, ZPR64, GPR64NoXZRshifted16>;
	defm LD1SH_S : sve_mem_cld_ss<0b1001, "ld1sh", Z_s, ZPR32, GPR64NoXZRshifted16>;
	defm LD1W : sve_mem_cld_ss<0b1010, "ld1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
	defm LD1W_D : sve_mem_cld_ss<0b1011, "ld1w", Z_d, ZPR64, GPR64NoXZRshifted32>;
	defm LD1SB_D : sve_mem_cld_ss<0b1100, "ld1sb", Z_d, ZPR64, GPR64NoXZRshifted8>;
	defm LD1SB_S : sve_mem_cld_ss<0b1101, "ld1sb", Z_s, ZPR32, GPR64NoXZRshifted8>;
	defm LD1SB_H : sve_mem_cld_ss<0b1110, "ld1sb", Z_h, ZPR16, GPR64NoXZRshifted8>;
	defm LD1D : sve_mem_cld_ss<0b1111, "ld1d", Z_d, ZPR64, GPR64NoXZRshifted64>;

	// non-faulting continuous load with reg+immediate
	defm LDNF1B_IMM : sve_mem_cldnf_si<0b0000, "ldnf1b", Z_b, ZPR8>;
	defm LDNF1B_H_IMM : sve_mem_cldnf_si<0b0001, "ldnf1b", Z_h, ZPR16>;
	defm LDNF1B_S_IMM : sve_mem_cldnf_si<0b0010, "ldnf1b", Z_s, ZPR32>;
	defm LDNF1B_D_IMM : sve_mem_cldnf_si<0b0011, "ldnf1b", Z_d, ZPR64>;
	defm LDNF1SW_D_IMM : sve_mem_cldnf_si<0b0100, "ldnf1sw", Z_d, ZPR64>;
	defm LDNF1H_IMM : sve_mem_cldnf_si<0b0101, "ldnf1h", Z_h, ZPR16>;
	defm LDNF1H_S_IMM : sve_mem_cldnf_si<0b0110, "ldnf1h", Z_s, ZPR32>;
	defm LDNF1H_D_IMM : sve_mem_cldnf_si<0b0111, "ldnf1h", Z_d, ZPR64>;
	defm LDNF1SH_D_IMM : sve_mem_cldnf_si<0b1000, "ldnf1sh", Z_d, ZPR64>;
	defm LDNF1SH_S_IMM : sve_mem_cldnf_si<0b1001, "ldnf1sh", Z_s, ZPR32>;
	defm LDNF1W_IMM : sve_mem_cldnf_si<0b1010, "ldnf1w", Z_s, ZPR32>;
	defm LDNF1W_D_IMM : sve_mem_cldnf_si<0b1011, "ldnf1w", Z_d, ZPR64>;
	defm LDNF1SB_D_IMM : sve_mem_cldnf_si<0b1100, "ldnf1sb", Z_d, ZPR64>;
	defm LDNF1SB_S_IMM : sve_mem_cldnf_si<0b1101, "ldnf1sb", Z_s, ZPR32>;
	defm LDNF1SB_H_IMM : sve_mem_cldnf_si<0b1110, "ldnf1sb", Z_h, ZPR16>;
	defm LDNF1D_IMM : sve_mem_cldnf_si<0b1111, "ldnf1d", Z_d, ZPR64>;

	// First-faulting loads with reg+reg addressing.
	defm LDFF1B : sve_mem_cldff_ss<0b0000, "ldff1b", Z_b, ZPR8, GPR64shifted8>;
	defm LDFF1B_H : sve_mem_cldff_ss<0b0001, "ldff1b", Z_h, ZPR16, GPR64shifted8>;
	defm LDFF1B_S : sve_mem_cldff_ss<0b0010, "ldff1b", Z_s, ZPR32, GPR64shifted8>;
	defm LDFF1B_D : sve_mem_cldff_ss<0b0011, "ldff1b", Z_d, ZPR64, GPR64shifted8>;
	defm LDFF1SW_D : sve_mem_cldff_ss<0b0100, "ldff1sw", Z_d, ZPR64, GPR64shifted32>;
	defm LDFF1H : sve_mem_cldff_ss<0b0101, "ldff1h", Z_h, ZPR16, GPR64shifted16>;
	defm LDFF1H_S : sve_mem_cldff_ss<0b0110, "ldff1h", Z_s, ZPR32, GPR64shifted16>;
	defm LDFF1H_D : sve_mem_cldff_ss<0b0111, "ldff1h", Z_d, ZPR64, GPR64shifted16>;
	defm LDFF1SH_D : sve_mem_cldff_ss<0b1000, "ldff1sh", Z_d, ZPR64, GPR64shifted16>;
	defm LDFF1SH_S : sve_mem_cldff_ss<0b1001, "ldff1sh", Z_s, ZPR32, GPR64shifted16>;
	defm LDFF1W : sve_mem_cldff_ss<0b1010, "ldff1w", Z_s, ZPR32, GPR64shifted32>;
	defm LDFF1W_D : sve_mem_cldff_ss<0b1011, "ldff1w", Z_d, ZPR64, GPR64shifted32>;
	defm LDFF1SB_D : sve_mem_cldff_ss<0b1100, "ldff1sb", Z_d, ZPR64, GPR64shifted8>;
	defm LDFF1SB_S : sve_mem_cldff_ss<0b1101, "ldff1sb", Z_s, ZPR32, GPR64shifted8>;
	defm LDFF1SB_H : sve_mem_cldff_ss<0b1110, "ldff1sb", Z_h, ZPR16, GPR64shifted8>;
	defm LDFF1D : sve_mem_cldff_ss<0b1111, "ldff1d", Z_d, ZPR64, GPR64shifted64>;

	// LD(2\|3\|4) structured loads with reg+immediate
	defm LD2B_IMM : sve_mem_eld_si<0b00, 0b01, ZZ_b, "ld2b", simm4s2>;
	defm LD3B_IMM : sve_mem_eld_si<0b00, 0b10, ZZZ_b, "ld3b", simm4s3>;
	defm LD4B_IMM : sve_mem_eld_si<0b00, 0b11, ZZZZ_b, "ld4b", simm4s4>;
	defm LD2H_IMM : sve_mem_eld_si<0b01, 0b01, ZZ_h, "ld2h", simm4s2>;
	defm LD3H_IMM : sve_mem_eld_si<0b01, 0b10, ZZZ_h, "ld3h", simm4s3>;
	defm LD4H_IMM : sve_mem_eld_si<0b01, 0b11, ZZZZ_h, "ld4h", simm4s4>;
	defm LD2W_IMM : sve_mem_eld_si<0b10, 0b01, ZZ_s, "ld2w", simm4s2>;
	defm LD3W_IMM : sve_mem_eld_si<0b10, 0b10, ZZZ_s, "ld3w", simm4s3>;
	defm LD4W_IMM : sve_mem_eld_si<0b10, 0b11, ZZZZ_s, "ld4w", simm4s4>;
	defm LD2D_IMM : sve_mem_eld_si<0b11, 0b01, ZZ_d, "ld2d", simm4s2>;
	defm LD3D_IMM : sve_mem_eld_si<0b11, 0b10, ZZZ_d, "ld3d", simm4s3>;
	defm LD4D_IMM : sve_mem_eld_si<0b11, 0b11, ZZZZ_d, "ld4d", simm4s4>;

	// LD(2\|3\|4) structured loads (register + register)
	def LD2B : sve_mem_eld_ss<0b00, 0b01, ZZ_b, "ld2b", GPR64NoXZRshifted8>;
	def LD3B : sve_mem_eld_ss<0b00, 0b10, ZZZ_b, "ld3b", GPR64NoXZRshifted8>;
	def LD4B : sve_mem_eld_ss<0b00, 0b11, ZZZZ_b, "ld4b", GPR64NoXZRshifted8>;
	def LD2H : sve_mem_eld_ss<0b01, 0b01, ZZ_h, "ld2h", GPR64NoXZRshifted16>;
	def LD3H : sve_mem_eld_ss<0b01, 0b10, ZZZ_h, "ld3h", GPR64NoXZRshifted16>;
	def LD4H : sve_mem_eld_ss<0b01, 0b11, ZZZZ_h, "ld4h", GPR64NoXZRshifted16>;
	def LD2W : sve_mem_eld_ss<0b10, 0b01, ZZ_s, "ld2w", GPR64NoXZRshifted32>;
	def LD3W : sve_mem_eld_ss<0b10, 0b10, ZZZ_s, "ld3w", GPR64NoXZRshifted32>;
	def LD4W : sve_mem_eld_ss<0b10, 0b11, ZZZZ_s, "ld4w", GPR64NoXZRshifted32>;
	def LD2D : sve_mem_eld_ss<0b11, 0b01, ZZ_d, "ld2d", GPR64NoXZRshifted64>;
	def LD3D : sve_mem_eld_ss<0b11, 0b10, ZZZ_d, "ld3d", GPR64NoXZRshifted64>;
	def LD4D : sve_mem_eld_ss<0b11, 0b11, ZZZZ_d, "ld4d", GPR64NoXZRshifted64>;

	// Gathers using unscaled 32-bit offsets, e.g.
	// ld1h z0.s, p0/z, [x0, z0.s, uxtw]
	defm GLD1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
	defm GLDFF1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
	defm GLD1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0010, "ld1b", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
	defm GLDFF1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0011, "ldff1b", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
	defm GLD1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
	defm GLDFF1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
	defm GLD1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
	defm GLDFF1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
	defm GLD1W : sve_mem_32b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>;
	defm GLDFF1W : sve_mem_32b_gld_vs_32_unscaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>;

	// Gathers using scaled 32-bit offsets, e.g.
	// ld1h z0.s, p0/z, [x0, z0.s, uxtw #1]
	defm GLD1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled_z, AArch64ld1s_gather_uxtw_scaled_z, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
	defm GLDFF1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_scaled_z, AArch64ldff1s_gather_uxtw_scaled_z, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
	defm GLD1H_S : sve_mem_32b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
	defm GLDFF1H_S : sve_mem_32b_gld_sv_32_scaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
	defm GLD1W : sve_mem_32b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>;
	defm GLDFF1W : sve_mem_32b_gld_sv_32_scaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>;

	// Gathers using 32-bit pointers with scaled offset, e.g.
	// ld1h z0.s, p0/z, [z0.s, #16]
	defm GLD1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm_z, nxv4i8>;
	defm GLDFF1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0001, "ldff1sb", imm0_31, AArch64ldff1s_gather_imm_z, nxv4i8>;
	defm GLD1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm_z, nxv4i8>;
	defm GLDFF1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0011, "ldff1b", imm0_31, AArch64ldff1_gather_imm_z, nxv4i8>;
	defm GLD1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm_z, nxv4i16>;
	defm GLDFF1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0101, "ldff1sh", uimm5s2, AArch64ldff1s_gather_imm_z, nxv4i16>;
	defm GLD1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm_z, nxv4i16>;
	defm GLDFF1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0111, "ldff1h", uimm5s2, AArch64ldff1_gather_imm_z, nxv4i16>;
	defm GLD1W : sve_mem_32b_gld_vi_32_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm_z, nxv4i32>;
	defm GLDFF1W : sve_mem_32b_gld_vi_32_ptrs<0b1011, "ldff1w", uimm5s4, AArch64ldff1_gather_imm_z, nxv4i32>;

	// Gathers using 64-bit pointers with scaled offset, e.g.
	// ld1h z0.d, p0/z, [z0.d, #16]
	defm GLD1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm_z, nxv2i8>;
	defm GLDFF1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0001, "ldff1sb", imm0_31, AArch64ldff1s_gather_imm_z, nxv2i8>;
	defm GLD1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm_z, nxv2i8>;
	defm GLDFF1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0011, "ldff1b", imm0_31, AArch64ldff1_gather_imm_z, nxv2i8>;
	defm GLD1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm_z, nxv2i16>;
	defm GLDFF1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0101, "ldff1sh", uimm5s2, AArch64ldff1s_gather_imm_z, nxv2i16>;
	defm GLD1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm_z, nxv2i16>;
	defm GLDFF1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0111, "ldff1h", uimm5s2, AArch64ldff1_gather_imm_z, nxv2i16>;
	defm GLD1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1000, "ld1sw", uimm5s4, AArch64ld1s_gather_imm_z, nxv2i32>;
	defm GLDFF1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1001, "ldff1sw", uimm5s4, AArch64ldff1s_gather_imm_z, nxv2i32>;
	defm GLD1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm_z, nxv2i32>;
	defm GLDFF1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1011, "ldff1w", uimm5s4, AArch64ldff1_gather_imm_z, nxv2i32>;
	defm GLD1D : sve_mem_64b_gld_vi_64_ptrs<0b1110, "ld1d", uimm5s8, AArch64ld1_gather_imm_z, nxv2i64>;
	defm GLDFF1D : sve_mem_64b_gld_vi_64_ptrs<0b1111, "ldff1d", uimm5s8, AArch64ldff1_gather_imm_z, nxv2i64>;

	// Gathers using unscaled 64-bit offsets, e.g.
	// ld1h z0.d, p0/z, [x0, z0.d]
	defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_z, nxv2i8>;
	defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather_z, nxv2i8>;
	defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b", AArch64ld1_gather_z, nxv2i8>;
	defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b", AArch64ldff1_gather_z, nxv2i8>;
	defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_z, nxv2i16>;
	defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather_z, nxv2i16>;
	defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h", AArch64ld1_gather_z, nxv2i16>;
	defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h", AArch64ldff1_gather_z, nxv2i16>;
	defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw", AArch64ld1s_gather_z, nxv2i32>;
	defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw", AArch64ldff1s_gather_z, nxv2i32>;
	defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w", AArch64ld1_gather_z, nxv2i32>;
	defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w", AArch64ldff1_gather_z, nxv2i32>;
	defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d", AArch64ld1_gather_z, nxv2i64>;
	defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d", AArch64ldff1_gather_z, nxv2i64>;

	// Gathers using scaled 64-bit offsets, e.g.
	// ld1h z0.d, p0/z, [x0, z0.d, lsl #1]
	defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", AArch64ld1s_gather_scaled_z, ZPR64ExtLSL16, nxv2i16>;
	defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", AArch64ldff1s_gather_scaled_z, ZPR64ExtLSL16, nxv2i16>;
	defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", AArch64ld1_gather_scaled_z, ZPR64ExtLSL16, nxv2i16>;
	defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", AArch64ldff1_gather_scaled_z, ZPR64ExtLSL16, nxv2i16>;
	defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", AArch64ld1s_gather_scaled_z, ZPR64ExtLSL32, nxv2i32>;
	defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", AArch64ldff1s_gather_scaled_z, ZPR64ExtLSL32, nxv2i32>;
	defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", AArch64ld1_gather_scaled_z, ZPR64ExtLSL32, nxv2i32>;
	defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", AArch64ldff1_gather_scaled_z, ZPR64ExtLSL32, nxv2i32>;
	defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", AArch64ld1_gather_scaled_z, ZPR64ExtLSL64, nxv2i64>;
	defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", AArch64ldff1_gather_scaled_z, ZPR64ExtLSL64, nxv2i64>;

	// Gathers using unscaled 32-bit offsets unpacked in 64-bits elements, e.g.
	// ld1h z0.d, p0/z, [x0, z0.d, uxtw]
	defm GLD1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
	defm GLDFF1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
	defm GLD1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0010, "ld1b", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
	defm GLDFF1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0011, "ldff1b", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
	defm GLD1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
	defm GLDFF1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
	defm GLD1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
	defm GLDFF1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
	defm GLD1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
	defm GLDFF1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1001, "ldff1sw", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
	defm GLD1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
	defm GLDFF1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
	defm GLD1D : sve_mem_64b_gld_vs_32_unscaled<0b1110, "ld1d", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>;
	defm GLDFF1D : sve_mem_64b_gld_vs_32_unscaled<0b1111, "ldff1d", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>;

	// Gathers using scaled 32-bit offsets unpacked in 64-bits elements, e.g.
	// ld1h z0.d, p0/z, [x0, z0.d, uxtw #1]
	defm GLD1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled_z, AArch64ld1s_gather_uxtw_scaled_z, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
	defm GLDFF1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_scaled_z, AArch64ldff1s_gather_uxtw_scaled_z, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
	defm GLD1H_D : sve_mem_64b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
	defm GLDFF1H_D : sve_mem_64b_gld_sv_32_scaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
	defm GLD1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw_scaled_z, AArch64ld1s_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
	defm GLDFF1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1001, "ldff1sw", AArch64ldff1s_gather_sxtw_scaled_z, AArch64ldff1s_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
	defm GLD1W_D : sve_mem_64b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
	defm GLDFF1W_D : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
	defm GLD1D : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;
	defm GLDFF1D : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;

	// Non-temporal contiguous loads (register + immediate)
	defm LDNT1B_ZRI : sve_mem_cldnt_si<0b00, "ldnt1b", Z_b, ZPR8>;
	defm LDNT1H_ZRI : sve_mem_cldnt_si<0b01, "ldnt1h", Z_h, ZPR16>;
	defm LDNT1W_ZRI : sve_mem_cldnt_si<0b10, "ldnt1w", Z_s, ZPR32>;
	defm LDNT1D_ZRI : sve_mem_cldnt_si<0b11, "ldnt1d", Z_d, ZPR64>;

	// Non-temporal contiguous loads (register + register)
	defm LDNT1B_ZRR : sve_mem_cldnt_ss<0b00, "ldnt1b", Z_b, ZPR8, GPR64NoXZRshifted8>;
	defm LDNT1H_ZRR : sve_mem_cldnt_ss<0b01, "ldnt1h", Z_h, ZPR16, GPR64NoXZRshifted16>;
	defm LDNT1W_ZRR : sve_mem_cldnt_ss<0b10, "ldnt1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
	defm LDNT1D_ZRR : sve_mem_cldnt_ss<0b11, "ldnt1d", Z_d, ZPR64, GPR64NoXZRshifted64>;

	// contiguous store with immediates
	defm ST1B_IMM : sve_mem_cst_si<0b00, 0b00, "st1b", Z_b, ZPR8>;
	defm ST1B_H_IMM : sve_mem_cst_si<0b00, 0b01, "st1b", Z_h, ZPR16>;
	defm ST1B_S_IMM : sve_mem_cst_si<0b00, 0b10, "st1b", Z_s, ZPR32>;
	defm ST1B_D_IMM : sve_mem_cst_si<0b00, 0b11, "st1b", Z_d, ZPR64>;
	defm ST1H_IMM : sve_mem_cst_si<0b01, 0b01, "st1h", Z_h, ZPR16>;
	defm ST1H_S_IMM : sve_mem_cst_si<0b01, 0b10, "st1h", Z_s, ZPR32>;
	defm ST1H_D_IMM : sve_mem_cst_si<0b01, 0b11, "st1h", Z_d, ZPR64>;
	defm ST1W_IMM : sve_mem_cst_si<0b10, 0b10, "st1w", Z_s, ZPR32>;
	defm ST1W_D_IMM : sve_mem_cst_si<0b10, 0b11, "st1w", Z_d, ZPR64>;
	defm ST1D_IMM : sve_mem_cst_si<0b11, 0b11, "st1d", Z_d, ZPR64>;

	// contiguous store with reg+reg addressing.
	defm ST1B : sve_mem_cst_ss<0b0000, "st1b", Z_b, ZPR8, GPR64NoXZRshifted8>;
	defm ST1B_H : sve_mem_cst_ss<0b0001, "st1b", Z_h, ZPR16, GPR64NoXZRshifted8>;
	defm ST1B_S : sve_mem_cst_ss<0b0010, "st1b", Z_s, ZPR32, GPR64NoXZRshifted8>;
	defm ST1B_D : sve_mem_cst_ss<0b0011, "st1b", Z_d, ZPR64, GPR64NoXZRshifted8>;
	defm ST1H : sve_mem_cst_ss<0b0101, "st1h", Z_h, ZPR16, GPR64NoXZRshifted16>;
	defm ST1H_S : sve_mem_cst_ss<0b0110, "st1h", Z_s, ZPR32, GPR64NoXZRshifted16>;
	defm ST1H_D : sve_mem_cst_ss<0b0111, "st1h", Z_d, ZPR64, GPR64NoXZRshifted16>;
	defm ST1W : sve_mem_cst_ss<0b1010, "st1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
	defm ST1W_D : sve_mem_cst_ss<0b1011, "st1w", Z_d, ZPR64, GPR64NoXZRshifted32>;
	defm ST1D : sve_mem_cst_ss<0b1111, "st1d", Z_d, ZPR64, GPR64NoXZRshifted64>;

	// Scatters using unpacked, unscaled 32-bit offsets, e.g.
	// st1h z0.d, p0, [x0, z0.d, uxtw]
	defm SST1B_D : sve_mem_64b_sst_sv_32_unscaled<0b000, "st1b", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
	defm SST1H_D : sve_mem_64b_sst_sv_32_unscaled<0b010, "st1h", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
	defm SST1W_D : sve_mem_64b_sst_sv_32_unscaled<0b100, "st1w", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8,nxv2i32>;
	defm SST1D : sve_mem_64b_sst_sv_32_unscaled<0b110, "st1d", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>;

	// Scatters using packed, unscaled 32-bit offsets, e.g.
	// st1h z0.s, p0, [x0, z0.s, uxtw]
	defm SST1B_S : sve_mem_32b_sst_sv_32_unscaled<0b001, "st1b", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
	defm SST1H_S : sve_mem_32b_sst_sv_32_unscaled<0b011, "st1h", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
	defm SST1W : sve_mem_32b_sst_sv_32_unscaled<0b101, "st1w", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>;

	// Scatters using packed, scaled 32-bit offsets, e.g.
	// st1h z0.s, p0, [x0, z0.s, uxtw #1]
	defm SST1H_S : sve_mem_32b_sst_sv_32_scaled<0b011, "st1h", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
	defm SST1W : sve_mem_32b_sst_sv_32_scaled<0b101, "st1w", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>;

	// Scatters using unpacked, scaled 32-bit offsets, e.g.
	// st1h z0.d, p0, [x0, z0.d, uxtw #1]
	defm SST1H_D : sve_mem_64b_sst_sv_32_scaled<0b010, "st1h", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
	defm SST1W_D : sve_mem_64b_sst_sv_32_scaled<0b100, "st1w", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
	defm SST1D : sve_mem_64b_sst_sv_32_scaled<0b110, "st1d", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;

	// Scatters using 32/64-bit pointers with offset, e.g.
	// st1h z0.s, p0, [z0.s, #16]
	defm SST1B_S : sve_mem_32b_sst_vi_ptrs<0b001, "st1b", imm0_31, AArch64st1_scatter_imm, nxv4i8>;
	defm SST1H_S : sve_mem_32b_sst_vi_ptrs<0b011, "st1h", uimm5s2, AArch64st1_scatter_imm, nxv4i16>;
	defm SST1W : sve_mem_32b_sst_vi_ptrs<0b101, "st1w", uimm5s4, AArch64st1_scatter_imm, nxv4i32>;

	// Scatters using 32/64-bit pointers with offset, e.g.
	// st1h z0.d, p0, [z0.d, #16]
	defm SST1B_D : sve_mem_64b_sst_vi_ptrs<0b000, "st1b", imm0_31, AArch64st1_scatter_imm, nxv2i8>;
	defm SST1H_D : sve_mem_64b_sst_vi_ptrs<0b010, "st1h", uimm5s2, AArch64st1_scatter_imm, nxv2i16>;
	defm SST1W_D : sve_mem_64b_sst_vi_ptrs<0b100, "st1w", uimm5s4, AArch64st1_scatter_imm, nxv2i32>;
	defm SST1D : sve_mem_64b_sst_vi_ptrs<0b110, "st1d", uimm5s8, AArch64st1_scatter_imm, nxv2i64>;

	// Scatters using unscaled 64-bit offsets, e.g.
	// st1h z0.d, p0, [x0, z0.d]
	defm SST1B_D : sve_mem_sst_sv_64_unscaled<0b00, "st1b", AArch64st1_scatter, nxv2i8>;
	defm SST1H_D : sve_mem_sst_sv_64_unscaled<0b01, "st1h", AArch64st1_scatter, nxv2i16>;
	defm SST1W_D : sve_mem_sst_sv_64_unscaled<0b10, "st1w", AArch64st1_scatter, nxv2i32>;
	defm SST1D : sve_mem_sst_sv_64_unscaled<0b11, "st1d", AArch64st1_scatter, nxv2i64>;

	// Scatters using scaled 64-bit offsets, e.g.
	// st1h z0.d, p0, [x0, z0.d, lsl #1]
	defm SST1H_D_SCALED : sve_mem_sst_sv_64_scaled<0b01, "st1h", AArch64st1_scatter_scaled, ZPR64ExtLSL16, nxv2i16>;
	defm SST1W_D_SCALED : sve_mem_sst_sv_64_scaled<0b10, "st1w", AArch64st1_scatter_scaled, ZPR64ExtLSL32, nxv2i32>;
	defm SST1D_SCALED : sve_mem_sst_sv_64_scaled<0b11, "st1d", AArch64st1_scatter_scaled, ZPR64ExtLSL64, nxv2i64>;

	// ST(2\|3\|4) structured stores (register + immediate)
	defm ST2B_IMM : sve_mem_est_si<0b00, 0b01, ZZ_b, "st2b", simm4s2>;
	defm ST3B_IMM : sve_mem_est_si<0b00, 0b10, ZZZ_b, "st3b", simm4s3>;
	defm ST4B_IMM : sve_mem_est_si<0b00, 0b11, ZZZZ_b, "st4b", simm4s4>;
	defm ST2H_IMM : sve_mem_est_si<0b01, 0b01, ZZ_h, "st2h", simm4s2>;
	defm ST3H_IMM : sve_mem_est_si<0b01, 0b10, ZZZ_h, "st3h", simm4s3>;
	defm ST4H_IMM : sve_mem_est_si<0b01, 0b11, ZZZZ_h, "st4h", simm4s4>;
	defm ST2W_IMM : sve_mem_est_si<0b10, 0b01, ZZ_s, "st2w", simm4s2>;
	defm ST3W_IMM : sve_mem_est_si<0b10, 0b10, ZZZ_s, "st3w", simm4s3>;
	defm ST4W_IMM : sve_mem_est_si<0b10, 0b11, ZZZZ_s, "st4w", simm4s4>;
	defm ST2D_IMM : sve_mem_est_si<0b11, 0b01, ZZ_d, "st2d", simm4s2>;
	defm ST3D_IMM : sve_mem_est_si<0b11, 0b10, ZZZ_d, "st3d", simm4s3>;
	defm ST4D_IMM : sve_mem_est_si<0b11, 0b11, ZZZZ_d, "st4d", simm4s4>;

	// ST(2\|3\|4) structured stores (register + register)
	def ST2B : sve_mem_est_ss<0b00, 0b01, ZZ_b, "st2b", GPR64NoXZRshifted8>;
	def ST3B : sve_mem_est_ss<0b00, 0b10, ZZZ_b, "st3b", GPR64NoXZRshifted8>;
	def ST4B : sve_mem_est_ss<0b00, 0b11, ZZZZ_b, "st4b", GPR64NoXZRshifted8>;
	def ST2H : sve_mem_est_ss<0b01, 0b01, ZZ_h, "st2h", GPR64NoXZRshifted16>;
	def ST3H : sve_mem_est_ss<0b01, 0b10, ZZZ_h, "st3h", GPR64NoXZRshifted16>;
	def ST4H : sve_mem_est_ss<0b01, 0b11, ZZZZ_h, "st4h", GPR64NoXZRshifted16>;
	def ST2W : sve_mem_est_ss<0b10, 0b01, ZZ_s, "st2w", GPR64NoXZRshifted32>;
	def ST3W : sve_mem_est_ss<0b10, 0b10, ZZZ_s, "st3w", GPR64NoXZRshifted32>;
	def ST4W : sve_mem_est_ss<0b10, 0b11, ZZZZ_s, "st4w", GPR64NoXZRshifted32>;
	def ST2D : sve_mem_est_ss<0b11, 0b01, ZZ_d, "st2d", GPR64NoXZRshifted64>;
	def ST3D : sve_mem_est_ss<0b11, 0b10, ZZZ_d, "st3d", GPR64NoXZRshifted64>;
	def ST4D : sve_mem_est_ss<0b11, 0b11, ZZZZ_d, "st4d", GPR64NoXZRshifted64>;

	// Non-temporal contiguous stores (register + immediate)
	defm STNT1B_ZRI : sve_mem_cstnt_si<0b00, "stnt1b", Z_b, ZPR8>;
	defm STNT1H_ZRI : sve_mem_cstnt_si<0b01, "stnt1h", Z_h, ZPR16>;
	defm STNT1W_ZRI : sve_mem_cstnt_si<0b10, "stnt1w", Z_s, ZPR32>;
	defm STNT1D_ZRI : sve_mem_cstnt_si<0b11, "stnt1d", Z_d, ZPR64>;

	// Non-temporal contiguous stores (register + register)
	defm STNT1B_ZRR : sve_mem_cstnt_ss<0b00, "stnt1b", Z_b, ZPR8, GPR64NoXZRshifted8>;
	defm STNT1H_ZRR : sve_mem_cstnt_ss<0b01, "stnt1h", Z_h, ZPR16, GPR64NoXZRshifted16>;
	defm STNT1W_ZRR : sve_mem_cstnt_ss<0b10, "stnt1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
	defm STNT1D_ZRR : sve_mem_cstnt_ss<0b11, "stnt1d", Z_d, ZPR64, GPR64NoXZRshifted64>;

	// Fill/Spill
	defm LDR_ZXI : sve_mem_z_fill<"ldr">;
	defm LDR_PXI : sve_mem_p_fill<"ldr">;
	defm STR_ZXI : sve_mem_z_spill<"str">;
	defm STR_PXI : sve_mem_p_spill<"str">;

	// Contiguous prefetch (register + immediate)
	defm PRFB_PRI : sve_mem_prfm_si<0b00, "prfb">;
	defm PRFH_PRI : sve_mem_prfm_si<0b01, "prfh">;
	defm PRFW_PRI : sve_mem_prfm_si<0b10, "prfw">;
	defm PRFD_PRI : sve_mem_prfm_si<0b11, "prfd">;

	// Contiguous prefetch (register + register)
	def PRFB_PRR : sve_mem_prfm_ss<0b001, "prfb", GPR64NoXZRshifted8>;
	def PRFH_PRR : sve_mem_prfm_ss<0b011, "prfh", GPR64NoXZRshifted16>;
	def PRFS_PRR : sve_mem_prfm_ss<0b101, "prfw", GPR64NoXZRshifted32>;
	def PRFD_PRR : sve_mem_prfm_ss<0b111, "prfd", GPR64NoXZRshifted64>;

	multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instruction RegImmInst, Instruction RegRegInst, int scale, ComplexPattern AddrCP> {
	// reg + imm
	let AddedComplexity = 2 in {
	def _reg_imm : Pat<(prefetch (PredTy PPR_3b:$gp), (am_sve_indexed_s6 GPR64sp:$base, simm6s1:$offset), (i32 sve_prfop:$prfop)),
	(RegImmInst sve_prfop:$prfop, PPR_3b:$gp, GPR64:$base, simm6s1:$offset)>;
	}

	// reg + reg
	let AddedComplexity = 1 in {
	def _reg_reg : Pat<(prefetch (PredTy PPR_3b:$gp), (AddrCP GPR64sp:$base, GPR64:$index), (i32 sve_prfop:$prfop)),
	(RegRegInst sve_prfop:$prfop, PPR_3b:$gp, GPR64:$base, GPR64:$index)>;
	}

	// default fallback
	def _default : Pat<(prefetch (PredTy PPR_3b:$gp), GPR64:$base, (i32 sve_prfop:$prfop)),
	(RegImmInst sve_prfop:$prfop, PPR_3b:$gp, GPR64:$base, (i64 0))>;
	}

	defm : sve_prefetch<int_aarch64_sve_prf, nxv16i1, PRFB_PRI, PRFB_PRR, 0, am_sve_regreg_lsl0>;
	defm : sve_prefetch<int_aarch64_sve_prf, nxv8i1, PRFH_PRI, PRFH_PRR, 1, am_sve_regreg_lsl1>;
	defm : sve_prefetch<int_aarch64_sve_prf, nxv4i1, PRFW_PRI, PRFS_PRR, 2, am_sve_regreg_lsl2>;
	defm : sve_prefetch<int_aarch64_sve_prf, nxv2i1, PRFD_PRI, PRFD_PRR, 3, am_sve_regreg_lsl3>;

	// Gather prefetch using scaled 32-bit offsets, e.g.
	// prfh pldl1keep, p0, [x0, z0.s, uxtw #1]
	defm PRFB_S : sve_mem_32b_prfm_sv_scaled<0b00, "prfb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, int_aarch64_sve_prfb_gather_sxtw_index, int_aarch64_sve_prfb_gather_uxtw_index>;
	defm PRFH_S : sve_mem_32b_prfm_sv_scaled<0b01, "prfh", ZPR32ExtSXTW16, ZPR32ExtUXTW16, int_aarch64_sve_prfh_gather_sxtw_index, int_aarch64_sve_prfh_gather_uxtw_index>;
	defm PRFW_S : sve_mem_32b_prfm_sv_scaled<0b10, "prfw", ZPR32ExtSXTW32, ZPR32ExtUXTW32, int_aarch64_sve_prfw_gather_sxtw_index, int_aarch64_sve_prfw_gather_uxtw_index>;
	defm PRFD_S : sve_mem_32b_prfm_sv_scaled<0b11, "prfd", ZPR32ExtSXTW64, ZPR32ExtUXTW64, int_aarch64_sve_prfd_gather_sxtw_index, int_aarch64_sve_prfd_gather_uxtw_index>;

	// Gather prefetch using unpacked, scaled 32-bit offsets, e.g.
	// prfh pldl1keep, p0, [x0, z0.d, uxtw #1]
	defm PRFB_D : sve_mem_64b_prfm_sv_ext_scaled<0b00, "prfb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, int_aarch64_sve_prfb_gather_sxtw_index, int_aarch64_sve_prfb_gather_uxtw_index>;
	defm PRFH_D : sve_mem_64b_prfm_sv_ext_scaled<0b01, "prfh", ZPR64ExtSXTW16, ZPR64ExtUXTW16, int_aarch64_sve_prfh_gather_sxtw_index, int_aarch64_sve_prfh_gather_uxtw_index>;
	defm PRFW_D : sve_mem_64b_prfm_sv_ext_scaled<0b10, "prfw", ZPR64ExtSXTW32, ZPR64ExtUXTW32, int_aarch64_sve_prfw_gather_sxtw_index, int_aarch64_sve_prfw_gather_uxtw_index>;
	defm PRFD_D : sve_mem_64b_prfm_sv_ext_scaled<0b11, "prfd", ZPR64ExtSXTW64, ZPR64ExtUXTW64, int_aarch64_sve_prfd_gather_sxtw_index, int_aarch64_sve_prfd_gather_uxtw_index>;

	// Gather prefetch using scaled 64-bit offsets, e.g.
	// prfh pldl1keep, p0, [x0, z0.d, lsl #1]
	defm PRFB_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b00, "prfb", ZPR64ExtLSL8, int_aarch64_sve_prfb_gather_index>;
	defm PRFH_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b01, "prfh", ZPR64ExtLSL16, int_aarch64_sve_prfh_gather_index>;
	defm PRFW_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b10, "prfw", ZPR64ExtLSL32, int_aarch64_sve_prfw_gather_index>;
	defm PRFD_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b11, "prfd", ZPR64ExtLSL64, int_aarch64_sve_prfd_gather_index>;

	// Gather prefetch using 32/64-bit pointers with offset, e.g.
	// prfh pldl1keep, p0, [z0.s, #16]
	// prfh pldl1keep, p0, [z0.d, #16]
	defm PRFB_S_PZI : sve_mem_32b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_prfb_gather_scalar_offset>;
	defm PRFH_S_PZI : sve_mem_32b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_prfh_gather_scalar_offset>;
	defm PRFW_S_PZI : sve_mem_32b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_prfw_gather_scalar_offset>;
	defm PRFD_S_PZI : sve_mem_32b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_prfd_gather_scalar_offset>;

	defm PRFB_D_PZI : sve_mem_64b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_prfb_gather_scalar_offset>;
	defm PRFH_D_PZI : sve_mem_64b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_prfh_gather_scalar_offset>;
	defm PRFW_D_PZI : sve_mem_64b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_prfw_gather_scalar_offset>;
	defm PRFD_D_PZI : sve_mem_64b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_prfd_gather_scalar_offset>;

	defm ADR_SXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_sxtw<0b00, "adr">;
	defm ADR_UXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_uxtw<0b01, "adr">;
	defm ADR_LSL_ZZZ_S : sve_int_bin_cons_misc_0_a_32_lsl<0b10, "adr">;
	defm ADR_LSL_ZZZ_D : sve_int_bin_cons_misc_0_a_64_lsl<0b11, "adr">;

	def : Pat<(nxv4i32 (int_aarch64_sve_adrb nxv4i32:$Op1, nxv4i32:$Op2)),
	(ADR_LSL_ZZZ_S_0 $Op1, $Op2)>;
	def : Pat<(nxv4i32 (int_aarch64_sve_adrh nxv4i32:$Op1, nxv4i32:$Op2)),
	(ADR_LSL_ZZZ_S_1 $Op1, $Op2)>;
	def : Pat<(nxv4i32 (int_aarch64_sve_adrw nxv4i32:$Op1, nxv4i32:$Op2)),
	(ADR_LSL_ZZZ_S_2 $Op1, $Op2)>;
	def : Pat<(nxv4i32 (int_aarch64_sve_adrd nxv4i32:$Op1, nxv4i32:$Op2)),
	(ADR_LSL_ZZZ_S_3 $Op1, $Op2)>;

	def : Pat<(nxv2i64 (int_aarch64_sve_adrb nxv2i64:$Op1, nxv2i64:$Op2)),
	(ADR_LSL_ZZZ_D_0 $Op1, $Op2)>;
	def : Pat<(nxv2i64 (int_aarch64_sve_adrh nxv2i64:$Op1, nxv2i64:$Op2)),
	(ADR_LSL_ZZZ_D_1 $Op1, $Op2)>;
	def : Pat<(nxv2i64 (int_aarch64_sve_adrw nxv2i64:$Op1, nxv2i64:$Op2)),
	(ADR_LSL_ZZZ_D_2 $Op1, $Op2)>;
	def : Pat<(nxv2i64 (int_aarch64_sve_adrd nxv2i64:$Op1, nxv2i64:$Op2)),
	(ADR_LSL_ZZZ_D_3 $Op1, $Op2)>;

	defm TBL_ZZZ : sve_int_perm_tbl<"tbl", AArch64tbl>;

	let Predicates = [HasSVE, HasBF16] in {
	def : SVE_2_Op_Pat<nxv8bf16, AArch64tbl, nxv8bf16, nxv8i16, TBL_ZZZ_H>;
	}

	defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1", AArch64zip1>;
	defm ZIP2_ZZZ : sve_int_perm_bin_perm_zz<0b001, "zip2", AArch64zip2>;
	defm UZP1_ZZZ : sve_int_perm_bin_perm_zz<0b010, "uzp1", AArch64uzp1>;
	defm UZP2_ZZZ : sve_int_perm_bin_perm_zz<0b011, "uzp2", AArch64uzp2>;
	defm TRN1_ZZZ : sve_int_perm_bin_perm_zz<0b100, "trn1", AArch64trn1>;
	defm TRN2_ZZZ : sve_int_perm_bin_perm_zz<0b101, "trn2", AArch64trn2>;

	let Predicates = [HasSVE, HasBF16] in {
	def : SVE_2_Op_Pat<nxv8bf16, AArch64zip1, nxv8bf16, nxv8bf16, ZIP1_ZZZ_H>;
	def : SVE_2_Op_Pat<nxv8bf16, AArch64zip2, nxv8bf16, nxv8bf16, ZIP2_ZZZ_H>;
	def : SVE_2_Op_Pat<nxv8bf16, AArch64uzp1, nxv8bf16, nxv8bf16, UZP1_ZZZ_H>;
	def : SVE_2_Op_Pat<nxv8bf16, AArch64uzp2, nxv8bf16, nxv8bf16, UZP2_ZZZ_H>;
	def : SVE_2_Op_Pat<nxv8bf16, AArch64trn1, nxv8bf16, nxv8bf16, TRN1_ZZZ_H>;
	def : SVE_2_Op_Pat<nxv8bf16, AArch64trn2, nxv8bf16, nxv8bf16, TRN2_ZZZ_H>;
	}

	defm ZIP1_PPP : sve_int_perm_bin_perm_pp<0b000, "zip1", AArch64zip1>;
	defm ZIP2_PPP : sve_int_perm_bin_perm_pp<0b001, "zip2", AArch64zip2>;
	defm UZP1_PPP : sve_int_perm_bin_perm_pp<0b010, "uzp1", AArch64uzp1>;
	defm UZP2_PPP : sve_int_perm_bin_perm_pp<0b011, "uzp2", AArch64uzp2>;
	defm TRN1_PPP : sve_int_perm_bin_perm_pp<0b100, "trn1", AArch64trn1>;
	defm TRN2_PPP : sve_int_perm_bin_perm_pp<0b101, "trn2", AArch64trn2>;

	+ // Extract lo/hi halves of legal predicate types.
	+ def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 0))),
	+ (ZIP1_PPP_S PPR:$Ps, (PFALSE))>;
	+ def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 2))),
	+ (ZIP2_PPP_S PPR:$Ps, (PFALSE))>;
	+ def : Pat<(nxv4i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 0))),
	+ (ZIP1_PPP_H PPR:$Ps, (PFALSE))>;
	+ def : Pat<(nxv4i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 4))),
	+ (ZIP2_PPP_H PPR:$Ps, (PFALSE))>;
	+ def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))),
	+ (ZIP1_PPP_B PPR:$Ps, (PFALSE))>;
	+ def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))),
	+ (ZIP2_PPP_B PPR:$Ps, (PFALSE))>;
	+
	+ // Concatenate two predicates.
	+ def : Pat<(nxv4i1 (concat_vectors nxv2i1:$p1, nxv2i1:$p2)),
	+ (UZP1_PPP_S $p1, $p2)>;
	+ def : Pat<(nxv8i1 (concat_vectors nxv4i1:$p1, nxv4i1:$p2)),
	+ (UZP1_PPP_H $p1, $p2)>;
	+ def : Pat<(nxv16i1 (concat_vectors nxv8i1:$p1, nxv8i1:$p2)),
	+ (UZP1_PPP_B $p1, $p2)>;
	+
	defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>;
	defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>;
	defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", SETGE, SETLE>;
	defm CMPGT_PPzZZ : sve_int_cmp_0<0b101, "cmpgt", SETGT, SETLT>;
	defm CMPEQ_PPzZZ : sve_int_cmp_0<0b110, "cmpeq", SETEQ, SETEQ>;
	defm CMPNE_PPzZZ : sve_int_cmp_0<0b111, "cmpne", SETNE, SETNE>;

	defm CMPEQ_WIDE_PPzZZ : sve_int_cmp_0_wide<0b010, "cmpeq", int_aarch64_sve_cmpeq_wide>;
	defm CMPNE_WIDE_PPzZZ : sve_int_cmp_0_wide<0b011, "cmpne", int_aarch64_sve_cmpne_wide>;
	defm CMPGE_WIDE_PPzZZ : sve_int_cmp_1_wide<0b000, "cmpge", int_aarch64_sve_cmpge_wide>;
	defm CMPGT_WIDE_PPzZZ : sve_int_cmp_1_wide<0b001, "cmpgt", int_aarch64_sve_cmpgt_wide>;
	defm CMPLT_WIDE_PPzZZ : sve_int_cmp_1_wide<0b010, "cmplt", int_aarch64_sve_cmplt_wide>;
	defm CMPLE_WIDE_PPzZZ : sve_int_cmp_1_wide<0b011, "cmple", int_aarch64_sve_cmple_wide>;
	defm CMPHS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b100, "cmphs", int_aarch64_sve_cmphs_wide>;
	defm CMPHI_WIDE_PPzZZ : sve_int_cmp_1_wide<0b101, "cmphi", int_aarch64_sve_cmphi_wide>;
	defm CMPLO_WIDE_PPzZZ : sve_int_cmp_1_wide<0b110, "cmplo", int_aarch64_sve_cmplo_wide>;
	defm CMPLS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b111, "cmpls", int_aarch64_sve_cmpls_wide>;

	defm CMPGE_PPzZI : sve_int_scmp_vi<0b000, "cmpge", SETGE, SETLE>;
	defm CMPGT_PPzZI : sve_int_scmp_vi<0b001, "cmpgt", SETGT, SETLT>;
	defm CMPLT_PPzZI : sve_int_scmp_vi<0b010, "cmplt", SETLT, SETGT>;
	defm CMPLE_PPzZI : sve_int_scmp_vi<0b011, "cmple", SETLE, SETGE>;
	defm CMPEQ_PPzZI : sve_int_scmp_vi<0b100, "cmpeq", SETEQ, SETEQ>;
	defm CMPNE_PPzZI : sve_int_scmp_vi<0b101, "cmpne", SETNE, SETEQ>;
	defm CMPHS_PPzZI : sve_int_ucmp_vi<0b00, "cmphs", SETUGE, SETULE>;
	defm CMPHI_PPzZI : sve_int_ucmp_vi<0b01, "cmphi", SETUGT, SETULT>;
	defm CMPLO_PPzZI : sve_int_ucmp_vi<0b10, "cmplo", SETULT, SETUGT>;
	defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls", SETULE, SETUGE>;

	defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", int_aarch64_sve_fcmpge, setoge>;
	defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", int_aarch64_sve_fcmpgt, setogt>;
	defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", int_aarch64_sve_fcmpeq, setoeq>;
	defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", int_aarch64_sve_fcmpne, setone>;
	defm FCMUO_PPzZZ : sve_fp_3op_p_pd_cc<0b100, "fcmuo", int_aarch64_sve_fcmpuo, setuo>;
	defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge", int_aarch64_sve_facge>;
	defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt", int_aarch64_sve_facgt>;

	defm FCMGE_PPzZ0 : sve_fp_2op_p_pd<0b000, "fcmge">;
	defm FCMGT_PPzZ0 : sve_fp_2op_p_pd<0b001, "fcmgt">;
	defm FCMLT_PPzZ0 : sve_fp_2op_p_pd<0b010, "fcmlt">;
	defm FCMLE_PPzZ0 : sve_fp_2op_p_pd<0b011, "fcmle">;
	defm FCMEQ_PPzZ0 : sve_fp_2op_p_pd<0b100, "fcmeq">;
	defm FCMNE_PPzZ0 : sve_fp_2op_p_pd<0b110, "fcmne">;

	defm WHILELT_PWW : sve_int_while4_rr<0b010, "whilelt", int_aarch64_sve_whilelt>;
	defm WHILELE_PWW : sve_int_while4_rr<0b011, "whilele", int_aarch64_sve_whilele>;
	defm WHILELO_PWW : sve_int_while4_rr<0b110, "whilelo", int_aarch64_sve_whilelo>;
	defm WHILELS_PWW : sve_int_while4_rr<0b111, "whilels", int_aarch64_sve_whilels>;

	defm WHILELT_PXX : sve_int_while8_rr<0b010, "whilelt", int_aarch64_sve_whilelt>;
	defm WHILELE_PXX : sve_int_while8_rr<0b011, "whilele", int_aarch64_sve_whilele>;
	defm WHILELO_PXX : sve_int_while8_rr<0b110, "whilelo", int_aarch64_sve_whilelo>;
	defm WHILELS_PXX : sve_int_while8_rr<0b111, "whilels", int_aarch64_sve_whilels>;

	def CTERMEQ_WW : sve_int_cterm<0b0, 0b0, "ctermeq", GPR32>;
	def CTERMNE_WW : sve_int_cterm<0b0, 0b1, "ctermne", GPR32>;
	def CTERMEQ_XX : sve_int_cterm<0b1, 0b0, "ctermeq", GPR64>;
	def CTERMNE_XX : sve_int_cterm<0b1, 0b1, "ctermne", GPR64>;

	def RDVLI_XI : sve_int_read_vl_a<0b0, 0b11111, "rdvl">;
	def ADDVL_XXI : sve_int_arith_vl<0b0, "addvl">;
	def ADDPL_XXI : sve_int_arith_vl<0b1, "addpl">;

	defm CNTB_XPiI : sve_int_count<0b000, "cntb", int_aarch64_sve_cntb>;
	defm CNTH_XPiI : sve_int_count<0b010, "cnth", int_aarch64_sve_cnth>;
	defm CNTW_XPiI : sve_int_count<0b100, "cntw", int_aarch64_sve_cntw>;
	defm CNTD_XPiI : sve_int_count<0b110, "cntd", int_aarch64_sve_cntd>;
	defm CNTP_XPP : sve_int_pcount_pred<0b0000, "cntp", int_aarch64_sve_cntp>;

	defm INCB_XPiI : sve_int_pred_pattern_a<0b000, "incb">;
	defm DECB_XPiI : sve_int_pred_pattern_a<0b001, "decb">;
	defm INCH_XPiI : sve_int_pred_pattern_a<0b010, "inch">;
	defm DECH_XPiI : sve_int_pred_pattern_a<0b011, "dech">;
	defm INCW_XPiI : sve_int_pred_pattern_a<0b100, "incw">;
	defm DECW_XPiI : sve_int_pred_pattern_a<0b101, "decw">;
	defm INCD_XPiI : sve_int_pred_pattern_a<0b110, "incd">;
	defm DECD_XPiI : sve_int_pred_pattern_a<0b111, "decd">;

	defm SQINCB_XPiWdI : sve_int_pred_pattern_b_s32<0b00000, "sqincb", int_aarch64_sve_sqincb_n32>;
	defm UQINCB_WPiI : sve_int_pred_pattern_b_u32<0b00001, "uqincb", int_aarch64_sve_uqincb_n32>;
	defm SQDECB_XPiWdI : sve_int_pred_pattern_b_s32<0b00010, "sqdecb", int_aarch64_sve_sqdecb_n32>;
	defm UQDECB_WPiI : sve_int_pred_pattern_b_u32<0b00011, "uqdecb", int_aarch64_sve_uqdecb_n32>;
	defm SQINCB_XPiI : sve_int_pred_pattern_b_x64<0b00100, "sqincb", int_aarch64_sve_sqincb_n64>;
	defm UQINCB_XPiI : sve_int_pred_pattern_b_x64<0b00101, "uqincb", int_aarch64_sve_uqincb_n64>;
	defm SQDECB_XPiI : sve_int_pred_pattern_b_x64<0b00110, "sqdecb", int_aarch64_sve_sqdecb_n64>;
	defm UQDECB_XPiI : sve_int_pred_pattern_b_x64<0b00111, "uqdecb", int_aarch64_sve_uqdecb_n64>;

	defm SQINCH_XPiWdI : sve_int_pred_pattern_b_s32<0b01000, "sqinch", int_aarch64_sve_sqinch_n32>;
	defm UQINCH_WPiI : sve_int_pred_pattern_b_u32<0b01001, "uqinch", int_aarch64_sve_uqinch_n32>;
	defm SQDECH_XPiWdI : sve_int_pred_pattern_b_s32<0b01010, "sqdech", int_aarch64_sve_sqdech_n32>;
	defm UQDECH_WPiI : sve_int_pred_pattern_b_u32<0b01011, "uqdech", int_aarch64_sve_uqdech_n32>;
	defm SQINCH_XPiI : sve_int_pred_pattern_b_x64<0b01100, "sqinch", int_aarch64_sve_sqinch_n64>;
	defm UQINCH_XPiI : sve_int_pred_pattern_b_x64<0b01101, "uqinch", int_aarch64_sve_uqinch_n64>;
	defm SQDECH_XPiI : sve_int_pred_pattern_b_x64<0b01110, "sqdech", int_aarch64_sve_sqdech_n64>;
	defm UQDECH_XPiI : sve_int_pred_pattern_b_x64<0b01111, "uqdech", int_aarch64_sve_uqdech_n64>;

	defm SQINCW_XPiWdI : sve_int_pred_pattern_b_s32<0b10000, "sqincw", int_aarch64_sve_sqincw_n32>;
	defm UQINCW_WPiI : sve_int_pred_pattern_b_u32<0b10001, "uqincw", int_aarch64_sve_uqincw_n32>;
	defm SQDECW_XPiWdI : sve_int_pred_pattern_b_s32<0b10010, "sqdecw", int_aarch64_sve_sqdecw_n32>;
	defm UQDECW_WPiI : sve_int_pred_pattern_b_u32<0b10011, "uqdecw", int_aarch64_sve_uqdecw_n32>;
	defm SQINCW_XPiI : sve_int_pred_pattern_b_x64<0b10100, "sqincw", int_aarch64_sve_sqincw_n64>;
	defm UQINCW_XPiI : sve_int_pred_pattern_b_x64<0b10101, "uqincw", int_aarch64_sve_uqincw_n64>;
	defm SQDECW_XPiI : sve_int_pred_pattern_b_x64<0b10110, "sqdecw", int_aarch64_sve_sqdecw_n64>;
	defm UQDECW_XPiI : sve_int_pred_pattern_b_x64<0b10111, "uqdecw", int_aarch64_sve_uqdecw_n64>;

	defm SQINCD_XPiWdI : sve_int_pred_pattern_b_s32<0b11000, "sqincd", int_aarch64_sve_sqincd_n32>;
	defm UQINCD_WPiI : sve_int_pred_pattern_b_u32<0b11001, "uqincd", int_aarch64_sve_uqincd_n32>;
	defm SQDECD_XPiWdI : sve_int_pred_pattern_b_s32<0b11010, "sqdecd", int_aarch64_sve_sqdecd_n32>;
	defm UQDECD_WPiI : sve_int_pred_pattern_b_u32<0b11011, "uqdecd", int_aarch64_sve_uqdecd_n32>;
	defm SQINCD_XPiI : sve_int_pred_pattern_b_x64<0b11100, "sqincd", int_aarch64_sve_sqincd_n64>;
	defm UQINCD_XPiI : sve_int_pred_pattern_b_x64<0b11101, "uqincd", int_aarch64_sve_uqincd_n64>;
	defm SQDECD_XPiI : sve_int_pred_pattern_b_x64<0b11110, "sqdecd", int_aarch64_sve_sqdecd_n64>;
	defm UQDECD_XPiI : sve_int_pred_pattern_b_x64<0b11111, "uqdecd", int_aarch64_sve_uqdecd_n64>;

	defm SQINCH_ZPiI : sve_int_countvlv<0b01000, "sqinch", ZPR16, int_aarch64_sve_sqinch, nxv8i16>;
	defm UQINCH_ZPiI : sve_int_countvlv<0b01001, "uqinch", ZPR16, int_aarch64_sve_uqinch, nxv8i16>;
	defm SQDECH_ZPiI : sve_int_countvlv<0b01010, "sqdech", ZPR16, int_aarch64_sve_sqdech, nxv8i16>;
	defm UQDECH_ZPiI : sve_int_countvlv<0b01011, "uqdech", ZPR16, int_aarch64_sve_uqdech, nxv8i16>;
	defm INCH_ZPiI : sve_int_countvlv<0b01100, "inch", ZPR16>;
	defm DECH_ZPiI : sve_int_countvlv<0b01101, "dech", ZPR16>;
	defm SQINCW_ZPiI : sve_int_countvlv<0b10000, "sqincw", ZPR32, int_aarch64_sve_sqincw, nxv4i32>;
	defm UQINCW_ZPiI : sve_int_countvlv<0b10001, "uqincw", ZPR32, int_aarch64_sve_uqincw, nxv4i32>;
	defm SQDECW_ZPiI : sve_int_countvlv<0b10010, "sqdecw", ZPR32, int_aarch64_sve_sqdecw, nxv4i32>;
	defm UQDECW_ZPiI : sve_int_countvlv<0b10011, "uqdecw", ZPR32, int_aarch64_sve_uqdecw, nxv4i32>;
	defm INCW_ZPiI : sve_int_countvlv<0b10100, "incw", ZPR32>;
	defm DECW_ZPiI : sve_int_countvlv<0b10101, "decw", ZPR32>;
	defm SQINCD_ZPiI : sve_int_countvlv<0b11000, "sqincd", ZPR64, int_aarch64_sve_sqincd, nxv2i64>;
	defm UQINCD_ZPiI : sve_int_countvlv<0b11001, "uqincd", ZPR64, int_aarch64_sve_uqincd, nxv2i64>;
	defm SQDECD_ZPiI : sve_int_countvlv<0b11010, "sqdecd", ZPR64, int_aarch64_sve_sqdecd, nxv2i64>;
	defm UQDECD_ZPiI : sve_int_countvlv<0b11011, "uqdecd", ZPR64, int_aarch64_sve_uqdecd, nxv2i64>;
	defm INCD_ZPiI : sve_int_countvlv<0b11100, "incd", ZPR64>;
	defm DECD_ZPiI : sve_int_countvlv<0b11101, "decd", ZPR64>;

	defm SQINCP_XPWd : sve_int_count_r_s32<0b00000, "sqincp", int_aarch64_sve_sqincp_n32>;
	defm SQINCP_XP : sve_int_count_r_x64<0b00010, "sqincp", int_aarch64_sve_sqincp_n64>;
	defm UQINCP_WP : sve_int_count_r_u32<0b00100, "uqincp", int_aarch64_sve_uqincp_n32>;
	defm UQINCP_XP : sve_int_count_r_x64<0b00110, "uqincp", int_aarch64_sve_uqincp_n64>;
	defm SQDECP_XPWd : sve_int_count_r_s32<0b01000, "sqdecp", int_aarch64_sve_sqdecp_n32>;
	defm SQDECP_XP : sve_int_count_r_x64<0b01010, "sqdecp", int_aarch64_sve_sqdecp_n64>;
	defm UQDECP_WP : sve_int_count_r_u32<0b01100, "uqdecp", int_aarch64_sve_uqdecp_n32>;
	defm UQDECP_XP : sve_int_count_r_x64<0b01110, "uqdecp", int_aarch64_sve_uqdecp_n64>;
	defm INCP_XP : sve_int_count_r_x64<0b10000, "incp">;
	defm DECP_XP : sve_int_count_r_x64<0b10100, "decp">;

	defm SQINCP_ZP : sve_int_count_v<0b00000, "sqincp", int_aarch64_sve_sqincp>;
	defm UQINCP_ZP : sve_int_count_v<0b00100, "uqincp", int_aarch64_sve_uqincp>;
	defm SQDECP_ZP : sve_int_count_v<0b01000, "sqdecp", int_aarch64_sve_sqdecp>;
	defm UQDECP_ZP : sve_int_count_v<0b01100, "uqdecp", int_aarch64_sve_uqdecp>;
	defm INCP_ZP : sve_int_count_v<0b10000, "incp">;
	defm DECP_ZP : sve_int_count_v<0b10100, "decp">;

	defm INDEX_RR : sve_int_index_rr<"index", index_vector>;
	defm INDEX_IR : sve_int_index_ir<"index", index_vector>;
	defm INDEX_RI : sve_int_index_ri<"index", index_vector>;
	defm INDEX_II : sve_int_index_ii<"index", index_vector>;

	// Unpredicated shifts
	defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr", AArch64asr_m1>;
	defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr", AArch64lsr_m1>;
	defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl", AArch64lsl_m1>;

	defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr">;
	defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr">;
	defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl">;

	// Predicated shifts
	defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0000, "asr", "ASR_ZPZI">;
	defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0001, "lsr", "LSR_ZPZI">;
	defm LSL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0011, "lsl">;
	defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b0100, "asrd", "ASRD_ZPZI", int_aarch64_sve_asrd>;

	let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
	defm ASR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<AArch64asr_m1>;
	defm LSR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<AArch64lsr_m1>;
	defm LSL_ZPZZ : sve_int_bin_pred_zeroing_bhsd<AArch64lsl_m1>;
	defm ASRD_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_asrd>;
	}

	defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr", "ASR_ZPZZ", AArch64asr_m1, "ASRR_ZPmZ">;
	defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr", "LSR_ZPZZ", AArch64lsr_m1, "LSRR_ZPmZ">;
	defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl", "LSL_ZPZZ", AArch64lsl_m1, "LSLR_ZPmZ">;
	defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr", "ASRR_ZPZZ", null_frag, "ASR_ZPmZ", /isReverseInstr/ 1>;
	defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr", "LSRR_ZPZZ", null_frag, "LSR_ZPmZ", /isReverseInstr/ 1>;
	defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr", "LSLR_ZPZZ", null_frag, "LSL_ZPmZ", /isReverseInstr/ 1>;

	defm ASR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b000, "asr", int_aarch64_sve_asr_wide>;
	defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr", int_aarch64_sve_lsr_wide>;
	defm LSL_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b011, "lsl", int_aarch64_sve_lsl_wide>;

	defm FCVT_ZPmZ_StoH : sve_fp_2op_p_zd<0b1001000, "fcvt", ZPR32, ZPR16, int_aarch64_sve_fcvt_f16f32, nxv8f16, nxv4i1, nxv4f32, ElementSizeS>;
	defm FCVT_ZPmZ_HtoS : sve_fp_2op_p_zd<0b1001001, "fcvt", ZPR16, ZPR32, int_aarch64_sve_fcvt_f32f16, nxv4f32, nxv4i1, nxv8f16, ElementSizeS>;
	defm SCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110010, "scvtf", ZPR16, ZPR16, int_aarch64_sve_scvtf, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
	defm SCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010100, "scvtf", ZPR32, ZPR32, int_aarch64_sve_scvtf, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
	defm UCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010101, "ucvtf", ZPR32, ZPR32, int_aarch64_sve_ucvtf, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
	defm UCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110011, "ucvtf", ZPR16, ZPR16, int_aarch64_sve_ucvtf, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
	defm FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16, int_aarch64_sve_fcvtzs, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
	defm FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32, int_aarch64_sve_fcvtzs, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
	defm FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16, int_aarch64_sve_fcvtzu, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
	defm FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32, int_aarch64_sve_fcvtzu, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
	defm FCVT_ZPmZ_DtoH : sve_fp_2op_p_zd<0b1101000, "fcvt", ZPR64, ZPR16, int_aarch64_sve_fcvt_f16f64, nxv8f16, nxv2i1, nxv2f64, ElementSizeD>;
	defm FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd<0b1101001, "fcvt", ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16, nxv2f64, nxv2i1, nxv8f16, ElementSizeD>;
	defm FCVT_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1101010, "fcvt", ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64, nxv4f32, nxv2i1, nxv2f64, ElementSizeD>;
	defm FCVT_ZPmZ_StoD : sve_fp_2op_p_zd<0b1101011, "fcvt", ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32, nxv2f64, nxv2i1, nxv4f32, ElementSizeD>;
	defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
	defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
	defm UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110101, "ucvtf", ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32, nxv8f16, nxv4i1, nxv4i32, ElementSizeS>;
	defm SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110100, "scvtf", ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64, nxv4f32, nxv2i1, nxv2i64, ElementSizeD>;
	defm SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110100, "scvtf", ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32, nxv8f16, nxv4i1, nxv4i32, ElementSizeS>;
	defm SCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110110, "scvtf", ZPR64, ZPR16, int_aarch64_sve_scvtf_f16i64, nxv8f16, nxv2i1, nxv2i64, ElementSizeD>;
	defm UCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110101, "ucvtf", ZPR64, ZPR32, int_aarch64_sve_ucvtf_f32i64, nxv4f32, nxv2i1, nxv2i64, ElementSizeD>;
	defm UCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110111, "ucvtf", ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64, nxv8f16, nxv2i1, nxv2i64, ElementSizeD>;
	defm SCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110110, "scvtf", ZPR64, ZPR64, int_aarch64_sve_scvtf, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
	defm UCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110111, "ucvtf", ZPR64, ZPR64, int_aarch64_sve_ucvtf, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
	defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
	defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
	defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32, nxv2i64, nxv2i1, nxv4f32, ElementSizeD>;
	defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16, nxv4i32, nxv4i1, nxv8f16, ElementSizeS>;
	defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16, nxv2i64, nxv2i1, nxv8f16, ElementSizeD>;
	defm FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32, int_aarch64_sve_fcvtzu_i32f16, nxv4i32, nxv4i1, nxv8f16, ElementSizeS>;
	defm FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64, int_aarch64_sve_fcvtzu_i64f16, nxv2i64, nxv2i1, nxv8f16, ElementSizeD>;
	defm FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64, int_aarch64_sve_fcvtzu_i64f32, nxv2i64, nxv2i1, nxv4f32, ElementSizeD>;
	defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64, int_aarch64_sve_fcvtzs, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
	defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64, int_aarch64_sve_fcvtzu, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;

	defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn", int_aarch64_sve_frintn>;
	defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp", int_aarch64_sve_frintp>;
	defm FRINTM_ZPmZ : sve_fp_2op_p_zd_HSD<0b00010, "frintm", int_aarch64_sve_frintm>;
	defm FRINTZ_ZPmZ : sve_fp_2op_p_zd_HSD<0b00011, "frintz", int_aarch64_sve_frintz>;
	defm FRINTA_ZPmZ : sve_fp_2op_p_zd_HSD<0b00100, "frinta", int_aarch64_sve_frinta>;
	defm FRINTX_ZPmZ : sve_fp_2op_p_zd_HSD<0b00110, "frintx", int_aarch64_sve_frintx>;
	defm FRINTI_ZPmZ : sve_fp_2op_p_zd_HSD<0b00111, "frinti", int_aarch64_sve_frinti>;
	defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx", int_aarch64_sve_frecpx>;
	defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt", int_aarch64_sve_fsqrt>;

	let Predicates = [HasBF16, HasSVE] in {
	defm BFDOT_ZZZ : sve_bfloat_dot<"bfdot", int_aarch64_sve_bfdot>;
	defm BFDOT_ZZI : sve_bfloat_dot_indexed<"bfdot", int_aarch64_sve_bfdot_lane>;
	defm BFMMLA_ZZZ : sve_bfloat_matmul<"bfmmla", int_aarch64_sve_bfmmla>;
	defm BFMMLA_B_ZZZ : sve_bfloat_matmul_longvecl<0b0, "bfmlalb", int_aarch64_sve_bfmlalb>;
	defm BFMMLA_T_ZZZ : sve_bfloat_matmul_longvecl<0b1, "bfmlalt", int_aarch64_sve_bfmlalt>;
	defm BFMMLA_B_ZZI : sve_bfloat_matmul_longvecl_idx<0b0, "bfmlalb", int_aarch64_sve_bfmlalb_lane>;
	defm BFMMLA_T_ZZI : sve_bfloat_matmul_longvecl_idx<0b1, "bfmlalt", int_aarch64_sve_bfmlalt_lane>;
	defm BFCVT_ZPmZ : sve_bfloat_convert<0b1, "bfcvt", int_aarch64_sve_fcvt_bf16f32>;
	defm BFCVTNT_ZPmZ : sve_bfloat_convert<0b0, "bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32>;
	}

	// InstAliases
	def : InstAlias<"mov $Zd, $Zn",
	(ORR_ZZZ ZPR64:$Zd, ZPR64:$Zn, ZPR64:$Zn), 1>;
	def : InstAlias<"mov $Pd, $Pg/m, $Pn",
	(SEL_PPPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPR8:$Pd), 1>;
	def : InstAlias<"mov $Pd, $Pn",
	(ORR_PPzPP PPR8:$Pd, PPR8:$Pn, PPR8:$Pn, PPR8:$Pn), 1>;
	def : InstAlias<"mov $Pd, $Pg/z, $Pn",
	(AND_PPzPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPR8:$Pn), 1>;

	def : InstAlias<"movs $Pd, $Pn",
	(ORRS_PPzPP PPR8:$Pd, PPR8:$Pn, PPR8:$Pn, PPR8:$Pn), 1>;
	def : InstAlias<"movs $Pd, $Pg/z, $Pn",
	(ANDS_PPzPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPR8:$Pn), 1>;

	def : InstAlias<"not $Pd, $Pg/z, $Pn",
	(EOR_PPzPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPRAny:$Pg), 1>;

	def : InstAlias<"nots $Pd, $Pg/z, $Pn",
	(EORS_PPzPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPRAny:$Pg), 1>;

	def : InstAlias<"cmple $Zd, $Pg/z, $Zm, $Zn",
	(CMPGE_PPzZZ_B PPR8:$Zd, PPR3bAny:$Pg, ZPR8:$Zn, ZPR8:$Zm), 0>;
	def : InstAlias<"cmple $Zd, $Pg/z, $Zm, $Zn",
	(CMPGE_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
	def : InstAlias<"cmple $Zd, $Pg/z, $Zm, $Zn",
	(CMPGE_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
	def : InstAlias<"cmple $Zd, $Pg/z, $Zm, $Zn",
	(CMPGE_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;

	def : InstAlias<"cmplo $Zd, $Pg/z, $Zm, $Zn",
	(CMPHI_PPzZZ_B PPR8:$Zd, PPR3bAny:$Pg, ZPR8:$Zn, ZPR8:$Zm), 0>;
	def : InstAlias<"cmplo $Zd, $Pg/z, $Zm, $Zn",
	(CMPHI_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
	def : InstAlias<"cmplo $Zd, $Pg/z, $Zm, $Zn",
	(CMPHI_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
	def : InstAlias<"cmplo $Zd, $Pg/z, $Zm, $Zn",
	(CMPHI_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;

	def : InstAlias<"cmpls $Zd, $Pg/z, $Zm, $Zn",
	(CMPHS_PPzZZ_B PPR8:$Zd, PPR3bAny:$Pg, ZPR8:$Zn, ZPR8:$Zm), 0>;
	def : InstAlias<"cmpls $Zd, $Pg/z, $Zm, $Zn",
	(CMPHS_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
	def : InstAlias<"cmpls $Zd, $Pg/z, $Zm, $Zn",
	(CMPHS_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
	def : InstAlias<"cmpls $Zd, $Pg/z, $Zm, $Zn",
	(CMPHS_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;

	def : InstAlias<"cmplt $Zd, $Pg/z, $Zm, $Zn",
	(CMPGT_PPzZZ_B PPR8:$Zd, PPR3bAny:$Pg, ZPR8:$Zn, ZPR8:$Zm), 0>;
	def : InstAlias<"cmplt $Zd, $Pg/z, $Zm, $Zn",
	(CMPGT_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
	def : InstAlias<"cmplt $Zd, $Pg/z, $Zm, $Zn",
	(CMPGT_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
	def : InstAlias<"cmplt $Zd, $Pg/z, $Zm, $Zn",
	(CMPGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;

	def : InstAlias<"facle $Zd, $Pg/z, $Zm, $Zn",
	(FACGE_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
	def : InstAlias<"facle $Zd, $Pg/z, $Zm, $Zn",
	(FACGE_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
	def : InstAlias<"facle $Zd, $Pg/z, $Zm, $Zn",
	(FACGE_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;

	def : InstAlias<"faclt $Zd, $Pg/z, $Zm, $Zn",
	(FACGT_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
	def : InstAlias<"faclt $Zd, $Pg/z, $Zm, $Zn",
	(FACGT_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
	def : InstAlias<"faclt $Zd, $Pg/z, $Zm, $Zn",
	(FACGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;

	def : InstAlias<"fcmle $Zd, $Pg/z, $Zm, $Zn",
	(FCMGE_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
	def : InstAlias<"fcmle $Zd, $Pg/z, $Zm, $Zn",
	(FCMGE_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
	def : InstAlias<"fcmle $Zd, $Pg/z, $Zm, $Zn",
	(FCMGE_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;

	def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn",
	(FCMGT_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
	def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn",
	(FCMGT_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
	def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn",
	(FCMGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;

	// Pseudo instructions representing unpredicated LDR and STR for ZPR2,3,4.
	// These get expanded to individual LDR_ZXI/STR_ZXI instructions in
	// AArch64ExpandPseudoInsts.
	let mayLoad = 1, hasSideEffects = 0 in {
	def LDR_ZZXI : Pseudo<(outs ZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
	def LDR_ZZZXI : Pseudo<(outs ZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
	def LDR_ZZZZXI : Pseudo<(outs ZZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
	}
	let mayStore = 1, hasSideEffects = 0 in {
	def STR_ZZXI : Pseudo<(outs), (ins ZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
	def STR_ZZZXI : Pseudo<(outs), (ins ZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
	def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
	}

	def : Pat<(AArch64ptest (nxv16i1 PPR:$pg), (nxv16i1 PPR:$src)),
	(PTEST_PP PPR:$pg, PPR:$src)>;
	def : Pat<(AArch64ptest (nxv8i1 PPR:$pg), (nxv8i1 PPR:$src)),
	(PTEST_PP PPR:$pg, PPR:$src)>;
	def : Pat<(AArch64ptest (nxv4i1 PPR:$pg), (nxv4i1 PPR:$src)),
	(PTEST_PP PPR:$pg, PPR:$src)>;
	def : Pat<(AArch64ptest (nxv2i1 PPR:$pg), (nxv2i1 PPR:$src)),
	(PTEST_PP PPR:$pg, PPR:$src)>;

	// LD1R of 128-bit masked data
	def : Pat<(nxv16i8 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
	(LD1RQ_B_IMM $gp, $base, (i64 0))>;
	def : Pat<(nxv8i16 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
	(LD1RQ_H_IMM $gp, $base, (i64 0))>;
	def : Pat<(nxv4i32 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
	(LD1RQ_W_IMM $gp, $base, (i64 0))>;
	def : Pat<(nxv2i64 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
	(LD1RQ_D_IMM $gp, $base, (i64 0))>;

	def : Pat<(nxv16i8 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
	(LD1RQ_B_IMM $gp, $base, simm4s16:$imm)>;
	def : Pat<(nxv8i16 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
	(LD1RQ_H_IMM $gp, $base, simm4s16:$imm)>;
	def : Pat<(nxv4i32 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
	(LD1RQ_W_IMM $gp, $base, simm4s16:$imm)>;
	def : Pat<(nxv2i64 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
	(LD1RQ_D_IMM $gp, $base, simm4s16:$imm)>;

	def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (SXTW_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
	def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i16), (SXTH_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
	def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i8), (SXTB_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
	def : Pat<(sext_inreg (nxv4i32 ZPR:$Zs), nxv4i16), (SXTH_ZPmZ_S (IMPLICIT_DEF), (PTRUE_S 31), ZPR:$Zs)>;
	def : Pat<(sext_inreg (nxv4i32 ZPR:$Zs), nxv4i8), (SXTB_ZPmZ_S (IMPLICIT_DEF), (PTRUE_S 31), ZPR:$Zs)>;
	def : Pat<(sext_inreg (nxv8i16 ZPR:$Zs), nxv8i8), (SXTB_ZPmZ_H (IMPLICIT_DEF), (PTRUE_H 31), ZPR:$Zs)>;

	// General case that we ideally never want to match.
	def : Pat<(vscale GPR64:$scale), (MADDXrrr (UBFMXri (RDVLI_XI 1), 4, 63), $scale, XZR)>;

	let AddedComplexity = 5 in {
	def : Pat<(vscale (i64 1)), (UBFMXri (RDVLI_XI 1), 4, 63)>;
	def : Pat<(vscale (i64 -1)), (SBFMXri (RDVLI_XI -1), 4, 63)>;

	def : Pat<(vscale (sve_rdvl_imm i32:$imm)), (RDVLI_XI $imm)>;
	def : Pat<(vscale (sve_cnth_imm i32:$imm)), (CNTH_XPiI 31, $imm)>;
	def : Pat<(vscale (sve_cntw_imm i32:$imm)), (CNTW_XPiI 31, $imm)>;
	def : Pat<(vscale (sve_cntd_imm i32:$imm)), (CNTD_XPiI 31, $imm)>;

	def : Pat<(vscale (sve_cnth_imm_neg i32:$imm)), (SUBXrs XZR, (CNTH_XPiI 31, $imm), 0)>;
	def : Pat<(vscale (sve_cntw_imm_neg i32:$imm)), (SUBXrs XZR, (CNTW_XPiI 31, $imm), 0)>;
	def : Pat<(vscale (sve_cntd_imm_neg i32:$imm)), (SUBXrs XZR, (CNTD_XPiI 31, $imm), 0)>;
	}

	// FIXME: BigEndian requires an additional REV instruction to satisfy the
	// constraint that none of the bits change when stored to memory as one
	// type, and and reloaded as another type.
	let Predicates = [IsLE] in {
	def : Pat<(nxv16i8 (bitconvert (nxv8i16 ZPR:$src))), (nxv16i8 ZPR:$src)>;
	def : Pat<(nxv16i8 (bitconvert (nxv4i32 ZPR:$src))), (nxv16i8 ZPR:$src)>;
	def : Pat<(nxv16i8 (bitconvert (nxv2i64 ZPR:$src))), (nxv16i8 ZPR:$src)>;
	def : Pat<(nxv16i8 (bitconvert (nxv8f16 ZPR:$src))), (nxv16i8 ZPR:$src)>;
	def : Pat<(nxv16i8 (bitconvert (nxv4f32 ZPR:$src))), (nxv16i8 ZPR:$src)>;
	def : Pat<(nxv16i8 (bitconvert (nxv2f64 ZPR:$src))), (nxv16i8 ZPR:$src)>;

	def : Pat<(nxv8i16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8i16 ZPR:$src)>;
	def : Pat<(nxv8i16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8i16 ZPR:$src)>;
	def : Pat<(nxv8i16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8i16 ZPR:$src)>;
	def : Pat<(nxv8i16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8i16 ZPR:$src)>;
	def : Pat<(nxv8i16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8i16 ZPR:$src)>;
	def : Pat<(nxv8i16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8i16 ZPR:$src)>;

	def : Pat<(nxv4i32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4i32 ZPR:$src)>;
	def : Pat<(nxv4i32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4i32 ZPR:$src)>;
	def : Pat<(nxv4i32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4i32 ZPR:$src)>;
	def : Pat<(nxv4i32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4i32 ZPR:$src)>;
	def : Pat<(nxv4i32 (bitconvert (nxv4f32 ZPR:$src))), (nxv4i32 ZPR:$src)>;
	def : Pat<(nxv4i32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4i32 ZPR:$src)>;

	def : Pat<(nxv2i64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2i64 ZPR:$src)>;
	def : Pat<(nxv2i64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
	def : Pat<(nxv2i64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2i64 ZPR:$src)>;
	def : Pat<(nxv2i64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
	def : Pat<(nxv2i64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2i64 ZPR:$src)>;
	def : Pat<(nxv2i64 (bitconvert (nxv2f64 ZPR:$src))), (nxv2i64 ZPR:$src)>;

	def : Pat<(nxv8f16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8f16 ZPR:$src)>;
	def : Pat<(nxv8f16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8f16 ZPR:$src)>;
	def : Pat<(nxv8f16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8f16 ZPR:$src)>;
	def : Pat<(nxv8f16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8f16 ZPR:$src)>;
	def : Pat<(nxv8f16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8f16 ZPR:$src)>;
	def : Pat<(nxv8f16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8f16 ZPR:$src)>;

	def : Pat<(nxv4f32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4f32 ZPR:$src)>;
	def : Pat<(nxv4f32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4f32 ZPR:$src)>;
	def : Pat<(nxv4f32 (bitconvert (nxv4i32 ZPR:$src))), (nxv4f32 ZPR:$src)>;
	def : Pat<(nxv4f32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4f32 ZPR:$src)>;
	def : Pat<(nxv4f32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4f32 ZPR:$src)>;
	def : Pat<(nxv4f32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4f32 ZPR:$src)>;

	def : Pat<(nxv2f64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2f64 ZPR:$src)>;
	def : Pat<(nxv2f64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
	def : Pat<(nxv2f64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2f64 ZPR:$src)>;
	def : Pat<(nxv2f64 (bitconvert (nxv2i64 ZPR:$src))), (nxv2f64 ZPR:$src)>;
	def : Pat<(nxv2f64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
	def : Pat<(nxv2f64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2f64 ZPR:$src)>;

	}

	let Predicates = [IsLE, HasBF16, HasSVE] in {
	def : Pat<(nxv2i64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
	def : Pat<(nxv8bf16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
	def : Pat<(nxv8bf16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
	}

	let Predicates = [IsLE, HasSVE, HasBF16] in {
	def : Pat<(nxv8bf16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
	def : Pat<(nxv8bf16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
	def : Pat<(nxv8bf16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
	def : Pat<(nxv8bf16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
	def : Pat<(nxv8bf16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
	def : Pat<(nxv8bf16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
	def : Pat<(nxv8bf16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8bf16 ZPR:$src)>;

	def : Pat<(nxv16i8 (bitconvert (nxv8bf16 ZPR:$src))), (nxv16i8 ZPR:$src)>;
	def : Pat<(nxv8i16 (bitconvert (nxv8bf16 ZPR:$src))), (nxv8i16 ZPR:$src)>;
	def : Pat<(nxv4i32 (bitconvert (nxv8bf16 ZPR:$src))), (nxv4i32 ZPR:$src)>;
	def : Pat<(nxv2i64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
	def : Pat<(nxv8f16 (bitconvert (nxv8bf16 ZPR:$src))), (nxv8f16 ZPR:$src)>;
	def : Pat<(nxv4f32 (bitconvert (nxv8bf16 ZPR:$src))), (nxv4f32 ZPR:$src)>;
	def : Pat<(nxv2f64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
	}

	def : Pat<(nxv16i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
	def : Pat<(nxv16i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
	def : Pat<(nxv16i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
	def : Pat<(nxv16i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
	def : Pat<(nxv8i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
	def : Pat<(nxv8i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
	def : Pat<(nxv8i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
	def : Pat<(nxv4i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
	def : Pat<(nxv4i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
	def : Pat<(nxv4i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
	def : Pat<(nxv2i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
	def : Pat<(nxv2i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
	def : Pat<(nxv2i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;

	def : Pat<(nxv16i1 (and PPR:$Ps1, PPR:$Ps2)),
	(AND_PPzPP (PTRUE_B 31), PPR:$Ps1, PPR:$Ps2)>;
	def : Pat<(nxv8i1 (and PPR:$Ps1, PPR:$Ps2)),
	(AND_PPzPP (PTRUE_H 31), PPR:$Ps1, PPR:$Ps2)>;
	def : Pat<(nxv4i1 (and PPR:$Ps1, PPR:$Ps2)),
	(AND_PPzPP (PTRUE_S 31), PPR:$Ps1, PPR:$Ps2)>;
	def : Pat<(nxv2i1 (and PPR:$Ps1, PPR:$Ps2)),
	(AND_PPzPP (PTRUE_D 31), PPR:$Ps1, PPR:$Ps2)>;

	// Add more complex addressing modes here as required
	multiclass pred_load<ValueType Ty, ValueType PredTy, SDPatternOperator Load,
	Instruction RegRegInst, Instruction RegImmInst, ComplexPattern AddrCP> {
	// reg + reg
	let AddedComplexity = 1 in {
	def _reg_reg_z : Pat<(Ty (Load (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp), (SVEDup0Undef))),
	(RegRegInst PPR:$gp, GPR64:$base, GPR64:$offset)>;
	}
	// reg + imm
	let AddedComplexity = 2 in {
	def _reg_imm_z : Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp), (SVEDup0Undef))),
	(RegImmInst PPR:$gp, GPR64:$base, simm4s1:$offset)>;
	}
	def _default_z : Pat<(Ty (Load GPR64:$base, (PredTy PPR:$gp), (SVEDup0Undef))),
	(RegImmInst PPR:$gp, GPR64:$base, (i64 0))>;
	}

	// 2-element contiguous loads
	defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i8, LD1B_D, LD1B_D_IMM, am_sve_regreg_lsl0>;
	defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i8, LD1SB_D, LD1SB_D_IMM, am_sve_regreg_lsl0>;
	defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i16, LD1H_D, LD1H_D_IMM, am_sve_regreg_lsl1>;
	defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i16, LD1SH_D, LD1SH_D_IMM, am_sve_regreg_lsl1>;
	defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i32, LD1W_D, LD1W_D_IMM, am_sve_regreg_lsl2>;
	defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i32, LD1SW_D, LD1SW_D_IMM, am_sve_regreg_lsl2>;
	defm : pred_load<nxv2i64, nxv2i1, nonext_masked_load, LD1D, LD1D_IMM, am_sve_regreg_lsl3>;
	defm : pred_load<nxv2f16, nxv2i1, nonext_masked_load, LD1H_D, LD1H_D_IMM, am_sve_regreg_lsl1>;
	defm : pred_load<nxv2f32, nxv2i1, nonext_masked_load, LD1W_D, LD1W_D_IMM, am_sve_regreg_lsl2>;
	defm : pred_load<nxv2f64, nxv2i1, nonext_masked_load, LD1D, LD1D_IMM, am_sve_regreg_lsl3>;

	// 4-element contiguous loads
	defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i8, LD1B_S, LD1B_S_IMM, am_sve_regreg_lsl0>;
	defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i8, LD1SB_S, LD1SB_S_IMM, am_sve_regreg_lsl0>;
	defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i16, LD1H_S, LD1H_S_IMM, am_sve_regreg_lsl1>;
	defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i16, LD1SH_S, LD1SH_S_IMM, am_sve_regreg_lsl1>;
	defm : pred_load<nxv4i32, nxv4i1, nonext_masked_load, LD1W, LD1W_IMM, am_sve_regreg_lsl2>;
	defm : pred_load<nxv4f16, nxv4i1, nonext_masked_load, LD1H_S, LD1H_S_IMM, am_sve_regreg_lsl1>;
	defm : pred_load<nxv4f32, nxv4i1, nonext_masked_load, LD1W, LD1W_IMM, am_sve_regreg_lsl2>;

	// 8-element contiguous loads
	defm : pred_load<nxv8i16, nxv8i1, zext_masked_load_i8, LD1B_H, LD1B_H_IMM, am_sve_regreg_lsl0>;
	defm : pred_load<nxv8i16, nxv8i1, asext_masked_load_i8, LD1SB_H, LD1SB_H_IMM, am_sve_regreg_lsl0>;
	defm : pred_load<nxv8i16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
	defm : pred_load<nxv8f16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;

	let Predicates = [HasBF16, HasSVE] in {
	defm : pred_load<nxv8bf16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
	}

	// 16-element contiguous loads
	defm : pred_load<nxv16i8, nxv16i1, nonext_masked_load, LD1B, LD1B_IMM, am_sve_regreg_lsl0>;

	multiclass pred_store<ValueType Ty, ValueType PredTy, SDPatternOperator Store,
	Instruction RegRegInst, Instruction RegImmInst, ComplexPattern AddrCP> {
	// reg + reg
	let AddedComplexity = 1 in {
	def _reg_reg : Pat<(Store (Ty ZPR:$vec), (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp)),
	(RegRegInst ZPR:$vec, PPR:$gp, GPR64:$base, GPR64:$offset)>;
	}
	// reg + imm
	let AddedComplexity = 2 in {
	def _reg_imm : Pat<(Store (Ty ZPR:$vec), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp)),
	(RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, simm4s1:$offset)>;
	}
	def _default : Pat<(Store (Ty ZPR:$vec), GPR64:$base, (PredTy PPR:$gp)),
	(RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, (i64 0))>;
	}

	// 2-element contiguous stores
	defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i8, ST1B_D, ST1B_D_IMM, am_sve_regreg_lsl0>;
	defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i16, ST1H_D, ST1H_D_IMM, am_sve_regreg_lsl1>;
	defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i32, ST1W_D, ST1W_D_IMM, am_sve_regreg_lsl2>;
	defm : pred_store<nxv2i64, nxv2i1, nontrunc_masked_store, ST1D, ST1D_IMM, am_sve_regreg_lsl3>;
	defm : pred_store<nxv2f16, nxv2i1, nontrunc_masked_store, ST1H_D, ST1H_D_IMM, am_sve_regreg_lsl1>;
	defm : pred_store<nxv2f32, nxv2i1, nontrunc_masked_store, ST1W_D, ST1W_D_IMM, am_sve_regreg_lsl2>;
	defm : pred_store<nxv2f64, nxv2i1, nontrunc_masked_store, ST1D, ST1D_IMM, am_sve_regreg_lsl3>;

	// 4-element contiguous stores
	defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i8, ST1B_S, ST1B_S_IMM, am_sve_regreg_lsl0>;
	defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i16, ST1H_S, ST1H_S_IMM, am_sve_regreg_lsl1>;
	defm : pred_store<nxv4i32, nxv4i1, nontrunc_masked_store, ST1W, ST1W_IMM, am_sve_regreg_lsl2>;
	defm : pred_store<nxv4f16, nxv4i1, nontrunc_masked_store, ST1H_S, ST1H_S_IMM, am_sve_regreg_lsl1>;
	defm : pred_store<nxv4f32, nxv4i1, nontrunc_masked_store, ST1W, ST1W_IMM, am_sve_regreg_lsl2>;

	// 8-element contiguous stores
	defm : pred_store<nxv8i16, nxv8i1, trunc_masked_store_i8, ST1B_H, ST1B_H_IMM, am_sve_regreg_lsl0>;
	defm : pred_store<nxv8i16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>;
	defm : pred_store<nxv8f16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>;

	let Predicates = [HasBF16, HasSVE] in {
	defm : pred_store<nxv8bf16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>;
	}

	// 16-element contiguous stores
	defm : pred_store<nxv16i8, nxv16i1, nontrunc_masked_store, ST1B, ST1B_IMM, am_sve_regreg_lsl0>;

	defm : pred_load<nxv16i8, nxv16i1, non_temporal_load, LDNT1B_ZRR, LDNT1B_ZRI, am_sve_regreg_lsl0>;
	defm : pred_load<nxv8i16, nxv8i1, non_temporal_load, LDNT1H_ZRR, LDNT1H_ZRI, am_sve_regreg_lsl1>;
	defm : pred_load<nxv4i32, nxv4i1, non_temporal_load, LDNT1W_ZRR, LDNT1W_ZRI, am_sve_regreg_lsl2>;
	defm : pred_load<nxv2i64, nxv2i1, non_temporal_load, LDNT1D_ZRR, LDNT1D_ZRI, am_sve_regreg_lsl3>;

	defm : pred_store<nxv16i8, nxv16i1, non_temporal_store, STNT1B_ZRR, STNT1B_ZRI, am_sve_regreg_lsl0>;
	defm : pred_store<nxv8i16, nxv8i1, non_temporal_store, STNT1H_ZRR, STNT1H_ZRI, am_sve_regreg_lsl1>;
	defm : pred_store<nxv4i32, nxv4i1, non_temporal_store, STNT1W_ZRR, STNT1W_ZRI, am_sve_regreg_lsl2>;
	defm : pred_store<nxv2i64, nxv2i1, non_temporal_store, STNT1D_ZRR, STNT1D_ZRI, am_sve_regreg_lsl3>;

	multiclass unpred_store<PatFrag Store, ValueType Ty, Instruction RegImmInst,
	Instruction PTrue> {
	let AddedComplexity = 1 in {
	def _imm : Pat<(Store (Ty ZPR:$val), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset)),
	(RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
	}
	let AddedComplexity = 2 in {
	def _fi : Pat<(Store (Ty ZPR:$val), (am_sve_fi GPR64sp:$base, simm4s1:$offset)),
	(RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
	}

	def : Pat<(Store (Ty ZPR:$val), GPR64:$base),
	(RegImmInst ZPR:$val, (PTrue 31), GPR64:$base, (i64 0))>;
	}

	defm : unpred_store< store, nxv16i8, ST1B_IMM, PTRUE_B>;
	defm : unpred_store< truncstorevi8, nxv8i16, ST1B_H_IMM, PTRUE_H>;
	defm : unpred_store< truncstorevi8, nxv4i32, ST1B_S_IMM, PTRUE_S>;
	defm : unpred_store< truncstorevi8, nxv2i64, ST1B_D_IMM, PTRUE_D>;
	defm : unpred_store< store, nxv8i16, ST1H_IMM, PTRUE_H>;
	defm : unpred_store<truncstorevi16, nxv4i32, ST1H_S_IMM, PTRUE_S>;
	defm : unpred_store<truncstorevi16, nxv2i64, ST1H_D_IMM, PTRUE_D>;
	defm : unpred_store< store, nxv4i32, ST1W_IMM, PTRUE_S>;
	defm : unpred_store<truncstorevi32, nxv2i64, ST1W_D_IMM, PTRUE_D>;
	defm : unpred_store< store, nxv2i64, ST1D_IMM, PTRUE_D>;
	defm : unpred_store< store, nxv8f16, ST1H_IMM, PTRUE_H>;
	defm : unpred_store< store, nxv8bf16, ST1H_IMM, PTRUE_H>;
	defm : unpred_store< store, nxv4f16, ST1H_S_IMM, PTRUE_S>;
	defm : unpred_store< store, nxv2f16, ST1H_D_IMM, PTRUE_D>;
	defm : unpred_store< store, nxv4f32, ST1W_IMM, PTRUE_S>;
	defm : unpred_store< store, nxv4f32, ST1W_D_IMM, PTRUE_D>;
	defm : unpred_store< store, nxv2f64, ST1D_IMM, PTRUE_D>;

	multiclass unpred_load<PatFrag Load, ValueType Ty, Instruction RegImmInst,
	Instruction PTrue> {
	let AddedComplexity = 1 in {
	def _imm: Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset))),
	(RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
	}

	let AddedComplexity = 2 in {
	def _fi : Pat<(Ty (Load (am_sve_fi GPR64sp:$base, simm4s1:$offset))),
	(RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
	}

	def : Pat<(Ty (Load GPR64:$base)),
	(RegImmInst (PTrue 31), GPR64:$base, (i64 0))>;
	}

	defm : unpred_load< load, nxv16i8, LD1B_IMM, PTRUE_B>;
	defm : unpred_load< zextloadvi8, nxv8i16, LD1B_H_IMM, PTRUE_H>;
	defm : unpred_load< zextloadvi8, nxv4i32, LD1B_S_IMM, PTRUE_S>;
	defm : unpred_load< zextloadvi8, nxv2i64, LD1B_D_IMM, PTRUE_D>;
	defm : unpred_load< extloadvi8, nxv8i16, LD1B_H_IMM, PTRUE_H>;
	defm : unpred_load< extloadvi8, nxv4i32, LD1B_S_IMM, PTRUE_S>;
	defm : unpred_load< extloadvi8, nxv2i64, LD1B_D_IMM, PTRUE_D>;
	defm : unpred_load< sextloadvi8, nxv8i16, LD1SB_H_IMM, PTRUE_H>;
	defm : unpred_load< sextloadvi8, nxv4i32, LD1SB_S_IMM, PTRUE_S>;
	defm : unpred_load< sextloadvi8, nxv2i64, LD1SB_D_IMM, PTRUE_D>;
	defm : unpred_load< load, nxv8i16, LD1H_IMM, PTRUE_H>;
	defm : unpred_load<zextloadvi16, nxv4i32, LD1H_S_IMM, PTRUE_S>;
	defm : unpred_load<zextloadvi16, nxv2i64, LD1H_D_IMM, PTRUE_D>;
	defm : unpred_load< extloadvi16, nxv4i32, LD1H_S_IMM, PTRUE_S>;
	defm : unpred_load< extloadvi16, nxv2i64, LD1H_D_IMM, PTRUE_D>;
	defm : unpred_load<sextloadvi16, nxv4i32, LD1SH_S_IMM, PTRUE_S>;
	defm : unpred_load<sextloadvi16, nxv2i64, LD1SH_D_IMM, PTRUE_D>;
	defm : unpred_load< load, nxv4i32, LD1W_IMM, PTRUE_S>;
	defm : unpred_load<zextloadvi32, nxv2i64, LD1W_D_IMM, PTRUE_D>;
	defm : unpred_load< extloadvi32, nxv2i64, LD1W_D_IMM, PTRUE_D>;
	defm : unpred_load<sextloadvi32, nxv2i64, LD1SW_D_IMM, PTRUE_D>;
	defm : unpred_load< load, nxv2i64, LD1D_IMM, PTRUE_D>;
	defm : unpred_load< load, nxv8f16, LD1H_IMM, PTRUE_H>;
	defm : unpred_load< load, nxv8bf16, LD1H_IMM, PTRUE_H>;
	defm : unpred_load< load, nxv4f16, LD1H_S_IMM, PTRUE_S>;
	defm : unpred_load< load, nxv2f16, LD1H_D_IMM, PTRUE_D>;
	defm : unpred_load< load, nxv4f32, LD1W_IMM, PTRUE_S>;
	defm : unpred_load< load, nxv2f32, LD1W_D_IMM, PTRUE_D>;
	defm : unpred_load< load, nxv2f64, LD1D_IMM, PTRUE_D>;

	multiclass unpred_store_predicate<ValueType Ty, Instruction Store> {
	def _fi : Pat<(store (Ty PPR:$val), (am_sve_fi GPR64sp:$base, simm9:$offset)),
	(Store PPR:$val, GPR64sp:$base, simm9:$offset)>;

	def _default : Pat<(store (Ty PPR:$Val), GPR64:$base),
	(Store PPR:$Val, GPR64:$base, (i64 0))>;
	}

	defm Pat_Store_P16 : unpred_store_predicate<nxv16i1, STR_PXI>;
	defm Pat_Store_P8 : unpred_store_predicate<nxv8i1, STR_PXI>;
	defm Pat_Store_P4 : unpred_store_predicate<nxv4i1, STR_PXI>;
	defm Pat_Store_P2 : unpred_store_predicate<nxv2i1, STR_PXI>;

	multiclass unpred_load_predicate<ValueType Ty, Instruction Load> {
	def _fi : Pat<(Ty (load (am_sve_fi GPR64sp:$base, simm9:$offset))),
	(Load GPR64sp:$base, simm9:$offset)>;

	def _default : Pat<(Ty (load GPR64:$base)),
	(Load GPR64:$base, (i64 0))>;
	}

	defm Pat_Load_P16 : unpred_load_predicate<nxv16i1, LDR_PXI>;
	defm Pat_Load_P8 : unpred_load_predicate<nxv8i1, LDR_PXI>;
	defm Pat_Load_P4 : unpred_load_predicate<nxv4i1, LDR_PXI>;
	defm Pat_Load_P2 : unpred_load_predicate<nxv2i1, LDR_PXI>;

	multiclass ld1<Instruction RegRegInst, Instruction RegImmInst, ValueType Ty,
	SDPatternOperator Load, ValueType PredTy, ValueType MemVT, ComplexPattern AddrCP> {
	// reg + reg
	let AddedComplexity = 1 in {
	def : Pat<(Ty (Load (PredTy PPR:$gp), (AddrCP GPR64:$base, GPR64:$offset), MemVT)),
	(RegRegInst PPR:$gp, GPR64sp:$base, GPR64:$offset)>;
	}

	// scalar + immediate (mul vl)
	let AddedComplexity = 2 in {
	def : Pat<(Ty (Load (PredTy PPR:$gp), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), MemVT)),
	(RegImmInst PPR:$gp, GPR64sp:$base, simm4s1:$offset)>;
	}

	// base
	def : Pat<(Ty (Load (PredTy PPR:$gp), GPR64:$base, MemVT)),
	(RegImmInst PPR:$gp, GPR64sp:$base, (i64 0))>;
	}

	// 2-element contiguous loads
	defm : ld1<LD1B_D, LD1B_D_IMM, nxv2i64, AArch64ld1_z, nxv2i1, nxv2i8, am_sve_regreg_lsl0>;
	defm : ld1<LD1SB_D, LD1SB_D_IMM, nxv2i64, AArch64ld1s_z, nxv2i1, nxv2i8, am_sve_regreg_lsl0>;
	defm : ld1<LD1H_D, LD1H_D_IMM, nxv2i64, AArch64ld1_z, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
	defm : ld1<LD1SH_D, LD1SH_D_IMM, nxv2i64, AArch64ld1s_z, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
	defm : ld1<LD1W_D, LD1W_D_IMM, nxv2i64, AArch64ld1_z, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
	defm : ld1<LD1SW_D, LD1SW_D_IMM, nxv2i64, AArch64ld1s_z, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
	defm : ld1<LD1D, LD1D_IMM, nxv2i64, AArch64ld1_z, nxv2i1, nxv2i64, am_sve_regreg_lsl3>;
	defm : ld1<LD1D, LD1D_IMM, nxv2f64, AArch64ld1_z, nxv2i1, nxv2f64, am_sve_regreg_lsl3>;

	// 4-element contiguous loads
	defm : ld1<LD1B_S, LD1B_S_IMM, nxv4i32, AArch64ld1_z, nxv4i1, nxv4i8, am_sve_regreg_lsl0>;
	defm : ld1<LD1SB_S, LD1SB_S_IMM, nxv4i32, AArch64ld1s_z, nxv4i1, nxv4i8, am_sve_regreg_lsl0>;
	defm : ld1<LD1H_S, LD1H_S_IMM, nxv4i32, AArch64ld1_z, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
	defm : ld1<LD1SH_S, LD1SH_S_IMM, nxv4i32, AArch64ld1s_z, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
	defm : ld1<LD1W, LD1W_IMM, nxv4i32, AArch64ld1_z, nxv4i1, nxv4i32, am_sve_regreg_lsl2>;
	defm : ld1<LD1W, LD1W_IMM, nxv4f32, AArch64ld1_z, nxv4i1, nxv4f32, am_sve_regreg_lsl2>;

	// 8-element contiguous loads
	defm : ld1<LD1B_H, LD1B_H_IMM, nxv8i16, AArch64ld1_z, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
	defm : ld1<LD1SB_H, LD1SB_H_IMM, nxv8i16, AArch64ld1s_z, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
	defm : ld1<LD1H, LD1H_IMM, nxv8i16, AArch64ld1_z, nxv8i1, nxv8i16, am_sve_regreg_lsl1>;
	defm : ld1<LD1H, LD1H_IMM, nxv8f16, AArch64ld1_z, nxv8i1, nxv8f16, am_sve_regreg_lsl1>;

	let Predicates = [HasBF16, HasSVE] in {
	defm : ld1<LD1H, LD1H_IMM, nxv8bf16, AArch64ld1_z, nxv8i1, nxv8bf16, am_sve_regreg_lsl1>;
	}

	// 16-element contiguous loads
	defm : ld1<LD1B, LD1B_IMM, nxv16i8, AArch64ld1_z, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;

	multiclass ldnf1<Instruction I, ValueType Ty, SDPatternOperator Load, ValueType PredTy, ValueType MemVT> {
	// scalar + immediate (mul vl)
	let AddedComplexity = 1 in {
	def : Pat<(Ty (Load (PredTy PPR:$gp), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), MemVT)),
	(I PPR:$gp, GPR64sp:$base, simm4s1:$offset)>;
	}

	// base
	def : Pat<(Ty (Load (PredTy PPR:$gp), GPR64:$base, MemVT)),
	(I PPR:$gp, GPR64sp:$base, (i64 0))>;
	}

	// 2-element contiguous non-faulting loads
	defm : ldnf1<LDNF1B_D_IMM, nxv2i64, AArch64ldnf1_z, nxv2i1, nxv2i8>;
	defm : ldnf1<LDNF1SB_D_IMM, nxv2i64, AArch64ldnf1s_z, nxv2i1, nxv2i8>;
	defm : ldnf1<LDNF1H_D_IMM, nxv2i64, AArch64ldnf1_z, nxv2i1, nxv2i16>;
	defm : ldnf1<LDNF1SH_D_IMM, nxv2i64, AArch64ldnf1s_z, nxv2i1, nxv2i16>;
	defm : ldnf1<LDNF1W_D_IMM, nxv2i64, AArch64ldnf1_z, nxv2i1, nxv2i32>;
	defm : ldnf1<LDNF1SW_D_IMM, nxv2i64, AArch64ldnf1s_z, nxv2i1, nxv2i32>;
	defm : ldnf1<LDNF1D_IMM, nxv2i64, AArch64ldnf1_z, nxv2i1, nxv2i64>;
	defm : ldnf1<LDNF1D_IMM, nxv2f64, AArch64ldnf1_z, nxv2i1, nxv2f64>;

	// 4-element contiguous non-faulting loads
	defm : ldnf1<LDNF1B_S_IMM, nxv4i32, AArch64ldnf1_z, nxv4i1, nxv4i8>;
	defm : ldnf1<LDNF1SB_S_IMM, nxv4i32, AArch64ldnf1s_z, nxv4i1, nxv4i8>;
	defm : ldnf1<LDNF1H_S_IMM, nxv4i32, AArch64ldnf1_z, nxv4i1, nxv4i16>;
	defm : ldnf1<LDNF1SH_S_IMM, nxv4i32, AArch64ldnf1s_z, nxv4i1, nxv4i16>;
	defm : ldnf1<LDNF1W_IMM, nxv4i32, AArch64ldnf1_z, nxv4i1, nxv4i32>;
	defm : ldnf1<LDNF1W_IMM, nxv4f32, AArch64ldnf1_z, nxv4i1, nxv4f32>;

	// 8-element contiguous non-faulting loads
	defm : ldnf1<LDNF1B_H_IMM, nxv8i16, AArch64ldnf1_z, nxv8i1, nxv8i8>;
	defm : ldnf1<LDNF1SB_H_IMM, nxv8i16, AArch64ldnf1s_z, nxv8i1, nxv8i8>;
	defm : ldnf1<LDNF1H_IMM, nxv8i16, AArch64ldnf1_z, nxv8i1, nxv8i16>;
	defm : ldnf1<LDNF1H_IMM, nxv8f16, AArch64ldnf1_z, nxv8i1, nxv8f16>;

	let Predicates = [HasBF16, HasSVE] in {
	defm : ldnf1<LDNF1H_IMM, nxv8bf16, AArch64ldnf1_z, nxv8i1, nxv8bf16>;
	}

	// 16-element contiguous non-faulting loads
	defm : ldnf1<LDNF1B_IMM, nxv16i8, AArch64ldnf1_z, nxv16i1, nxv16i8>;

	multiclass ldff1<Instruction I, ValueType Ty, SDPatternOperator Load, ValueType PredTy, ValueType MemVT, ComplexPattern AddrCP> {
	// reg + reg
	let AddedComplexity = 1 in {
	def : Pat<(Ty (Load (PredTy PPR:$gp), (AddrCP GPR64:$base, GPR64:$offset), MemVT)),
	(I PPR:$gp, GPR64sp:$base, GPR64:$offset)>;
	}

	// Base
	def : Pat<(Ty (Load (PredTy PPR:$gp), GPR64:$base, MemVT)),
	(I PPR:$gp, GPR64sp:$base, XZR)>;
	}

	// 2-element contiguous first faulting loads
	defm : ldff1<LDFF1B_D, nxv2i64, AArch64ldff1_z, nxv2i1, nxv2i8, am_sve_regreg_lsl0>;
	defm : ldff1<LDFF1SB_D, nxv2i64, AArch64ldff1s_z, nxv2i1, nxv2i8, am_sve_regreg_lsl0>;
	defm : ldff1<LDFF1H_D, nxv2i64, AArch64ldff1_z, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
	defm : ldff1<LDFF1SH_D, nxv2i64, AArch64ldff1s_z, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
	defm : ldff1<LDFF1W_D, nxv2i64, AArch64ldff1_z, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
	defm : ldff1<LDFF1SW_D, nxv2i64, AArch64ldff1s_z, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
	defm : ldff1<LDFF1D, nxv2i64, AArch64ldff1_z, nxv2i1, nxv2i64, am_sve_regreg_lsl3>;
	defm : ldff1<LDFF1W_D, nxv2f32, AArch64ldff1_z, nxv2i1, nxv2f32, am_sve_regreg_lsl2>;
	defm : ldff1<LDFF1D, nxv2f64, AArch64ldff1_z, nxv2i1, nxv2f64, am_sve_regreg_lsl3>;

	// 4-element contiguous first faulting loads
	defm : ldff1<LDFF1B_S, nxv4i32, AArch64ldff1_z, nxv4i1, nxv4i8, am_sve_regreg_lsl0>;
	defm : ldff1<LDFF1SB_S, nxv4i32, AArch64ldff1s_z, nxv4i1, nxv4i8, am_sve_regreg_lsl0>;
	defm : ldff1<LDFF1H_S, nxv4i32, AArch64ldff1_z, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
	defm : ldff1<LDFF1SH_S, nxv4i32, AArch64ldff1s_z, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
	defm : ldff1<LDFF1W, nxv4i32, AArch64ldff1_z, nxv4i1, nxv4i32, am_sve_regreg_lsl2>;
	defm : ldff1<LDFF1W, nxv4f32, AArch64ldff1_z, nxv4i1, nxv4f32, am_sve_regreg_lsl2>;

	// 8-element contiguous first faulting loads
	defm : ldff1<LDFF1B_H, nxv8i16, AArch64ldff1_z, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
	defm : ldff1<LDFF1SB_H, nxv8i16, AArch64ldff1s_z, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
	defm : ldff1<LDFF1H, nxv8i16, AArch64ldff1_z, nxv8i1, nxv8i16, am_sve_regreg_lsl1>;
	defm : ldff1<LDFF1H, nxv8f16, AArch64ldff1_z, nxv8i1, nxv8f16, am_sve_regreg_lsl1>;

	let Predicates = [HasBF16, HasSVE] in {
	defm : ldff1<LDFF1H, nxv8bf16, AArch64ldff1_z, nxv8i1, nxv8bf16, am_sve_regreg_lsl1>;
	}

	// 16-element contiguous first faulting loads
	defm : ldff1<LDFF1B, nxv16i8, AArch64ldff1_z, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;

	multiclass st1<Instruction RegRegInst, Instruction RegImmInst, ValueType Ty,
	SDPatternOperator Store, ValueType PredTy, ValueType MemVT, ComplexPattern AddrCP> {
	// reg + reg
	let AddedComplexity = 1 in {
	def : Pat<(Store (Ty ZPR:$vec), (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp), MemVT),
	(RegRegInst ZPR:$vec, PPR:$gp, GPR64sp:$base, GPR64:$offset)>;
	}

	// scalar + immediate (mul vl)
	let AddedComplexity = 2 in {
	def : Pat<(Store (Ty ZPR:$vec), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp), MemVT),
	(RegImmInst ZPR:$vec, PPR:$gp, GPR64sp:$base, simm4s1:$offset)>;
	}

	// base
	def : Pat<(Store (Ty ZPR:$vec), GPR64:$base, (PredTy PPR:$gp), MemVT),
	(RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, (i64 0))>;
	}

	// 2-element contiguous store
	defm : st1<ST1B_D, ST1B_D_IMM, nxv2i64, AArch64st1, nxv2i1, nxv2i8, am_sve_regreg_lsl0>;
	defm : st1<ST1H_D, ST1H_D_IMM, nxv2i64, AArch64st1, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
	defm : st1<ST1W_D, ST1W_D_IMM, nxv2i64, AArch64st1, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
	defm : st1<ST1D, ST1D_IMM, nxv2i64, AArch64st1, nxv2i1, nxv2i64, am_sve_regreg_lsl3>;

	// 4-element contiguous store
	defm : st1<ST1B_S, ST1B_S_IMM, nxv4i32, AArch64st1, nxv4i1, nxv4i8, am_sve_regreg_lsl0>;
	defm : st1<ST1H_S, ST1H_S_IMM, nxv4i32, AArch64st1, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
	defm : st1<ST1W, ST1W_IMM, nxv4i32, AArch64st1, nxv4i1, nxv4i32, am_sve_regreg_lsl2>;

	// 8-element contiguous store
	defm : st1<ST1B_H, ST1B_H_IMM, nxv8i16, AArch64st1, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
	defm : st1<ST1H, ST1H_IMM, nxv8i16, AArch64st1, nxv8i1, nxv8i16, am_sve_regreg_lsl1>;

	// 16-element contiguous store
	defm : st1<ST1B, ST1B_IMM, nxv16i8, AArch64st1, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;

	def : Pat<(nxv16i8 (vector_insert (nxv16i8 (undef)), (i32 FPR32:$src), 0)),
	(INSERT_SUBREG (nxv16i8 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
	def : Pat<(nxv8i16 (vector_insert (nxv8i16 (undef)), (i32 FPR32:$src), 0)),
	(INSERT_SUBREG (nxv8i16 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
	def : Pat<(nxv4i32 (vector_insert (nxv4i32 (undef)), (i32 FPR32:$src), 0)),
	(INSERT_SUBREG (nxv4i32 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
	def : Pat<(nxv2i64 (vector_insert (nxv2i64 (undef)), (i64 FPR64:$src), 0)),
	(INSERT_SUBREG (nxv2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;

	// Insert scalar into vector[0]
	def : Pat<(nxv16i8 (vector_insert (nxv16i8 ZPR:$vec), (i32 GPR32:$src), 0)),
	(CPY_ZPmR_B ZPR:$vec, (PTRUE_B 1), GPR32:$src)>;
	def : Pat<(nxv8i16 (vector_insert (nxv8i16 ZPR:$vec), (i32 GPR32:$src), 0)),
	(CPY_ZPmR_H ZPR:$vec, (PTRUE_H 1), GPR32:$src)>;
	def : Pat<(nxv4i32 (vector_insert (nxv4i32 ZPR:$vec), (i32 GPR32:$src), 0)),
	(CPY_ZPmR_S ZPR:$vec, (PTRUE_S 1), GPR32:$src)>;
	def : Pat<(nxv2i64 (vector_insert (nxv2i64 ZPR:$vec), (i64 GPR64:$src), 0)),
	(CPY_ZPmR_D ZPR:$vec, (PTRUE_D 1), GPR64:$src)>;

	def : Pat<(nxv8f16 (vector_insert (nxv8f16 ZPR:$vec), (f16 FPR16:$src), 0)),
	(SEL_ZPZZ_H (PTRUE_H 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), ZPR:$vec)>;
	def : Pat<(nxv4f32 (vector_insert (nxv4f32 ZPR:$vec), (f32 FPR32:$src), 0)),
	(SEL_ZPZZ_S (PTRUE_S 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), ZPR:$vec)>;
	def : Pat<(nxv2f64 (vector_insert (nxv2f64 ZPR:$vec), (f64 FPR64:$src), 0)),
	(SEL_ZPZZ_D (PTRUE_D 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$src, dsub), ZPR:$vec)>;

	// Insert scalar into vector with scalar index
	def : Pat<(nxv16i8 (vector_insert (nxv16i8 ZPR:$vec), GPR32:$src, GPR64:$index)),
	(CPY_ZPmR_B ZPR:$vec,
	(CMPEQ_PPzZZ_B (PTRUE_B 31),
	(INDEX_II_B 0, 1),
	(DUP_ZR_B (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
	GPR32:$src)>;
	def : Pat<(nxv8i16 (vector_insert (nxv8i16 ZPR:$vec), GPR32:$src, GPR64:$index)),
	(CPY_ZPmR_H ZPR:$vec,
	(CMPEQ_PPzZZ_H (PTRUE_H 31),
	(INDEX_II_H 0, 1),
	(DUP_ZR_H (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
	GPR32:$src)>;
	def : Pat<(nxv4i32 (vector_insert (nxv4i32 ZPR:$vec), GPR32:$src, GPR64:$index)),
	(CPY_ZPmR_S ZPR:$vec,
	(CMPEQ_PPzZZ_S (PTRUE_S 31),
	(INDEX_II_S 0, 1),
	(DUP_ZR_S (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
	GPR32:$src)>;
	def : Pat<(nxv2i64 (vector_insert (nxv2i64 ZPR:$vec), GPR64:$src, GPR64:$index)),
	(CPY_ZPmR_D ZPR:$vec,
	(CMPEQ_PPzZZ_D (PTRUE_D 31),
	(INDEX_II_D 0, 1),
	(DUP_ZR_D GPR64:$index)),
	GPR64:$src)>;

	// Insert FP scalar into vector with scalar index
	def : Pat<(nxv8f16 (vector_insert (nxv8f16 ZPR:$vec), (f16 FPR16:$src), GPR64:$index)),
	(CPY_ZPmV_H ZPR:$vec,
	(CMPEQ_PPzZZ_H (PTRUE_H 31),
	(INDEX_II_H 0, 1),
	(DUP_ZR_H (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
	$src)>;
	def : Pat<(nxv4f32 (vector_insert (nxv4f32 ZPR:$vec), (f32 FPR32:$src), GPR64:$index)),
	(CPY_ZPmV_S ZPR:$vec,
	(CMPEQ_PPzZZ_S (PTRUE_S 31),
	(INDEX_II_S 0, 1),
	(DUP_ZR_S (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
	$src)>;
	def : Pat<(nxv2f64 (vector_insert (nxv2f64 ZPR:$vec), (f64 FPR64:$src), GPR64:$index)),
	(CPY_ZPmV_D ZPR:$vec,
	(CMPEQ_PPzZZ_D (PTRUE_D 31),
	(INDEX_II_D 0, 1),
	(DUP_ZR_D $index)),
	$src)>;

	// Extract element from vector with immediate index
	def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), sve_elm_idx_extdup_b:$index)),
	(EXTRACT_SUBREG (DUP_ZZI_B ZPR:$vec, sve_elm_idx_extdup_b:$index), ssub)>;
	def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
	(EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), ssub)>;
	def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
	(EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>;
	def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
	(EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
	def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
	(EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>;
	def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
	(EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>;
	def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
	(EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;

	// Extract element from vector with scalar index
	def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), GPR64:$index)),
	(LASTB_RPZ_B (WHILELS_PXX_B XZR, GPR64:$index),
	ZPR:$vec)>;
	def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), GPR64:$index)),
	(LASTB_RPZ_H (WHILELS_PXX_H XZR, GPR64:$index),
	ZPR:$vec)>;
	def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), GPR64:$index)),
	(LASTB_RPZ_S (WHILELS_PXX_S XZR, GPR64:$index),
	ZPR:$vec)>;
	def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), GPR64:$index)),
	(LASTB_RPZ_D (WHILELS_PXX_D XZR, GPR64:$index),
	ZPR:$vec)>;

	def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), GPR64:$index)),
	(LASTB_VPZ_H (WHILELS_PXX_H XZR, GPR64:$index),
	ZPR:$vec)>;
	def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), GPR64:$index)),
	(LASTB_VPZ_S (WHILELS_PXX_S XZR, GPR64:$index),
	ZPR:$vec)>;
	def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), GPR64:$index)),
	(LASTB_VPZ_D (WHILELS_PXX_D XZR, GPR64:$index),
	ZPR:$vec)>;
	}

	let Predicates = [HasSVE, HasMatMulInt8] in {
	defm SMMLA_ZZZ : sve_int_matmul<0b00, "smmla", int_aarch64_sve_smmla>;
	defm UMMLA_ZZZ : sve_int_matmul<0b11, "ummla", int_aarch64_sve_ummla>;
	defm USMMLA_ZZZ : sve_int_matmul<0b10, "usmmla", int_aarch64_sve_usmmla>;
	defm USDOT_ZZZ : sve_int_dot_mixed<"usdot", int_aarch64_sve_usdot>;
	defm USDOT_ZZZI : sve_int_dot_mixed_indexed<0, "usdot", int_aarch64_sve_usdot_lane>;
	defm SUDOT_ZZZI : sve_int_dot_mixed_indexed<1, "sudot", int_aarch64_sve_sudot_lane>;
	}

	let Predicates = [HasSVE, HasMatMulFP32] in {
	defm FMMLA_ZZZ_S : sve_fp_matrix_mla<0, "fmmla", ZPR32, int_aarch64_sve_fmmla, nxv4f32>;
	}

	let Predicates = [HasSVE, HasMatMulFP64] in {
	defm FMMLA_ZZZ_D : sve_fp_matrix_mla<1, "fmmla", ZPR64, int_aarch64_sve_fmmla, nxv2f64>;
	defm LD1RO_B_IMM : sve_mem_ldor_si<0b00, "ld1rob", Z_b, ZPR8, nxv16i8, nxv16i1, AArch64ld1ro_z>;
	defm LD1RO_H_IMM : sve_mem_ldor_si<0b01, "ld1roh", Z_h, ZPR16, nxv8i16, nxv8i1, AArch64ld1ro_z>;
	defm LD1RO_W_IMM : sve_mem_ldor_si<0b10, "ld1row", Z_s, ZPR32, nxv4i32, nxv4i1, AArch64ld1ro_z>;
	defm LD1RO_D_IMM : sve_mem_ldor_si<0b11, "ld1rod", Z_d, ZPR64, nxv2i64, nxv2i1, AArch64ld1ro_z>;
	defm LD1RO_B : sve_mem_ldor_ss<0b00, "ld1rob", Z_b, ZPR8, GPR64NoXZRshifted8, nxv16i8, nxv16i1, AArch64ld1ro_z, am_sve_regreg_lsl0>;
	defm LD1RO_H : sve_mem_ldor_ss<0b01, "ld1roh", Z_h, ZPR16, GPR64NoXZRshifted16, nxv8i16, nxv8i1, AArch64ld1ro_z, am_sve_regreg_lsl1>;
	defm LD1RO_W : sve_mem_ldor_ss<0b10, "ld1row", Z_s, ZPR32, GPR64NoXZRshifted32, nxv4i32, nxv4i1, AArch64ld1ro_z, am_sve_regreg_lsl2>;
	defm LD1RO_D : sve_mem_ldor_ss<0b11, "ld1rod", Z_d, ZPR64, GPR64NoXZRshifted64, nxv2i64, nxv2i1, AArch64ld1ro_z, am_sve_regreg_lsl3>;
	defm ZIP1_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b00, 0, "zip1", int_aarch64_sve_zip1q>;
	defm ZIP2_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b00, 1, "zip2", int_aarch64_sve_zip2q>;
	defm UZP1_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b01, 0, "uzp1", int_aarch64_sve_uzp1q>;
	defm UZP2_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b01, 1, "uzp2", int_aarch64_sve_uzp2q>;
	defm TRN1_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b11, 0, "trn1", int_aarch64_sve_trn1q>;
	defm TRN2_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b11, 1, "trn2", int_aarch64_sve_trn2q>;
	}

	let Predicates = [HasSVE, HasMatMulFP64, HasBF16] in {
	def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_zip1q, nxv8bf16, nxv8bf16, ZIP1_ZZZ_Q>;
	def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_zip2q, nxv8bf16, nxv8bf16, ZIP2_ZZZ_Q>;
	def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_uzp1q, nxv8bf16, nxv8bf16, UZP1_ZZZ_Q>;
	def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_uzp2q, nxv8bf16, nxv8bf16, UZP2_ZZZ_Q>;
	def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_trn1q, nxv8bf16, nxv8bf16, TRN1_ZZZ_Q>;
	def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_trn2q, nxv8bf16, nxv8bf16, TRN2_ZZZ_Q>;
	}

	let Predicates = [HasSVE2] in {
	// SVE2 integer multiply-add (indexed)
	defm MLA_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b0, "mla", int_aarch64_sve_mla_lane>;
	defm MLS_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b1, "mls", int_aarch64_sve_mls_lane>;

	// SVE2 saturating multiply-add high (indexed)
	defm SQRDMLAH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b0, "sqrdmlah", int_aarch64_sve_sqrdmlah_lane>;
	defm SQRDMLSH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b1, "sqrdmlsh", int_aarch64_sve_sqrdmlsh_lane>;

	// SVE2 saturating multiply-add high (vectors, unpredicated)
	defm SQRDMLAH_ZZZ : sve2_int_mla<0b0, "sqrdmlah", int_aarch64_sve_sqrdmlah>;
	defm SQRDMLSH_ZZZ : sve2_int_mla<0b1, "sqrdmlsh", int_aarch64_sve_sqrdmlsh>;

	// SVE2 integer multiply (indexed)
	defm MUL_ZZZI : sve2_int_mul_by_indexed_elem<0b1110, "mul", int_aarch64_sve_mul_lane>;

	// SVE2 saturating multiply high (indexed)
	defm SQDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1100, "sqdmulh", int_aarch64_sve_sqdmulh_lane>;
	defm SQRDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1101, "sqrdmulh", int_aarch64_sve_sqrdmulh_lane>;

	// SVE2 signed saturating doubling multiply high (unpredicated)
	defm SQDMULH_ZZZ : sve2_int_mul<0b100, "sqdmulh", int_aarch64_sve_sqdmulh>;
	defm SQRDMULH_ZZZ : sve2_int_mul<0b101, "sqrdmulh", int_aarch64_sve_sqrdmulh>;

	// SVE2 integer multiply vectors (unpredicated)
	defm MUL_ZZZ : sve2_int_mul<0b000, "mul", mul>;
	defm SMULH_ZZZ : sve2_int_mul<0b010, "smulh", null_frag>;
	defm UMULH_ZZZ : sve2_int_mul<0b011, "umulh", null_frag>;
	defm PMUL_ZZZ : sve2_int_mul_single<0b001, "pmul", int_aarch64_sve_pmul>;

	// Add patterns for unpredicated version of smulh and umulh.
	def : Pat<(nxv16i8 (int_aarch64_sve_smulh (nxv16i1 (AArch64ptrue 31)), nxv16i8:$Op1, nxv16i8:$Op2)),
	(SMULH_ZZZ_B $Op1, $Op2)>;
	def : Pat<(nxv8i16 (int_aarch64_sve_smulh (nxv8i1 (AArch64ptrue 31)), nxv8i16:$Op1, nxv8i16:$Op2)),
	(SMULH_ZZZ_H $Op1, $Op2)>;
	def : Pat<(nxv4i32 (int_aarch64_sve_smulh (nxv4i1 (AArch64ptrue 31)), nxv4i32:$Op1, nxv4i32:$Op2)),
	(SMULH_ZZZ_S $Op1, $Op2)>;
	def : Pat<(nxv2i64 (int_aarch64_sve_smulh (nxv2i1 (AArch64ptrue 31)), nxv2i64:$Op1, nxv2i64:$Op2)),
	(SMULH_ZZZ_D $Op1, $Op2)>;
	def : Pat<(nxv16i8 (int_aarch64_sve_umulh (nxv16i1 (AArch64ptrue 31)), nxv16i8:$Op1, nxv16i8:$Op2)),
	(UMULH_ZZZ_B $Op1, $Op2)>;
	def : Pat<(nxv8i16 (int_aarch64_sve_umulh (nxv8i1 (AArch64ptrue 31)), nxv8i16:$Op1, nxv8i16:$Op2)),
	(UMULH_ZZZ_H $Op1, $Op2)>;
	def : Pat<(nxv4i32 (int_aarch64_sve_umulh (nxv4i1 (AArch64ptrue 31)), nxv4i32:$Op1, nxv4i32:$Op2)),
	(UMULH_ZZZ_S $Op1, $Op2)>;
	def : Pat<(nxv2i64 (int_aarch64_sve_umulh (nxv2i1 (AArch64ptrue 31)), nxv2i64:$Op1, nxv2i64:$Op2)),
	(UMULH_ZZZ_D $Op1, $Op2)>;
	// SVE2 complex integer dot product (indexed)
	defm CDOT_ZZZI : sve2_cintx_dot_by_indexed_elem<"cdot", int_aarch64_sve_cdot_lane>;

	// SVE2 complex integer dot product
	defm CDOT_ZZZ : sve2_cintx_dot<"cdot", int_aarch64_sve_cdot>;

	// SVE2 complex integer multiply-add (indexed)
	defm CMLA_ZZZI : sve2_cmla_by_indexed_elem<0b0, "cmla", int_aarch64_sve_cmla_lane_x>;
	// SVE2 complex saturating multiply-add (indexed)
	defm SQRDCMLAH_ZZZI : sve2_cmla_by_indexed_elem<0b1, "sqrdcmlah", int_aarch64_sve_sqrdcmlah_lane_x>;

	// SVE2 complex integer multiply-add
	defm CMLA_ZZZ : sve2_int_cmla<0b0, "cmla", int_aarch64_sve_cmla_x>;
	defm SQRDCMLAH_ZZZ : sve2_int_cmla<0b1, "sqrdcmlah", int_aarch64_sve_sqrdcmlah_x>;

	// SVE2 integer multiply long (indexed)
	defm SMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b000, "smullb", int_aarch64_sve_smullb_lane>;
	defm SMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b001, "smullt", int_aarch64_sve_smullt_lane>;
	defm UMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b010, "umullb", int_aarch64_sve_umullb_lane>;
	defm UMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b011, "umullt", int_aarch64_sve_umullt_lane>;

	// SVE2 saturating multiply (indexed)
	defm SQDMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b100, "sqdmullb", int_aarch64_sve_sqdmullb_lane>;
	defm SQDMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b101, "sqdmullt", int_aarch64_sve_sqdmullt_lane>;

	// SVE2 integer multiply-add long (indexed)
	defm SMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1000, "smlalb", int_aarch64_sve_smlalb_lane>;
	defm SMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1001, "smlalt", int_aarch64_sve_smlalt_lane>;
	defm UMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1010, "umlalb", int_aarch64_sve_umlalb_lane>;
	defm UMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1011, "umlalt", int_aarch64_sve_umlalt_lane>;
	defm SMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1100, "smlslb", int_aarch64_sve_smlslb_lane>;
	defm SMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1101, "smlslt", int_aarch64_sve_smlslt_lane>;
	defm UMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1110, "umlslb", int_aarch64_sve_umlslb_lane>;
	defm UMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1111, "umlslt", int_aarch64_sve_umlslt_lane>;

	// SVE2 integer multiply-add long (vectors, unpredicated)
	defm SMLALB_ZZZ : sve2_int_mla_long<0b10000, "smlalb", int_aarch64_sve_smlalb>;
	defm SMLALT_ZZZ : sve2_int_mla_long<0b10001, "smlalt", int_aarch64_sve_smlalt>;
	defm UMLALB_ZZZ : sve2_int_mla_long<0b10010, "umlalb", int_aarch64_sve_umlalb>;
	defm UMLALT_ZZZ : sve2_int_mla_long<0b10011, "umlalt", int_aarch64_sve_umlalt>;
	defm SMLSLB_ZZZ : sve2_int_mla_long<0b10100, "smlslb", int_aarch64_sve_smlslb>;
	defm SMLSLT_ZZZ : sve2_int_mla_long<0b10101, "smlslt", int_aarch64_sve_smlslt>;
	defm UMLSLB_ZZZ : sve2_int_mla_long<0b10110, "umlslb", int_aarch64_sve_umlslb>;
	defm UMLSLT_ZZZ : sve2_int_mla_long<0b10111, "umlslt", int_aarch64_sve_umlslt>;

	// SVE2 saturating multiply-add long (indexed)
	defm SQDMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0100, "sqdmlalb", int_aarch64_sve_sqdmlalb_lane>;
	defm SQDMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0101, "sqdmlalt", int_aarch64_sve_sqdmlalt_lane>;
	defm SQDMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0110, "sqdmlslb", int_aarch64_sve_sqdmlslb_lane>;
	defm SQDMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0111, "sqdmlslt", int_aarch64_sve_sqdmlslt_lane>;

	// SVE2 saturating multiply-add long (vectors, unpredicated)
	defm SQDMLALB_ZZZ : sve2_int_mla_long<0b11000, "sqdmlalb", int_aarch64_sve_sqdmlalb>;
	defm SQDMLALT_ZZZ : sve2_int_mla_long<0b11001, "sqdmlalt", int_aarch64_sve_sqdmlalt>;
	defm SQDMLSLB_ZZZ : sve2_int_mla_long<0b11010, "sqdmlslb", int_aarch64_sve_sqdmlslb>;
	defm SQDMLSLT_ZZZ : sve2_int_mla_long<0b11011, "sqdmlslt", int_aarch64_sve_sqdmlslt>;

	// SVE2 saturating multiply-add interleaved long
	defm SQDMLALBT_ZZZ : sve2_int_mla_long<0b00010, "sqdmlalbt", int_aarch64_sve_sqdmlalbt>;
	defm SQDMLSLBT_ZZZ : sve2_int_mla_long<0b00011, "sqdmlslbt", int_aarch64_sve_sqdmlslbt>;

	// SVE2 integer halving add/subtract (predicated)
	defm SHADD_ZPmZ : sve2_int_arith_pred<0b100000, "shadd", int_aarch64_sve_shadd>;
	defm UHADD_ZPmZ : sve2_int_arith_pred<0b100010, "uhadd", int_aarch64_sve_uhadd>;
	defm SHSUB_ZPmZ : sve2_int_arith_pred<0b100100, "shsub", int_aarch64_sve_shsub>;
	defm UHSUB_ZPmZ : sve2_int_arith_pred<0b100110, "uhsub", int_aarch64_sve_uhsub>;
	defm SRHADD_ZPmZ : sve2_int_arith_pred<0b101000, "srhadd", int_aarch64_sve_srhadd>;
	defm URHADD_ZPmZ : sve2_int_arith_pred<0b101010, "urhadd", int_aarch64_sve_urhadd>;
	defm SHSUBR_ZPmZ : sve2_int_arith_pred<0b101100, "shsubr", int_aarch64_sve_shsubr>;
	defm UHSUBR_ZPmZ : sve2_int_arith_pred<0b101110, "uhsubr", int_aarch64_sve_uhsubr>;

	// SVE2 integer pairwise add and accumulate long
	defm SADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<0, "sadalp", int_aarch64_sve_sadalp>;
	defm UADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<1, "uadalp", int_aarch64_sve_uadalp>;

	// SVE2 integer pairwise arithmetic
	defm ADDP_ZPmZ : sve2_int_arith_pred<0b100011, "addp", int_aarch64_sve_addp>;
	defm SMAXP_ZPmZ : sve2_int_arith_pred<0b101001, "smaxp", int_aarch64_sve_smaxp>;
	defm UMAXP_ZPmZ : sve2_int_arith_pred<0b101011, "umaxp", int_aarch64_sve_umaxp>;
	defm SMINP_ZPmZ : sve2_int_arith_pred<0b101101, "sminp", int_aarch64_sve_sminp>;
	defm UMINP_ZPmZ : sve2_int_arith_pred<0b101111, "uminp", int_aarch64_sve_uminp>;

	// SVE2 integer unary operations (predicated)
	defm URECPE_ZPmZ : sve2_int_un_pred_arit_s<0b000, "urecpe", int_aarch64_sve_urecpe>;
	defm URSQRTE_ZPmZ : sve2_int_un_pred_arit_s<0b001, "ursqrte", int_aarch64_sve_ursqrte>;
	defm SQABS_ZPmZ : sve2_int_un_pred_arit<0b100, "sqabs", int_aarch64_sve_sqabs>;
	defm SQNEG_ZPmZ : sve2_int_un_pred_arit<0b101, "sqneg", int_aarch64_sve_sqneg>;

	// SVE2 saturating add/subtract
	defm SQADD_ZPmZ : sve2_int_arith_pred<0b110000, "sqadd", int_aarch64_sve_sqadd>;
	defm UQADD_ZPmZ : sve2_int_arith_pred<0b110010, "uqadd", int_aarch64_sve_uqadd>;
	defm SQSUB_ZPmZ : sve2_int_arith_pred<0b110100, "sqsub", int_aarch64_sve_sqsub>;
	defm UQSUB_ZPmZ : sve2_int_arith_pred<0b110110, "uqsub", int_aarch64_sve_uqsub>;
	defm SUQADD_ZPmZ : sve2_int_arith_pred<0b111000, "suqadd", int_aarch64_sve_suqadd>;
	defm USQADD_ZPmZ : sve2_int_arith_pred<0b111010, "usqadd", int_aarch64_sve_usqadd>;
	defm SQSUBR_ZPmZ : sve2_int_arith_pred<0b111100, "sqsubr", int_aarch64_sve_sqsubr>;
	defm UQSUBR_ZPmZ : sve2_int_arith_pred<0b111110, "uqsubr", int_aarch64_sve_uqsubr>;

	// SVE2 saturating/rounding bitwise shift left (predicated)
	defm SRSHL_ZPmZ : sve2_int_arith_pred<0b000100, "srshl", int_aarch64_sve_srshl>;
	defm URSHL_ZPmZ : sve2_int_arith_pred<0b000110, "urshl", int_aarch64_sve_urshl>;
	defm SRSHLR_ZPmZ : sve2_int_arith_pred<0b001100, "srshlr", null_frag>;
	defm URSHLR_ZPmZ : sve2_int_arith_pred<0b001110, "urshlr", null_frag>;
	defm SQSHL_ZPmZ : sve2_int_arith_pred<0b010000, "sqshl", int_aarch64_sve_sqshl>;
	defm UQSHL_ZPmZ : sve2_int_arith_pred<0b010010, "uqshl", int_aarch64_sve_uqshl>;
	defm SQRSHL_ZPmZ : sve2_int_arith_pred<0b010100, "sqrshl", int_aarch64_sve_sqrshl>;
	defm UQRSHL_ZPmZ : sve2_int_arith_pred<0b010110, "uqrshl", int_aarch64_sve_uqrshl>;
	defm SQSHLR_ZPmZ : sve2_int_arith_pred<0b011000, "sqshlr", null_frag>;
	defm UQSHLR_ZPmZ : sve2_int_arith_pred<0b011010, "uqshlr", null_frag>;
	defm SQRSHLR_ZPmZ : sve2_int_arith_pred<0b011100, "sqrshlr", null_frag>;
	defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr", null_frag>;

	let Predicates = [HasSVE2, UseExperimentalZeroingPseudos] in {
	defm SQSHL_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<null_frag>;
	defm UQSHL_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<null_frag>;
	defm SRSHR_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_srshr>;
	defm URSHR_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_urshr>;
	defm SQSHLU_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<int_aarch64_sve_sqshlu>;
	}

	// SVE2 predicated shifts
	defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl", "SQSHL_ZPZI">;
	defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl", "UQSHL_ZPZI">;
	defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr", "SRSHR_ZPZI", int_aarch64_sve_srshr>;
	defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr", "URSHR_ZPZI", int_aarch64_sve_urshr>;
	defm SQSHLU_ZPmI : sve2_int_bin_pred_shift_imm_left< 0b1111, "sqshlu", "SQSHLU_ZPZI", int_aarch64_sve_sqshlu>;

	// SVE2 integer add/subtract long
	defm SADDLB_ZZZ : sve2_wide_int_arith_long<0b00000, "saddlb", int_aarch64_sve_saddlb>;
	defm SADDLT_ZZZ : sve2_wide_int_arith_long<0b00001, "saddlt", int_aarch64_sve_saddlt>;
	defm UADDLB_ZZZ : sve2_wide_int_arith_long<0b00010, "uaddlb", int_aarch64_sve_uaddlb>;
	defm UADDLT_ZZZ : sve2_wide_int_arith_long<0b00011, "uaddlt", int_aarch64_sve_uaddlt>;
	defm SSUBLB_ZZZ : sve2_wide_int_arith_long<0b00100, "ssublb", int_aarch64_sve_ssublb>;
	defm SSUBLT_ZZZ : sve2_wide_int_arith_long<0b00101, "ssublt", int_aarch64_sve_ssublt>;
	defm USUBLB_ZZZ : sve2_wide_int_arith_long<0b00110, "usublb", int_aarch64_sve_usublb>;
	defm USUBLT_ZZZ : sve2_wide_int_arith_long<0b00111, "usublt", int_aarch64_sve_usublt>;
	defm SABDLB_ZZZ : sve2_wide_int_arith_long<0b01100, "sabdlb", int_aarch64_sve_sabdlb>;
	defm SABDLT_ZZZ : sve2_wide_int_arith_long<0b01101, "sabdlt", int_aarch64_sve_sabdlt>;
	defm UABDLB_ZZZ : sve2_wide_int_arith_long<0b01110, "uabdlb", int_aarch64_sve_uabdlb>;
	defm UABDLT_ZZZ : sve2_wide_int_arith_long<0b01111, "uabdlt", int_aarch64_sve_uabdlt>;

	// SVE2 integer add/subtract wide
	defm SADDWB_ZZZ : sve2_wide_int_arith_wide<0b000, "saddwb", int_aarch64_sve_saddwb>;
	defm SADDWT_ZZZ : sve2_wide_int_arith_wide<0b001, "saddwt", int_aarch64_sve_saddwt>;
	defm UADDWB_ZZZ : sve2_wide_int_arith_wide<0b010, "uaddwb", int_aarch64_sve_uaddwb>;
	defm UADDWT_ZZZ : sve2_wide_int_arith_wide<0b011, "uaddwt", int_aarch64_sve_uaddwt>;
	defm SSUBWB_ZZZ : sve2_wide_int_arith_wide<0b100, "ssubwb", int_aarch64_sve_ssubwb>;
	defm SSUBWT_ZZZ : sve2_wide_int_arith_wide<0b101, "ssubwt", int_aarch64_sve_ssubwt>;
	defm USUBWB_ZZZ : sve2_wide_int_arith_wide<0b110, "usubwb", int_aarch64_sve_usubwb>;
	defm USUBWT_ZZZ : sve2_wide_int_arith_wide<0b111, "usubwt", int_aarch64_sve_usubwt>;

	// SVE2 integer multiply long
	defm SQDMULLB_ZZZ : sve2_wide_int_arith_long<0b11000, "sqdmullb", int_aarch64_sve_sqdmullb>;
	defm SQDMULLT_ZZZ : sve2_wide_int_arith_long<0b11001, "sqdmullt", int_aarch64_sve_sqdmullt>;
	defm SMULLB_ZZZ : sve2_wide_int_arith_long<0b11100, "smullb", int_aarch64_sve_smullb>;
	defm SMULLT_ZZZ : sve2_wide_int_arith_long<0b11101, "smullt", int_aarch64_sve_smullt>;
	defm UMULLB_ZZZ : sve2_wide_int_arith_long<0b11110, "umullb", int_aarch64_sve_umullb>;
	defm UMULLT_ZZZ : sve2_wide_int_arith_long<0b11111, "umullt", int_aarch64_sve_umullt>;
	defm PMULLB_ZZZ : sve2_pmul_long<0b0, "pmullb", int_aarch64_sve_pmullb_pair>;
	defm PMULLT_ZZZ : sve2_pmul_long<0b1, "pmullt", int_aarch64_sve_pmullt_pair>;

	// SVE2 bitwise shift and insert
	defm SRI_ZZI : sve2_int_bin_shift_imm_right<0b0, "sri", int_aarch64_sve_sri>;
	defm SLI_ZZI : sve2_int_bin_shift_imm_left< 0b1, "sli", int_aarch64_sve_sli>;

	// SVE2 bitwise shift right and accumulate
	defm SSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b00, "ssra", int_aarch64_sve_ssra>;
	defm USRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b01, "usra", int_aarch64_sve_usra>;
	defm SRSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b10, "srsra", int_aarch64_sve_srsra>;
	defm URSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b11, "ursra", int_aarch64_sve_ursra>;

	// SVE2 complex integer add
	defm CADD_ZZI : sve2_int_cadd<0b0, "cadd", int_aarch64_sve_cadd_x>;
	defm SQCADD_ZZI : sve2_int_cadd<0b1, "sqcadd", int_aarch64_sve_sqcadd_x>;

	// SVE2 integer absolute difference and accumulate
	defm SABA_ZZZ : sve2_int_absdiff_accum<0b0, "saba", int_aarch64_sve_saba>;
	defm UABA_ZZZ : sve2_int_absdiff_accum<0b1, "uaba", int_aarch64_sve_uaba>;

	// SVE2 integer absolute difference and accumulate long
	defm SABALB_ZZZ : sve2_int_absdiff_accum_long<0b00, "sabalb", int_aarch64_sve_sabalb>;
	defm SABALT_ZZZ : sve2_int_absdiff_accum_long<0b01, "sabalt", int_aarch64_sve_sabalt>;
	defm UABALB_ZZZ : sve2_int_absdiff_accum_long<0b10, "uabalb", int_aarch64_sve_uabalb>;
	defm UABALT_ZZZ : sve2_int_absdiff_accum_long<0b11, "uabalt", int_aarch64_sve_uabalt>;

	// SVE2 integer add/subtract long with carry
	defm ADCLB_ZZZ : sve2_int_addsub_long_carry<0b00, "adclb", int_aarch64_sve_adclb>;
	defm ADCLT_ZZZ : sve2_int_addsub_long_carry<0b01, "adclt", int_aarch64_sve_adclt>;
	defm SBCLB_ZZZ : sve2_int_addsub_long_carry<0b10, "sbclb", int_aarch64_sve_sbclb>;
	defm SBCLT_ZZZ : sve2_int_addsub_long_carry<0b11, "sbclt", int_aarch64_sve_sbclt>;

	// SVE2 bitwise shift right narrow (bottom)
	defm SQSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b000, "sqshrunb", int_aarch64_sve_sqshrunb>;
	defm SQRSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b001, "sqrshrunb", int_aarch64_sve_sqrshrunb>;
	defm SHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b010, "shrnb", int_aarch64_sve_shrnb>;
	defm RSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b011, "rshrnb", int_aarch64_sve_rshrnb>;
	defm SQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b100, "sqshrnb", int_aarch64_sve_sqshrnb>;
	defm SQRSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b101, "sqrshrnb", int_aarch64_sve_sqrshrnb>;
	defm UQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b110, "uqshrnb", int_aarch64_sve_uqshrnb>;
	defm UQRSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b111, "uqrshrnb", int_aarch64_sve_uqrshrnb>;

	// SVE2 bitwise shift right narrow (top)
	defm SQSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b000, "sqshrunt", int_aarch64_sve_sqshrunt>;
	defm SQRSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b001, "sqrshrunt", int_aarch64_sve_sqrshrunt>;
	defm SHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b010, "shrnt", int_aarch64_sve_shrnt>;
	defm RSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b011, "rshrnt", int_aarch64_sve_rshrnt>;
	defm SQSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b100, "sqshrnt", int_aarch64_sve_sqshrnt>;
	defm SQRSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b101, "sqrshrnt", int_aarch64_sve_sqrshrnt>;
	defm UQSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b110, "uqshrnt", int_aarch64_sve_uqshrnt>;
	defm UQRSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b111, "uqrshrnt", int_aarch64_sve_uqrshrnt>;

	// SVE2 integer add/subtract narrow high part (bottom)
	defm ADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b00, "addhnb", int_aarch64_sve_addhnb>;
	defm RADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b01, "raddhnb", int_aarch64_sve_raddhnb>;
	defm SUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b10, "subhnb", int_aarch64_sve_subhnb>;
	defm RSUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b11, "rsubhnb", int_aarch64_sve_rsubhnb>;

	// SVE2 integer add/subtract narrow high part (top)
	defm ADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b00, "addhnt", int_aarch64_sve_addhnt>;
	defm RADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b01, "raddhnt", int_aarch64_sve_raddhnt>;
	defm SUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b10, "subhnt", int_aarch64_sve_subhnt>;
	defm RSUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b11, "rsubhnt", int_aarch64_sve_rsubhnt>;

	// SVE2 saturating extract narrow (bottom)
	defm SQXTNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b00, "sqxtnb", int_aarch64_sve_sqxtnb>;
	defm UQXTNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b01, "uqxtnb", int_aarch64_sve_uqxtnb>;
	defm SQXTUNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b10, "sqxtunb", int_aarch64_sve_sqxtunb>;

	// SVE2 saturating extract narrow (top)
	defm SQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b00, "sqxtnt", int_aarch64_sve_sqxtnt>;
	defm UQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b01, "uqxtnt", int_aarch64_sve_uqxtnt>;
	defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow_top<0b10, "sqxtunt", int_aarch64_sve_sqxtunt>;

	// SVE2 character match
	defm MATCH_PPzZZ : sve2_char_match<0b0, "match", int_aarch64_sve_match>;
	defm NMATCH_PPzZZ : sve2_char_match<0b1, "nmatch", int_aarch64_sve_nmatch>;

	// SVE2 bitwise exclusive-or interleaved
	defm EORBT_ZZZ : sve2_bitwise_xor_interleaved<0b0, "eorbt", int_aarch64_sve_eorbt>;
	defm EORTB_ZZZ : sve2_bitwise_xor_interleaved<0b1, "eortb", int_aarch64_sve_eortb>;

	// SVE2 bitwise shift left long
	defm SSHLLB_ZZI : sve2_bitwise_shift_left_long<0b00, "sshllb", int_aarch64_sve_sshllb>;
	defm SSHLLT_ZZI : sve2_bitwise_shift_left_long<0b01, "sshllt", int_aarch64_sve_sshllt>;
	defm USHLLB_ZZI : sve2_bitwise_shift_left_long<0b10, "ushllb", int_aarch64_sve_ushllb>;
	defm USHLLT_ZZI : sve2_bitwise_shift_left_long<0b11, "ushllt", int_aarch64_sve_ushllt>;

	// SVE2 integer add/subtract interleaved long
	defm SADDLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b00, "saddlbt", int_aarch64_sve_saddlbt>;
	defm SSUBLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b10, "ssublbt", int_aarch64_sve_ssublbt>;
	defm SSUBLTB_ZZZ : sve2_misc_int_addsub_long_interleaved<0b11, "ssubltb", int_aarch64_sve_ssubltb>;

	// SVE2 histogram generation (segment)
	def HISTSEG_ZZZ : sve2_hist_gen_segment<"histseg", int_aarch64_sve_histseg>;

	// SVE2 histogram generation (vector)
	defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt", int_aarch64_sve_histcnt>;

	// SVE2 floating-point base 2 logarithm as integer
	defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb", int_aarch64_sve_flogb>;

	// SVE2 floating-point convert precision
	defm FCVTXNT_ZPmZ : sve2_fp_convert_down_odd_rounding_top<"fcvtxnt", "int_aarch64_sve_fcvtxnt">;
	defm FCVTX_ZPmZ : sve2_fp_convert_down_odd_rounding<"fcvtx", "int_aarch64_sve_fcvtx">;
	defm FCVTNT_ZPmZ : sve2_fp_convert_down_narrow<"fcvtnt", "int_aarch64_sve_fcvtnt">;
	defm FCVTLT_ZPmZ : sve2_fp_convert_up_long<"fcvtlt", "int_aarch64_sve_fcvtlt">;

	// SVE2 floating-point pairwise operations
	defm FADDP_ZPmZZ : sve2_fp_pairwise_pred<0b000, "faddp", int_aarch64_sve_faddp>;
	defm FMAXNMP_ZPmZZ : sve2_fp_pairwise_pred<0b100, "fmaxnmp", int_aarch64_sve_fmaxnmp>;
	defm FMINNMP_ZPmZZ : sve2_fp_pairwise_pred<0b101, "fminnmp", int_aarch64_sve_fminnmp>;
	defm FMAXP_ZPmZZ : sve2_fp_pairwise_pred<0b110, "fmaxp", int_aarch64_sve_fmaxp>;
	defm FMINP_ZPmZZ : sve2_fp_pairwise_pred<0b111, "fminp", int_aarch64_sve_fminp>;

	// SVE2 floating-point multiply-add long (indexed)
	defm FMLALB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b00, "fmlalb", int_aarch64_sve_fmlalb_lane>;
	defm FMLALT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b01, "fmlalt", int_aarch64_sve_fmlalt_lane>;
	defm FMLSLB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b10, "fmlslb", int_aarch64_sve_fmlslb_lane>;
	defm FMLSLT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b11, "fmlslt", int_aarch64_sve_fmlslt_lane>;

	// SVE2 floating-point multiply-add long
	defm FMLALB_ZZZ_SHH : sve2_fp_mla_long<0b00, "fmlalb", int_aarch64_sve_fmlalb>;
	defm FMLALT_ZZZ_SHH : sve2_fp_mla_long<0b01, "fmlalt", int_aarch64_sve_fmlalt>;
	defm FMLSLB_ZZZ_SHH : sve2_fp_mla_long<0b10, "fmlslb", int_aarch64_sve_fmlslb>;
	defm FMLSLT_ZZZ_SHH : sve2_fp_mla_long<0b11, "fmlslt", int_aarch64_sve_fmlslt>;

	// SVE2 bitwise ternary operations
	defm EOR3_ZZZZ : sve2_int_bitwise_ternary_op<0b000, "eor3", int_aarch64_sve_eor3>;
	defm BCAX_ZZZZ : sve2_int_bitwise_ternary_op<0b010, "bcax", int_aarch64_sve_bcax>;
	defm BSL_ZZZZ : sve2_int_bitwise_ternary_op<0b001, "bsl", int_aarch64_sve_bsl>;
	defm BSL1N_ZZZZ : sve2_int_bitwise_ternary_op<0b011, "bsl1n", int_aarch64_sve_bsl1n>;
	defm BSL2N_ZZZZ : sve2_int_bitwise_ternary_op<0b101, "bsl2n", int_aarch64_sve_bsl2n>;
	defm NBSL_ZZZZ : sve2_int_bitwise_ternary_op<0b111, "nbsl", int_aarch64_sve_nbsl>;

	// SVE2 bitwise xor and rotate right by immediate
	defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar", int_aarch64_sve_xar>;

	// SVE2 extract vector (immediate offset, constructive)
	def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">;

	// SVE2 non-temporal gather loads
	defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00000, "ldnt1sb", AArch64ldnt1s_gather_z, nxv4i8>;
	defm LDNT1B_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00001, "ldnt1b", AArch64ldnt1_gather_z, nxv4i8>;
	defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00100, "ldnt1sh", AArch64ldnt1s_gather_z, nxv4i16>;
	defm LDNT1H_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00101, "ldnt1h", AArch64ldnt1_gather_z, nxv4i16>;
	defm LDNT1W_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b01001, "ldnt1w", AArch64ldnt1_gather_z, nxv4i32>;

	defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10000, "ldnt1sb", AArch64ldnt1s_gather_z, nxv2i8>;
	defm LDNT1B_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10010, "ldnt1b", AArch64ldnt1_gather_z, nxv2i8>;
	defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10100, "ldnt1sh", AArch64ldnt1s_gather_z, nxv2i16>;
	defm LDNT1H_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10110, "ldnt1h", AArch64ldnt1_gather_z, nxv2i16>;
	defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11000, "ldnt1sw", AArch64ldnt1s_gather_z, nxv2i32>;
	defm LDNT1W_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11010, "ldnt1w", AArch64ldnt1_gather_z, nxv2i32>;
	defm LDNT1D_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11110, "ldnt1d", AArch64ldnt1_gather_z, nxv2i64>;

	// SVE2 vector splice (constructive)
	defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">;

	// SVE2 non-temporal scatter stores
	defm STNT1B_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b001, "stnt1b", AArch64stnt1_scatter, nxv4i8>;
	defm STNT1H_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b011, "stnt1h", AArch64stnt1_scatter, nxv4i16>;
	defm STNT1W_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b101, "stnt1w", AArch64stnt1_scatter, nxv4i32>;

	defm STNT1B_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b000, "stnt1b", AArch64stnt1_scatter, nxv2i8>;
	defm STNT1H_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b010, "stnt1h", AArch64stnt1_scatter, nxv2i16>;
	defm STNT1W_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b100, "stnt1w", AArch64stnt1_scatter, nxv2i32>;
	defm STNT1D_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b110, "stnt1d", AArch64stnt1_scatter, nxv2i64>;

	// SVE2 table lookup (three sources)
	defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl", int_aarch64_sve_tbl2>;
	defm TBX_ZZZ : sve2_int_perm_tbx<"tbx", int_aarch64_sve_tbx>;

	let Predicates = [HasSVE, HasBF16] in {
	def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_tbx, nxv8bf16, nxv8bf16, nxv8i16, TBX_ZZZ_H>;
	def : Pat<(nxv8bf16 (int_aarch64_sve_tbl2 nxv8bf16:$Op1, nxv8bf16:$Op2, nxv8i16:$Op3)),
	(nxv8bf16 (TBL_ZZZZ_H (REG_SEQUENCE ZPR2, nxv8bf16:$Op1, zsub0, nxv8bf16:$Op2, zsub1),
	nxv8i16:$Op3))>;
	}

	// SVE2 integer compare scalar count and limit
	defm WHILEGE_PWW : sve_int_while4_rr<0b000, "whilege", int_aarch64_sve_whilege>;
	defm WHILEGT_PWW : sve_int_while4_rr<0b001, "whilegt", int_aarch64_sve_whilegt>;
	defm WHILEHS_PWW : sve_int_while4_rr<0b100, "whilehs", int_aarch64_sve_whilehs>;
	defm WHILEHI_PWW : sve_int_while4_rr<0b101, "whilehi", int_aarch64_sve_whilehi>;

	defm WHILEGE_PXX : sve_int_while8_rr<0b000, "whilege", int_aarch64_sve_whilege>;
	defm WHILEGT_PXX : sve_int_while8_rr<0b001, "whilegt", int_aarch64_sve_whilegt>;
	defm WHILEHS_PXX : sve_int_while8_rr<0b100, "whilehs", int_aarch64_sve_whilehs>;
	defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi", int_aarch64_sve_whilehi>;

	// SVE2 pointer conflict compare
	defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr", "int_aarch64_sve_whilewr">;
	defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw", "int_aarch64_sve_whilerw">;
	}

	let Predicates = [HasSVE2AES] in {
	// SVE2 crypto destructive binary operations
	defm AESE_ZZZ_B : sve2_crypto_des_bin_op<0b00, "aese", ZPR8, int_aarch64_sve_aese, nxv16i8>;
	defm AESD_ZZZ_B : sve2_crypto_des_bin_op<0b01, "aesd", ZPR8, int_aarch64_sve_aesd, nxv16i8>;

	// SVE2 crypto unary operations
	defm AESMC_ZZ_B : sve2_crypto_unary_op<0b0, "aesmc", int_aarch64_sve_aesmc>;
	defm AESIMC_ZZ_B : sve2_crypto_unary_op<0b1, "aesimc", int_aarch64_sve_aesimc>;

	// PMULLB and PMULLT instructions which operate with 64-bit source and
	// 128-bit destination elements are enabled with crypto extensions, similar
	// to NEON PMULL2 instruction.
	defm PMULLB_ZZZ_Q : sve2_wide_int_arith_pmul<0b00, 0b11010, "pmullb", int_aarch64_sve_pmullb_pair>;
	defm PMULLT_ZZZ_Q : sve2_wide_int_arith_pmul<0b00, 0b11011, "pmullt", int_aarch64_sve_pmullt_pair>;
	}

	let Predicates = [HasSVE2SM4] in {
	// SVE2 crypto constructive binary operations
	defm SM4EKEY_ZZZ_S : sve2_crypto_cons_bin_op<0b0, "sm4ekey", ZPR32, int_aarch64_sve_sm4ekey, nxv4i32>;
	// SVE2 crypto destructive binary operations
	defm SM4E_ZZZ_S : sve2_crypto_des_bin_op<0b10, "sm4e", ZPR32, int_aarch64_sve_sm4e, nxv4i32>;
	}

	let Predicates = [HasSVE2SHA3] in {
	// SVE2 crypto constructive binary operations
	defm RAX1_ZZZ_D : sve2_crypto_cons_bin_op<0b1, "rax1", ZPR64, int_aarch64_sve_rax1, nxv2i64>;
	}

	let Predicates = [HasSVE2BitPerm] in {
	// SVE2 bitwise permute
	defm BEXT_ZZZ : sve2_misc_bitwise<0b1100, "bext", int_aarch64_sve_bext_x>;
	defm BDEP_ZZZ : sve2_misc_bitwise<0b1101, "bdep", int_aarch64_sve_bdep_x>;
	defm BGRP_ZZZ : sve2_misc_bitwise<0b1110, "bgrp", int_aarch64_sve_bgrp_x>;
	}
	diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
	index a005d1e65abe..c56a65b9e212 100644
	--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
	+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
	@@ -1,7831 +1,7835 @@
	//=-- SVEInstrFormats.td - AArch64 SVE Instruction classes -- tablegen ---=//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// AArch64 Scalable Vector Extension (SVE) Instruction Class Definitions.
	//
	//===----------------------------------------------------------------------===//

	def SDT_AArch64Setcc : SDTypeProfile<1, 4, [
	SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>,
	SDTCVecEltisVT<0, i1>, SDTCVecEltisVT<1, i1>, SDTCisSameAs<2, 3>,
	SDTCisVT<4, OtherVT>
	]>;

	def AArch64setcc_z : SDNode<"AArch64ISD::SETCC_MERGE_ZERO", SDT_AArch64Setcc>;

	def SVEPatternOperand : AsmOperandClass {
	let Name = "SVEPattern";
	let ParserMethod = "tryParseSVEPattern";
	let PredicateMethod = "isSVEPattern";
	let RenderMethod = "addImmOperands";
	let DiagnosticType = "InvalidSVEPattern";
	}

	def sve_pred_enum : Operand<i32>, TImmLeaf<i32, [{
	return (((uint32_t)Imm) < 32);
	}]> {

	let PrintMethod = "printSVEPattern";
	let ParserMatchClass = SVEPatternOperand;
	}

	def SVEPrefetchOperand : AsmOperandClass {
	let Name = "SVEPrefetch";
	let ParserMethod = "tryParsePrefetch<true>";
	let PredicateMethod = "isPrefetch";
	let RenderMethod = "addPrefetchOperands";
	}

	def sve_prfop : Operand<i32>, TImmLeaf<i32, [{
	return (((uint32_t)Imm) <= 15);
	}]> {
	let PrintMethod = "printPrefetchOp<true>";
	let ParserMatchClass = SVEPrefetchOperand;
	}

	class SVELogicalImmOperand<int Width> : AsmOperandClass {
	let Name = "SVELogicalImm" # Width;
	let DiagnosticType = "LogicalSecondSource";
	let PredicateMethod = "isLogicalImm<int" # Width # "_t>";
	let RenderMethod = "addLogicalImmOperands<int" # Width # "_t>";
	}

	def sve_logical_imm8 : Operand<i64> {
	let ParserMatchClass = SVELogicalImmOperand<8>;
	let PrintMethod = "printLogicalImm<int8_t>";

	let MCOperandPredicate = [{
	if (!MCOp.isImm())
	return false;
	int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
	return AArch64_AM::isSVEMaskOfIdenticalElements<int8_t>(Val);
	}];
	}

	def sve_logical_imm16 : Operand<i64> {
	let ParserMatchClass = SVELogicalImmOperand<16>;
	let PrintMethod = "printLogicalImm<int16_t>";

	let MCOperandPredicate = [{
	if (!MCOp.isImm())
	return false;
	int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
	return AArch64_AM::isSVEMaskOfIdenticalElements<int16_t>(Val);
	}];
	}

	def sve_logical_imm32 : Operand<i64> {
	let ParserMatchClass = SVELogicalImmOperand<32>;
	let PrintMethod = "printLogicalImm<int32_t>";

	let MCOperandPredicate = [{
	if (!MCOp.isImm())
	return false;
	int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
	return AArch64_AM::isSVEMaskOfIdenticalElements<int32_t>(Val);
	}];
	}

	class SVEPreferredLogicalImmOperand<int Width> : AsmOperandClass {
	let Name = "SVEPreferredLogicalImm" # Width;
	let PredicateMethod = "isSVEPreferredLogicalImm<int" # Width # "_t>";
	let RenderMethod = "addLogicalImmOperands<int" # Width # "_t>";
	}

	def sve_preferred_logical_imm16 : Operand<i64> {
	let ParserMatchClass = SVEPreferredLogicalImmOperand<16>;
	let PrintMethod = "printSVELogicalImm<int16_t>";

	let MCOperandPredicate = [{
	if (!MCOp.isImm())
	return false;
	int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
	return AArch64_AM::isSVEMaskOfIdenticalElements<int16_t>(Val) &&
	AArch64_AM::isSVEMoveMaskPreferredLogicalImmediate(Val);
	}];
	}

	def sve_preferred_logical_imm32 : Operand<i64> {
	let ParserMatchClass = SVEPreferredLogicalImmOperand<32>;
	let PrintMethod = "printSVELogicalImm<int32_t>";

	let MCOperandPredicate = [{
	if (!MCOp.isImm())
	return false;
	int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
	return AArch64_AM::isSVEMaskOfIdenticalElements<int32_t>(Val) &&
	AArch64_AM::isSVEMoveMaskPreferredLogicalImmediate(Val);
	}];
	}

	def sve_preferred_logical_imm64 : Operand<i64> {
	let ParserMatchClass = SVEPreferredLogicalImmOperand<64>;
	let PrintMethod = "printSVELogicalImm<int64_t>";

	let MCOperandPredicate = [{
	if (!MCOp.isImm())
	return false;
	int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
	return AArch64_AM::isSVEMaskOfIdenticalElements<int64_t>(Val) &&
	AArch64_AM::isSVEMoveMaskPreferredLogicalImmediate(Val);
	}];
	}

	class SVELogicalImmNotOperand<int Width> : AsmOperandClass {
	let Name = "SVELogicalImm" # Width # "Not";
	let DiagnosticType = "LogicalSecondSource";
	let PredicateMethod = "isLogicalImm<int" # Width # "_t>";
	let RenderMethod = "addLogicalImmNotOperands<int" # Width # "_t>";
	}

	def sve_logical_imm8_not : Operand<i64> {
	let ParserMatchClass = SVELogicalImmNotOperand<8>;
	}

	def sve_logical_imm16_not : Operand<i64> {
	let ParserMatchClass = SVELogicalImmNotOperand<16>;
	}

	def sve_logical_imm32_not : Operand<i64> {
	let ParserMatchClass = SVELogicalImmNotOperand<32>;
	}

	class SVEShiftedImmOperand<int ElementWidth, string Infix, string Predicate>
	: AsmOperandClass {
	let Name = "SVE" # Infix # "Imm" # ElementWidth;
	let DiagnosticType = "Invalid" # Name;
	let RenderMethod = "addImmWithOptionalShiftOperands<8>";
	let ParserMethod = "tryParseImmWithOptionalShift";
	let PredicateMethod = Predicate;
	}

	def SVECpyImmOperand8 : SVEShiftedImmOperand<8, "Cpy", "isSVECpyImm<int8_t>">;
	def SVECpyImmOperand16 : SVEShiftedImmOperand<16, "Cpy", "isSVECpyImm<int16_t>">;
	def SVECpyImmOperand32 : SVEShiftedImmOperand<32, "Cpy", "isSVECpyImm<int32_t>">;
	def SVECpyImmOperand64 : SVEShiftedImmOperand<64, "Cpy", "isSVECpyImm<int64_t>">;

	def SVEAddSubImmOperand8 : SVEShiftedImmOperand<8, "AddSub", "isSVEAddSubImm<int8_t>">;
	def SVEAddSubImmOperand16 : SVEShiftedImmOperand<16, "AddSub", "isSVEAddSubImm<int16_t>">;
	def SVEAddSubImmOperand32 : SVEShiftedImmOperand<32, "AddSub", "isSVEAddSubImm<int32_t>">;
	def SVEAddSubImmOperand64 : SVEShiftedImmOperand<64, "AddSub", "isSVEAddSubImm<int64_t>">;

	class imm8_opt_lsl<int ElementWidth, string printType,
	AsmOperandClass OpndClass>
	: Operand<i32> {
	let EncoderMethod = "getImm8OptLsl";
	let DecoderMethod = "DecodeImm8OptLsl<" # ElementWidth # ">";
	let PrintMethod = "printImm8OptLsl<" # printType # ">";
	let ParserMatchClass = OpndClass;
	let MIOperandInfo = (ops i32imm, i32imm);
	}

	def cpy_imm8_opt_lsl_i8 : imm8_opt_lsl<8, "int8_t", SVECpyImmOperand8>;
	def cpy_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "int16_t", SVECpyImmOperand16>;
	def cpy_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "int32_t", SVECpyImmOperand32>;
	def cpy_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "int64_t", SVECpyImmOperand64>;

	def addsub_imm8_opt_lsl_i8 : imm8_opt_lsl<8, "uint8_t", SVEAddSubImmOperand8>;
	def addsub_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "uint16_t", SVEAddSubImmOperand16>;
	def addsub_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "uint32_t", SVEAddSubImmOperand32>;
	def addsub_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "uint64_t", SVEAddSubImmOperand64>;

	def SVEAddSubImm8Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i8>", []>;
	def SVEAddSubImm16Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i16>", []>;
	def SVEAddSubImm32Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i32>", []>;
	def SVEAddSubImm64Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i64>", []>;

	def SVELogicalImm8Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i8>", []>;
	def SVELogicalImm16Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i16>", []>;
	def SVELogicalImm32Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i32>", []>;
	def SVELogicalImm64Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i64>", []>;

	def SVE8BitLslImm : ComplexPattern<i32, 2, "SelectSVE8BitLslImm", [imm]>;

	def SVEArithUImmPat : ComplexPattern<i32, 1, "SelectSVEArithImm", []>;
	def SVEArithSImmPat : ComplexPattern<i32, 1, "SelectSVESignedArithImm", []>;

	def SVEShiftImm64 : ComplexPattern<i32, 1, "SelectSVEShiftImm64<0, 64>", []>;

	class SVEExactFPImm<string Suffix, string ValA, string ValB> : AsmOperandClass {
	let Name = "SVEExactFPImmOperand" # Suffix;
	let DiagnosticType = "Invalid" # Name;
	let ParserMethod = "tryParseFPImm<false>";
	let PredicateMethod = "isExactFPImm<" # ValA # ", " # ValB # ">";
	let RenderMethod = "addExactFPImmOperands<" # ValA # ", " # ValB # ">";
	}

	class SVEExactFPImmOperand<string Suffix, string ValA, string ValB> : Operand<i32> {
	let PrintMethod = "printExactFPImm<" # ValA # ", " # ValB # ">";
	let ParserMatchClass = SVEExactFPImm<Suffix, ValA, ValB>;
	}

	def sve_fpimm_half_one
	: SVEExactFPImmOperand<"HalfOne", "AArch64ExactFPImm::half",
	"AArch64ExactFPImm::one">;
	def sve_fpimm_half_two
	: SVEExactFPImmOperand<"HalfTwo", "AArch64ExactFPImm::half",
	"AArch64ExactFPImm::two">;
	def sve_fpimm_zero_one
	: SVEExactFPImmOperand<"ZeroOne", "AArch64ExactFPImm::zero",
	"AArch64ExactFPImm::one">;

	def sve_incdec_imm : Operand<i32>, TImmLeaf<i32, [{
	return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
	}]> {
	let ParserMatchClass = Imm1_16Operand;
	let EncoderMethod = "getSVEIncDecImm";
	let DecoderMethod = "DecodeSVEIncDecImm";
	}

	// This allows i32 immediate extraction from i64 based arithmetic.
	def sve_cnt_mul_imm : ComplexPattern<i32, 1, "SelectCntImm<1, 16, 1, false>">;
	def sve_cnt_shl_imm : ComplexPattern<i32, 1, "SelectCntImm<1, 16, 1, true>">;

	//===----------------------------------------------------------------------===//
	// SVE PTrue - These are used extensively throughout the pattern matching so
	// it's important we define them first.
	//===----------------------------------------------------------------------===//

	class sve_int_ptrue<bits<2> sz8_64, bits<3> opc, string asm, PPRRegOp pprty,
	ValueType vt, SDPatternOperator op>
	: I<(outs pprty:$Pd), (ins sve_pred_enum:$pattern),
	asm, "\t$Pd, $pattern",
	"",
	[(set (vt pprty:$Pd), (op sve_pred_enum:$pattern))]>, Sched<[]> {
	bits<4> Pd;
	bits<5> pattern;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = sz8_64;
	let Inst{21-19} = 0b011;
	let Inst{18-17} = opc{2-1};
	let Inst{16} = opc{0};
	let Inst{15-10} = 0b111000;
	let Inst{9-5} = pattern;
	let Inst{4} = 0b0;
	let Inst{3-0} = Pd;

	let Defs = !if(!eq (opc{0}, 1), [NZCV], []);
	}

	multiclass sve_int_ptrue<bits<3> opc, string asm, SDPatternOperator op> {
	def _B : sve_int_ptrue<0b00, opc, asm, PPR8, nxv16i1, op>;
	def _H : sve_int_ptrue<0b01, opc, asm, PPR16, nxv8i1, op>;
	def _S : sve_int_ptrue<0b10, opc, asm, PPR32, nxv4i1, op>;
	def _D : sve_int_ptrue<0b11, opc, asm, PPR64, nxv2i1, op>;

	def : InstAlias<asm # "\t$Pd",
	(!cast<Instruction>(NAME # _B) PPR8:$Pd, 0b11111), 1>;
	def : InstAlias<asm # "\t$Pd",
	(!cast<Instruction>(NAME # _H) PPR16:$Pd, 0b11111), 1>;
	def : InstAlias<asm # "\t$Pd",
	(!cast<Instruction>(NAME # _S) PPR32:$Pd, 0b11111), 1>;
	def : InstAlias<asm # "\t$Pd",
	(!cast<Instruction>(NAME # _D) PPR64:$Pd, 0b11111), 1>;
	}

	def SDT_AArch64PTrue : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
	def AArch64ptrue : SDNode<"AArch64ISD::PTRUE", SDT_AArch64PTrue>;

	let Predicates = [HasSVE] in {
	defm PTRUE : sve_int_ptrue<0b000, "ptrue", AArch64ptrue>;
	defm PTRUES : sve_int_ptrue<0b001, "ptrues", null_frag>;
	}

	//===----------------------------------------------------------------------===//
	// SVE pattern match helpers.
	//===----------------------------------------------------------------------===//

	class SVE_1_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
	Instruction inst>
	: Pat<(vtd (op vt1:$Op1)),
	(inst $Op1)>;

	class SVE_1_Op_Imm_OptLsl_Reverse_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty,
	ValueType it, ComplexPattern cpx, Instruction inst>
	: Pat<(vt (op (vt (AArch64dup (it (cpx i32:$imm, i32:$shift)))), (vt zprty:$Op1))),
	(inst $Op1, i32:$imm, i32:$shift)>;

	class SVE_1_Op_Imm_OptLsl_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty,
	ValueType it, ComplexPattern cpx, Instruction inst>
	: Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm, i32:$shift)))))),
	(inst $Op1, i32:$imm, i32:$shift)>;

	class SVE_1_Op_Imm_Arith_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty,
	ValueType it, ComplexPattern cpx, Instruction inst>
	: Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))),
	(inst $Op1, i32:$imm)>;

	class SVE_1_Op_Imm_Shift_Pred_Pat<ValueType vt, ValueType pt, SDPatternOperator op,
	ZPRRegOp zprty, Operand ImmTy, Instruction inst>
	: Pat<(vt (op (pt (AArch64ptrue 31)), (vt zprty:$Op1), (vt (AArch64dup (ImmTy:$imm))))),
	(inst $Op1, ImmTy:$imm)>;

	class SVE_1_Op_Imm_Arith_Pred_Pat<ValueType vt, ValueType pt, SDPatternOperator op,
	ZPRRegOp zprty, ValueType it, ComplexPattern cpx, Instruction inst>
	: Pat<(vt (op (pt (AArch64ptrue 31)), (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))),
	(inst $Op1, i32:$imm)>;

	class SVE_1_Op_Imm_Log_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty,
	ValueType it, ComplexPattern cpx, Instruction inst>
	: Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i64:$imm)))))),
	(inst $Op1, i64:$imm)>;

	class SVE_2_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
	ValueType vt2, Instruction inst>
	: Pat<(vtd (op vt1:$Op1, vt2:$Op2)),
	(inst $Op1, $Op2)>;

	class SVE_2_Op_Pat_Reduce_To_Neon<ValueType vtd, SDPatternOperator op, ValueType vt1,
	ValueType vt2, Instruction inst, SubRegIndex sub>
	: Pat<(vtd (op vt1:$Op1, vt2:$Op2)),
	(INSERT_SUBREG (vtd (IMPLICIT_DEF)), (inst $Op1, $Op2), sub)>;

	class SVE_3_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
	ValueType vt2, ValueType vt3, Instruction inst>
	: Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3)),
	(inst $Op1, $Op2, $Op3)>;

	class SVE_4_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
	ValueType vt2, ValueType vt3, ValueType vt4,
	Instruction inst>
	: Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3, vt4:$Op4)),
	(inst $Op1, $Op2, $Op3, $Op4)>;

	class SVE_2_Op_Imm_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
	ValueType vt2, Operand ImmTy, Instruction inst>
	: Pat<(vtd (op vt1:$Op1, (vt2 ImmTy:$Op2))),
	(inst $Op1, ImmTy:$Op2)>;

	class SVE_3_Op_Imm_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
	ValueType vt2, ValueType vt3, Operand ImmTy,
	Instruction inst>
	: Pat<(vtd (op vt1:$Op1, vt2:$Op2, (vt3 ImmTy:$Op3))),
	(inst $Op1, $Op2, ImmTy:$Op3)>;

	class SVE_4_Op_Imm_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
	ValueType vt2, ValueType vt3, ValueType vt4,
	Operand ImmTy, Instruction inst>
	: Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3, (vt4 ImmTy:$Op4))),
	(inst $Op1, $Op2, $Op3, ImmTy:$Op4)>;

	def SVEDup0 : ComplexPattern<i64, 0, "SelectDupZero", []>;
	def SVEDup0Undef : ComplexPattern<i64, 0, "SelectDupZeroOrUndef", []>;

	let AddedComplexity = 1 in {
	class SVE_3_Op_Pat_SelZero<ValueType vtd, SDPatternOperator op, ValueType vt1,
	ValueType vt2, ValueType vt3, Instruction inst>
	: Pat<(vtd (vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (SVEDup0)), vt3:$Op3))),
	(inst $Op1, $Op2, $Op3)>;

	class SVE_3_Op_Pat_Shift_Imm_SelZero<ValueType vtd, SDPatternOperator op,
	ValueType vt1, ValueType vt2,
	Operand vt3, Instruction inst>
	: Pat<(vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (SVEDup0)), (i32 (vt3:$Op3)))),
	(inst $Op1, $Op2, vt3:$Op3)>;
	}

	//
	// Common but less generic patterns.
	//

	class SVE_1_Op_AllActive_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
	Instruction inst, Instruction ptrue>
	: Pat<(vtd (op vt1:$Op1)),
	(inst (IMPLICIT_DEF), (ptrue 31), $Op1)>;

	class SVE_2_Op_AllActive_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
	ValueType vt2, Instruction inst, Instruction ptrue>
	: Pat<(vtd (op vt1:$Op1, vt2:$Op2)),
	(inst (ptrue 31), $Op1, $Op2)>;

	//
	// Pseudo -> Instruction mappings
	//
	def getSVEPseudoMap : InstrMapping {
	let FilterClass = "SVEPseudo2Instr";
	let RowFields = ["PseudoName"];
	let ColFields = ["IsInstr"];
	let KeyCol = ["0"];
	let ValueCols = [["1"]];
	}

	class SVEPseudo2Instr<string name, bit instr> {
	string PseudoName = name;
	bit IsInstr = instr;
	}

	// Lookup e.g. DIV -> DIVR
	def getSVERevInstr : InstrMapping {
	let FilterClass = "SVEInstr2Rev";
	let RowFields = ["InstrName"];
	let ColFields = ["isReverseInstr"];
	let KeyCol = ["0"];
	let ValueCols = [["1"]];
	}

	// Lookup e.g. DIVR -> DIV
	def getSVENonRevInstr : InstrMapping {
	let FilterClass = "SVEInstr2Rev";
	let RowFields = ["InstrName"];
	let ColFields = ["isReverseInstr"];
	let KeyCol = ["1"];
	let ValueCols = [["0"]];
	}

	class SVEInstr2Rev<string name1, string name2, bit name1IsReverseInstr> {
	string InstrName = !if(name1IsReverseInstr, name1, name2);
	bit isReverseInstr = name1IsReverseInstr;
	}

	//
	// Pseudos for destructive operands
	//
	let hasNoSchedulingInfo = 1 in {
	class PredTwoOpPseudo<string name, ZPRRegOp zprty,
	FalseLanesEnum flags = FalseLanesNone>
	: SVEPseudo2Instr<name, 0>,
	Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, zprty:$Zs2), []> {
	let FalseLanes = flags;
	}

	class PredTwoOpImmPseudo<string name, ZPRRegOp zprty, Operand immty,
	FalseLanesEnum flags = FalseLanesNone>
	: SVEPseudo2Instr<name, 0>,
	Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, immty:$imm), []> {
	let FalseLanes = flags;
	}
	}

	//===----------------------------------------------------------------------===//
	// SVE Predicate Misc Group
	//===----------------------------------------------------------------------===//

	class sve_int_pfalse<bits<6> opc, string asm>
	: I<(outs PPR8:$Pd), (ins),
	asm, "\t$Pd",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = opc{5-4};
	let Inst{21-19} = 0b011;
	let Inst{18-16} = opc{3-1};
	let Inst{15-10} = 0b111001;
	let Inst{9} = opc{0};
	let Inst{8-4} = 0b00000;
	let Inst{3-0} = Pd;
	}

	class sve_int_ptest<bits<6> opc, string asm>
	: I<(outs), (ins PPRAny:$Pg, PPR8:$Pn),
	asm, "\t$Pg, $Pn",
	"",
	[]>, Sched<[]> {
	bits<4> Pg;
	bits<4> Pn;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = opc{5-4};
	let Inst{21-19} = 0b010;
	let Inst{18-16} = opc{3-1};
	let Inst{15-14} = 0b11;
	let Inst{13-10} = Pg;
	let Inst{9} = opc{0};
	let Inst{8-5} = Pn;
	let Inst{4-0} = 0b00000;

	let Defs = [NZCV];
	}

	class sve_int_pfirst_next<bits<2> sz8_64, bits<5> opc, string asm,
	PPRRegOp pprty>
	: I<(outs pprty:$Pdn), (ins PPRAny:$Pg, pprty:$_Pdn),
	asm, "\t$Pdn, $Pg, $_Pdn",
	"",
	[]>, Sched<[]> {
	bits<4> Pdn;
	bits<4> Pg;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = sz8_64;
	let Inst{21-19} = 0b011;
	let Inst{18-16} = opc{4-2};
	let Inst{15-11} = 0b11000;
	let Inst{10-9} = opc{1-0};
	let Inst{8-5} = Pg;
	let Inst{4} = 0;
	let Inst{3-0} = Pdn;

	let Constraints = "$Pdn = $_Pdn";
	let Defs = [NZCV];
	}

	multiclass sve_int_pfirst<bits<5> opc, string asm, SDPatternOperator op> {
	def _B : sve_int_pfirst_next<0b01, opc, asm, PPR8>;

	def : SVE_2_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, !cast<Instruction>(NAME # _B)>;
	}

	multiclass sve_int_pnext<bits<5> opc, string asm, SDPatternOperator op> {
	def _B : sve_int_pfirst_next<0b00, opc, asm, PPR8>;
	def _H : sve_int_pfirst_next<0b01, opc, asm, PPR16>;
	def _S : sve_int_pfirst_next<0b10, opc, asm, PPR32>;
	def _D : sve_int_pfirst_next<0b11, opc, asm, PPR64>;

	def : SVE_2_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, !cast<Instruction>(NAME # _B)>;
	def : SVE_2_Op_Pat<nxv8i1, op, nxv8i1, nxv8i1, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<nxv4i1, op, nxv4i1, nxv4i1, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<nxv2i1, op, nxv2i1, nxv2i1, !cast<Instruction>(NAME # _D)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Predicate Count Group
	//===----------------------------------------------------------------------===//

	class sve_int_count_r<bits<2> sz8_64, bits<5> opc, string asm,
	RegisterOperand dty, PPRRegOp pprty, RegisterOperand sty>
	: I<(outs dty:$Rdn), (ins pprty:$Pg, sty:$_Rdn),
	asm, "\t$Rdn, $Pg",
	"",
	[]>, Sched<[]> {
	bits<5> Rdn;
	bits<4> Pg;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = sz8_64;
	let Inst{21-19} = 0b101;
	let Inst{18-16} = opc{4-2};
	let Inst{15-11} = 0b10001;
	let Inst{10-9} = opc{1-0};
	let Inst{8-5} = Pg;
	let Inst{4-0} = Rdn;

	// Signed 32bit forms require their GPR operand printed.
	let AsmString = !if(!eq(opc{4,2-0}, 0b0000),
	!strconcat(asm, "\t$Rdn, $Pg, $_Rdn"),
	!strconcat(asm, "\t$Rdn, $Pg"));
	let Constraints = "$Rdn = $_Rdn";
	}

	multiclass sve_int_count_r_s32<bits<5> opc, string asm,
	SDPatternOperator op> {
	def _B : sve_int_count_r<0b00, opc, asm, GPR64z, PPR8, GPR64as32>;
	def _H : sve_int_count_r<0b01, opc, asm, GPR64z, PPR16, GPR64as32>;
	def _S : sve_int_count_r<0b10, opc, asm, GPR64z, PPR32, GPR64as32>;
	def _D : sve_int_count_r<0b11, opc, asm, GPR64z, PPR64, GPR64as32>;

	def : Pat<(i32 (op GPR32:$Rn, (nxv16i1 PPRAny:$Pg))),
	(EXTRACT_SUBREG (!cast<Instruction>(NAME # _B) PPRAny:$Pg, (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32)), sub_32)>;
	def : Pat<(i64 (sext (i32 (op GPR32:$Rn, (nxv16i1 PPRAny:$Pg))))),
	(!cast<Instruction>(NAME # _B) PPRAny:$Pg, (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32))>;

	def : Pat<(i32 (op GPR32:$Rn, (nxv8i1 PPRAny:$Pg))),
	(EXTRACT_SUBREG (!cast<Instruction>(NAME # _H) PPRAny:$Pg, (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32)), sub_32)>;
	def : Pat<(i64 (sext (i32 (op GPR32:$Rn, (nxv8i1 PPRAny:$Pg))))),
	(!cast<Instruction>(NAME # _H) PPRAny:$Pg, (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32))>;

	def : Pat<(i32 (op GPR32:$Rn, (nxv4i1 PPRAny:$Pg))),
	(EXTRACT_SUBREG (!cast<Instruction>(NAME # _S) PPRAny:$Pg, (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32)), sub_32)>;
	def : Pat<(i64 (sext (i32 (op GPR32:$Rn, (nxv4i1 PPRAny:$Pg))))),
	(!cast<Instruction>(NAME # _S) PPRAny:$Pg, (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32))>;

	def : Pat<(i32 (op GPR32:$Rn, (nxv2i1 PPRAny:$Pg))),
	(EXTRACT_SUBREG (!cast<Instruction>(NAME # _D) PPRAny:$Pg, (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32)), sub_32)>;
	def : Pat<(i64 (sext (i32 (op GPR32:$Rn, (nxv2i1 PPRAny:$Pg))))),
	(!cast<Instruction>(NAME # _D) PPRAny:$Pg, (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32))>;
	}

	multiclass sve_int_count_r_u32<bits<5> opc, string asm,
	SDPatternOperator op> {
	def _B : sve_int_count_r<0b00, opc, asm, GPR32z, PPR8, GPR32z>;
	def _H : sve_int_count_r<0b01, opc, asm, GPR32z, PPR16, GPR32z>;
	def _S : sve_int_count_r<0b10, opc, asm, GPR32z, PPR32, GPR32z>;
	def _D : sve_int_count_r<0b11, opc, asm, GPR32z, PPR64, GPR32z>;

	def : Pat<(i32 (op GPR32:$Rn, (nxv16i1 PPRAny:$Pg))),
	(!cast<Instruction>(NAME # _B) PPRAny:$Pg, $Rn)>;
	def : Pat<(i32 (op GPR32:$Rn, (nxv8i1 PPRAny:$Pg))),
	(!cast<Instruction>(NAME # _H) PPRAny:$Pg, $Rn)>;
	def : Pat<(i32 (op GPR32:$Rn, (nxv4i1 PPRAny:$Pg))),
	(!cast<Instruction>(NAME # _S) PPRAny:$Pg, $Rn)>;
	def : Pat<(i32 (op GPR32:$Rn, (nxv2i1 PPRAny:$Pg))),
	(!cast<Instruction>(NAME # _D) PPRAny:$Pg, $Rn)>;
	}

	multiclass sve_int_count_r_x64<bits<5> opc, string asm,
	SDPatternOperator op = null_frag> {
	def _B : sve_int_count_r<0b00, opc, asm, GPR64z, PPR8, GPR64z>;
	def _H : sve_int_count_r<0b01, opc, asm, GPR64z, PPR16, GPR64z>;
	def _S : sve_int_count_r<0b10, opc, asm, GPR64z, PPR32, GPR64z>;
	def _D : sve_int_count_r<0b11, opc, asm, GPR64z, PPR64, GPR64z>;

	def : Pat<(i64 (op GPR64:$Rn, (nxv16i1 PPRAny:$Pg))),
	(!cast<Instruction>(NAME # _B) PPRAny:$Pg, $Rn)>;
	def : Pat<(i64 (op GPR64:$Rn, (nxv8i1 PPRAny:$Pg))),
	(!cast<Instruction>(NAME # _H) PPRAny:$Pg, $Rn)>;
	def : Pat<(i64 (op GPR64:$Rn, (nxv4i1 PPRAny:$Pg))),
	(!cast<Instruction>(NAME # _S) PPRAny:$Pg, $Rn)>;
	def : Pat<(i64 (op GPR64:$Rn, (nxv2i1 PPRAny:$Pg))),
	(!cast<Instruction>(NAME # _D) PPRAny:$Pg, $Rn)>;
	}

	class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm,
	ZPRRegOp zprty, PPRRegOp pprty>
	: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, pprty:$Pm),
	asm, "\t$Zdn, $Pm",
	"",
	[]>, Sched<[]> {
	bits<4> Pm;
	bits<5> Zdn;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = sz8_64;
	let Inst{21-19} = 0b101;
	let Inst{18-16} = opc{4-2};
	let Inst{15-11} = 0b10000;
	let Inst{10-9} = opc{1-0};
	let Inst{8-5} = Pm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve_int_count_v<bits<5> opc, string asm,
	SDPatternOperator op = null_frag> {
	def _H : sve_int_count_v<0b01, opc, asm, ZPR16, PPR16>;
	def _S : sve_int_count_v<0b10, opc, asm, ZPR32, PPR32>;
	def _D : sve_int_count_v<0b11, opc, asm, ZPR64, PPR64>;

	def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, !cast<Instruction>(NAME # _D)>;

	def : InstAlias<asm # "\t$Zdn, $Pm",
	(!cast<Instruction>(NAME # "_H") ZPR16:$Zdn, PPRAny:$Pm), 0>;
	def : InstAlias<asm # "\t$Zdn, $Pm",
	(!cast<Instruction>(NAME # "_S") ZPR32:$Zdn, PPRAny:$Pm), 0>;
	def : InstAlias<asm # "\t$Zdn, $Pm",
	(!cast<Instruction>(NAME # "_D") ZPR64:$Zdn, PPRAny:$Pm), 0>;
	}

	class sve_int_pcount_pred<bits<2> sz8_64, bits<4> opc, string asm,
	PPRRegOp pprty>
	: I<(outs GPR64:$Rd), (ins PPRAny:$Pg, pprty:$Pn),
	asm, "\t$Rd, $Pg, $Pn",
	"",
	[]>, Sched<[]> {
	bits<4> Pg;
	bits<4> Pn;
	bits<5> Rd;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = sz8_64;
	let Inst{21-19} = 0b100;
	let Inst{18-16} = opc{3-1};
	let Inst{15-14} = 0b10;
	let Inst{13-10} = Pg;
	let Inst{9} = opc{0};
	let Inst{8-5} = Pn;
	let Inst{4-0} = Rd;
	}

	multiclass sve_int_pcount_pred<bits<4> opc, string asm,
	SDPatternOperator int_op> {
	def _B : sve_int_pcount_pred<0b00, opc, asm, PPR8>;
	def _H : sve_int_pcount_pred<0b01, opc, asm, PPR16>;
	def _S : sve_int_pcount_pred<0b10, opc, asm, PPR32>;
	def _D : sve_int_pcount_pred<0b11, opc, asm, PPR64>;

	def : SVE_2_Op_Pat<i64, int_op, nxv16i1, nxv16i1, !cast<Instruction>(NAME # _B)>;
	def : SVE_2_Op_Pat<i64, int_op, nxv8i1, nxv8i1, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<i64, int_op, nxv4i1, nxv4i1, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<i64, int_op, nxv2i1, nxv2i1, !cast<Instruction>(NAME # _D)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Element Count Group
	//===----------------------------------------------------------------------===//

	class sve_int_count<bits<3> opc, string asm>
	: I<(outs GPR64:$Rd), (ins sve_pred_enum:$pattern, sve_incdec_imm:$imm4),
	asm, "\t$Rd, $pattern, mul $imm4",
	"",
	[]>, Sched<[]> {
	bits<5> Rd;
	bits<4> imm4;
	bits<5> pattern;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = opc{2-1};
	let Inst{21-20} = 0b10;
	let Inst{19-16} = imm4;
	let Inst{15-11} = 0b11100;
	let Inst{10} = opc{0};
	let Inst{9-5} = pattern;
	let Inst{4-0} = Rd;
	}

	multiclass sve_int_count<bits<3> opc, string asm, SDPatternOperator op> {
	def NAME : sve_int_count<opc, asm>;

	def : InstAlias<asm # "\t$Rd, $pattern",
	(!cast<Instruction>(NAME) GPR64:$Rd, sve_pred_enum:$pattern, 1), 1>;
	def : InstAlias<asm # "\t$Rd",
	(!cast<Instruction>(NAME) GPR64:$Rd, 0b11111, 1), 2>;

	def : Pat<(i64 (mul (op sve_pred_enum:$pattern), (sve_cnt_mul_imm i32:$imm))),
	(!cast<Instruction>(NAME) sve_pred_enum:$pattern, sve_incdec_imm:$imm)>;

	def : Pat<(i64 (shl (op sve_pred_enum:$pattern), (i64 (sve_cnt_shl_imm i32:$imm)))),
	(!cast<Instruction>(NAME) sve_pred_enum:$pattern, sve_incdec_imm:$imm)>;

	def : Pat<(i64 (op sve_pred_enum:$pattern)),
	(!cast<Instruction>(NAME) sve_pred_enum:$pattern, 1)>;
	}

	class sve_int_countvlv<bits<5> opc, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, sve_pred_enum:$pattern, sve_incdec_imm:$imm4),
	asm, "\t$Zdn, $pattern, mul $imm4",
	"",
	[]>, Sched<[]> {
	bits<5> Zdn;
	bits<5> pattern;
	bits<4> imm4;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = opc{4-3};
	let Inst{21} = 0b1;
	let Inst{20} = opc{2};
	let Inst{19-16} = imm4;
	let Inst{15-12} = 0b1100;
	let Inst{11-10} = opc{1-0};
	let Inst{9-5} = pattern;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve_int_countvlv<bits<5> opc, string asm, ZPRRegOp zprty,
	SDPatternOperator op = null_frag,
	ValueType vt = OtherVT> {
	def NAME : sve_int_countvlv<opc, asm, zprty>;

	def : InstAlias<asm # "\t$Zdn, $pattern",
	(!cast<Instruction>(NAME) zprty:$Zdn, sve_pred_enum:$pattern, 1), 1>;
	def : InstAlias<asm # "\t$Zdn",
	(!cast<Instruction>(NAME) zprty:$Zdn, 0b11111, 1), 2>;

	def : Pat<(vt (op (vt zprty:$Zn), (sve_pred_enum:$pattern), (sve_incdec_imm:$imm4))),
	(!cast<Instruction>(NAME) $Zn, sve_pred_enum:$pattern, sve_incdec_imm:$imm4)>;
	}

	class sve_int_pred_pattern_a<bits<3> opc, string asm>
	: I<(outs GPR64:$Rdn), (ins GPR64:$_Rdn, sve_pred_enum:$pattern, sve_incdec_imm:$imm4),
	asm, "\t$Rdn, $pattern, mul $imm4",
	"",
	[]>, Sched<[]> {
	bits<5> Rdn;
	bits<5> pattern;
	bits<4> imm4;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = opc{2-1};
	let Inst{21-20} = 0b11;
	let Inst{19-16} = imm4;
	let Inst{15-11} = 0b11100;
	let Inst{10} = opc{0};
	let Inst{9-5} = pattern;
	let Inst{4-0} = Rdn;

	let Constraints = "$Rdn = $_Rdn";
	}

	multiclass sve_int_pred_pattern_a<bits<3> opc, string asm> {
	def NAME : sve_int_pred_pattern_a<opc, asm>;

	def : InstAlias<asm # "\t$Rdn, $pattern",
	(!cast<Instruction>(NAME) GPR64:$Rdn, sve_pred_enum:$pattern, 1), 1>;
	def : InstAlias<asm # "\t$Rdn",
	(!cast<Instruction>(NAME) GPR64:$Rdn, 0b11111, 1), 2>;
	}

	class sve_int_pred_pattern_b<bits<5> opc, string asm, RegisterOperand dt,
	RegisterOperand st>
	: I<(outs dt:$Rdn), (ins st:$_Rdn, sve_pred_enum:$pattern, sve_incdec_imm:$imm4),
	asm, "\t$Rdn, $pattern, mul $imm4",
	"",
	[]>, Sched<[]> {
	bits<5> Rdn;
	bits<5> pattern;
	bits<4> imm4;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = opc{4-3};
	let Inst{21} = 0b1;
	let Inst{20} = opc{2};
	let Inst{19-16} = imm4;
	let Inst{15-12} = 0b1111;
	let Inst{11-10} = opc{1-0};
	let Inst{9-5} = pattern;
	let Inst{4-0} = Rdn;

	// Signed 32bit forms require their GPR operand printed.
	let AsmString = !if(!eq(opc{2,0}, 0b00),
	!strconcat(asm, "\t$Rdn, $_Rdn, $pattern, mul $imm4"),
	!strconcat(asm, "\t$Rdn, $pattern, mul $imm4"));

	let Constraints = "$Rdn = $_Rdn";
	}

	multiclass sve_int_pred_pattern_b_s32<bits<5> opc, string asm,
	SDPatternOperator op> {
	def NAME : sve_int_pred_pattern_b<opc, asm, GPR64z, GPR64as32>;

	def : InstAlias<asm # "\t$Rd, $Rn, $pattern",
	(!cast<Instruction>(NAME) GPR64z:$Rd, GPR64as32:$Rn, sve_pred_enum:$pattern, 1), 1>;
	def : InstAlias<asm # "\t$Rd, $Rn",
	(!cast<Instruction>(NAME) GPR64z:$Rd, GPR64as32:$Rn, 0b11111, 1), 2>;

	// NOTE: Register allocation doesn't like tied operands of differing register
	// class, hence the extra INSERT_SUBREG complication.

	def : Pat<(i32 (op GPR32:$Rn, (sve_pred_enum:$pattern), (sve_incdec_imm:$imm4))),
	(EXTRACT_SUBREG (!cast<Instruction>(NAME) (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32), sve_pred_enum:$pattern, sve_incdec_imm:$imm4), sub_32)>;
	def : Pat<(i64 (sext (i32 (op GPR32:$Rn, (sve_pred_enum:$pattern), (sve_incdec_imm:$imm4))))),
	(!cast<Instruction>(NAME) (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32), sve_pred_enum:$pattern, sve_incdec_imm:$imm4)>;
	}

	multiclass sve_int_pred_pattern_b_u32<bits<5> opc, string asm,
	SDPatternOperator op> {
	def NAME : sve_int_pred_pattern_b<opc, asm, GPR32z, GPR32z>;

	def : InstAlias<asm # "\t$Rdn, $pattern",
	(!cast<Instruction>(NAME) GPR32z:$Rdn, sve_pred_enum:$pattern, 1), 1>;
	def : InstAlias<asm # "\t$Rdn",
	(!cast<Instruction>(NAME) GPR32z:$Rdn, 0b11111, 1), 2>;

	def : Pat<(i32 (op GPR32:$Rn, (sve_pred_enum:$pattern), (sve_incdec_imm:$imm4))),
	(!cast<Instruction>(NAME) $Rn, sve_pred_enum:$pattern, sve_incdec_imm:$imm4)>;
	}

	multiclass sve_int_pred_pattern_b_x64<bits<5> opc, string asm,
	SDPatternOperator op> {
	def NAME : sve_int_pred_pattern_b<opc, asm, GPR64z, GPR64z>;

	def : InstAlias<asm # "\t$Rdn, $pattern",
	(!cast<Instruction>(NAME) GPR64z:$Rdn, sve_pred_enum:$pattern, 1), 1>;
	def : InstAlias<asm # "\t$Rdn",
	(!cast<Instruction>(NAME) GPR64z:$Rdn, 0b11111, 1), 2>;

	def : Pat<(i64 (op GPR64:$Rn, (sve_pred_enum:$pattern), (sve_incdec_imm:$imm4))),
	(!cast<Instruction>(NAME) $Rn, sve_pred_enum:$pattern, sve_incdec_imm:$imm4)>;
	}


	//===----------------------------------------------------------------------===//
	// SVE Permute - Cross Lane Group
	//===----------------------------------------------------------------------===//

	class sve_int_perm_dup_r<bits<2> sz8_64, string asm, ZPRRegOp zprty,
	ValueType vt, RegisterClass srcRegType,
	SDPatternOperator op>
	: I<(outs zprty:$Zd), (ins srcRegType:$Rn),
	asm, "\t$Zd, $Rn",
	"",
	[(set (vt zprty:$Zd), (op srcRegType:$Rn))]>, Sched<[]> {
	bits<5> Rn;
	bits<5> Zd;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-10} = 0b100000001110;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_perm_dup_r<string asm, SDPatternOperator op> {
	def _B : sve_int_perm_dup_r<0b00, asm, ZPR8, nxv16i8, GPR32sp, op>;
	def _H : sve_int_perm_dup_r<0b01, asm, ZPR16, nxv8i16, GPR32sp, op>;
	def _S : sve_int_perm_dup_r<0b10, asm, ZPR32, nxv4i32, GPR32sp, op>;
	def _D : sve_int_perm_dup_r<0b11, asm, ZPR64, nxv2i64, GPR64sp, op>;

	def : InstAlias<"mov $Zd, $Rn",
	(!cast<Instruction>(NAME # _B) ZPR8:$Zd, GPR32sp:$Rn), 1>;
	def : InstAlias<"mov $Zd, $Rn",
	(!cast<Instruction>(NAME # _H) ZPR16:$Zd, GPR32sp:$Rn), 1>;
	def : InstAlias<"mov $Zd, $Rn",
	(!cast<Instruction>(NAME # _S) ZPR32:$Zd, GPR32sp:$Rn), 1>;
	def : InstAlias<"mov $Zd, $Rn",
	(!cast<Instruction>(NAME # _D) ZPR64:$Zd, GPR64sp:$Rn), 1>;
	}

	class sve_int_perm_dup_i<bits<5> tsz, Operand immtype, string asm,
	ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$idx),
	asm, "\t$Zd, $Zn$idx",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<7> idx;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = {?,?}; // imm3h
	let Inst{21} = 0b1;
	let Inst{20-16} = tsz;
	let Inst{15-10} = 0b001000;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_perm_dup_i<string asm> {
	def _B : sve_int_perm_dup_i<{?,?,?,?,1}, sve_elm_idx_extdup_b, asm, ZPR8> {
	let Inst{23-22} = idx{5-4};
	let Inst{20-17} = idx{3-0};
	}
	def _H : sve_int_perm_dup_i<{?,?,?,1,0}, sve_elm_idx_extdup_h, asm, ZPR16> {
	let Inst{23-22} = idx{4-3};
	let Inst{20-18} = idx{2-0};
	}
	def _S : sve_int_perm_dup_i<{?,?,1,0,0}, sve_elm_idx_extdup_s, asm, ZPR32> {
	let Inst{23-22} = idx{3-2};
	let Inst{20-19} = idx{1-0};
	}
	def _D : sve_int_perm_dup_i<{?,1,0,0,0}, sve_elm_idx_extdup_d, asm, ZPR64> {
	let Inst{23-22} = idx{2-1};
	let Inst{20} = idx{0};
	}
	def _Q : sve_int_perm_dup_i<{1,0,0,0,0}, sve_elm_idx_extdup_q, asm, ZPR128> {
	let Inst{23-22} = idx{1-0};
	}

	def : InstAlias<"mov $Zd, $Zn$idx",
	(!cast<Instruction>(NAME # _B) ZPR8:$Zd, ZPR8:$Zn, sve_elm_idx_extdup_b:$idx), 1>;
	def : InstAlias<"mov $Zd, $Zn$idx",
	(!cast<Instruction>(NAME # _H) ZPR16:$Zd, ZPR16:$Zn, sve_elm_idx_extdup_h:$idx), 1>;
	def : InstAlias<"mov $Zd, $Zn$idx",
	(!cast<Instruction>(NAME # _S) ZPR32:$Zd, ZPR32:$Zn, sve_elm_idx_extdup_s:$idx), 1>;
	def : InstAlias<"mov $Zd, $Zn$idx",
	(!cast<Instruction>(NAME # _D) ZPR64:$Zd, ZPR64:$Zn, sve_elm_idx_extdup_d:$idx), 1>;
	def : InstAlias<"mov $Zd, $Zn$idx",
	(!cast<Instruction>(NAME # _Q) ZPR128:$Zd, ZPR128:$Zn, sve_elm_idx_extdup_q:$idx), 1>;
	def : InstAlias<"mov $Zd, $Bn",
	(!cast<Instruction>(NAME # _B) ZPR8:$Zd, FPR8asZPR:$Bn, 0), 2>;
	def : InstAlias<"mov $Zd, $Hn",
	(!cast<Instruction>(NAME # _H) ZPR16:$Zd, FPR16asZPR:$Hn, 0), 2>;
	def : InstAlias<"mov $Zd, $Sn",
	(!cast<Instruction>(NAME # _S) ZPR32:$Zd, FPR32asZPR:$Sn, 0), 2>;
	def : InstAlias<"mov $Zd, $Dn",
	(!cast<Instruction>(NAME # _D) ZPR64:$Zd, FPR64asZPR:$Dn, 0), 2>;
	def : InstAlias<"mov $Zd, $Qn",
	(!cast<Instruction>(NAME # _Q) ZPR128:$Zd, FPR128asZPR:$Qn, 0), 2>;
	}

	class sve_int_perm_tbl<bits<2> sz8_64, bits<2> opc, string asm,
	ZPRRegOp zprty, RegisterOperand VecList>
	: I<(outs zprty:$Zd), (ins VecList:$Zn, zprty:$Zm),
	asm, "\t$Zd, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15-13} = 0b001;
	let Inst{12-11} = opc;
	let Inst{10} = 0b0;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_perm_tbl<string asm, SDPatternOperator op> {
	def _B : sve_int_perm_tbl<0b00, 0b10, asm, ZPR8, Z_b>;
	def _H : sve_int_perm_tbl<0b01, 0b10, asm, ZPR16, Z_h>;
	def _S : sve_int_perm_tbl<0b10, 0b10, asm, ZPR32, Z_s>;
	def _D : sve_int_perm_tbl<0b11, 0b10, asm, ZPR64, Z_d>;

	def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
	(!cast<Instruction>(NAME # _B) ZPR8:$Zd, ZPR8:$Zn, ZPR8:$Zm), 0>;
	def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
	(!cast<Instruction>(NAME # _H) ZPR16:$Zd, ZPR16:$Zn, ZPR16:$Zm), 0>;
	def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
	(!cast<Instruction>(NAME # _S) ZPR32:$Zd, ZPR32:$Zn, ZPR32:$Zm), 0>;
	def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
	(!cast<Instruction>(NAME # _D) ZPR64:$Zd, ZPR64:$Zn, ZPR64:$Zm), 0>;

	def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;

	def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve2_int_perm_tbl<string asm, SDPatternOperator op> {
	def _B : sve_int_perm_tbl<0b00, 0b01, asm, ZPR8, ZZ_b>;
	def _H : sve_int_perm_tbl<0b01, 0b01, asm, ZPR16, ZZ_h>;
	def _S : sve_int_perm_tbl<0b10, 0b01, asm, ZPR32, ZZ_s>;
	def _D : sve_int_perm_tbl<0b11, 0b01, asm, ZPR64, ZZ_d>;

	def : Pat<(nxv16i8 (op nxv16i8:$Op1, nxv16i8:$Op2, nxv16i8:$Op3)),
	(nxv16i8 (!cast<Instruction>(NAME # _B) (REG_SEQUENCE ZPR2, nxv16i8:$Op1, zsub0,
	nxv16i8:$Op2, zsub1),
	nxv16i8:$Op3))>;

	def : Pat<(nxv8i16 (op nxv8i16:$Op1, nxv8i16:$Op2, nxv8i16:$Op3)),
	(nxv8i16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8i16:$Op1, zsub0,
	nxv8i16:$Op2, zsub1),
	nxv8i16:$Op3))>;

	def : Pat<(nxv4i32 (op nxv4i32:$Op1, nxv4i32:$Op2, nxv4i32:$Op3)),
	(nxv4i32 (!cast<Instruction>(NAME # _S) (REG_SEQUENCE ZPR2, nxv4i32:$Op1, zsub0,
	nxv4i32:$Op2, zsub1),
	nxv4i32:$Op3))>;

	def : Pat<(nxv2i64 (op nxv2i64:$Op1, nxv2i64:$Op2, nxv2i64:$Op3)),
	(nxv2i64 (!cast<Instruction>(NAME # _D) (REG_SEQUENCE ZPR2, nxv2i64:$Op1, zsub0,
	nxv2i64:$Op2, zsub1),
	nxv2i64:$Op3))>;

	def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8i16:$Op3)),
	(nxv8f16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8f16:$Op1, zsub0,
	nxv8f16:$Op2, zsub1),
	nxv8i16:$Op3))>;

	def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4i32:$Op3)),
	(nxv4f32 (!cast<Instruction>(NAME # _S) (REG_SEQUENCE ZPR2, nxv4f32:$Op1, zsub0,
	nxv4f32:$Op2, zsub1),
	nxv4i32:$Op3))>;

	def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, nxv2i64:$Op3)),
	(nxv2f64 (!cast<Instruction>(NAME # _D) (REG_SEQUENCE ZPR2, nxv2f64:$Op1, zsub0,
	nxv2f64:$Op2, zsub1),
	nxv2i64:$Op3))>;
	}

	class sve2_int_perm_tbx<bits<2> sz8_64, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$_Zd, zprty:$Zn, zprty:$Zm),
	asm, "\t$Zd, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15-10} = 0b001011;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;

	let Constraints = "$Zd = $_Zd";
	}

	multiclass sve2_int_perm_tbx<string asm, SDPatternOperator op> {
	def _B : sve2_int_perm_tbx<0b00, asm, ZPR8>;
	def _H : sve2_int_perm_tbx<0b01, asm, ZPR16>;
	def _S : sve2_int_perm_tbx<0b10, asm, ZPR32>;
	def _D : sve2_int_perm_tbx<0b11, asm, ZPR64>;

	def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;

	def : SVE_3_Op_Pat<nxv8f16, op, nxv8f16, nxv8f16, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv4f32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	class sve_int_perm_reverse_z<bits<2> sz8_64, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$Zn),
	asm, "\t$Zd, $Zn",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-10} = 0b111000001110;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_perm_reverse_z<string asm, SDPatternOperator op> {
	def _B : sve_int_perm_reverse_z<0b00, asm, ZPR8>;
	def _H : sve_int_perm_reverse_z<0b01, asm, ZPR16>;
	def _S : sve_int_perm_reverse_z<0b10, asm, ZPR32>;
	def _D : sve_int_perm_reverse_z<0b11, asm, ZPR64>;

	def : SVE_1_Op_Pat<nxv16i8, op, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_1_Op_Pat<nxv8i16, op, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_1_Op_Pat<nxv4i32, op, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_1_Op_Pat<nxv2i64, op, nxv2i64, !cast<Instruction>(NAME # _D)>;

	def : SVE_1_Op_Pat<nxv8f16, op, nxv8f16, !cast<Instruction>(NAME # _H)>;
	def : SVE_1_Op_Pat<nxv4f32, op, nxv4f32, !cast<Instruction>(NAME # _S)>;
	def : SVE_1_Op_Pat<nxv2f64, op, nxv2f64, !cast<Instruction>(NAME # _D)>;
	}

	class sve_int_perm_reverse_p<bits<2> sz8_64, string asm, PPRRegOp pprty>
	: I<(outs pprty:$Pd), (ins pprty:$Pn),
	asm, "\t$Pd, $Pn",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	bits<4> Pn;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-9} = 0b1101000100000;
	let Inst{8-5} = Pn;
	let Inst{4} = 0b0;
	let Inst{3-0} = Pd;
	}

	multiclass sve_int_perm_reverse_p<string asm, SDPatternOperator op> {
	def _B : sve_int_perm_reverse_p<0b00, asm, PPR8>;
	def _H : sve_int_perm_reverse_p<0b01, asm, PPR16>;
	def _S : sve_int_perm_reverse_p<0b10, asm, PPR32>;
	def _D : sve_int_perm_reverse_p<0b11, asm, PPR64>;

	def : SVE_1_Op_Pat<nxv16i1, op, nxv16i1, !cast<Instruction>(NAME # _B)>;
	def : SVE_1_Op_Pat<nxv8i1, op, nxv8i1, !cast<Instruction>(NAME # _H)>;
	def : SVE_1_Op_Pat<nxv4i1, op, nxv4i1, !cast<Instruction>(NAME # _S)>;
	def : SVE_1_Op_Pat<nxv2i1, op, nxv2i1, !cast<Instruction>(NAME # _D)>;
	}

	class sve_int_perm_unpk<bits<2> sz16_64, bits<2> opc, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2>
	: I<(outs zprty1:$Zd), (ins zprty2:$Zn),
	asm, "\t$Zd, $Zn",
	"", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz16_64;
	let Inst{21-18} = 0b1100;
	let Inst{17-16} = opc;
	let Inst{15-10} = 0b001110;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_perm_unpk<bits<2> opc, string asm, SDPatternOperator op> {
	def _H : sve_int_perm_unpk<0b01, opc, asm, ZPR16, ZPR8>;
	def _S : sve_int_perm_unpk<0b10, opc, asm, ZPR32, ZPR16>;
	def _D : sve_int_perm_unpk<0b11, opc, asm, ZPR64, ZPR32>;

	def : SVE_1_Op_Pat<nxv8i16, op, nxv16i8, !cast<Instruction>(NAME # _H)>;
	def : SVE_1_Op_Pat<nxv4i32, op, nxv8i16, !cast<Instruction>(NAME # _S)>;
	def : SVE_1_Op_Pat<nxv2i64, op, nxv4i32, !cast<Instruction>(NAME # _D)>;
	}

	class sve_int_perm_insrs<bits<2> sz8_64, string asm, ZPRRegOp zprty,
	RegisterClass srcRegType>
	: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, srcRegType:$Rm),
	asm, "\t$Zdn, $Rm",
	"",
	[]>, Sched<[]> {
	bits<5> Rm;
	bits<5> Zdn;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-10} = 0b100100001110;
	let Inst{9-5} = Rm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = DestructiveOther;
	}

	multiclass sve_int_perm_insrs<string asm, SDPatternOperator op> {
	def _B : sve_int_perm_insrs<0b00, asm, ZPR8, GPR32>;
	def _H : sve_int_perm_insrs<0b01, asm, ZPR16, GPR32>;
	def _S : sve_int_perm_insrs<0b10, asm, ZPR32, GPR32>;
	def _D : sve_int_perm_insrs<0b11, asm, ZPR64, GPR64>;

	def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, i32, !cast<Instruction>(NAME # _B)>;
	def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, i32, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, i64, !cast<Instruction>(NAME # _D)>;
	}

	class sve_int_perm_insrv<bits<2> sz8_64, string asm, ZPRRegOp zprty,
	RegisterClass srcRegType>
	: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, srcRegType:$Vm),
	asm, "\t$Zdn, $Vm",
	"",
	[]>, Sched<[]> {
	bits<5> Vm;
	bits<5> Zdn;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-10} = 0b110100001110;
	let Inst{9-5} = Vm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = DestructiveOther;
	}

	multiclass sve_int_perm_insrv<string asm, SDPatternOperator op> {
	def _B : sve_int_perm_insrv<0b00, asm, ZPR8, FPR8>;
	def _H : sve_int_perm_insrv<0b01, asm, ZPR16, FPR16>;
	def _S : sve_int_perm_insrv<0b10, asm, ZPR32, FPR32>;
	def _D : sve_int_perm_insrv<0b11, asm, ZPR64, FPR64>;

	def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, f16, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, f32, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, f64, !cast<Instruction>(NAME # _D)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Permute - Extract Group
	//===----------------------------------------------------------------------===//

	class sve_int_perm_extract_i<string asm>
	: I<(outs ZPR8:$Zdn), (ins ZPR8:$_Zdn, ZPR8:$Zm, imm0_255:$imm8),
	asm, "\t$Zdn, $_Zdn, $Zm, $imm8",
	"", []>, Sched<[]> {
	bits<5> Zdn;
	bits<5> Zm;
	bits<8> imm8;
	let Inst{31-21} = 0b00000101001;
	let Inst{20-16} = imm8{7-3};
	let Inst{15-13} = 0b000;
	let Inst{12-10} = imm8{2-0};
	let Inst{9-5} = Zm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve_int_perm_extract_i<string asm, SDPatternOperator op> {
	def NAME : sve_int_perm_extract_i<asm>;

	def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, imm0_255,
	!cast<Instruction>(NAME)>;
	}

	class sve2_int_perm_extract_i_cons<string asm>
	: I<(outs ZPR8:$Zd), (ins ZZ_b:$Zn, imm0_255:$imm8),
	asm, "\t$Zd, $Zn, $imm8",
	"", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<8> imm8;
	let Inst{31-21} = 0b00000101011;
	let Inst{20-16} = imm8{7-3};
	let Inst{15-13} = 0b000;
	let Inst{12-10} = imm8{2-0};
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	//===----------------------------------------------------------------------===//
	// SVE Vector Select Group
	//===----------------------------------------------------------------------===//

	class sve_int_sel_vvv<bits<2> sz8_64, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins PPRAny:$Pg, zprty:$Zn, zprty:$Zm),
	asm, "\t$Zd, $Pg, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<4> Pg;
	bits<5> Zd;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15-14} = 0b11;
	let Inst{13-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_sel_vvv<string asm, SDPatternOperator op> {
	def _B : sve_int_sel_vvv<0b00, asm, ZPR8>;
	def _H : sve_int_sel_vvv<0b01, asm, ZPR16>;
	def _S : sve_int_sel_vvv<0b10, asm, ZPR32>;
	def _D : sve_int_sel_vvv<0b11, asm, ZPR64>;

	def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;

	def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2f32, op, nxv2i1, nxv2f32, nxv2f32, !cast<Instruction>(NAME # _D)>;
	def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;

	def : InstAlias<"mov $Zd, $Pg/m, $Zn",
	(!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPRAny:$Pg, ZPR8:$Zn, ZPR8:$Zd), 1>;
	def : InstAlias<"mov $Zd, $Pg/m, $Zn",
	(!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, ZPR16:$Zn, ZPR16:$Zd), 1>;
	def : InstAlias<"mov $Zd, $Pg/m, $Zn",
	(!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, ZPR32:$Zn, ZPR32:$Zd), 1>;
	def : InstAlias<"mov $Zd, $Pg/m, $Zn",
	(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, ZPR64:$Zn, ZPR64:$Zd), 1>;
	}


	//===----------------------------------------------------------------------===//
	// SVE Predicate Logical Operations Group
	//===----------------------------------------------------------------------===//

	class sve_int_pred_log<bits<4> opc, string asm>
	: I<(outs PPR8:$Pd), (ins PPRAny:$Pg, PPR8:$Pn, PPR8:$Pm),
	asm, "\t$Pd, $Pg/z, $Pn, $Pm",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	bits<4> Pg;
	bits<4> Pm;
	bits<4> Pn;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = opc{3-2};
	let Inst{21-20} = 0b00;
	let Inst{19-16} = Pm;
	let Inst{15-14} = 0b01;
	let Inst{13-10} = Pg;
	let Inst{9} = opc{1};
	let Inst{8-5} = Pn;
	let Inst{4} = opc{0};
	let Inst{3-0} = Pd;

	// SEL has no predication qualifier.
	let AsmString = !if(!eq(opc, 0b0011),
	!strconcat(asm, "\t$Pd, $Pg, $Pn, $Pm"),
	!strconcat(asm, "\t$Pd, $Pg/z, $Pn, $Pm"));

	let Defs = !if(!eq (opc{2}, 1), [NZCV], []);

	}

	multiclass sve_int_pred_log<bits<4> opc, string asm, SDPatternOperator op,
	SDPatternOperator op_nopred = null_frag> {
	def NAME : sve_int_pred_log<opc, asm>;

	def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, nxv16i1, !cast<Instruction>(NAME)>;
	def : SVE_3_Op_Pat<nxv8i1, op, nxv8i1, nxv8i1, nxv8i1, !cast<Instruction>(NAME)>;
	def : SVE_3_Op_Pat<nxv4i1, op, nxv4i1, nxv4i1, nxv4i1, !cast<Instruction>(NAME)>;
	def : SVE_3_Op_Pat<nxv2i1, op, nxv2i1, nxv2i1, nxv2i1, !cast<Instruction>(NAME)>;
	def : SVE_2_Op_AllActive_Pat<nxv16i1, op_nopred, nxv16i1, nxv16i1,
	!cast<Instruction>(NAME), PTRUE_B>;
	def : SVE_2_Op_AllActive_Pat<nxv8i1, op_nopred, nxv8i1, nxv8i1,
	!cast<Instruction>(NAME), PTRUE_H>;
	def : SVE_2_Op_AllActive_Pat<nxv4i1, op_nopred, nxv4i1, nxv4i1,
	!cast<Instruction>(NAME), PTRUE_S>;
	def : SVE_2_Op_AllActive_Pat<nxv2i1, op_nopred, nxv2i1, nxv2i1,
	!cast<Instruction>(NAME), PTRUE_D>;
	}


	//===----------------------------------------------------------------------===//
	// SVE Logical Mask Immediate Group
	//===----------------------------------------------------------------------===//

	class sve_int_log_imm<bits<2> opc, string asm>
	: I<(outs ZPR64:$Zdn), (ins ZPR64:$_Zdn, logical_imm64:$imms13),
	asm, "\t$Zdn, $_Zdn, $imms13",
	"", []>, Sched<[]> {
	bits<5> Zdn;
	bits<13> imms13;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = opc;
	let Inst{21-18} = 0b0000;
	let Inst{17-5} = imms13;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DecoderMethod = "DecodeSVELogicalImmInstruction";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve_int_log_imm<bits<2> opc, string asm, string alias, SDPatternOperator op> {
	def NAME : sve_int_log_imm<opc, asm>;

	def : SVE_1_Op_Imm_Log_Pat<nxv16i8, op, ZPR8, i32, SVELogicalImm8Pat, !cast<Instruction>(NAME)>;
	def : SVE_1_Op_Imm_Log_Pat<nxv8i16, op, ZPR16, i32, SVELogicalImm16Pat, !cast<Instruction>(NAME)>;
	def : SVE_1_Op_Imm_Log_Pat<nxv4i32, op, ZPR32, i32, SVELogicalImm32Pat, !cast<Instruction>(NAME)>;
	def : SVE_1_Op_Imm_Log_Pat<nxv2i64, op, ZPR64, i64, SVELogicalImm64Pat, !cast<Instruction>(NAME)>;

	def : InstAlias<asm # "\t$Zdn, $Zdn, $imm",
	(!cast<Instruction>(NAME) ZPR8:$Zdn, sve_logical_imm8:$imm), 4>;
	def : InstAlias<asm # "\t$Zdn, $Zdn, $imm",
	(!cast<Instruction>(NAME) ZPR16:$Zdn, sve_logical_imm16:$imm), 3>;
	def : InstAlias<asm # "\t$Zdn, $Zdn, $imm",
	(!cast<Instruction>(NAME) ZPR32:$Zdn, sve_logical_imm32:$imm), 2>;

	def : InstAlias<alias # "\t$Zdn, $Zdn, $imm",
	(!cast<Instruction>(NAME) ZPR8:$Zdn, sve_logical_imm8_not:$imm), 0>;
	def : InstAlias<alias # "\t$Zdn, $Zdn, $imm",
	(!cast<Instruction>(NAME) ZPR16:$Zdn, sve_logical_imm16_not:$imm), 0>;
	def : InstAlias<alias # "\t$Zdn, $Zdn, $imm",
	(!cast<Instruction>(NAME) ZPR32:$Zdn, sve_logical_imm32_not:$imm), 0>;
	def : InstAlias<alias # "\t$Zdn, $Zdn, $imm",
	(!cast<Instruction>(NAME) ZPR64:$Zdn, logical_imm64_not:$imm), 0>;
	}

	class sve_int_dup_mask_imm<string asm>
	: I<(outs ZPR64:$Zd), (ins logical_imm64:$imms),
	asm, "\t$Zd, $imms",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<13> imms;
	let Inst{31-18} = 0b00000101110000;
	let Inst{17-5} = imms;
	let Inst{4-0} = Zd;

	let isReMaterializable = 1;
	let DecoderMethod = "DecodeSVELogicalImmInstruction";
	}

	multiclass sve_int_dup_mask_imm<string asm> {
	def NAME : sve_int_dup_mask_imm<asm>;

	def : InstAlias<"dupm $Zd, $imm",
	(!cast<Instruction>(NAME) ZPR8:$Zd, sve_logical_imm8:$imm), 4>;
	def : InstAlias<"dupm $Zd, $imm",
	(!cast<Instruction>(NAME) ZPR16:$Zd, sve_logical_imm16:$imm), 3>;
	def : InstAlias<"dupm $Zd, $imm",
	(!cast<Instruction>(NAME) ZPR32:$Zd, sve_logical_imm32:$imm), 2>;

	// All Zd.b forms have a CPY/DUP equivalent, hence no byte alias here.
	def : InstAlias<"mov $Zd, $imm",
	(!cast<Instruction>(NAME) ZPR16:$Zd, sve_preferred_logical_imm16:$imm), 7>;
	def : InstAlias<"mov $Zd, $imm",
	(!cast<Instruction>(NAME) ZPR32:$Zd, sve_preferred_logical_imm32:$imm), 6>;
	def : InstAlias<"mov $Zd, $imm",
	(!cast<Instruction>(NAME) ZPR64:$Zd, sve_preferred_logical_imm64:$imm), 5>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Integer Arithmetic - Unpredicated Group.
	//===----------------------------------------------------------------------===//

	class sve_int_bin_cons_arit_0<bits<2> sz8_64, bits<3> opc, string asm,
	ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
	asm, "\t$Zd, $Zn, $Zm",
	"", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15-13} = 0b000;
	let Inst{12-10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_bin_cons_arit_0<bits<3> opc, string asm,
	SDPatternOperator op, SDPatternOperator int_op> {
	def _B : sve_int_bin_cons_arit_0<0b00, opc, asm, ZPR8>;
	def _H : sve_int_bin_cons_arit_0<0b01, opc, asm, ZPR16>;
	def _S : sve_int_bin_cons_arit_0<0b10, opc, asm, ZPR32>;
	def _D : sve_int_bin_cons_arit_0<0b11, opc, asm, ZPR64>;

	def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;

	// Intrinsic version
	def : SVE_2_Op_Pat<nxv16i8, int_op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_2_Op_Pat<nxv8i16, int_op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<nxv4i32, int_op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<nxv2i64, int_op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Floating Point Arithmetic - Predicated Group
	//===----------------------------------------------------------------------===//

	class sve_fp_2op_i_p_zds<bits<2> sz, bits<3> opc, string asm,
	ZPRRegOp zprty,
	Operand imm_ty>
	: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, imm_ty:$i1),
	asm, "\t$Zdn, $Pg/m, $_Zdn, $i1",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zdn;
	bit i1;
	let Inst{31-24} = 0b01100101;
	let Inst{23-22} = sz;
	let Inst{21-19} = 0b011;
	let Inst{18-16} = opc;
	let Inst{15-13} = 0b100;
	let Inst{12-10} = Pg;
	let Inst{9-6} = 0b0000;
	let Inst{5} = i1;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_fp_2op_i_p_zds<bits<3> opc, string asm, Operand imm_ty> {
	def _H : sve_fp_2op_i_p_zds<0b01, opc, asm, ZPR16, imm_ty>;
	def _S : sve_fp_2op_i_p_zds<0b10, opc, asm, ZPR32, imm_ty>;
	def _D : sve_fp_2op_i_p_zds<0b11, opc, asm, ZPR64, imm_ty>;
	}

	class sve_fp_2op_p_zds<bits<2> sz, bits<4> opc, string asm,
	ZPRRegOp zprty>
	: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
	asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zdn;
	bits<5> Zm;
	let Inst{31-24} = 0b01100101;
	let Inst{23-22} = sz;
	let Inst{21-20} = 0b00;
	let Inst{19-16} = opc;
	let Inst{15-13} = 0b100;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_fp_2op_p_zds<bits<4> opc, string asm, string Ps,
	SDPatternOperator op, DestructiveInstTypeEnum flags,
	string revname="", bit isReverseInstr=0> {
	let DestructiveInstType = flags in {
	def _H : sve_fp_2op_p_zds<0b01, opc, asm, ZPR16>,
	SVEPseudo2Instr<Ps # _H, 1>, SVEInstr2Rev<NAME # _H, revname # _H, isReverseInstr>;
	def _S : sve_fp_2op_p_zds<0b10, opc, asm, ZPR32>,
	SVEPseudo2Instr<Ps # _S, 1>, SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>;
	def _D : sve_fp_2op_p_zds<0b11, opc, asm, ZPR64>,
	SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>;
	}

	def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve_fp_2op_p_zds_fscale<bits<4> opc, string asm,
	SDPatternOperator op> {
	def _H : sve_fp_2op_p_zds<0b01, opc, asm, ZPR16>;
	def _S : sve_fp_2op_p_zds<0b10, opc, asm, ZPR32>;
	def _D : sve_fp_2op_p_zds<0b11, opc, asm, ZPR64>;

	def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve_fp_2op_p_zds_zeroing_hsd<SDPatternOperator op> {
	def _ZERO_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesZero>;
	def _ZERO_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesZero>;
	def _ZERO_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesZero>;

	def : SVE_3_Op_Pat_SelZero<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Pseudo>(NAME # _ZERO_H)>;
	def : SVE_3_Op_Pat_SelZero<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Pseudo>(NAME # _ZERO_S)>;
	def : SVE_3_Op_Pat_SelZero<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Pseudo>(NAME # _ZERO_D)>;
	}

	class sve_fp_ftmad<bits<2> sz, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, imm32_0_7:$imm3),
	asm, "\t$Zdn, $_Zdn, $Zm, $imm3",
	"",
	[]>, Sched<[]> {
	bits<5> Zdn;
	bits<5> Zm;
	bits<3> imm3;
	let Inst{31-24} = 0b01100101;
	let Inst{23-22} = sz;
	let Inst{21-19} = 0b010;
	let Inst{18-16} = imm3;
	let Inst{15-10} = 0b100000;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve_fp_ftmad<string asm, SDPatternOperator op> {
	def _H : sve_fp_ftmad<0b01, asm, ZPR16>;
	def _S : sve_fp_ftmad<0b10, asm, ZPR32>;
	def _D : sve_fp_ftmad<0b11, asm, ZPR64>;

	def : Pat<(nxv8f16 (op (nxv8f16 ZPR16:$Zn), (nxv8f16 ZPR16:$Zm), (i32 imm32_0_7:$imm))),
	(!cast<Instruction>(NAME # _H) ZPR16:$Zn, ZPR16:$Zm, imm32_0_7:$imm)>;
	def : Pat<(nxv4f32 (op (nxv4f32 ZPR32:$Zn), (nxv4f32 ZPR32:$Zm), (i32 imm32_0_7:$imm))),
	(!cast<Instruction>(NAME # _S) ZPR32:$Zn, ZPR32:$Zm, imm32_0_7:$imm)>;
	def : Pat<(nxv2f64 (op (nxv2f64 ZPR64:$Zn), (nxv2f64 ZPR64:$Zm), (i32 imm32_0_7:$imm))),
	(!cast<Instruction>(NAME # _D) ZPR64:$Zn, ZPR64:$Zm, imm32_0_7:$imm)>;
	}


	//===----------------------------------------------------------------------===//
	// SVE Floating Point Arithmetic - Unpredicated Group
	//===----------------------------------------------------------------------===//

	class sve_fp_3op_u_zd<bits<2> sz, bits<3> opc, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
	asm, "\t$Zd, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-24} = 0b01100101;
	let Inst{23-22} = sz;
	let Inst{21} = 0b0;
	let Inst{20-16} = Zm;
	let Inst{15-13} = 0b000;
	let Inst{12-10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_fp_3op_u_zd<bits<3> opc, string asm, SDPatternOperator op> {
	def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16>;
	def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32>;
	def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64>;

	def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;

	}

	multiclass sve_fp_3op_u_zd_ftsmul<bits<3> opc, string asm, SDPatternOperator op> {
	def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16>;
	def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32>;
	def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64>;

	def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Floating Point Fused Multiply-Add Group
	//===----------------------------------------------------------------------===//

	class sve_fp_3op_p_zds_a<bits<2> sz, bits<2> opc, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zda), (ins PPR3bAny:$Pg, zprty:$_Zda, zprty:$Zn, zprty:$Zm),
	asm, "\t$Zda, $Pg/m, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zda;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-24} = 0b01100101;
	let Inst{23-22} = sz;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15} = 0b0;
	let Inst{14-13} = opc;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_fp_3op_p_zds_a<bits<2> opc, string asm, SDPatternOperator op> {
	def _H : sve_fp_3op_p_zds_a<0b01, opc, asm, ZPR16>;
	def _S : sve_fp_3op_p_zds_a<0b10, opc, asm, ZPR32>;
	def _D : sve_fp_3op_p_zds_a<0b11, opc, asm, ZPR64>;

	def : SVE_4_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
	def : SVE_4_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
	def : SVE_4_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
	}

	class sve_fp_3op_p_zds_b<bits<2> sz, bits<2> opc, string asm,
	ZPRRegOp zprty>
	: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm, zprty:$Za),
	asm, "\t$Zdn, $Pg/m, $Zm, $Za",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Za;
	bits<5> Zdn;
	bits<5> Zm;
	let Inst{31-24} = 0b01100101;
	let Inst{23-22} = sz;
	let Inst{21} = 0b1;
	let Inst{20-16} = Za;
	let Inst{15} = 0b1;
	let Inst{14-13} = opc;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_fp_3op_p_zds_b<bits<2> opc, string asm, SDPatternOperator op> {
	def _H : sve_fp_3op_p_zds_b<0b01, opc, asm, ZPR16>;
	def _S : sve_fp_3op_p_zds_b<0b10, opc, asm, ZPR32>;
	def _D : sve_fp_3op_p_zds_b<0b11, opc, asm, ZPR64>;

	def : SVE_4_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
	def : SVE_4_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
	def : SVE_4_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Floating Point Multiply-Add - Indexed Group
	//===----------------------------------------------------------------------===//

	class sve_fp_fma_by_indexed_elem<bits<2> sz, bit opc, string asm,
	ZPRRegOp zprty1,
	ZPRRegOp zprty2, Operand itype>
	: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty1:$Zn, zprty2:$Zm, itype:$iop),
	asm, "\t$Zda, $Zn, $Zm$iop", "", []>, Sched<[]> {
	bits<5> Zda;
	bits<5> Zn;
	let Inst{31-24} = 0b01100100;
	let Inst{23-22} = sz;
	let Inst{21} = 0b1;
	let Inst{15-11} = 0;
	let Inst{10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve_fp_fma_by_indexed_elem<bit opc, string asm,
	SDPatternOperator op> {
	def _H : sve_fp_fma_by_indexed_elem<{0, ?}, opc, asm, ZPR16, ZPR3b16, VectorIndexH32b> {
	bits<3> Zm;
	bits<3> iop;
	let Inst{22} = iop{2};
	let Inst{20-19} = iop{1-0};
	let Inst{18-16} = Zm;
	}
	def _S : sve_fp_fma_by_indexed_elem<0b10, opc, asm, ZPR32, ZPR3b32, VectorIndexS32b> {
	bits<3> Zm;
	bits<2> iop;
	let Inst{20-19} = iop;
	let Inst{18-16} = Zm;
	}
	def _D : sve_fp_fma_by_indexed_elem<0b11, opc, asm, ZPR64, ZPR4b64, VectorIndexD32b> {
	bits<4> Zm;
	bit iop;
	let Inst{20} = iop;
	let Inst{19-16} = Zm;
	}

	def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3, (i32 VectorIndexH32b_timm:$idx))),
	(!cast<Instruction>(NAME # _H) $Op1, $Op2, $Op3, VectorIndexH32b_timm:$idx)>;
	def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, (i32 VectorIndexS32b_timm:$idx))),
	(!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b_timm:$idx)>;
	def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, nxv2f64:$Op3, (i32 VectorIndexD32b_timm:$idx))),
	(!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b_timm:$idx)>;
	}


	//===----------------------------------------------------------------------===//
	// SVE Floating Point Multiply - Indexed Group
	//===----------------------------------------------------------------------===//

	class sve_fp_fmul_by_indexed_elem<bits<2> sz, string asm, ZPRRegOp zprty,
	ZPRRegOp zprty2, Operand itype>
	: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty2:$Zm, itype:$iop),
	asm, "\t$Zd, $Zn, $Zm$iop", "", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	let Inst{31-24} = 0b01100100;
	let Inst{23-22} = sz;
	let Inst{21} = 0b1;
	let Inst{15-10} = 0b001000;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_fp_fmul_by_indexed_elem<string asm, SDPatternOperator op> {
	def _H : sve_fp_fmul_by_indexed_elem<{0, ?}, asm, ZPR16, ZPR3b16, VectorIndexH32b> {
	bits<3> Zm;
	bits<3> iop;
	let Inst{22} = iop{2};
	let Inst{20-19} = iop{1-0};
	let Inst{18-16} = Zm;
	}
	def _S : sve_fp_fmul_by_indexed_elem<0b10, asm, ZPR32, ZPR3b32, VectorIndexS32b> {
	bits<3> Zm;
	bits<2> iop;
	let Inst{20-19} = iop;
	let Inst{18-16} = Zm;
	}
	def _D : sve_fp_fmul_by_indexed_elem<0b11, asm, ZPR64, ZPR4b64, VectorIndexD32b> {
	bits<4> Zm;
	bit iop;
	let Inst{20} = iop;
	let Inst{19-16} = Zm;
	}

	def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, (i32 VectorIndexH32b_timm:$idx))),
	(!cast<Instruction>(NAME # _H) $Op1, $Op2, VectorIndexH32b_timm:$idx)>;
	def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, (i32 VectorIndexS32b_timm:$idx))),
	(!cast<Instruction>(NAME # _S) $Op1, $Op2, VectorIndexS32b_timm:$idx)>;
	def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, (i32 VectorIndexD32b_timm:$idx))),
	(!cast<Instruction>(NAME # _D) $Op1, $Op2, VectorIndexD32b_timm:$idx)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Floating Point Complex Multiply-Add Group
	//===----------------------------------------------------------------------===//

	class sve_fp_fcmla<bits<2> sz, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zda), (ins PPR3bAny:$Pg, zprty:$_Zda, zprty:$Zn, zprty:$Zm,
	complexrotateop:$imm),
	asm, "\t$Zda, $Pg/m, $Zn, $Zm, $imm",
	"", []>, Sched<[]> {
	bits<5> Zda;
	bits<3> Pg;
	bits<5> Zn;
	bits<5> Zm;
	bits<2> imm;
	let Inst{31-24} = 0b01100100;
	let Inst{23-22} = sz;
	let Inst{21} = 0;
	let Inst{20-16} = Zm;
	let Inst{15} = 0;
	let Inst{14-13} = imm;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_fp_fcmla<string asm, SDPatternOperator op> {
	def _H : sve_fp_fcmla<0b01, asm, ZPR16>;
	def _S : sve_fp_fcmla<0b10, asm, ZPR32>;
	def _D : sve_fp_fcmla<0b11, asm, ZPR64>;

	def : Pat<(nxv8f16 (op nxv8i1:$Op1, nxv8f16:$Op2, nxv8f16:$Op3, nxv8f16:$Op4, (i32 complexrotateop:$imm))),
	(!cast<Instruction>(NAME # _H) $Op1, $Op2, $Op3, $Op4, complexrotateop:$imm)>;
	def : Pat<(nxv4f32 (op nxv4i1:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, nxv4f32:$Op4, (i32 complexrotateop:$imm))),
	(!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, $Op4, complexrotateop:$imm)>;
	def : Pat<(nxv2f64 (op nxv2i1:$Op1, nxv2f64:$Op2, nxv2f64:$Op3, nxv2f64:$Op4, (i32 complexrotateop:$imm))),
	(!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, $Op4, complexrotateop:$imm)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Floating Point Complex Multiply-Add - Indexed Group
	//===----------------------------------------------------------------------===//

	class sve_fp_fcmla_by_indexed_elem<bits<2> sz, string asm,
	ZPRRegOp zprty,
	ZPRRegOp zprty2, Operand itype>
	: I<(outs zprty:$Zda), (ins zprty:$_Zda, zprty:$Zn, zprty2:$Zm, itype:$iop,
	complexrotateop:$imm),
	asm, "\t$Zda, $Zn, $Zm$iop, $imm",
	"", []>, Sched<[]> {
	bits<5> Zda;
	bits<5> Zn;
	bits<2> imm;
	let Inst{31-24} = 0b01100100;
	let Inst{23-22} = sz;
	let Inst{21} = 0b1;
	let Inst{15-12} = 0b0001;
	let Inst{11-10} = imm;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve_fp_fcmla_by_indexed_elem<string asm, SDPatternOperator op> {
	def _H : sve_fp_fcmla_by_indexed_elem<0b10, asm, ZPR16, ZPR3b16, VectorIndexS32b> {
	bits<3> Zm;
	bits<2> iop;
	let Inst{20-19} = iop;
	let Inst{18-16} = Zm;
	}
	def _S : sve_fp_fcmla_by_indexed_elem<0b11, asm, ZPR32, ZPR4b32, VectorIndexD32b> {
	bits<4> Zm;
	bits<1> iop;
	let Inst{20} = iop;
	let Inst{19-16} = Zm;
	}

	def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3, (i32 VectorIndexS32b_timm:$idx), (i32 complexrotateop:$imm))),
	(!cast<Instruction>(NAME # _H) $Op1, $Op2, $Op3, VectorIndexS32b_timm:$idx, complexrotateop:$imm)>;
	def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, (i32 VectorIndexD32b_timm:$idx), (i32 complexrotateop:$imm))),
	(!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexD32b_timm:$idx, complexrotateop:$imm)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Floating Point Complex Addition Group
	//===----------------------------------------------------------------------===//

	class sve_fp_fcadd<bits<2> sz, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm,
	complexrotateopodd:$imm),
	asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm, $imm",
	"",
	[]>, Sched<[]> {
	bits<5> Zdn;
	bits<5> Zm;
	bits<3> Pg;
	bit imm;
	let Inst{31-24} = 0b01100100;
	let Inst{23-22} = sz;
	let Inst{21-17} = 0;
	let Inst{16} = imm;
	let Inst{15-13} = 0b100;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_fp_fcadd<string asm, SDPatternOperator op> {
	def _H : sve_fp_fcadd<0b01, asm, ZPR16>;
	def _S : sve_fp_fcadd<0b10, asm, ZPR32>;
	def _D : sve_fp_fcadd<0b11, asm, ZPR64>;

	def : Pat<(nxv8f16 (op nxv8i1:$Op1, nxv8f16:$Op2, nxv8f16:$Op3, (i32 complexrotateopodd:$imm))),
	(!cast<Instruction>(NAME # _H) $Op1, $Op2, $Op3, complexrotateopodd:$imm)>;
	def : Pat<(nxv4f32 (op nxv4i1:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, (i32 complexrotateopodd:$imm))),
	(!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, complexrotateopodd:$imm)>;
	def : Pat<(nxv2f64 (op nxv2i1:$Op1, nxv2f64:$Op2, nxv2f64:$Op3, (i32 complexrotateopodd:$imm))),
	(!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, complexrotateopodd:$imm)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Floating Point Convert Group
	//===----------------------------------------------------------------------===//

	class sve2_fp_convert_precision<bits<4> opc, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2>
	: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, PPR3bAny:$Pg, zprty2:$Zn),
	asm, "\t$Zd, $Pg/m, $Zn",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<3> Pg;
	let Inst{31-24} = 0b01100100;
	let Inst{23-22} = opc{3-2};
	let Inst{21-18} = 0b0010;
	let Inst{17-16} = opc{1-0};
	let Inst{15-13} = 0b101;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;

	let Constraints = "$Zd = $_Zd";
	}

	multiclass sve2_fp_convert_down_narrow<string asm, string op> {
	def _StoH : sve2_fp_convert_precision<0b1000, asm, ZPR16, ZPR32>;
	def _DtoS : sve2_fp_convert_precision<0b1110, asm, ZPR32, ZPR64>;

	def : SVE_3_Op_Pat<nxv8f16, !cast<SDPatternOperator>(op # _f16f32), nxv8f16, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _StoH)>;
	def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
	}

	multiclass sve2_fp_convert_up_long<string asm, string op> {
	def _HtoS : sve2_fp_convert_precision<0b1001, asm, ZPR32, ZPR16>;
	def _StoD : sve2_fp_convert_precision<0b1111, asm, ZPR64, ZPR32>;

	def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f16), nxv4f32, nxv4i1, nxv8f16, !cast<Instruction>(NAME # _HtoS)>;
	def : SVE_3_Op_Pat<nxv2f64, !cast<SDPatternOperator>(op # _f64f32), nxv2f64, nxv2i1, nxv4f32, !cast<Instruction>(NAME # _StoD)>;
	}

	multiclass sve2_fp_convert_down_odd_rounding_top<string asm, string op> {
	def _DtoS : sve2_fp_convert_precision<0b0010, asm, ZPR32, ZPR64>;

	def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Floating Point Pairwise Group
	//===----------------------------------------------------------------------===//

	class sve2_fp_pairwise_pred<bits<2> sz, bits<3> opc, string asm,
	ZPRRegOp zprty>
	: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
	asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zm;
	bits<5> Zdn;
	let Inst{31-24} = 0b01100100;
	let Inst{23-22} = sz;
	let Inst{21-19} = 0b010;
	let Inst{18-16} = opc;
	let Inst{15-13} = 0b100;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve2_fp_pairwise_pred<bits<3> opc, string asm, SDPatternOperator op> {
	def _H : sve2_fp_pairwise_pred<0b01, opc, asm, ZPR16>;
	def _S : sve2_fp_pairwise_pred<0b10, opc, asm, ZPR32>;
	def _D : sve2_fp_pairwise_pred<0b11, opc, asm, ZPR64>;

	def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Floating Point Widening Multiply-Add - Indexed Group
	//===----------------------------------------------------------------------===//

	class sve2_fp_mla_long_by_indexed_elem<bits<2> opc, string asm>
	: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm,
	VectorIndexH32b:$iop),
	asm, "\t$Zda, $Zn, $Zm$iop",
	"",
	[]>, Sched<[]> {
	bits<5> Zda;
	bits<5> Zn;
	bits<3> Zm;
	bits<3> iop;
	let Inst{31-21} = 0b01100100101;
	let Inst{20-19} = iop{2-1};
	let Inst{18-16} = Zm;
	let Inst{15-14} = 0b01;
	let Inst{13} = opc{1};
	let Inst{12} = 0b0;
	let Inst{11} = iop{0};
	let Inst{10} = opc{0};
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve2_fp_mla_long_by_indexed_elem<bits<2> opc, string asm,
	SDPatternOperator op> {
	def NAME : sve2_fp_mla_long_by_indexed_elem<opc, asm>;
	def : SVE_4_Op_Imm_Pat<nxv4f32, op, nxv4f32, nxv8f16, nxv8f16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Floating Point Widening Multiply-Add Group
	//===----------------------------------------------------------------------===//

	class sve2_fp_mla_long<bits<2> opc, string asm>
	: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm),
	asm, "\t$Zda, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<5> Zda;
	bits<5> Zn;
	bits<5> Zm;
	let Inst{31-21} = 0b01100100101;
	let Inst{20-16} = Zm;
	let Inst{15-14} = 0b10;
	let Inst{13} = opc{1};
	let Inst{12-11} = 0b00;
	let Inst{10} = opc{0};
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve2_fp_mla_long<bits<2> opc, string asm, SDPatternOperator op> {
	def NAME : sve2_fp_mla_long<opc, asm>;
	def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv8f16, nxv8f16, !cast<Instruction>(NAME)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Stack Allocation Group
	//===----------------------------------------------------------------------===//

	class sve_int_arith_vl<bit opc, string asm>
	: I<(outs GPR64sp:$Rd), (ins GPR64sp:$Rn, simm6_32b:$imm6),
	asm, "\t$Rd, $Rn, $imm6",
	"",
	[]>, Sched<[]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<6> imm6;
	let Inst{31-23} = 0b000001000;
	let Inst{22} = opc;
	let Inst{21} = 0b1;
	let Inst{20-16} = Rn;
	let Inst{15-11} = 0b01010;
	let Inst{10-5} = imm6;
	let Inst{4-0} = Rd;
	}

	class sve_int_read_vl_a<bit op, bits<5> opc2, string asm>
	: I<(outs GPR64:$Rd), (ins simm6_32b:$imm6),
	asm, "\t$Rd, $imm6",
	"",
	[]>, Sched<[]> {
	bits<5> Rd;
	bits<6> imm6;
	let Inst{31-23} = 0b000001001;
	let Inst{22} = op;
	let Inst{21} = 0b1;
	let Inst{20-16} = opc2{4-0};
	let Inst{15-11} = 0b01010;
	let Inst{10-5} = imm6;
	let Inst{4-0} = Rd;
	}

	//===----------------------------------------------------------------------===//
	// SVE Permute - In Lane Group
	//===----------------------------------------------------------------------===//

	class sve_int_perm_bin_perm_zz<bits<3> opc, bits<2> sz8_64, string asm,
	ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
	asm, "\t$Zd, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15-13} = 0b011;
	let Inst{12-10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_perm_bin_perm_zz<bits<3> opc, string asm,
	SDPatternOperator op> {
	def _B : sve_int_perm_bin_perm_zz<opc, 0b00, asm, ZPR8>;
	def _H : sve_int_perm_bin_perm_zz<opc, 0b01, asm, ZPR16>;
	def _S : sve_int_perm_bin_perm_zz<opc, 0b10, asm, ZPR32>;
	def _D : sve_int_perm_bin_perm_zz<opc, 0b11, asm, ZPR64>;

	def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;

	def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<nxv4f16, op, nxv4f16, nxv4f16, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Floating Point Unary Operations Group
	//===----------------------------------------------------------------------===//

	class sve_fp_2op_p_zd<bits<7> opc, string asm, RegisterOperand i_zprtype,
	RegisterOperand o_zprtype, ElementSizeEnum size>
	: I<(outs o_zprtype:$Zd), (ins i_zprtype:$_Zd, PPR3bAny:$Pg, i_zprtype:$Zn),
	asm, "\t$Zd, $Pg/m, $Zn",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zd;
	bits<5> Zn;
	let Inst{31-24} = 0b01100101;
	let Inst{23-22} = opc{6-5};
	let Inst{21} = 0b0;
	let Inst{20-16} = opc{4-0};
	let Inst{15-13} = 0b101;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;

	let Constraints = "$Zd = $_Zd";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = size;
	}

	multiclass sve_fp_2op_p_zd<bits<7> opc, string asm,
	RegisterOperand i_zprtype,
	RegisterOperand o_zprtype,
	SDPatternOperator op, ValueType vt1,
	ValueType vt2, ValueType vt3, ElementSizeEnum Sz> {
	def NAME : sve_fp_2op_p_zd<opc, asm, i_zprtype, o_zprtype, Sz>;

	def : SVE_3_Op_Pat<vt1, op, vt1, vt2, vt3, !cast<Instruction>(NAME)>;
	}

	multiclass sve_fp_2op_p_zd_HSD<bits<5> opc, string asm, SDPatternOperator op> {
	def _H : sve_fp_2op_p_zd<{ 0b01, opc }, asm, ZPR16, ZPR16, ElementSizeH>;
	def _S : sve_fp_2op_p_zd<{ 0b10, opc }, asm, ZPR32, ZPR32, ElementSizeS>;
	def _D : sve_fp_2op_p_zd<{ 0b11, opc }, asm, ZPR64, ZPR64, ElementSizeD>;

	def : SVE_3_Op_Pat<nxv8f16, op, nxv8f16, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2f64, op, nxv2f64, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve2_fp_flogb<string asm, SDPatternOperator op> {
	def _H : sve_fp_2op_p_zd<0b0011010, asm, ZPR16, ZPR16, ElementSizeH>;
	def _S : sve_fp_2op_p_zd<0b0011100, asm, ZPR32, ZPR32, ElementSizeS>;
	def _D : sve_fp_2op_p_zd<0b0011110, asm, ZPR64, ZPR64, ElementSizeD>;

	def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve2_fp_convert_down_odd_rounding<string asm, string op> {
	def _DtoS : sve_fp_2op_p_zd<0b0001010, asm, ZPR64, ZPR32, ElementSizeD>;
	def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Floating Point Unary Operations - Unpredicated Group
	//===----------------------------------------------------------------------===//

	class sve_fp_2op_u_zd<bits<2> sz, bits<3> opc, string asm,
	ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$Zn),
	asm, "\t$Zd, $Zn",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	let Inst{31-24} = 0b01100101;
	let Inst{23-22} = sz;
	let Inst{21-19} = 0b001;
	let Inst{18-16} = opc;
	let Inst{15-10} = 0b001100;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_fp_2op_u_zd<bits<3> opc, string asm, SDPatternOperator op> {
	def _H : sve_fp_2op_u_zd<0b01, opc, asm, ZPR16>;
	def _S : sve_fp_2op_u_zd<0b10, opc, asm, ZPR32>;
	def _D : sve_fp_2op_u_zd<0b11, opc, asm, ZPR64>;

	def : SVE_1_Op_Pat<nxv8f16, op, nxv8f16, !cast<Instruction>(NAME # _H)>;
	def : SVE_1_Op_Pat<nxv4f32, op, nxv4f32, !cast<Instruction>(NAME # _S)>;
	def : SVE_1_Op_Pat<nxv2f64, op, nxv2f64, !cast<Instruction>(NAME # _D)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Integer Arithmetic - Binary Predicated Group
	//===----------------------------------------------------------------------===//

	class sve_int_bin_pred_arit_log<bits<2> sz8_64, bits<2> fmt, bits<3> opc,
	string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
	asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm", "", []>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zdn;
	bits<5> Zm;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b0;
	let Inst{20-19} = fmt;
	let Inst{18-16} = opc;
	let Inst{15-13} = 0b000;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_int_bin_pred_log<bits<3> opc, string asm, SDPatternOperator op> {
	def _B : sve_int_bin_pred_arit_log<0b00, 0b11, opc, asm, ZPR8>;
	def _H : sve_int_bin_pred_arit_log<0b01, 0b11, opc, asm, ZPR16>;
	def _S : sve_int_bin_pred_arit_log<0b10, 0b11, opc, asm, ZPR32>;
	def _D : sve_int_bin_pred_arit_log<0b11, 0b11, opc, asm, ZPR64>;

	def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve_int_bin_pred_arit_0<bits<3> opc, string asm, string Ps,
	SDPatternOperator op,
	DestructiveInstTypeEnum flags,
	string revname="", bit isReverseInstr=0> {
	let DestructiveInstType = flags in {
	def _B : sve_int_bin_pred_arit_log<0b00, 0b00, opc, asm, ZPR8>,
	SVEPseudo2Instr<Ps # _B, 1>, SVEInstr2Rev<NAME # _B, revname # _B, isReverseInstr>;
	def _H : sve_int_bin_pred_arit_log<0b01, 0b00, opc, asm, ZPR16>,
	SVEPseudo2Instr<Ps # _H, 1>, SVEInstr2Rev<NAME # _H, revname # _H, isReverseInstr>;
	def _S : sve_int_bin_pred_arit_log<0b10, 0b00, opc, asm, ZPR32>,
	SVEPseudo2Instr<Ps # _S, 1>, SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>;
	def _D : sve_int_bin_pred_arit_log<0b11, 0b00, opc, asm, ZPR64>,
	SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>;
	}

	def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve_int_bin_pred_arit_1<bits<3> opc, string asm, SDPatternOperator op> {
	def _B : sve_int_bin_pred_arit_log<0b00, 0b01, opc, asm, ZPR8>;
	def _H : sve_int_bin_pred_arit_log<0b01, 0b01, opc, asm, ZPR16>;
	def _S : sve_int_bin_pred_arit_log<0b10, 0b01, opc, asm, ZPR32>;
	def _D : sve_int_bin_pred_arit_log<0b11, 0b01, opc, asm, ZPR64>;

	def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve_int_bin_pred_arit_2<bits<3> opc, string asm, SDPatternOperator op> {
	def _B : sve_int_bin_pred_arit_log<0b00, 0b10, opc, asm, ZPR8>;
	def _H : sve_int_bin_pred_arit_log<0b01, 0b10, opc, asm, ZPR16>;
	def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>;
	def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>;

	def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	// Special case for divides which are not defined for 8b/16b elements.
	multiclass sve_int_bin_pred_arit_2_div<bits<3> opc, string asm, string Ps,
	SDPatternOperator op,
	DestructiveInstTypeEnum flags,
	string revname="", bit isReverseInstr=0> {
	let DestructiveInstType = flags in {
	def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>,
	SVEPseudo2Instr<Ps # _S, 1>, SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>;
	def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>,
	SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>;
	}

	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Integer Multiply-Add Group
	//===----------------------------------------------------------------------===//

	class sve_int_mladdsub_vvv_pred<bits<2> sz8_64, bits<1> opc, string asm,
	ZPRRegOp zprty>
	: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm, zprty:$Za),
	asm, "\t$Zdn, $Pg/m, $Zm, $Za",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zdn;
	bits<5> Za;
	bits<5> Zm;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b0;
	let Inst{20-16} = Zm;
	let Inst{15-14} = 0b11;
	let Inst{13} = opc;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Za;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_int_mladdsub_vvv_pred<bits<1> opc, string asm, SDPatternOperator op> {
	def _B : sve_int_mladdsub_vvv_pred<0b00, opc, asm, ZPR8>;
	def _H : sve_int_mladdsub_vvv_pred<0b01, opc, asm, ZPR16>;
	def _S : sve_int_mladdsub_vvv_pred<0b10, opc, asm, ZPR32>;
	def _D : sve_int_mladdsub_vvv_pred<0b11, opc, asm, ZPR64>;

	def : SVE_4_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_4_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_4_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_4_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	class sve_int_mlas_vvv_pred<bits<2> sz8_64, bits<1> opc, string asm,
	ZPRRegOp zprty>
	: I<(outs zprty:$Zda), (ins PPR3bAny:$Pg, zprty:$_Zda, zprty:$Zn, zprty:$Zm),
	asm, "\t$Zda, $Pg/m, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zda;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b0;
	let Inst{20-16} = Zm;
	let Inst{15-14} = 0b01;
	let Inst{13} = opc;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_int_mlas_vvv_pred<bits<1> opc, string asm, SDPatternOperator op> {
	def _B : sve_int_mlas_vvv_pred<0b00, opc, asm, ZPR8>;
	def _H : sve_int_mlas_vvv_pred<0b01, opc, asm, ZPR16>;
	def _S : sve_int_mlas_vvv_pred<0b10, opc, asm, ZPR32>;
	def _D : sve_int_mlas_vvv_pred<0b11, opc, asm, ZPR64>;

	def : SVE_4_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_4_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_4_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_4_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Integer Multiply-Add - Unpredicated Group
	//===----------------------------------------------------------------------===//

	class sve2_int_mla<bits<2> sz, bits<5> opc, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2>
	: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm),
	asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
	bits<5> Zda;
	bits<5> Zn;
	bits<5> Zm;
	let Inst{31-24} = 0b01000100;
	let Inst{23-22} = sz;
	let Inst{21} = 0b0;
	let Inst{20-16} = Zm;
	let Inst{15} = 0b0;
	let Inst{14-10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve2_int_mla<bit S, string asm, SDPatternOperator op> {
	def _B : sve2_int_mla<0b00, { 0b1110, S }, asm, ZPR8, ZPR8>;
	def _H : sve2_int_mla<0b01, { 0b1110, S }, asm, ZPR16, ZPR16>;
	def _S : sve2_int_mla<0b10, { 0b1110, S }, asm, ZPR32, ZPR32>;
	def _D : sve2_int_mla<0b11, { 0b1110, S }, asm, ZPR64, ZPR64>;

	def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve2_int_mla_long<bits<5> opc, string asm, SDPatternOperator op> {
	def _H : sve2_int_mla<0b01, opc, asm, ZPR16, ZPR8>;
	def _S : sve2_int_mla<0b10, opc, asm, ZPR32, ZPR16>;
	def _D : sve2_int_mla<0b11, opc, asm, ZPR64, ZPR32>;

	def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Integer Multiply-Add - Indexed Group
	//===----------------------------------------------------------------------===//

	class sve2_int_mla_by_indexed_elem<bits<2> sz, bits<6> opc, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2,
	ZPRRegOp zprty3, Operand itype>
	: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty3:$Zm, itype:$iop),
	asm, "\t$Zda, $Zn, $Zm$iop", "", []>, Sched<[]> {
	bits<5> Zda;
	bits<5> Zn;
	let Inst{31-24} = 0b01000100;
	let Inst{23-22} = sz;
	let Inst{21} = 0b1;
	let Inst{15-10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve2_int_mla_by_indexed_elem<bits<2> opc, bit S, string asm,
	SDPatternOperator op> {
	def _H : sve2_int_mla_by_indexed_elem<{0, ?}, { 0b000, opc, S }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH32b> {
	bits<3> Zm;
	bits<3> iop;
	let Inst{22} = iop{2};
	let Inst{20-19} = iop{1-0};
	let Inst{18-16} = Zm;
	}
	def _S : sve2_int_mla_by_indexed_elem<0b10, { 0b000, opc, S }, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS32b> {
	bits<3> Zm;
	bits<2> iop;
	let Inst{20-19} = iop;
	let Inst{18-16} = Zm;
	}
	def _D : sve2_int_mla_by_indexed_elem<0b11, { 0b000, opc, S }, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD32b> {
	bits<4> Zm;
	bit iop;
	let Inst{20} = iop;
	let Inst{19-16} = Zm;
	}

	def : SVE_4_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME # _H)>;
	def : SVE_4_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME # _S)>;
	def : SVE_4_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, i32, VectorIndexD32b_timm, !cast<Instruction>(NAME # _D)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Integer Multiply-Add Long - Indexed Group
	//===----------------------------------------------------------------------===//

	multiclass sve2_int_mla_long_by_indexed_elem<bits<4> opc, string asm, SDPatternOperator op> {
	def _S : sve2_int_mla_by_indexed_elem<0b10, { opc{3}, 0b0, opc{2-1}, ?, opc{0} },
	asm, ZPR32, ZPR16, ZPR3b16, VectorIndexH32b> {
	bits<3> Zm;
	bits<3> iop;
	let Inst{20-19} = iop{2-1};
	let Inst{18-16} = Zm;
	let Inst{11} = iop{0};
	}
	def _D : sve2_int_mla_by_indexed_elem<0b11, { opc{3}, 0b0, opc{2-1}, ?, opc{0} },
	asm, ZPR64, ZPR32, ZPR4b32, VectorIndexS32b> {
	bits<4> Zm;
	bits<2> iop;
	let Inst{20} = iop{1};
	let Inst{19-16} = Zm;
	let Inst{11} = iop{0};
	}

	def : SVE_4_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv8i16, nxv8i16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME # _S)>;
	def : SVE_4_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv4i32, nxv4i32, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME # _D)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Integer Dot Product Group
	//===----------------------------------------------------------------------===//

	class sve_intx_dot<bit sz, bit U, string asm, ZPRRegOp zprty1,
	ZPRRegOp zprty2>
	: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm), asm,
	"\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
	bits<5> Zda;
	bits<5> Zn;
	bits<5> Zm;
	let Inst{31-23} = 0b010001001;
	let Inst{22} = sz;
	let Inst{21} = 0;
	let Inst{20-16} = Zm;
	let Inst{15-11} = 0;
	let Inst{10} = U;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = DestructiveOther;
	}

	multiclass sve_intx_dot<bit opc, string asm, SDPatternOperator op> {
	def _S : sve_intx_dot<0b0, opc, asm, ZPR32, ZPR8>;
	def _D : sve_intx_dot<0b1, opc, asm, ZPR64, ZPR16>;

	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _D)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Integer Dot Product Group - Indexed Group
	//===----------------------------------------------------------------------===//

	class sve_intx_dot_by_indexed_elem<bit sz, bit U, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2,
	ZPRRegOp zprty3, Operand itype>
	: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty3:$Zm, itype:$iop),
	asm, "\t$Zda, $Zn, $Zm$iop",
	"", []>, Sched<[]> {
	bits<5> Zda;
	bits<5> Zn;
	let Inst{31-23} = 0b010001001;
	let Inst{22} = sz;
	let Inst{21} = 0b1;
	let Inst{15-11} = 0;
	let Inst{10} = U;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = DestructiveOther;
	}

	multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm,
	SDPatternOperator op> {
	def _S : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b_timm> {
	bits<2> iop;
	bits<3> Zm;
	let Inst{20-19} = iop;
	let Inst{18-16} = Zm;
	}
	def _D : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b_timm> {
	bits<1> iop;
	bits<4> Zm;
	let Inst{20} = iop;
	let Inst{19-16} = Zm;
	}

	def : Pat<(nxv4i32 (op nxv4i32:$Op1, nxv16i8:$Op2, nxv16i8:$Op3, (i32 VectorIndexS32b_timm:$idx))),
	(!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b_timm:$idx)>;
	def : Pat<(nxv2i64 (op nxv2i64:$Op1, nxv8i16:$Op2, nxv8i16:$Op3, (i32 VectorIndexD32b_timm:$idx))),
	(!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b_timm:$idx)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Complex Integer Dot Product Group
	//===----------------------------------------------------------------------===//

	class sve2_complex_int_arith<bits<2> sz, bits<4> opc, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2>
	: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm,
	complexrotateop:$rot),
	asm, "\t$Zda, $Zn, $Zm, $rot", "", []>, Sched<[]> {
	bits<5> Zda;
	bits<5> Zn;
	bits<5> Zm;
	bits<2> rot;
	let Inst{31-24} = 0b01000100;
	let Inst{23-22} = sz;
	let Inst{21} = 0b0;
	let Inst{20-16} = Zm;
	let Inst{15-12} = opc;
	let Inst{11-10} = rot;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve2_cintx_dot<string asm, SDPatternOperator op> {
	def _S : sve2_complex_int_arith<0b10, 0b0001, asm, ZPR32, ZPR8>;
	def _D : sve2_complex_int_arith<0b11, 0b0001, asm, ZPR64, ZPR16>;

	def : Pat<(nxv4i32 (op (nxv4i32 ZPR32:$Op1), (nxv16i8 ZPR8:$Op2), (nxv16i8 ZPR8:$Op3),
	(i32 complexrotateop:$imm))),
	(!cast<Instruction>(NAME # "_S") ZPR32:$Op1, ZPR8:$Op2, ZPR8:$Op3, complexrotateop:$imm)>;
	def : Pat<(nxv2i64 (op (nxv2i64 ZPR64:$Op1), (nxv8i16 ZPR16:$Op2), (nxv8i16 ZPR16:$Op3),
	(i32 complexrotateop:$imm))),
	(!cast<Instruction>(NAME # "_D") ZPR64:$Op1, ZPR16:$Op2, ZPR16:$Op3, complexrotateop:$imm)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Complex Multiply-Add Group
	//===----------------------------------------------------------------------===//

	multiclass sve2_int_cmla<bit opc, string asm, SDPatternOperator op> {
	def _B : sve2_complex_int_arith<0b00, { 0b001, opc }, asm, ZPR8, ZPR8>;
	def _H : sve2_complex_int_arith<0b01, { 0b001, opc }, asm, ZPR16, ZPR16>;
	def _S : sve2_complex_int_arith<0b10, { 0b001, opc }, asm, ZPR32, ZPR32>;
	def _D : sve2_complex_int_arith<0b11, { 0b001, opc }, asm, ZPR64, ZPR64>;

	def : SVE_4_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, i32, complexrotateop, !cast<Instruction>(NAME # _B)>;
	def : SVE_4_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, i32, complexrotateop, !cast<Instruction>(NAME # _H)>;
	def : SVE_4_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, i32, complexrotateop, !cast<Instruction>(NAME # _S)>;
	def : SVE_4_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, i32, complexrotateop, !cast<Instruction>(NAME # _D)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Complex Integer Dot Product - Indexed Group
	//===----------------------------------------------------------------------===//

	class sve2_complex_int_arith_indexed<bits<2> sz, bits<4> opc, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2,
	ZPRRegOp zprty3, Operand itype>
	: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty3:$Zm, itype:$iop,
	complexrotateop:$rot),
	asm, "\t$Zda, $Zn, $Zm$iop, $rot", "", []>, Sched<[]> {
	bits<5> Zda;
	bits<5> Zn;
	bits<2> rot;
	let Inst{31-24} = 0b01000100;
	let Inst{23-22} = sz;
	let Inst{21} = 0b1;
	let Inst{15-12} = opc;
	let Inst{11-10} = rot;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve2_cintx_dot_by_indexed_elem<string asm, SDPatternOperator op> {
	def _S : sve2_complex_int_arith_indexed<0b10, 0b0100, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b> {
	bits<2> iop;
	bits<3> Zm;
	let Inst{20-19} = iop;
	let Inst{18-16} = Zm;
	}
	def _D : sve2_complex_int_arith_indexed<0b11, 0b0100, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b> {
	bit iop;
	bits<4> Zm;
	let Inst{20} = iop;
	let Inst{19-16} = Zm;
	}

	def : Pat<(nxv4i32 (op (nxv4i32 ZPR32:$Op1), (nxv16i8 ZPR8:$Op2), (nxv16i8 ZPR8:$Op3),
	(i32 VectorIndexS32b_timm:$idx), (i32 complexrotateop:$imm))),
	(!cast<Instruction>(NAME # "_S") ZPR32:$Op1, ZPR8:$Op2, ZPR8:$Op3, VectorIndexS32b_timm:$idx, complexrotateop:$imm)>;
	def : Pat<(nxv2i64 (op (nxv2i64 ZPR64:$Op1), (nxv8i16 ZPR16:$Op2), (nxv8i16 ZPR16:$Op3),
	(i32 VectorIndexD32b_timm:$idx), (i32 complexrotateop:$imm))),
	(!cast<Instruction>(NAME # "_D") ZPR64:$Op1, ZPR16:$Op2, ZPR16:$Op3, VectorIndexD32b_timm:$idx, complexrotateop:$imm)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Complex Multiply-Add - Indexed Group
	//===----------------------------------------------------------------------===//

	multiclass sve2_cmla_by_indexed_elem<bit opc, string asm,
	SDPatternOperator op> {
	def _H : sve2_complex_int_arith_indexed<0b10, { 0b011, opc }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexS32b> {
	bits<2> iop;
	bits<3> Zm;
	let Inst{20-19} = iop;
	let Inst{18-16} = Zm;
	}
	def _S : sve2_complex_int_arith_indexed<0b11, { 0b011, opc }, asm, ZPR32, ZPR32, ZPR4b32, VectorIndexD32b> {
	bit iop;
	bits<4> Zm;
	let Inst{20} = iop;
	let Inst{19-16} = Zm;
	}

	def : Pat<(nxv8i16 (op (nxv8i16 ZPR16:$Op1), (nxv8i16 ZPR16:$Op2), (nxv8i16 ZPR16:$Op3),
	(i32 VectorIndexS32b_timm:$idx), (i32 complexrotateop:$imm))),
	(!cast<Instruction>(NAME # "_H") ZPR16:$Op1, ZPR16:$Op2, ZPR16:$Op3, VectorIndexS32b_timm:$idx, complexrotateop:$imm)>;

	def : Pat<(nxv4i32 (op (nxv4i32 ZPR32:$Op1), (nxv4i32 ZPR32:$Op2), (nxv4i32 ZPR32:$Op3),
	(i32 VectorIndexD32b_timm:$idx), (i32 complexrotateop:$imm))),
	(!cast<Instruction>(NAME # "_S") ZPR32:$Op1, ZPR32:$Op2, ZPR32:$Op3, VectorIndexD32b_timm:$idx, complexrotateop:$imm)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Integer Multiply - Unpredicated Group
	//===----------------------------------------------------------------------===//

	class sve2_int_mul<bits<2> sz, bits<3> opc, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
	asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15-13} = 0b011;
	let Inst{12-10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve2_int_mul<bits<3> opc, string asm, SDPatternOperator op> {
	def _B : sve2_int_mul<0b00, opc, asm, ZPR8>;
	def _H : sve2_int_mul<0b01, opc, asm, ZPR16>;
	def _S : sve2_int_mul<0b10, opc, asm, ZPR32>;
	def _D : sve2_int_mul<0b11, opc, asm, ZPR64>;

	def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve2_int_mul_single<bits<3> opc, string asm, SDPatternOperator op> {
	def _B : sve2_int_mul<0b00, opc, asm, ZPR8>;

	def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Integer Multiply - Indexed Group
	//===----------------------------------------------------------------------===//

	class sve2_int_mul_by_indexed_elem<bits<2> sz, bits<4> opc, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2,
	ZPRRegOp zprty3, Operand itype>
	: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty3:$Zm, itype:$iop),
	asm, "\t$Zd, $Zn, $Zm$iop", "", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	let Inst{31-24} = 0b01000100;
	let Inst{23-22} = sz;
	let Inst{21} = 0b1;
	let Inst{15-14} = 0b11;
	let Inst{13-10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve2_int_mul_by_indexed_elem<bits<4> opc, string asm,
	SDPatternOperator op> {
	def _H : sve2_int_mul_by_indexed_elem<{0, ?}, opc, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH32b> {
	bits<3> Zm;
	bits<3> iop;
	let Inst{22} = iop{2};
	let Inst{20-19} = iop{1-0};
	let Inst{18-16} = Zm;
	}
	def _S : sve2_int_mul_by_indexed_elem<0b10, opc, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS32b> {
	bits<3> Zm;
	bits<2> iop;
	let Inst{20-19} = iop;
	let Inst{18-16} = Zm;
	}
	def _D : sve2_int_mul_by_indexed_elem<0b11, opc, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD32b> {
	bits<4> Zm;
	bit iop;
	let Inst{20} = iop;
	let Inst{19-16} = Zm;
	}

	def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, VectorIndexD32b_timm, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve2_int_mul_long_by_indexed_elem<bits<3> opc, string asm,
	SDPatternOperator op> {
	def _S : sve2_int_mul_by_indexed_elem<0b10, { opc{2-1}, ?, opc{0} }, asm,
	ZPR32, ZPR16, ZPR3b16, VectorIndexH32b> {
	bits<3> Zm;
	bits<3> iop;
	let Inst{20-19} = iop{2-1};
	let Inst{18-16} = Zm;
	let Inst{11} = iop{0};
	}
	def _D : sve2_int_mul_by_indexed_elem<0b11, { opc{2-1}, ?, opc{0} }, asm,
	ZPR64, ZPR32, ZPR4b32, VectorIndexS32b> {
	bits<4> Zm;
	bits<2> iop;
	let Inst{20} = iop{1};
	let Inst{19-16} = Zm;
	let Inst{11} = iop{0};
	}

	def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv8i16, nxv8i16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv4i32, nxv4i32, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME # _D)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Integer - Predicated Group
	//===----------------------------------------------------------------------===//

	class sve2_int_arith_pred<bits<2> sz, bits<6> opc, string asm,
	ZPRRegOp zprty>
	: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
	asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm", "", []>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zm;
	bits<5> Zdn;
	let Inst{31-24} = 0b01000100;
	let Inst{23-22} = sz;
	let Inst{21-20} = 0b01;
	let Inst{20-16} = opc{5-1};
	let Inst{15-14} = 0b10;
	let Inst{13} = opc{0};
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve2_int_arith_pred<bits<6> opc, string asm, SDPatternOperator op> {
	def _B : sve2_int_arith_pred<0b00, opc, asm, ZPR8>;
	def _H : sve2_int_arith_pred<0b01, opc, asm, ZPR16>;
	def _S : sve2_int_arith_pred<0b10, opc, asm, ZPR32>;
	def _D : sve2_int_arith_pred<0b11, opc, asm, ZPR64>;

	def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	class sve2_int_sadd_long_accum_pairwise<bits<2> sz, bit U, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2>
	: I<(outs zprty1:$Zda), (ins PPR3bAny:$Pg, zprty1:$_Zda, zprty2:$Zn),
	asm, "\t$Zda, $Pg/m, $Zn", "", []>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zn;
	bits<5> Zda;
	let Inst{31-24} = 0b01000100;
	let Inst{23-22} = sz;
	let Inst{21-17} = 0b00010;
	let Inst{16} = U;
	let Inst{15-13} = 0b101;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = zprty1.ElementSize;
	}

	multiclass sve2_int_sadd_long_accum_pairwise<bit U, string asm, SDPatternOperator op> {
	def _H : sve2_int_sadd_long_accum_pairwise<0b01, U, asm, ZPR16, ZPR8>;
	def _S : sve2_int_sadd_long_accum_pairwise<0b10, U, asm, ZPR32, ZPR16>;
	def _D : sve2_int_sadd_long_accum_pairwise<0b11, U, asm, ZPR64, ZPR32>;

	def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv16i8, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv8i16, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv4i32, !cast<Instruction>(NAME # _D)>;
	}

	class sve2_int_un_pred_arit<bits<2> sz, bit Q, bits<2> opc,
	string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, zprty:$Zn),
	asm, "\t$Zd, $Pg/m, $Zn",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zd;
	bits<5> Zn;
	let Inst{31-24} = 0b01000100;
	let Inst{23-22} = sz;
	let Inst{21-20} = 0b00;
	let Inst{19} = Q;
	let Inst{18} = 0b0;
	let Inst{17-16} = opc;
	let Inst{15-13} = 0b101;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;

	let Constraints = "$Zd = $_Zd";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve2_int_un_pred_arit_s<bits<3> opc, string asm,
	SDPatternOperator op> {
	def _S : sve2_int_un_pred_arit<0b10, opc{2}, opc{1-0}, asm, ZPR32>;
	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
	}

	multiclass sve2_int_un_pred_arit<bits<3> opc, string asm, SDPatternOperator op> {
	def _B : sve2_int_un_pred_arit<0b00, opc{2}, opc{1-0}, asm, ZPR8>;
	def _H : sve2_int_un_pred_arit<0b01, opc{2}, opc{1-0}, asm, ZPR16>;
	def _S : sve2_int_un_pred_arit<0b10, opc{2}, opc{1-0}, asm, ZPR32>;
	def _D : sve2_int_un_pred_arit<0b11, opc{2}, opc{1-0}, asm, ZPR64>;

	def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Widening Integer Arithmetic Group
	//===----------------------------------------------------------------------===//

	class sve2_wide_int_arith<bits<2> sz, bits<5> opc, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2, ZPRRegOp zprty3>
	: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty3:$Zm),
	asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<5> Zm;
	let Inst{31-24} = 0b01000101;
	let Inst{23-22} = sz;
	let Inst{21} = 0b0;
	let Inst{20-16} = Zm;
	let Inst{15} = 0b0;
	let Inst{14-10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve2_wide_int_arith_long<bits<5> opc, string asm,
	SDPatternOperator op> {
	def _H : sve2_wide_int_arith<0b01, opc, asm, ZPR16, ZPR8, ZPR8>;
	def _S : sve2_wide_int_arith<0b10, opc, asm, ZPR32, ZPR16, ZPR16>;
	def _D : sve2_wide_int_arith<0b11, opc, asm, ZPR64, ZPR32, ZPR32>;

	def : SVE_2_Op_Pat<nxv8i16, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<nxv4i32, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<nxv2i64, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve2_wide_int_arith_wide<bits<3> opc, string asm,
	SDPatternOperator op> {
	def _H : sve2_wide_int_arith<0b01, { 0b10, opc }, asm, ZPR16, ZPR16, ZPR8>;
	def _S : sve2_wide_int_arith<0b10, { 0b10, opc }, asm, ZPR32, ZPR32, ZPR16>;
	def _D : sve2_wide_int_arith<0b11, { 0b10, opc }, asm, ZPR64, ZPR64, ZPR32>;

	def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv16i8, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv8i16, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv4i32, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve2_wide_int_arith_pmul<bits<2> sz, bits<5> opc, string asm,
	SDPatternOperator op> {
	def NAME : sve2_wide_int_arith<sz, opc, asm, ZPR128, ZPR64, ZPR64>;

	// To avoid using 128 bit elements in the IR, the pattern below works with
	// llvm intrinsics with the _pair suffix, to reflect that
	// _Q is implemented as a pair of _D.
	def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME)>;
	}

	multiclass sve2_pmul_long<bits<1> opc, string asm, SDPatternOperator op> {
	def _H : sve2_wide_int_arith<0b01, {0b1101, opc}, asm, ZPR16, ZPR8, ZPR8>;
	def _D : sve2_wide_int_arith<0b11, {0b1101, opc}, asm, ZPR64, ZPR32, ZPR32>;

	// To avoid using 128 bit elements in the IR, the patterns below work with
	// llvm intrinsics with the _pair suffix, to reflect that
	// _H is implemented as a pair of _B and _D is implemented as a pair of _S.
	def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Misc Group
	//===----------------------------------------------------------------------===//

	class sve2_misc<bits<2> sz, bits<4> opc, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2>
	: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty2:$Zm),
	asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<5> Zm;
	let Inst{31-24} = 0b01000101;
	let Inst{23-22} = sz;
	let Inst{21} = 0b0;
	let Inst{20-16} = Zm;
	let Inst{15-14} = 0b10;
	let Inst{13-10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve2_misc_bitwise<bits<4> opc, string asm, SDPatternOperator op> {
	def _B : sve2_misc<0b00, opc, asm, ZPR8, ZPR8>;
	def _H : sve2_misc<0b01, opc, asm, ZPR16, ZPR16>;
	def _S : sve2_misc<0b10, opc, asm, ZPR32, ZPR32>;
	def _D : sve2_misc<0b11, opc, asm, ZPR64, ZPR64>;

	def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve2_misc_int_addsub_long_interleaved<bits<2> opc, string asm,
	SDPatternOperator op> {
	def _H : sve2_misc<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>;
	def _S : sve2_misc<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>;
	def _D : sve2_misc<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>;

	def : SVE_2_Op_Pat<nxv8i16, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<nxv4i32, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<nxv2i64, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>;
	}

	class sve2_bitwise_xor_interleaved<bits<2> sz, bits<1> opc, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2>
	: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, zprty2:$Zm),
	asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<5> Zm;
	let Inst{31-24} = 0b01000101;
	let Inst{23-22} = sz;
	let Inst{21} = 0b0;
	let Inst{20-16} = Zm;
	let Inst{15-11} = 0b10010;
	let Inst{10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;

	let Constraints = "$Zd = $_Zd";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve2_bitwise_xor_interleaved<bit opc, string asm,
	SDPatternOperator op> {
	def _B : sve2_bitwise_xor_interleaved<0b00, opc, asm, ZPR8, ZPR8>;
	def _H : sve2_bitwise_xor_interleaved<0b01, opc, asm, ZPR16, ZPR16>;
	def _S : sve2_bitwise_xor_interleaved<0b10, opc, asm, ZPR32, ZPR32>;
	def _D : sve2_bitwise_xor_interleaved<0b11, opc, asm, ZPR64, ZPR64>;

	def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	class sve2_bitwise_shift_left_long<bits<3> tsz8_64, bits<2> opc, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2,
	Operand immtype>
	: I<(outs zprty1:$Zd), (ins zprty2:$Zn, immtype:$imm),
	asm, "\t$Zd, $Zn, $imm",
	"", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<5> imm;
	let Inst{31-23} = 0b010001010;
	let Inst{22} = tsz8_64{2};
	let Inst{21} = 0b0;
	let Inst{20-19} = tsz8_64{1-0};
	let Inst{18-16} = imm{2-0}; // imm3
	let Inst{15-12} = 0b1010;
	let Inst{11-10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm,
	SDPatternOperator op> {
	def _H : sve2_bitwise_shift_left_long<{0,0,1}, opc, asm,
	ZPR16, ZPR8, vecshiftL8>;
	def _S : sve2_bitwise_shift_left_long<{0,1,?}, opc, asm,
	ZPR32, ZPR16, vecshiftL16> {
	let Inst{19} = imm{3};
	}
	def _D : sve2_bitwise_shift_left_long<{1,?,?}, opc, asm,
	ZPR64, ZPR32, vecshiftL32> {
	let Inst{20-19} = imm{4-3};
	}
	def : SVE_2_Op_Imm_Pat<nxv8i16, op, nxv16i8, i32, tvecshiftL8, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Imm_Pat<nxv4i32, op, nxv8i16, i32, tvecshiftL16, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Imm_Pat<nxv2i64, op, nxv4i32, i32, tvecshiftL32, !cast<Instruction>(NAME # _D)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Accumulate Group
	//===----------------------------------------------------------------------===//

	class sve2_int_bin_shift_imm<bits<4> tsz8_64, bit opc, string asm,
	ZPRRegOp zprty, Operand immtype>
	: I<(outs zprty:$Zd), (ins zprty:$_Zd, zprty:$Zn, immtype:$imm),
	asm, "\t$Zd, $Zn, $imm",
	"", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<6> imm;
	let Inst{31-24} = 0b01000101;
	let Inst{23-22} = tsz8_64{3-2};
	let Inst{21} = 0b0;
	let Inst{20-19} = tsz8_64{1-0};
	let Inst{18-16} = imm{2-0}; // imm3
	let Inst{15-11} = 0b11110;
	let Inst{10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;

	let Constraints = "$Zd = $_Zd";
	}

	multiclass sve2_int_bin_shift_imm_left<bit opc, string asm,
	SDPatternOperator op> {
	def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
	def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
	let Inst{19} = imm{3};
	}
	def _S : sve2_int_bin_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
	let Inst{20-19} = imm{4-3};
	}
	def _D : sve2_int_bin_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
	let Inst{22} = imm{5};
	let Inst{20-19} = imm{4-3};
	}

	def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, tvecshiftL8, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftL16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftL32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, tvecshiftL64, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve2_int_bin_shift_imm_right<bit opc, string asm,
	SDPatternOperator op> {
	def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
	def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
	let Inst{19} = imm{3};
	}
	def _S : sve2_int_bin_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
	let Inst{20-19} = imm{4-3};
	}
	def _D : sve2_int_bin_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
	let Inst{22} = imm{5};
	let Inst{20-19} = imm{4-3};
	}

	def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, tvecshiftR8, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>;
	}

	class sve2_int_bin_accum_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
	ZPRRegOp zprty, Operand immtype>
	: I<(outs zprty:$Zda), (ins zprty:$_Zda, zprty:$Zn, immtype:$imm),
	asm, "\t$Zda, $Zn, $imm",
	"", []>, Sched<[]> {
	bits<5> Zda;
	bits<5> Zn;
	bits<6> imm;
	let Inst{31-24} = 0b01000101;
	let Inst{23-22} = tsz8_64{3-2};
	let Inst{21} = 0b0;
	let Inst{20-19} = tsz8_64{1-0};
	let Inst{18-16} = imm{2-0}; // imm3
	let Inst{15-12} = 0b1110;
	let Inst{11-10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve2_int_bin_accum_shift_imm_right<bits<2> opc, string asm,
	SDPatternOperator op> {
	def _B : sve2_int_bin_accum_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
	def _H : sve2_int_bin_accum_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
	let Inst{19} = imm{3};
	}
	def _S : sve2_int_bin_accum_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
	let Inst{20-19} = imm{4-3};
	}
	def _D : sve2_int_bin_accum_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
	let Inst{22} = imm{5};
	let Inst{20-19} = imm{4-3};
	}

	def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, tvecshiftR8, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>;
	}

	class sve2_int_cadd<bits<2> sz, bit opc, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, complexrotateopodd:$rot),
	asm, "\t$Zdn, $_Zdn, $Zm, $rot", "", []>, Sched<[]> {
	bits<5> Zdn;
	bits<5> Zm;
	bit rot;
	let Inst{31-24} = 0b01000101;
	let Inst{23-22} = sz;
	let Inst{21-17} = 0b00000;
	let Inst{16} = opc;
	let Inst{15-11} = 0b11011;
	let Inst{10} = rot;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve2_int_cadd<bit opc, string asm, SDPatternOperator op> {
	def _B : sve2_int_cadd<0b00, opc, asm, ZPR8>;
	def _H : sve2_int_cadd<0b01, opc, asm, ZPR16>;
	def _S : sve2_int_cadd<0b10, opc, asm, ZPR32>;
	def _D : sve2_int_cadd<0b11, opc, asm, ZPR64>;

	def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, complexrotateopodd, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, complexrotateopodd, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, complexrotateopodd, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, complexrotateopodd, !cast<Instruction>(NAME # _D)>;
	}

	class sve2_int_absdiff_accum<bits<2> sz, bits<4> opc, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2>
	: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm),
	asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
	bits<5> Zda;
	bits<5> Zn;
	bits<5> Zm;
	let Inst{31-24} = 0b01000101;
	let Inst{23-22} = sz;
	let Inst{21} = 0b0;
	let Inst{20-16} = Zm;
	let Inst{15-14} = 0b11;
	let Inst{13-10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve2_int_absdiff_accum<bit opc, string asm, SDPatternOperator op> {
	def _B : sve2_int_absdiff_accum<0b00, { 0b111, opc }, asm, ZPR8, ZPR8>;
	def _H : sve2_int_absdiff_accum<0b01, { 0b111, opc }, asm, ZPR16, ZPR16>;
	def _S : sve2_int_absdiff_accum<0b10, { 0b111, opc }, asm, ZPR32, ZPR32>;
	def _D : sve2_int_absdiff_accum<0b11, { 0b111, opc }, asm, ZPR64, ZPR64>;

	def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve2_int_absdiff_accum_long<bits<2> opc, string asm,
	SDPatternOperator op> {
	def _H : sve2_int_absdiff_accum<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>;
	def _S : sve2_int_absdiff_accum<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>;
	def _D : sve2_int_absdiff_accum<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>;

	def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve2_int_addsub_long_carry<bits<2> opc, string asm, SDPatternOperator op> {
	def _S : sve2_int_absdiff_accum<{ opc{1}, 0b0 }, { 0b010, opc{0} }, asm,
	ZPR32, ZPR32>;
	def _D : sve2_int_absdiff_accum<{ opc{1}, 0b1 }, { 0b010, opc{0} }, asm,
	ZPR64, ZPR64>;

	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Narrowing Group
	//===----------------------------------------------------------------------===//

	class sve2_int_bin_shift_imm_narrow_bottom<bits<3> tsz8_64, bits<3> opc,
	string asm, ZPRRegOp zprty1,
	ZPRRegOp zprty2, Operand immtype>
	: I<(outs zprty1:$Zd), (ins zprty2:$Zn, immtype:$imm),
	asm, "\t$Zd, $Zn, $imm",
	"", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<5> imm;
	let Inst{31-23} = 0b010001010;
	let Inst{22} = tsz8_64{2};
	let Inst{21} = 0b1;
	let Inst{20-19} = tsz8_64{1-0};
	let Inst{18-16} = imm{2-0}; // imm3
	let Inst{15-14} = 0b00;
	let Inst{13-11} = opc;
	let Inst{10} = 0b0;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve2_int_bin_shift_imm_right_narrow_bottom<bits<3> opc, string asm,
	SDPatternOperator op> {
	def _B : sve2_int_bin_shift_imm_narrow_bottom<{0,0,1}, opc, asm, ZPR8, ZPR16,
	tvecshiftR8>;
	def _H : sve2_int_bin_shift_imm_narrow_bottom<{0,1,?}, opc, asm, ZPR16, ZPR32,
	tvecshiftR16> {
	let Inst{19} = imm{3};
	}
	def _S : sve2_int_bin_shift_imm_narrow_bottom<{1,?,?}, opc, asm, ZPR32, ZPR64,
	vecshiftR32> {
	let Inst{20-19} = imm{4-3};
	}
	def : SVE_2_Op_Imm_Pat<nxv16i8, op, nxv8i16, i32, tvecshiftR8, !cast<Instruction>(NAME # _B)>;
	def : SVE_2_Op_Imm_Pat<nxv8i16, op, nxv4i32, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Imm_Pat<nxv4i32, op, nxv2i64, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
	}

	class sve2_int_bin_shift_imm_narrow_top<bits<3> tsz8_64, bits<3> opc,
	string asm, ZPRRegOp zprty1,
	ZPRRegOp zprty2, Operand immtype>
	: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, immtype:$imm),
	asm, "\t$Zd, $Zn, $imm",
	"", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<5> imm;
	let Inst{31-23} = 0b010001010;
	let Inst{22} = tsz8_64{2};
	let Inst{21} = 0b1;
	let Inst{20-19} = tsz8_64{1-0};
	let Inst{18-16} = imm{2-0}; // imm3
	let Inst{15-14} = 0b00;
	let Inst{13-11} = opc;
	let Inst{10} = 0b1;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;

	let Constraints = "$Zd = $_Zd";
	}

	multiclass sve2_int_bin_shift_imm_right_narrow_top<bits<3> opc, string asm,
	SDPatternOperator op> {
	def _B : sve2_int_bin_shift_imm_narrow_top<{0,0,1}, opc, asm, ZPR8, ZPR16,
	tvecshiftR8>;
	def _H : sve2_int_bin_shift_imm_narrow_top<{0,1,?}, opc, asm, ZPR16, ZPR32,
	tvecshiftR16> {
	let Inst{19} = imm{3};
	}
	def _S : sve2_int_bin_shift_imm_narrow_top<{1,?,?}, opc, asm, ZPR32, ZPR64,
	vecshiftR32> {
	let Inst{20-19} = imm{4-3};
	}
	def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv8i16, i32, tvecshiftR8, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv4i32, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv2i64, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
	}

	class sve2_int_addsub_narrow_high_bottom<bits<2> sz, bits<2> opc, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2>
	: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty2:$Zm),
	asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<5> Zm;
	let Inst{31-24} = 0b01000101;
	let Inst{23-22} = sz;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15-13} = 0b011;
	let Inst{12-11} = opc; // S, R
	let Inst{10} = 0b0; // Top
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve2_int_addsub_narrow_high_bottom<bits<2> opc, string asm,
	SDPatternOperator op> {
	def _B : sve2_int_addsub_narrow_high_bottom<0b01, opc, asm, ZPR8, ZPR16>;
	def _H : sve2_int_addsub_narrow_high_bottom<0b10, opc, asm, ZPR16, ZPR32>;
	def _S : sve2_int_addsub_narrow_high_bottom<0b11, opc, asm, ZPR32, ZPR64>;

	def : SVE_2_Op_Pat<nxv16i8, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _B)>;
	def : SVE_2_Op_Pat<nxv8i16, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<nxv4i32, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _S)>;
	}

	class sve2_int_addsub_narrow_high_top<bits<2> sz, bits<2> opc, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2>
	: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, zprty2:$Zm),
	asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<5> Zm;
	let Inst{31-24} = 0b01000101;
	let Inst{23-22} = sz;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15-13} = 0b011;
	let Inst{12-11} = opc; // S, R
	let Inst{10} = 0b1; // Top
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;

	let Constraints = "$Zd = $_Zd";
	}

	multiclass sve2_int_addsub_narrow_high_top<bits<2> opc, string asm,
	SDPatternOperator op> {
	def _B : sve2_int_addsub_narrow_high_top<0b01, opc, asm, ZPR8, ZPR16>;
	def _H : sve2_int_addsub_narrow_high_top<0b10, opc, asm, ZPR16, ZPR32>;
	def _S : sve2_int_addsub_narrow_high_top<0b11, opc, asm, ZPR32, ZPR64>;

	def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _S)>;
	}

	class sve2_int_sat_extract_narrow_bottom<bits<3> tsz8_64, bits<2> opc, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2>
	: I<(outs zprty1:$Zd), (ins zprty2:$Zn),
	asm, "\t$Zd, $Zn", "", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	let Inst{31-23} = 0b010001010;
	let Inst{22} = tsz8_64{2};
	let Inst{21} = 0b1;
	let Inst{20-19} = tsz8_64{1-0};
	let Inst{18-13} = 0b000010;
	let Inst{12-11} = opc;
	let Inst{10} = 0b0;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve2_int_sat_extract_narrow_bottom<bits<2> opc, string asm,
	SDPatternOperator op> {
	def _B : sve2_int_sat_extract_narrow_bottom<0b001, opc, asm, ZPR8, ZPR16>;
	def _H : sve2_int_sat_extract_narrow_bottom<0b010, opc, asm, ZPR16, ZPR32>;
	def _S : sve2_int_sat_extract_narrow_bottom<0b100, opc, asm, ZPR32, ZPR64>;

	def : SVE_1_Op_Pat<nxv16i8, op, nxv8i16, !cast<Instruction>(NAME # _B)>;
	def : SVE_1_Op_Pat<nxv8i16, op, nxv4i32, !cast<Instruction>(NAME # _H)>;
	def : SVE_1_Op_Pat<nxv4i32, op, nxv2i64, !cast<Instruction>(NAME # _S)>;
	}

	class sve2_int_sat_extract_narrow_top<bits<3> tsz8_64, bits<2> opc, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2>
	: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn),
	asm, "\t$Zd, $Zn", "", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	let Inst{31-23} = 0b010001010;
	let Inst{22} = tsz8_64{2};
	let Inst{21} = 0b1;
	let Inst{20-19} = tsz8_64{1-0};
	let Inst{18-13} = 0b000010;
	let Inst{12-11} = opc;
	let Inst{10} = 0b1;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;

	let Constraints = "$Zd = $_Zd";
	}

	multiclass sve2_int_sat_extract_narrow_top<bits<2> opc, string asm,
	SDPatternOperator op> {
	def _B : sve2_int_sat_extract_narrow_top<0b001, opc, asm, ZPR8, ZPR16>;
	def _H : sve2_int_sat_extract_narrow_top<0b010, opc, asm, ZPR16, ZPR32>;
	def _S : sve2_int_sat_extract_narrow_top<0b100, opc, asm, ZPR32, ZPR64>;

	def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv8i16, !cast<Instruction>(NAME # _B)>;
	def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv4i32, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv2i64, !cast<Instruction>(NAME # _S)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Integer Arithmetic - Unary Predicated Group
	//===----------------------------------------------------------------------===//

	class sve_int_un_pred_arit<bits<2> sz8_64, bits<4> opc,
	string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, zprty:$Zn),
	asm, "\t$Zd, $Pg/m, $Zn",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zd;
	bits<5> Zn;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz8_64;
	let Inst{21-20} = 0b01;
	let Inst{19} = opc{0};
	let Inst{18-16} = opc{3-1};
	let Inst{15-13} = 0b101;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;

	let Constraints = "$Zd = $_Zd";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_int_un_pred_arit_0<bits<3> opc, string asm,
	SDPatternOperator op> {
	def _B : sve_int_un_pred_arit<0b00, { opc, 0b0 }, asm, ZPR8>;
	def _H : sve_int_un_pred_arit<0b01, { opc, 0b0 }, asm, ZPR16>;
	def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
	def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;

	def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve_int_un_pred_arit_0_h<bits<3> opc, string asm,
	SDPatternOperator op> {
	def _H : sve_int_un_pred_arit<0b01, { opc, 0b0 }, asm, ZPR16>;
	def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
	def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;

	def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve_int_un_pred_arit_0_w<bits<3> opc, string asm,
	SDPatternOperator op> {
	def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
	def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;

	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve_int_un_pred_arit_0_d<bits<3> opc, string asm,
	SDPatternOperator op> {
	def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;

	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve_int_un_pred_arit_1<bits<3> opc, string asm,
	SDPatternOperator op> {
	def _B : sve_int_un_pred_arit<0b00, { opc, 0b1 }, asm, ZPR8>;
	def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>;
	def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>;
	def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>;

	def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;

	def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve_int_un_pred_arit_1_fp<bits<3> opc, string asm,
	SDPatternOperator op> {
	def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>;
	def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>;
	def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>;

	def : SVE_3_Op_Pat<nxv8f16, op, nxv8f16, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2f64, op, nxv2f64, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Integer Wide Immediate - Unpredicated Group
	//===----------------------------------------------------------------------===//
	class sve_int_dup_imm<bits<2> sz8_64, string asm,
	ZPRRegOp zprty, Operand immtype>
	: I<(outs zprty:$Zd), (ins immtype:$imm),
	asm, "\t$Zd, $imm",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<9> imm;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = sz8_64;
	let Inst{21-14} = 0b11100011;
	let Inst{13} = imm{8}; // sh
	let Inst{12-5} = imm{7-0}; // imm8
	let Inst{4-0} = Zd;

	let isReMaterializable = 1;
	}

	multiclass sve_int_dup_imm<string asm> {
	def _B : sve_int_dup_imm<0b00, asm, ZPR8, cpy_imm8_opt_lsl_i8>;
	def _H : sve_int_dup_imm<0b01, asm, ZPR16, cpy_imm8_opt_lsl_i16>;
	def _S : sve_int_dup_imm<0b10, asm, ZPR32, cpy_imm8_opt_lsl_i32>;
	def _D : sve_int_dup_imm<0b11, asm, ZPR64, cpy_imm8_opt_lsl_i64>;

	def : InstAlias<"mov $Zd, $imm",
	(!cast<Instruction>(NAME # _B) ZPR8:$Zd, cpy_imm8_opt_lsl_i8:$imm), 1>;
	def : InstAlias<"mov $Zd, $imm",
	(!cast<Instruction>(NAME # _H) ZPR16:$Zd, cpy_imm8_opt_lsl_i16:$imm), 1>;
	def : InstAlias<"mov $Zd, $imm",
	(!cast<Instruction>(NAME # _S) ZPR32:$Zd, cpy_imm8_opt_lsl_i32:$imm), 1>;
	def : InstAlias<"mov $Zd, $imm",
	(!cast<Instruction>(NAME # _D) ZPR64:$Zd, cpy_imm8_opt_lsl_i64:$imm), 1>;

	def : InstAlias<"fmov $Zd, #0.0",
	(!cast<Instruction>(NAME # _H) ZPR16:$Zd, 0, 0), 1>;
	def : InstAlias<"fmov $Zd, #0.0",
	(!cast<Instruction>(NAME # _S) ZPR32:$Zd, 0, 0), 1>;
	def : InstAlias<"fmov $Zd, #0.0",
	(!cast<Instruction>(NAME # _D) ZPR64:$Zd, 0, 0), 1>;
	}

	class sve_int_dup_fpimm<bits<2> sz8_64, Operand fpimmtype,
	string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins fpimmtype:$imm8),
	asm, "\t$Zd, $imm8",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<8> imm8;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = sz8_64;
	let Inst{21-14} = 0b11100111;
	let Inst{13} = 0b0;
	let Inst{12-5} = imm8;
	let Inst{4-0} = Zd;

	let isReMaterializable = 1;
	}

	multiclass sve_int_dup_fpimm<string asm> {
	def _H : sve_int_dup_fpimm<0b01, fpimm16, asm, ZPR16>;
	def _S : sve_int_dup_fpimm<0b10, fpimm32, asm, ZPR32>;
	def _D : sve_int_dup_fpimm<0b11, fpimm64, asm, ZPR64>;

	def : InstAlias<"fmov $Zd, $imm8",
	(!cast<Instruction>(NAME # _H) ZPR16:$Zd, fpimm16:$imm8), 1>;
	def : InstAlias<"fmov $Zd, $imm8",
	(!cast<Instruction>(NAME # _S) ZPR32:$Zd, fpimm32:$imm8), 1>;
	def : InstAlias<"fmov $Zd, $imm8",
	(!cast<Instruction>(NAME # _D) ZPR64:$Zd, fpimm64:$imm8), 1>;
	}

	class sve_int_arith_imm0<bits<2> sz8_64, bits<3> opc, string asm,
	ZPRRegOp zprty, Operand immtype>
	: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, immtype:$imm),
	asm, "\t$Zdn, $_Zdn, $imm",
	"",
	[]>, Sched<[]> {
	bits<5> Zdn;
	bits<9> imm;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = sz8_64;
	let Inst{21-19} = 0b100;
	let Inst{18-16} = opc;
	let Inst{15-14} = 0b11;
	let Inst{13} = imm{8}; // sh
	let Inst{12-5} = imm{7-0}; // imm8
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve_int_arith_imm0<bits<3> opc, string asm,
	SDPatternOperator op, SDPatternOperator int_op> {
	def _B : sve_int_arith_imm0<0b00, opc, asm, ZPR8, addsub_imm8_opt_lsl_i8>;
	def _H : sve_int_arith_imm0<0b01, opc, asm, ZPR16, addsub_imm8_opt_lsl_i16>;
	def _S : sve_int_arith_imm0<0b10, opc, asm, ZPR32, addsub_imm8_opt_lsl_i32>;
	def _D : sve_int_arith_imm0<0b11, opc, asm, ZPR64, addsub_imm8_opt_lsl_i64>;

	def : SVE_1_Op_Imm_OptLsl_Pat<nxv16i8, op, ZPR8, i32, SVEAddSubImm8Pat, !cast<Instruction>(NAME # _B)>;
	def : SVE_1_Op_Imm_OptLsl_Pat<nxv8i16, op, ZPR16, i32, SVEAddSubImm16Pat, !cast<Instruction>(NAME # _H)>;
	def : SVE_1_Op_Imm_OptLsl_Pat<nxv4i32, op, ZPR32, i32, SVEAddSubImm32Pat, !cast<Instruction>(NAME # _S)>;
	def : SVE_1_Op_Imm_OptLsl_Pat<nxv2i64, op, ZPR64, i64, SVEAddSubImm64Pat, !cast<Instruction>(NAME # _D)>;

	// Intrinsic version
	def : SVE_1_Op_Imm_OptLsl_Pat<nxv16i8, int_op, ZPR8, i32, SVEAddSubImm8Pat, !cast<Instruction>(NAME # _B)>;
	def : SVE_1_Op_Imm_OptLsl_Pat<nxv8i16, int_op, ZPR16, i32, SVEAddSubImm16Pat, !cast<Instruction>(NAME # _H)>;
	def : SVE_1_Op_Imm_OptLsl_Pat<nxv4i32, int_op, ZPR32, i32, SVEAddSubImm32Pat, !cast<Instruction>(NAME # _S)>;
	def : SVE_1_Op_Imm_OptLsl_Pat<nxv2i64, int_op, ZPR64, i64, SVEAddSubImm64Pat, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve_int_arith_imm0_subr<bits<3> opc, string asm, SDPatternOperator op> {
	def _B : sve_int_arith_imm0<0b00, opc, asm, ZPR8, addsub_imm8_opt_lsl_i8>;
	def _H : sve_int_arith_imm0<0b01, opc, asm, ZPR16, addsub_imm8_opt_lsl_i16>;
	def _S : sve_int_arith_imm0<0b10, opc, asm, ZPR32, addsub_imm8_opt_lsl_i32>;
	def _D : sve_int_arith_imm0<0b11, opc, asm, ZPR64, addsub_imm8_opt_lsl_i64>;

	def : SVE_1_Op_Imm_OptLsl_Reverse_Pat<nxv16i8, op, ZPR8, i32, SVEAddSubImm8Pat, !cast<Instruction>(NAME # _B)>;
	def : SVE_1_Op_Imm_OptLsl_Reverse_Pat<nxv8i16, op, ZPR16, i32, SVEAddSubImm16Pat, !cast<Instruction>(NAME # _H)>;
	def : SVE_1_Op_Imm_OptLsl_Reverse_Pat<nxv4i32, op, ZPR32, i32, SVEAddSubImm32Pat, !cast<Instruction>(NAME # _S)>;
	def : SVE_1_Op_Imm_OptLsl_Reverse_Pat<nxv2i64, op, ZPR64, i64, SVEAddSubImm64Pat, !cast<Instruction>(NAME # _D)>;
	}

	class sve_int_arith_imm<bits<2> sz8_64, bits<6> opc, string asm,
	ZPRRegOp zprty, Operand immtype>
	: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, immtype:$imm),
	asm, "\t$Zdn, $_Zdn, $imm",
	"",
	[]>, Sched<[]> {
	bits<5> Zdn;
	bits<8> imm;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = sz8_64;
	let Inst{21-16} = opc;
	let Inst{15-13} = 0b110;
	let Inst{12-5} = imm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve_int_arith_imm1<bits<2> opc, string asm, SDPatternOperator op> {
	def _B : sve_int_arith_imm<0b00, { 0b1010, opc }, asm, ZPR8, simm8>;
	def _H : sve_int_arith_imm<0b01, { 0b1010, opc }, asm, ZPR16, simm8>;
	def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, simm8>;
	def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, simm8>;

	def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>;
	def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>;
	def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>;
	def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve_int_arith_imm1_unsigned<bits<2> opc, string asm, SDPatternOperator op> {
	def _B : sve_int_arith_imm<0b00, { 0b1010, opc }, asm, ZPR8, imm0_255>;
	def _H : sve_int_arith_imm<0b01, { 0b1010, opc }, asm, ZPR16, imm0_255>;
	def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, imm0_255>;
	def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, imm0_255>;

	def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _B)>;
	def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _H)>;
	def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _S)>;
	def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithUImmPat, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve_int_arith_imm2<string asm, SDPatternOperator op> {
	def _B : sve_int_arith_imm<0b00, 0b110000, asm, ZPR8, simm8>;
	def _H : sve_int_arith_imm<0b01, 0b110000, asm, ZPR16, simm8>;
	def _S : sve_int_arith_imm<0b10, 0b110000, asm, ZPR32, simm8>;
	def _D : sve_int_arith_imm<0b11, 0b110000, asm, ZPR64, simm8>;

	def : SVE_1_Op_Imm_Arith_Pat<nxv16i8, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>;
	def : SVE_1_Op_Imm_Arith_Pat<nxv8i16, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>;
	def : SVE_1_Op_Imm_Arith_Pat<nxv4i32, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>;
	def : SVE_1_Op_Imm_Arith_Pat<nxv2i64, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Bitwise Logical - Unpredicated Group
	//===----------------------------------------------------------------------===//

	class sve_int_bin_cons_log<bits<2> opc, string asm>
	: I<(outs ZPR64:$Zd), (ins ZPR64:$Zn, ZPR64:$Zm),
	asm, "\t$Zd, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = opc{1-0};
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15-10} = 0b001100;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_bin_cons_log<bits<2> opc, string asm, SDPatternOperator op> {
	def NAME : sve_int_bin_cons_log<opc, asm>;

	def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME)>;
	def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME)>;
	def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME)>;
	def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME)>;

	def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
	(!cast<Instruction>(NAME) ZPR8:$Zd, ZPR8:$Zn, ZPR8:$Zm), 1>;
	def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
	(!cast<Instruction>(NAME) ZPR16:$Zd, ZPR16:$Zn, ZPR16:$Zm), 1>;
	def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
	(!cast<Instruction>(NAME) ZPR32:$Zd, ZPR32:$Zn, ZPR32:$Zm), 1>;
	}

	class sve2_int_bitwise_ternary_op_d<bits<3> opc, string asm>
	: I<(outs ZPR64:$Zdn), (ins ZPR64:$_Zdn, ZPR64:$Zm, ZPR64:$Zk),
	asm, "\t$Zdn, $_Zdn, $Zm, $Zk",
	"",
	[]>, Sched<[]> {
	bits<5> Zdn;
	bits<5> Zk;
	bits<5> Zm;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = opc{2-1};
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15-11} = 0b00111;
	let Inst{10} = opc{0};
	let Inst{9-5} = Zk;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve2_int_bitwise_ternary_op<bits<3> opc, string asm, SDPatternOperator op> {
	def NAME : sve2_int_bitwise_ternary_op_d<opc, asm>;

	def : InstAlias<asm # "\t$Zdn, $Zdn, $Zm, $Zk",
	(!cast<Instruction>(NAME) ZPR8:$Zdn, ZPR8:$Zm, ZPR8:$Zk), 1>;
	def : InstAlias<asm # "\t$Zdn, $Zdn, $Zm, $Zk",
	(!cast<Instruction>(NAME) ZPR16:$Zdn, ZPR16:$Zm, ZPR16:$Zk), 1>;
	def : InstAlias<asm # "\t$Zdn, $Zdn, $Zm, $Zk",
	(!cast<Instruction>(NAME) ZPR32:$Zdn, ZPR32:$Zm, ZPR32:$Zk), 1>;

	def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME)>;
	def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME)>;
	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME)>;
	}

	class sve2_int_rotate_right_imm<bits<4> tsz8_64, string asm,
	ZPRRegOp zprty, Operand immtype>
	: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, immtype:$imm),
	asm, "\t$Zdn, $_Zdn, $Zm, $imm",
	"",
	[]>, Sched<[]> {
	bits<5> Zdn;
	bits<5> Zm;
	bits<6> imm;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = tsz8_64{3-2};
	let Inst{21} = 0b1;
	let Inst{20-19} = tsz8_64{1-0};
	let Inst{18-16} = imm{2-0}; // imm3
	let Inst{15-10} = 0b001101;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve2_int_rotate_right_imm<string asm, SDPatternOperator op> {
	def _B : sve2_int_rotate_right_imm<{0,0,0,1}, asm, ZPR8, vecshiftR8>;
	def _H : sve2_int_rotate_right_imm<{0,0,1,?}, asm, ZPR16, vecshiftR16> {
	let Inst{19} = imm{3};
	}
	def _S : sve2_int_rotate_right_imm<{0,1,?,?}, asm, ZPR32, vecshiftR32> {
	let Inst{20-19} = imm{4-3};
	}
	def _D : sve2_int_rotate_right_imm<{1,?,?,?}, asm, ZPR64, vecshiftR64> {
	let Inst{22} = imm{5};
	let Inst{20-19} = imm{4-3};
	}
	def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, tvecshiftR8, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Integer Wide Immediate - Predicated Group
	//===----------------------------------------------------------------------===//

	class sve_int_dup_fpimm_pred<bits<2> sz, Operand fpimmtype,
	string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPRAny:$Pg, fpimmtype:$imm8),
	asm, "\t$Zd, $Pg/m, $imm8",
	"",
	[]>, Sched<[]> {
	bits<4> Pg;
	bits<5> Zd;
	bits<8> imm8;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz;
	let Inst{21-20} = 0b01;
	let Inst{19-16} = Pg;
	let Inst{15-13} = 0b110;
	let Inst{12-5} = imm8;
	let Inst{4-0} = Zd;

	let Constraints = "$Zd = $_Zd";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_int_dup_fpimm_pred<string asm> {
	def _H : sve_int_dup_fpimm_pred<0b01, fpimm16, asm, ZPR16>;
	def _S : sve_int_dup_fpimm_pred<0b10, fpimm32, asm, ZPR32>;
	def _D : sve_int_dup_fpimm_pred<0b11, fpimm64, asm, ZPR64>;

	def : InstAlias<"fmov $Zd, $Pg/m, $imm8",
	(!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, fpimm16:$imm8), 1>;
	def : InstAlias<"fmov $Zd, $Pg/m, $imm8",
	(!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, fpimm32:$imm8), 1>;
	def : InstAlias<"fmov $Zd, $Pg/m, $imm8",
	(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, fpimm64:$imm8), 1>;
	}

	class sve_int_dup_imm_pred<bits<2> sz8_64, bit m, string asm,
	ZPRRegOp zprty, string pred_qual, dag iops>
	: I<(outs zprty:$Zd), iops,
	asm, "\t$Zd, $Pg"#pred_qual#", $imm",
	"", []>, Sched<[]> {
	bits<5> Zd;
	bits<4> Pg;
	bits<9> imm;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-20} = 0b01;
	let Inst{19-16} = Pg;
	let Inst{15} = 0b0;
	let Inst{14} = m;
	let Inst{13} = imm{8}; // sh
	let Inst{12-5} = imm{7-0}; // imm8
	let Inst{4-0} = Zd;

	let DestructiveInstType = DestructiveOther;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_int_dup_imm_pred_merge_inst<
	bits<2> sz8_64, string asm, ZPRRegOp zprty, ValueType intty,
	ValueType predty, ValueType scalarty, imm8_opt_lsl cpyimm> {
	let Constraints = "$Zd = $_Zd" in
	def NAME : sve_int_dup_imm_pred<sz8_64, 1, asm, zprty, "/m",
	(ins zprty:$_Zd, PPRAny:$Pg, cpyimm:$imm)>;
	def : InstAlias<"mov $Zd, $Pg/m, $imm",
	(!cast<Instruction>(NAME) zprty:$Zd, PPRAny:$Pg, cpyimm:$imm), 1>;
	def : Pat<(intty
	(vselect predty:$Pg,
	(intty (AArch64dup (scalarty (SVE8BitLslImm i32:$imm, i32:$shift)))),
	intty:$Zd)),
	(!cast<Instruction>(NAME) zprty:$Zd, $Pg, i32:$imm, i32:$shift)>;
	}

	multiclass sve_int_dup_imm_pred_merge<string asm> {
	defm _B : sve_int_dup_imm_pred_merge_inst<0b00, asm, ZPR8, nxv16i8, nxv16i1,
	i32, cpy_imm8_opt_lsl_i8>;
	defm _H : sve_int_dup_imm_pred_merge_inst<0b01, asm, ZPR16, nxv8i16, nxv8i1,
	i32, cpy_imm8_opt_lsl_i16>;
	defm _S : sve_int_dup_imm_pred_merge_inst<0b10, asm, ZPR32, nxv4i32, nxv4i1,
	i32, cpy_imm8_opt_lsl_i32>;
	defm _D : sve_int_dup_imm_pred_merge_inst<0b11, asm, ZPR64, nxv2i64, nxv2i1,
	i64, cpy_imm8_opt_lsl_i64>;

	def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
	(!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, 0, 0), 0>;
	def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
	(!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, 0, 0), 0>;
	def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
	(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, 0, 0), 0>;
	}

	multiclass sve_int_dup_imm_pred_zero_inst<
	bits<2> sz8_64, string asm, ZPRRegOp zprty, ValueType intty,
	ValueType predty, ValueType scalarty, imm8_opt_lsl cpyimm> {
	def NAME : sve_int_dup_imm_pred<sz8_64, 0, asm, zprty, "/z",
	(ins PPRAny:$Pg, cpyimm:$imm)>;
	def : InstAlias<"mov $Zd, $Pg/z, $imm",
	(!cast<Instruction>(NAME) zprty:$Zd, PPRAny:$Pg, cpyimm:$imm), 1>;
	def : Pat<(intty (zext (predty PPRAny:$Ps1))),
	(!cast<Instruction>(NAME) PPRAny:$Ps1, 1, 0)>;
	def : Pat<(intty (sext (predty PPRAny:$Ps1))),
	(!cast<Instruction>(NAME) PPRAny:$Ps1, -1, 0)>;
	def : Pat<(intty (anyext (predty PPRAny:$Ps1))),
	(!cast<Instruction>(NAME) PPRAny:$Ps1, 1, 0)>;
	def : Pat<(intty
	(vselect predty:$Pg,
	(intty (AArch64dup (scalarty (SVE8BitLslImm i32:$imm, i32:$shift)))),
	(intty (AArch64dup (scalarty 0))))),
	(!cast<Instruction>(NAME) $Pg, i32:$imm, i32:$shift)>;
	}

	multiclass sve_int_dup_imm_pred_zero<string asm> {
	defm _B : sve_int_dup_imm_pred_zero_inst<0b00, asm, ZPR8, nxv16i8, nxv16i1,
	i32, cpy_imm8_opt_lsl_i8>;
	defm _H : sve_int_dup_imm_pred_zero_inst<0b01, asm, ZPR16, nxv8i16, nxv8i1,
	i32, cpy_imm8_opt_lsl_i16>;
	defm _S : sve_int_dup_imm_pred_zero_inst<0b10, asm, ZPR32, nxv4i32, nxv4i1,
	i32, cpy_imm8_opt_lsl_i32>;
	defm _D : sve_int_dup_imm_pred_zero_inst<0b11, asm, ZPR64, nxv2i64, nxv2i1,
	i64, cpy_imm8_opt_lsl_i64>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Integer Compare - Vectors Group
	//===----------------------------------------------------------------------===//

	class sve_int_cmp<bit cmp_1, bits<2> sz8_64, bits<3> opc, string asm,
	PPRRegOp pprty, ZPRRegOp zprty1, ZPRRegOp zprty2>
	: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty1:$Zn, zprty2:$Zm),
	asm, "\t$Pd, $Pg/z, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	bits<3> Pg;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-24} = 0b00100100;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b0;
	let Inst{20-16} = Zm;
	let Inst{15} = opc{2};
	let Inst{14} = cmp_1;
	let Inst{13} = opc{1};
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4} = opc{0};
	let Inst{3-0} = Pd;

	let Defs = [NZCV];
	}

	multiclass SVE_SETCC_Pat<CondCode cc, CondCode invcc, ValueType predvt,
	ValueType intvt, sve_int_cmp cmp> {
	def : Pat<(predvt (AArch64setcc_z predvt:$Op1, intvt:$Op2, intvt:$Op3, cc)),
	(cmp $Op1, $Op2, $Op3)>;
	def : Pat<(predvt (AArch64setcc_z predvt:$Op1, intvt:$Op2, intvt:$Op3, invcc)),
	(cmp $Op1, $Op3, $Op2)>;
	}

	multiclass sve_int_cmp_0<bits<3> opc, string asm, CondCode cc, CondCode invcc> {
	def _B : sve_int_cmp<0b0, 0b00, opc, asm, PPR8, ZPR8, ZPR8>;
	def _H : sve_int_cmp<0b0, 0b01, opc, asm, PPR16, ZPR16, ZPR16>;
	def _S : sve_int_cmp<0b0, 0b10, opc, asm, PPR32, ZPR32, ZPR32>;
	def _D : sve_int_cmp<0b0, 0b11, opc, asm, PPR64, ZPR64, ZPR64>;

	defm : SVE_SETCC_Pat<cc, invcc, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
	defm : SVE_SETCC_Pat<cc, invcc, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
	defm : SVE_SETCC_Pat<cc, invcc, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
	defm : SVE_SETCC_Pat<cc, invcc, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve_int_cmp_0_wide<bits<3> opc, string asm, SDPatternOperator op> {
	def _B : sve_int_cmp<0b0, 0b00, opc, asm, PPR8, ZPR8, ZPR64>;
	def _H : sve_int_cmp<0b0, 0b01, opc, asm, PPR16, ZPR16, ZPR64>;
	def _S : sve_int_cmp<0b0, 0b10, opc, asm, PPR32, ZPR32, ZPR64>;

	def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i8, nxv2i64, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Pat<nxv8i1, op, nxv8i1, nxv8i16, nxv2i64, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4i1, op, nxv4i1, nxv4i32, nxv2i64, !cast<Instruction>(NAME # _S)>;
	}

	multiclass sve_int_cmp_1_wide<bits<3> opc, string asm, SDPatternOperator op> {
	def _B : sve_int_cmp<0b1, 0b00, opc, asm, PPR8, ZPR8, ZPR64>;
	def _H : sve_int_cmp<0b1, 0b01, opc, asm, PPR16, ZPR16, ZPR64>;
	def _S : sve_int_cmp<0b1, 0b10, opc, asm, PPR32, ZPR32, ZPR64>;

	def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i8, nxv2i64, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Pat<nxv8i1, op, nxv8i1, nxv8i16, nxv2i64, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4i1, op, nxv4i1, nxv4i32, nxv2i64, !cast<Instruction>(NAME # _S)>;
	}


	//===----------------------------------------------------------------------===//
	// SVE Integer Compare - Signed Immediate Group
	//===----------------------------------------------------------------------===//

	class sve_int_scmp_vi<bits<2> sz8_64, bits<3> opc, string asm, PPRRegOp pprty,
	ZPRRegOp zprty,
	Operand immtype>
	: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn, immtype:$imm5),
	asm, "\t$Pd, $Pg/z, $Zn, $imm5",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	bits<3> Pg;
	bits<5> Zn;
	bits<5> imm5;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b0;
	let Inst{20-16} = imm5;
	let Inst{15} = opc{2};
	let Inst{14} = 0b0;
	let Inst{13} = opc{1};
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4} = opc{0};
	let Inst{3-0} = Pd;

	let Defs = [NZCV];
	let ElementSize = pprty.ElementSize;
	}

	multiclass SVE_SETCC_Imm_Pat<CondCode cc, CondCode commuted_cc,
	ValueType predvt, ValueType intvt,
	Operand immtype, Instruction cmp> {
	def : Pat<(predvt (AArch64setcc_z (predvt PPR_3b:$Pg),
	(intvt ZPR:$Zs1),
	(intvt (AArch64dup (immtype:$imm))),
	cc)),
	(cmp $Pg, $Zs1, immtype:$imm)>;
	def : Pat<(predvt (AArch64setcc_z (predvt PPR_3b:$Pg),
	(intvt (AArch64dup (immtype:$imm))),
	(intvt ZPR:$Zs1),
	commuted_cc)),
	(cmp $Pg, $Zs1, immtype:$imm)>;
	}

	multiclass sve_int_scmp_vi<bits<3> opc, string asm, CondCode cc, CondCode commuted_cc> {
	def _B : sve_int_scmp_vi<0b00, opc, asm, PPR8, ZPR8, simm5_32b>;
	def _H : sve_int_scmp_vi<0b01, opc, asm, PPR16, ZPR16, simm5_32b>;
	def _S : sve_int_scmp_vi<0b10, opc, asm, PPR32, ZPR32, simm5_32b>;
	def _D : sve_int_scmp_vi<0b11, opc, asm, PPR64, ZPR64, simm5_64b>;

	defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv16i1, nxv16i8, simm5_32b,
	!cast<Instruction>(NAME # _B)>;
	defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv8i1, nxv8i16, simm5_32b,
	!cast<Instruction>(NAME # _H)>;
	defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv4i1, nxv4i32, simm5_32b,
	!cast<Instruction>(NAME # _S)>;
	defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv2i1, nxv2i64, simm5_64b,
	!cast<Instruction>(NAME # _D)>;
	}


	//===----------------------------------------------------------------------===//
	// SVE Integer Compare - Unsigned Immediate Group
	//===----------------------------------------------------------------------===//

	class sve_int_ucmp_vi<bits<2> sz8_64, bits<2> opc, string asm, PPRRegOp pprty,
	ZPRRegOp zprty, Operand immtype>
	: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn, immtype:$imm7),
	asm, "\t$Pd, $Pg/z, $Zn, $imm7",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	bits<3> Pg;
	bits<5> Zn;
	bits<7> imm7;
	let Inst{31-24} = 0b00100100;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 1;
	let Inst{20-14} = imm7;
	let Inst{13} = opc{1};
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4} = opc{0};
	let Inst{3-0} = Pd;

	let Defs = [NZCV];
	}

	multiclass sve_int_ucmp_vi<bits<2> opc, string asm, CondCode cc,
	CondCode commuted_cc> {
	def _B : sve_int_ucmp_vi<0b00, opc, asm, PPR8, ZPR8, imm0_127>;
	def _H : sve_int_ucmp_vi<0b01, opc, asm, PPR16, ZPR16, imm0_127>;
	def _S : sve_int_ucmp_vi<0b10, opc, asm, PPR32, ZPR32, imm0_127>;
	def _D : sve_int_ucmp_vi<0b11, opc, asm, PPR64, ZPR64, imm0_127_64b>;

	defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv16i1, nxv16i8, imm0_127,
	!cast<Instruction>(NAME # _B)>;
	defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv8i1, nxv8i16, imm0_127,
	!cast<Instruction>(NAME # _H)>;
	defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv4i1, nxv4i32, imm0_127,
	!cast<Instruction>(NAME # _S)>;
	defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv2i1, nxv2i64, imm0_127_64b,
	!cast<Instruction>(NAME # _D)>;
	}


	//===----------------------------------------------------------------------===//
	// SVE Integer Compare - Scalars Group
	//===----------------------------------------------------------------------===//

	class sve_int_cterm<bit sz, bit opc, string asm, RegisterClass rt>
	: I<(outs), (ins rt:$Rn, rt:$Rm),
	asm, "\t$Rn, $Rm",
	"",
	[]>, Sched<[]> {
	bits<5> Rm;
	bits<5> Rn;
	let Inst{31-23} = 0b001001011;
	let Inst{22} = sz;
	let Inst{21} = 0b1;
	let Inst{20-16} = Rm;
	let Inst{15-10} = 0b001000;
	let Inst{9-5} = Rn;
	let Inst{4} = opc;
	let Inst{3-0} = 0b0000;

	let Defs = [NZCV];
	}

	class sve_int_while_rr<bits<2> sz8_64, bits<4> opc, string asm,
	RegisterClass gprty, PPRRegOp pprty,
	ValueType vt, SDPatternOperator op>
	: I<(outs pprty:$Pd), (ins gprty:$Rn, gprty:$Rm),
	asm, "\t$Pd, $Rn, $Rm",
	"", []>, Sched<[]> {
	bits<4> Pd;
	bits<5> Rm;
	bits<5> Rn;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b1;
	let Inst{20-16} = Rm;
	let Inst{15-13} = 0b000;
	let Inst{12-10} = opc{3-1};
	let Inst{9-5} = Rn;
	let Inst{4} = opc{0};
	let Inst{3-0} = Pd;

	let Defs = [NZCV];
	}

	multiclass sve_int_while4_rr<bits<3> opc, string asm, SDPatternOperator op> {
	def _B : sve_int_while_rr<0b00, { 0, opc }, asm, GPR32, PPR8, nxv16i1, op>;
	def _H : sve_int_while_rr<0b01, { 0, opc }, asm, GPR32, PPR16, nxv8i1, op>;
	def _S : sve_int_while_rr<0b10, { 0, opc }, asm, GPR32, PPR32, nxv4i1, op>;
	def _D : sve_int_while_rr<0b11, { 0, opc }, asm, GPR32, PPR64, nxv2i1, op>;

	def : SVE_2_Op_Pat<nxv16i1, op, i32, i32, !cast<Instruction>(NAME # _B)>;
	def : SVE_2_Op_Pat<nxv8i1, op, i32, i32, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<nxv4i1, op, i32, i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<nxv2i1, op, i32, i32, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve_int_while8_rr<bits<3> opc, string asm, SDPatternOperator op> {
	def _B : sve_int_while_rr<0b00, { 1, opc }, asm, GPR64, PPR8, nxv16i1, op>;
	def _H : sve_int_while_rr<0b01, { 1, opc }, asm, GPR64, PPR16, nxv8i1, op>;
	def _S : sve_int_while_rr<0b10, { 1, opc }, asm, GPR64, PPR32, nxv4i1, op>;
	def _D : sve_int_while_rr<0b11, { 1, opc }, asm, GPR64, PPR64, nxv2i1, op>;

	def : SVE_2_Op_Pat<nxv16i1, op, i64, i64, !cast<Instruction>(NAME # _B)>;
	def : SVE_2_Op_Pat<nxv8i1, op, i64, i64, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<nxv4i1, op, i64, i64, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<nxv2i1, op, i64, i64, !cast<Instruction>(NAME # _D)>;
	}

	class sve2_int_while_rr<bits<2> sz8_64, bits<1> rw, string asm,
	PPRRegOp pprty>
	: I<(outs pprty:$Pd), (ins GPR64:$Rn, GPR64:$Rm),
	asm, "\t$Pd, $Rn, $Rm",
	"", []>, Sched<[]> {
	bits<4> Pd;
	bits<5> Rm;
	bits<5> Rn;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b1;
	let Inst{20-16} = Rm;
	let Inst{15-10} = 0b001100;
	let Inst{9-5} = Rn;
	let Inst{4} = rw;
	let Inst{3-0} = Pd;

	let Defs = [NZCV];
	}

	multiclass sve2_int_while_rr<bits<1> rw, string asm, string op> {
	def _B : sve2_int_while_rr<0b00, rw, asm, PPR8>;
	def _H : sve2_int_while_rr<0b01, rw, asm, PPR16>;
	def _S : sve2_int_while_rr<0b10, rw, asm, PPR32>;
	def _D : sve2_int_while_rr<0b11, rw, asm, PPR64>;

	def : SVE_2_Op_Pat<nxv16i1, !cast<SDPatternOperator>(op # _b), i64, i64, !cast<Instruction>(NAME # _B)>;
	def : SVE_2_Op_Pat<nxv8i1, !cast<SDPatternOperator>(op # _h), i64, i64, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<nxv4i1, !cast<SDPatternOperator>(op # _s), i64, i64, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<nxv2i1, !cast<SDPatternOperator>(op # _d), i64, i64, !cast<Instruction>(NAME # _D)>;

	}

	//===----------------------------------------------------------------------===//
	// SVE Floating Point Fast Reduction Group
	//===----------------------------------------------------------------------===//

	class sve_fp_fast_red<bits<2> sz, bits<3> opc, string asm,
	ZPRRegOp zprty, FPRasZPROperand dstOpType>
	: I<(outs dstOpType:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn),
	asm, "\t$Vd, $Pg, $Zn",
	"",
	[]>, Sched<[]> {
	bits<5> Zn;
	bits<5> Vd;
	bits<3> Pg;
	let Inst{31-24} = 0b01100101;
	let Inst{23-22} = sz;
	let Inst{21-19} = 0b000;
	let Inst{18-16} = opc;
	let Inst{15-13} = 0b001;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Vd;
	}

	multiclass sve_fp_fast_red<bits<3> opc, string asm, SDPatternOperator op> {
	def _H : sve_fp_fast_red<0b01, opc, asm, ZPR16, FPR16asZPR>;
	def _S : sve_fp_fast_red<0b10, opc, asm, ZPR32, FPR32asZPR>;
	def _D : sve_fp_fast_red<0b11, opc, asm, ZPR64, FPR64asZPR>;

	def : SVE_2_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
	}


	//===----------------------------------------------------------------------===//
	// SVE Floating Point Accumulating Reduction Group
	//===----------------------------------------------------------------------===//

	class sve_fp_2op_p_vd<bits<2> sz, bits<3> opc, string asm,
	ZPRRegOp zprty, FPRasZPROperand dstOpType>
	: I<(outs dstOpType:$Vdn), (ins PPR3bAny:$Pg, dstOpType:$_Vdn, zprty:$Zm),
	asm, "\t$Vdn, $Pg, $_Vdn, $Zm",
	"",
	[]>,
	Sched<[]> {
	bits<3> Pg;
	bits<5> Vdn;
	bits<5> Zm;
	let Inst{31-24} = 0b01100101;
	let Inst{23-22} = sz;
	let Inst{21-19} = 0b011;
	let Inst{18-16} = opc;
	let Inst{15-13} = 0b001;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Vdn;

	let Constraints = "$Vdn = $_Vdn";
	}

	multiclass sve_fp_2op_p_vd<bits<3> opc, string asm, SDPatternOperator op> {
	def _H : sve_fp_2op_p_vd<0b01, opc, asm, ZPR16, FPR16asZPR>;
	def _S : sve_fp_2op_p_vd<0b10, opc, asm, ZPR32, FPR32asZPR>;
	def _D : sve_fp_2op_p_vd<0b11, opc, asm, ZPR64, FPR64asZPR>;

	def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Floating Point Compare - Vectors Group
	//===----------------------------------------------------------------------===//

	class sve_fp_3op_p_pd<bits<2> sz, bits<3> opc, string asm, PPRRegOp pprty,
	ZPRRegOp zprty>
	: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn, zprty:$Zm),
	asm, "\t$Pd, $Pg/z, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	bits<3> Pg;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-24} = 0b01100101;
	let Inst{23-22} = sz;
	let Inst{21} = 0b0;
	let Inst{20-16} = Zm;
	let Inst{15} = opc{2};
	let Inst{14} = 0b1;
	let Inst{13} = opc{1};
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4} = opc{0};
	let Inst{3-0} = Pd;
	}

	multiclass sve_fp_3op_p_pd<bits<3> opc, string asm, SDPatternOperator op> {
	def _H : sve_fp_3op_p_pd<0b01, opc, asm, PPR16, ZPR16>;
	def _S : sve_fp_3op_p_pd<0b10, opc, asm, PPR32, ZPR32>;
	def _D : sve_fp_3op_p_pd<0b11, opc, asm, PPR64, ZPR64>;

	def : SVE_3_Op_Pat<nxv8i1, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4i1, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i1, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve_fp_3op_p_pd_cc<bits<3> opc, string asm, SDPatternOperator op,
	SDPatternOperator op_nopred>
	: sve_fp_3op_p_pd<opc, asm, op> {
	def : SVE_2_Op_AllActive_Pat<nxv8i1, op_nopred, nxv8f16, nxv8f16,
	!cast<Instruction>(NAME # _H), PTRUE_H>;
	def : SVE_2_Op_AllActive_Pat<nxv4i1, op_nopred, nxv4f16, nxv4f16,
	!cast<Instruction>(NAME # _H), PTRUE_S>;
	def : SVE_2_Op_AllActive_Pat<nxv2i1, op_nopred, nxv2f16, nxv2f16,
	!cast<Instruction>(NAME # _H), PTRUE_D>;
	def : SVE_2_Op_AllActive_Pat<nxv4i1, op_nopred, nxv4f32, nxv4f32,
	!cast<Instruction>(NAME # _S), PTRUE_S>;
	def : SVE_2_Op_AllActive_Pat<nxv2i1, op_nopred, nxv2f32, nxv2f32,
	!cast<Instruction>(NAME # _S), PTRUE_D>;
	def : SVE_2_Op_AllActive_Pat<nxv2i1, op_nopred, nxv2f64, nxv2f64,
	!cast<Instruction>(NAME # _D), PTRUE_D>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Floating Point Compare - with Zero Group
	//===----------------------------------------------------------------------===//

	class sve_fp_2op_p_pd<bits<2> sz, bits<3> opc, string asm, PPRRegOp pprty,
	ZPRRegOp zprty>
	: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn),
	asm, "\t$Pd, $Pg/z, $Zn, #0.0",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	bits<3> Pg;
	bits<5> Zn;
	let Inst{31-24} = 0b01100101;
	let Inst{23-22} = sz;
	let Inst{21-18} = 0b0100;
	let Inst{17-16} = opc{2-1};
	let Inst{15-13} = 0b001;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4} = opc{0};
	let Inst{3-0} = Pd;
	}

	multiclass sve_fp_2op_p_pd<bits<3> opc, string asm> {
	def _H : sve_fp_2op_p_pd<0b01, opc, asm, PPR16, ZPR16>;
	def _S : sve_fp_2op_p_pd<0b10, opc, asm, PPR32, ZPR32>;
	def _D : sve_fp_2op_p_pd<0b11, opc, asm, PPR64, ZPR64>;
	}


	//===----------------------------------------------------------------------===//
	//SVE Index Generation Group
	//===----------------------------------------------------------------------===//

	class sve_int_index_ii<bits<2> sz8_64, string asm, ZPRRegOp zprty,
	Operand imm_ty>
	: I<(outs zprty:$Zd), (ins imm_ty:$imm5, imm_ty:$imm5b),
	asm, "\t$Zd, $imm5, $imm5b",
	"", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> imm5;
	bits<5> imm5b;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b1;
	let Inst{20-16} = imm5b;
	let Inst{15-10} = 0b010000;
	let Inst{9-5} = imm5;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_index_ii<string asm, SDPatternOperator op> {
	def _B : sve_int_index_ii<0b00, asm, ZPR8, simm5_8b>;
	def _H : sve_int_index_ii<0b01, asm, ZPR16, simm5_16b>;
	def _S : sve_int_index_ii<0b10, asm, ZPR32, simm5_32b>;
	def _D : sve_int_index_ii<0b11, asm, ZPR64, simm5_64b>;

	def : Pat<(nxv16i8 (op simm5_8b:$imm5, simm5_8b:$imm5b)),
	(!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, simm5_8b:$imm5b)>;
	def : Pat<(nxv8i16 (op simm5_16b:$imm5, simm5_16b:$imm5b)),
	(!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, simm5_16b:$imm5b)>;
	def : Pat<(nxv4i32 (op simm5_32b:$imm5, simm5_32b:$imm5b)),
	(!cast<Instruction>(NAME # "_S") simm5_32b:$imm5, simm5_32b:$imm5b)>;
	def : Pat<(nxv2i64 (op simm5_64b:$imm5, simm5_64b:$imm5b)),
	(!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, simm5_64b:$imm5b)>;
	}

	class sve_int_index_ir<bits<2> sz8_64, string asm, ZPRRegOp zprty,
	RegisterClass srcRegType, Operand imm_ty>
	: I<(outs zprty:$Zd), (ins imm_ty:$imm5, srcRegType:$Rm),
	asm, "\t$Zd, $imm5, $Rm",
	"", []>, Sched<[]> {
	bits<5> Rm;
	bits<5> Zd;
	bits<5> imm5;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b1;
	let Inst{20-16} = Rm;
	let Inst{15-10} = 0b010010;
	let Inst{9-5} = imm5;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_index_ir<string asm, SDPatternOperator op> {
	def _B : sve_int_index_ir<0b00, asm, ZPR8, GPR32, simm5_8b>;
	def _H : sve_int_index_ir<0b01, asm, ZPR16, GPR32, simm5_16b>;
	def _S : sve_int_index_ir<0b10, asm, ZPR32, GPR32, simm5_32b>;
	def _D : sve_int_index_ir<0b11, asm, ZPR64, GPR64, simm5_64b>;

	def : Pat<(nxv16i8 (op simm5_8b:$imm5, GPR32:$Rm)),
	(!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, GPR32:$Rm)>;
	def : Pat<(nxv8i16 (op simm5_16b:$imm5, GPR32:$Rm)),
	(!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, GPR32:$Rm)>;
	def : Pat<(nxv4i32 (op simm5_32b:$imm5, GPR32:$Rm)),
	(!cast<Instruction>(NAME # "_S") simm5_32b:$imm5, GPR32:$Rm)>;
	def : Pat<(nxv2i64 (op simm5_64b:$imm5, GPR64:$Rm)),
	(!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, GPR64:$Rm)>;
	}

	class sve_int_index_ri<bits<2> sz8_64, string asm, ZPRRegOp zprty,
	RegisterClass srcRegType, Operand imm_ty>
	: I<(outs zprty:$Zd), (ins srcRegType:$Rn, imm_ty:$imm5),
	asm, "\t$Zd, $Rn, $imm5",
	"", []>, Sched<[]> {
	bits<5> Rn;
	bits<5> Zd;
	bits<5> imm5;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b1;
	let Inst{20-16} = imm5;
	let Inst{15-10} = 0b010001;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_index_ri<string asm, SDPatternOperator op> {
	def _B : sve_int_index_ri<0b00, asm, ZPR8, GPR32, simm5_8b>;
	def _H : sve_int_index_ri<0b01, asm, ZPR16, GPR32, simm5_16b>;
	def _S : sve_int_index_ri<0b10, asm, ZPR32, GPR32, simm5_32b>;
	def _D : sve_int_index_ri<0b11, asm, ZPR64, GPR64, simm5_64b>;

	def : Pat<(nxv16i8 (op GPR32:$Rm, simm5_8b:$imm5)),
	(!cast<Instruction>(NAME # "_B") GPR32:$Rm, simm5_8b:$imm5)>;
	def : Pat<(nxv8i16 (op GPR32:$Rm, simm5_16b:$imm5)),
	(!cast<Instruction>(NAME # "_H") GPR32:$Rm, simm5_16b:$imm5)>;
	def : Pat<(nxv4i32 (op GPR32:$Rm, simm5_32b:$imm5)),
	(!cast<Instruction>(NAME # "_S") GPR32:$Rm, simm5_32b:$imm5)>;
	def : Pat<(nxv2i64 (op GPR64:$Rm, simm5_64b:$imm5)),
	(!cast<Instruction>(NAME # "_D") GPR64:$Rm, simm5_64b:$imm5)>;
	}

	class sve_int_index_rr<bits<2> sz8_64, string asm, ZPRRegOp zprty,
	RegisterClass srcRegType>
	: I<(outs zprty:$Zd), (ins srcRegType:$Rn, srcRegType:$Rm),
	asm, "\t$Zd, $Rn, $Rm",
	"", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Rm;
	bits<5> Rn;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b1;
	let Inst{20-16} = Rm;
	let Inst{15-10} = 0b010011;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_index_rr<string asm, SDPatternOperator op> {
	def _B : sve_int_index_rr<0b00, asm, ZPR8, GPR32>;
	def _H : sve_int_index_rr<0b01, asm, ZPR16, GPR32>;
	def _S : sve_int_index_rr<0b10, asm, ZPR32, GPR32>;
	def _D : sve_int_index_rr<0b11, asm, ZPR64, GPR64>;

	def : SVE_2_Op_Pat<nxv16i8, op, i32, i32, !cast<Instruction>(NAME # _B)>;
	def : SVE_2_Op_Pat<nxv8i16, op, i32, i32, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<nxv4i32, op, i32, i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<nxv2i64, op, i64, i64, !cast<Instruction>(NAME # _D)>;
	}
	//
	//===----------------------------------------------------------------------===//
	// SVE Bitwise Shift - Predicated Group
	//===----------------------------------------------------------------------===//
	class sve_int_bin_pred_shift_imm<bits<4> tsz8_64, bits<4> opc, string asm,
	ZPRRegOp zprty, Operand immtype>
	: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, immtype:$imm),
	asm, "\t$Zdn, $Pg/m, $_Zdn, $imm",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zdn;
	bits<6> imm;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = tsz8_64{3-2};
	let Inst{21-20} = 0b00;
	let Inst{19-16} = opc;
	let Inst{15-13} = 0b100;
	let Inst{12-10} = Pg;
	let Inst{9-8} = tsz8_64{1-0};
	let Inst{7-5} = imm{2-0}; // imm3
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = DestructiveBinaryImm;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_int_bin_pred_shift_imm_left<bits<4> opc, string asm, string psName=""> {
	def _B : SVEPseudo2Instr<psName # _B, 1>,
	sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
	def _H : SVEPseudo2Instr<psName # _H, 1>,
	sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
	let Inst{8} = imm{3};
	}
	def _S : SVEPseudo2Instr<psName # _S, 1>,
	sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
	let Inst{9-8} = imm{4-3};
	}
	def _D : SVEPseudo2Instr<psName # _D, 1>,
	sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
	let Inst{22} = imm{5};
	let Inst{9-8} = imm{4-3};
	}
	}

	multiclass sve2_int_bin_pred_shift_imm_left<bits<4> opc, string asm,
	string psName,
	SDPatternOperator op> {

	def _B : SVEPseudo2Instr<psName # _B, 1>, sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
	def _H : SVEPseudo2Instr<psName # _H, 1>,
	sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
	let Inst{8} = imm{3};
	}
	def _S : SVEPseudo2Instr<psName # _S, 1>,
	sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
	let Inst{9-8} = imm{4-3};
	}
	def _D : SVEPseudo2Instr<psName # _D, 1>,
	sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
	let Inst{22} = imm{5};
	let Inst{9-8} = imm{4-3};
	}

	def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i1, nxv16i8, i32, tvecshiftL8, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i1, nxv8i16, i32, tvecshiftL16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i1, nxv4i32, i32, tvecshiftL32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i1, nxv2i64, i32, tvecshiftL64, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve_int_bin_pred_shift_imm_left_zeroing_bhsd<SDPatternOperator op> {
	def _ZERO_B : PredTwoOpImmPseudo<NAME # _B, ZPR8, tvecshiftL8, FalseLanesZero>;
	def _ZERO_H : PredTwoOpImmPseudo<NAME # _H, ZPR16, tvecshiftL16, FalseLanesZero>;
	def _ZERO_S : PredTwoOpImmPseudo<NAME # _S, ZPR32, tvecshiftL32, FalseLanesZero>;
	def _ZERO_D : PredTwoOpImmPseudo<NAME # _D, ZPR64, tvecshiftL64, FalseLanesZero>;

	def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv16i8, op, nxv16i1, nxv16i8, tvecshiftL8, !cast<Pseudo>(NAME # _ZERO_B)>;
	def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv8i16, op, nxv8i1, nxv8i16, tvecshiftL16, !cast<Pseudo>(NAME # _ZERO_H)>;
	def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv4i32, op, nxv4i1, nxv4i32, tvecshiftL32, !cast<Pseudo>(NAME # _ZERO_S)>;
	def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv2i64, op, nxv2i1, nxv2i64, tvecshiftL64, !cast<Pseudo>(NAME # _ZERO_D)>;
	}

	multiclass sve_int_bin_pred_shift_imm_right<bits<4> opc, string asm, string Ps,
	SDPatternOperator op = null_frag> {
	def _B : SVEPseudo2Instr<Ps # _B, 1>,
	sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
	def _H : SVEPseudo2Instr<Ps # _H, 1>,
	sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
	let Inst{8} = imm{3};
	}
	def _S : SVEPseudo2Instr<Ps # _S, 1>,
	sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
	let Inst{9-8} = imm{4-3};
	}
	def _D : SVEPseudo2Instr<Ps # _D, 1>,
	sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
	let Inst{22} = imm{5};
	let Inst{9-8} = imm{4-3};
	}

	def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i1, nxv16i8, i32, tvecshiftR8, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i1, nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i1, nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i1, nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve_int_bin_pred_shift_imm_right_zeroing_bhsd<SDPatternOperator op = null_frag> {
	def _ZERO_B : PredTwoOpImmPseudo<NAME # _B, ZPR8, vecshiftR8, FalseLanesZero>;
	def _ZERO_H : PredTwoOpImmPseudo<NAME # _H, ZPR16, vecshiftR16, FalseLanesZero>;
	def _ZERO_S : PredTwoOpImmPseudo<NAME # _S, ZPR32, vecshiftR32, FalseLanesZero>;
	def _ZERO_D : PredTwoOpImmPseudo<NAME # _D, ZPR64, vecshiftR64, FalseLanesZero>;

	def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv16i8, op, nxv16i1, nxv16i8, tvecshiftR8, !cast<Pseudo>(NAME # _ZERO_B)>;
	def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv8i16, op, nxv8i1, nxv8i16, tvecshiftR16, !cast<Pseudo>(NAME # _ZERO_H)>;
	def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv4i32, op, nxv4i1, nxv4i32, tvecshiftR32, !cast<Pseudo>(NAME # _ZERO_S)>;
	def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv2i64, op, nxv2i1, nxv2i64, tvecshiftR64, !cast<Pseudo>(NAME # _ZERO_D)>;
	}

	class sve_int_bin_pred_shift<bits<2> sz8_64, bit wide, bits<3> opc,
	string asm, ZPRRegOp zprty, ZPRRegOp zprty2>
	: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty2:$Zm),
	asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zdn;
	bits<5> Zm;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz8_64;
	let Inst{21-20} = 0b01;
	let Inst{19} = wide;
	let Inst{18-16} = opc;
	let Inst{15-13} = 0b100;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_int_bin_pred_shift<bits<3> opc, string asm, string Ps,
	SDPatternOperator op, string revname, bit isReverseInstr = 0> {
	let DestructiveInstType = DestructiveBinaryCommWithRev in {
	def _B : sve_int_bin_pred_shift<0b00, 0b0, opc, asm, ZPR8, ZPR8>,
	SVEPseudo2Instr<Ps # _B, 1>, SVEInstr2Rev<NAME # _B, revname # _B, isReverseInstr>;
	def _H : sve_int_bin_pred_shift<0b01, 0b0, opc, asm, ZPR16, ZPR16>,
	SVEPseudo2Instr<Ps # _H, 1>, SVEInstr2Rev<NAME # _H, revname # _H, isReverseInstr>;
	def _S : sve_int_bin_pred_shift<0b10, 0b0, opc, asm, ZPR32, ZPR32>,
	SVEPseudo2Instr<Ps # _S, 1>, SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>;
	def _D : sve_int_bin_pred_shift<0b11, 0b0, opc, asm, ZPR64, ZPR64>,
	SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>;
	}
	def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve_int_bin_pred_zeroing_bhsd<SDPatternOperator op> {
	def _ZERO_B : PredTwoOpPseudo<NAME # _B, ZPR8, FalseLanesZero>;
	def _ZERO_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesZero>;
	def _ZERO_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesZero>;
	def _ZERO_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesZero>;

	def : SVE_3_Op_Pat_SelZero<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Pseudo>(NAME # _ZERO_B)>;
	def : SVE_3_Op_Pat_SelZero<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Pseudo>(NAME # _ZERO_H)>;
	def : SVE_3_Op_Pat_SelZero<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Pseudo>(NAME # _ZERO_S)>;
	def : SVE_3_Op_Pat_SelZero<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _ZERO_D)>;
	}

	multiclass sve_int_bin_pred_shift_wide<bits<3> opc, string asm,
	SDPatternOperator op> {
	def _B : sve_int_bin_pred_shift<0b00, 0b1, opc, asm, ZPR8, ZPR64>;
	def _H : sve_int_bin_pred_shift<0b01, 0b1, opc, asm, ZPR16, ZPR64>;
	def _S : sve_int_bin_pred_shift<0b10, 0b1, opc, asm, ZPR32, ZPR64>;

	def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv2i64, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv2i64, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv2i64, !cast<Instruction>(NAME # _S)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Shift - Unpredicated Group
	//===----------------------------------------------------------------------===//

	class sve_int_bin_cons_shift_wide<bits<2> sz8_64, bits<2> opc, string asm,
	ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$Zn, ZPR64:$Zm),
	asm, "\t$Zd, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15-12} = 0b1000;
	let Inst{11-10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_bin_cons_shift_wide<bits<2> opc, string asm> {
	def _B : sve_int_bin_cons_shift_wide<0b00, opc, asm, ZPR8>;
	def _H : sve_int_bin_cons_shift_wide<0b01, opc, asm, ZPR16>;
	def _S : sve_int_bin_cons_shift_wide<0b10, opc, asm, ZPR32>;
	}

	class sve_int_bin_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
	ZPRRegOp zprty, Operand immtype>
	: I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$imm),
	asm, "\t$Zd, $Zn, $imm",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<6> imm;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = tsz8_64{3-2};
	let Inst{21} = 0b1;
	let Inst{20-19} = tsz8_64{1-0};
	let Inst{18-16} = imm{2-0}; // imm3
	let Inst{15-12} = 0b1001;
	let Inst{11-10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_bin_cons_shift_imm_left<bits<2> opc, string asm,
	SDPatternOperator op> {
	def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
	def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
	let Inst{19} = imm{3};
	}
	def _S : sve_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
	let Inst{20-19} = imm{4-3};
	}
	def _D : sve_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
	let Inst{22} = imm{5};
	let Inst{20-19} = imm{4-3};
	}

	def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, vecshiftL8, !cast<Instruction>(NAME # _B)>;
	def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, vecshiftL16, !cast<Instruction>(NAME # _H)>;
	def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, vecshiftL32, !cast<Instruction>(NAME # _S)>;
	def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEShiftImm64, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve_int_bin_cons_shift_imm_right<bits<2> opc, string asm,
	SDPatternOperator op> {
	def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
	def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
	let Inst{19} = imm{3};
	}
	def _S : sve_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
	let Inst{20-19} = imm{4-3};
	}
	def _D : sve_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
	let Inst{22} = imm{5};
	let Inst{20-19} = imm{4-3};
	}

	def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, vecshiftR8, !cast<Instruction>(NAME # _B)>;
	def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, vecshiftR16, !cast<Instruction>(NAME # _H)>;
	def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, vecshiftR32, !cast<Instruction>(NAME # _S)>;
	def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEShiftImm64, !cast<Instruction>(NAME # _D)>;
	}
	//===----------------------------------------------------------------------===//
	// SVE Memory - Store Group
	//===----------------------------------------------------------------------===//

	class sve_mem_cst_si<bits<2> msz, bits<2> esz, string asm,
	RegisterOperand VecList>
	: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4),
	asm, "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rn;
	bits<5> Zt;
	bits<4> imm4;
	let Inst{31-25} = 0b1110010;
	let Inst{24-23} = msz;
	let Inst{22-21} = esz;
	let Inst{20} = 0;
	let Inst{19-16} = imm4;
	let Inst{15-13} = 0b111;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayStore = 1;
	}

	multiclass sve_mem_cst_si<bits<2> msz, bits<2> esz, string asm,
	RegisterOperand listty, ZPRRegOp zprty>
	{
	def NAME : sve_mem_cst_si<msz, esz, asm, listty>;

	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
	(!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
	}

	class sve_mem_est_si<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
	string asm, Operand immtype>
	: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, immtype:$imm4),
	asm, "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rn;
	bits<5> Zt;
	bits<4> imm4;
	let Inst{31-25} = 0b1110010;
	let Inst{24-23} = sz;
	let Inst{22-21} = nregs;
	let Inst{20} = 1;
	let Inst{19-16} = imm4;
	let Inst{15-13} = 0b111;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayStore = 1;
	}

	multiclass sve_mem_est_si<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
	string asm, Operand immtype> {
	def NAME : sve_mem_est_si<sz, nregs, VecList, asm, immtype>;

	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
	(!cast<Instruction>(NAME) VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
	}

	class sve_mem_est_ss<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
	string asm, RegisterOperand gprty>
	: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
	asm, "\t$Zt, $Pg, [$Rn, $Rm]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rm;
	bits<5> Rn;
	bits<5> Zt;
	let Inst{31-25} = 0b1110010;
	let Inst{24-23} = sz;
	let Inst{22-21} = nregs;
	let Inst{20-16} = Rm;
	let Inst{15-13} = 0b011;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayStore = 1;
	}

	class sve_mem_cst_ss_base<bits<4> dtype, string asm,
	RegisterOperand listty, RegisterOperand gprty>
	: I<(outs), (ins listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
	asm, "\t$Zt, $Pg, [$Rn, $Rm]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rm;
	bits<5> Rn;
	bits<5> Zt;
	let Inst{31-25} = 0b1110010;
	let Inst{24-21} = dtype;
	let Inst{20-16} = Rm;
	let Inst{15-13} = 0b010;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayStore = 1;
	}

	multiclass sve_mem_cst_ss<bits<4> dtype, string asm,
	RegisterOperand listty, ZPRRegOp zprty,
	RegisterOperand gprty> {
	def NAME : sve_mem_cst_ss_base<dtype, asm, listty, gprty>;

	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Rm]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
	}

	class sve_mem_cstnt_si<bits<2> msz, string asm, RegisterOperand VecList>
	: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4),
	asm, "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rn;
	bits<5> Zt;
	bits<4> imm4;
	let Inst{31-25} = 0b1110010;
	let Inst{24-23} = msz;
	let Inst{22-20} = 0b001;
	let Inst{19-16} = imm4;
	let Inst{15-13} = 0b111;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayStore = 1;
	}

	multiclass sve_mem_cstnt_si<bits<2> msz, string asm, RegisterOperand listty,
	ZPRRegOp zprty> {
	def NAME : sve_mem_cstnt_si<msz, asm, listty>;

	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
	(!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
	}

	class sve_mem_cstnt_ss_base<bits<2> msz, string asm, RegisterOperand listty,
	RegisterOperand gprty>
	: I<(outs), (ins listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
	asm, "\t$Zt, $Pg, [$Rn, $Rm]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rm;
	bits<5> Rn;
	bits<5> Zt;
	let Inst{31-25} = 0b1110010;
	let Inst{24-23} = msz;
	let Inst{22-21} = 0b00;
	let Inst{20-16} = Rm;
	let Inst{15-13} = 0b011;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayStore = 1;
	}

	multiclass sve_mem_cstnt_ss<bits<2> msz, string asm, RegisterOperand listty,
	ZPRRegOp zprty, RegisterOperand gprty> {
	def NAME : sve_mem_cstnt_ss_base<msz, asm, listty, gprty>;

	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Rm]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
	}

	class sve2_mem_sstnt_vs_base<bits<3> opc, string asm,
	RegisterOperand listty, ZPRRegOp zprty>
	: I<(outs), (ins listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
	asm, "\t$Zt, $Pg, [$Zn, $Rm]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rm;
	bits<5> Zn;
	bits<5> Zt;
	let Inst{31-25} = 0b1110010;
	let Inst{24-22} = opc;
	let Inst{21} = 0b0;
	let Inst{20-16} = Rm;
	let Inst{15-13} = 0b001;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zt;

	let mayStore = 1;
	}

	multiclass sve2_mem_sstnt_vs_32_ptrs<bits<3> opc, string asm,
	SDPatternOperator op,
	ValueType vt> {
	def _REAL : sve2_mem_sstnt_vs_base<opc, asm, Z_s, ZPR32>;

	def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
	(!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
	(!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
	(!cast<Instruction>(NAME # _REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 1>;

	def : Pat <(op (nxv4i32 ZPR32:$Zt), (nxv4i1 PPR3bAny:$Pg), (nxv4i32 ZPR32:$Zn), (i64 GPR64:$Rm), vt),
	(!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm)>;
	}

	multiclass sve2_mem_sstnt_vs_64_ptrs<bits<3> opc, string asm,
	SDPatternOperator op,
	ValueType vt> {
	def _REAL : sve2_mem_sstnt_vs_base<opc, asm, Z_d, ZPR64>;

	def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
	(!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
	(!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
	(!cast<Instruction>(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>;

	def : Pat <(op (nxv2i64 ZPR64:$Zt), (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64:$Rm), vt),
	(!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>;
	}

	class sve_mem_sst_sv<bits<3> opc, bit xs, bit scaled, string asm,
	RegisterOperand VecList, RegisterOperand zprext>
	: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
	asm, "\t$Zt, $Pg, [$Rn, $Zm]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rn;
	bits<5> Zm;
	bits<5> Zt;
	let Inst{31-25} = 0b1110010;
	let Inst{24-22} = opc;
	let Inst{21} = scaled;
	let Inst{20-16} = Zm;
	let Inst{15} = 0b1;
	let Inst{14} = xs;
	let Inst{13} = 0;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayStore = 1;
	}

	multiclass sve_mem_32b_sst_sv_32_scaled<bits<3> opc, string asm,
	SDPatternOperator sxtw_op,
	SDPatternOperator uxtw_op,
	RegisterOperand sxtw_opnd,
	RegisterOperand uxtw_opnd,
	ValueType vt > {
	def _UXTW_SCALED : sve_mem_sst_sv<opc, 0, 1, asm, Z_s, uxtw_opnd>;
	def _SXTW_SCALED : sve_mem_sst_sv<opc, 1, 1, asm, Z_s, sxtw_opnd>;

	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _UXTW_SCALED) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _SXTW_SCALED) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;

	def : Pat<(uxtw_op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt),
	(!cast<Instruction>(NAME # _UXTW_SCALED) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
	def : Pat<(sxtw_op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt),
	(!cast<Instruction>(NAME # _SXTW_SCALED) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
	}

	multiclass sve_mem_64b_sst_sv_32_scaled<bits<3> opc, string asm,
	SDPatternOperator sxtw_op,
	SDPatternOperator uxtw_op,
	RegisterOperand sxtw_opnd,
	RegisterOperand uxtw_opnd,
	ValueType vt > {
	def _UXTW_SCALED : sve_mem_sst_sv<opc, 0, 1, asm, Z_d, uxtw_opnd>;
	def _SXTW_SCALED : sve_mem_sst_sv<opc, 1, 1, asm, Z_d, sxtw_opnd>;

	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _UXTW_SCALED) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _SXTW_SCALED) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;

	def : Pat<(uxtw_op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt),
	(!cast<Instruction>(NAME # _UXTW_SCALED) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
	def : Pat<(sxtw_op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt),
	(!cast<Instruction>(NAME # _SXTW_SCALED) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
	}

	multiclass sve_mem_64b_sst_sv_32_unscaled<bits<3> opc, string asm,
	SDPatternOperator sxtw_op,
	SDPatternOperator uxtw_op,
	RegisterOperand sxtw_opnd,
	RegisterOperand uxtw_opnd,
	ValueType vt> {
	def _UXTW : sve_mem_sst_sv<opc, 0, 0, asm, Z_d, uxtw_opnd>;
	def _SXTW : sve_mem_sst_sv<opc, 1, 0, asm, Z_d, sxtw_opnd>;

	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _UXTW) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _SXTW) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;

	def : Pat<(uxtw_op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt),
	(!cast<Instruction>(NAME # _UXTW) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
	def : Pat<(sxtw_op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt),
	(!cast<Instruction>(NAME # _SXTW) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
	}

	multiclass sve_mem_32b_sst_sv_32_unscaled<bits<3> opc, string asm,
	SDPatternOperator sxtw_op,
	SDPatternOperator uxtw_op,
	RegisterOperand sxtw_opnd,
	RegisterOperand uxtw_opnd,
	ValueType vt> {
	def _UXTW : sve_mem_sst_sv<opc, 0, 0, asm, Z_s, uxtw_opnd>;
	def _SXTW : sve_mem_sst_sv<opc, 1, 0, asm, Z_s, sxtw_opnd>;

	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _UXTW) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _SXTW) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;

	def : Pat<(uxtw_op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt),
	(!cast<Instruction>(NAME # _UXTW) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
	def : Pat<(sxtw_op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt),
	(!cast<Instruction>(NAME # _SXTW) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
	}

	class sve_mem_sst_sv2<bits<2> msz, bit scaled, string asm,
	RegisterOperand zprext>
	: I<(outs), (ins Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
	asm, "\t$Zt, $Pg, [$Rn, $Zm]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rn;
	bits<5> Zm;
	bits<5> Zt;
	let Inst{31-25} = 0b1110010;
	let Inst{24-23} = msz;
	let Inst{22} = 0b0;
	let Inst{21} = scaled;
	let Inst{20-16} = Zm;
	let Inst{15-13} = 0b101;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayStore = 1;
	}

	multiclass sve_mem_sst_sv_64_scaled<bits<2> msz, string asm,
	SDPatternOperator op,
	RegisterOperand zprext,
	ValueType vt> {
	def _SCALED_REAL : sve_mem_sst_sv2<msz, 1, asm, zprext>;

	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>;

	def : Pat<(op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt),
	(!cast<Instruction>(NAME # _SCALED_REAL) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
	}

	multiclass sve_mem_sst_sv_64_unscaled<bits<2> msz, string asm,
	SDPatternOperator op,
	ValueType vt> {
	def _REAL : sve_mem_sst_sv2<msz, 0, asm, ZPR64ExtLSL8>;

	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>;

	def : Pat<(op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt),
	(!cast<Instruction>(NAME # _REAL) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
	}

	class sve_mem_sst_vi<bits<3> opc, string asm, ZPRRegOp zprty,
	RegisterOperand VecList, Operand imm_ty>
	: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, zprty:$Zn, imm_ty:$imm5),
	asm, "\t$Zt, $Pg, [$Zn, $imm5]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> imm5;
	bits<5> Zn;
	bits<5> Zt;
	let Inst{31-25} = 0b1110010;
	let Inst{24-23} = opc{2-1};
	let Inst{22} = 0b1;
	let Inst{21} = opc{0};
	let Inst{20-16} = imm5;
	let Inst{15-13} = 0b101;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zt;

	let mayStore = 1;
	}

	multiclass sve_mem_32b_sst_vi_ptrs<bits<3> opc, string asm,
	Operand imm_ty,
	SDPatternOperator op,
	ValueType vt> {
	def _IMM : sve_mem_sst_vi<opc, asm, ZPR32, Z_s, imm_ty>;

	def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
	(!cast<Instruction>(NAME # _IMM) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $imm5]",
	(!cast<Instruction>(NAME # _IMM) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
	(!cast<Instruction>(NAME # _IMM) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>;

	def : Pat<(op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), (nxv4i32 ZPR:$ptrs), imm_ty:$index, vt),
	(!cast<Instruction>(NAME # _IMM) ZPR:$data, PPR:$gp, ZPR:$ptrs, imm_ty:$index)>;
	}

	multiclass sve_mem_64b_sst_vi_ptrs<bits<3> opc, string asm,
	Operand imm_ty,
	SDPatternOperator op,
	ValueType vt> {
	def _IMM : sve_mem_sst_vi<opc, asm, ZPR64, Z_d, imm_ty>;

	def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
	(!cast<Instruction>(NAME # _IMM) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $imm5]",
	(!cast<Instruction>(NAME # _IMM) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
	(!cast<Instruction>(NAME # _IMM) Z_s:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>;

	def : Pat<(op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), (nxv2i64 ZPR:$ptrs), imm_ty:$index, vt),
	(!cast<Instruction>(NAME # _IMM) ZPR:$data, PPR:$gp, ZPR:$ptrs, imm_ty:$index)>;
	}

	class sve_mem_z_spill<string asm>
	: I<(outs), (ins ZPRAny:$Zt, GPR64sp:$Rn, simm9:$imm9),
	asm, "\t$Zt, [$Rn, $imm9, mul vl]",
	"",
	[]>, Sched<[]> {
	bits<5> Rn;
	bits<5> Zt;
	bits<9> imm9;
	let Inst{31-22} = 0b1110010110;
	let Inst{21-16} = imm9{8-3};
	let Inst{15-13} = 0b010;
	let Inst{12-10} = imm9{2-0};
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayStore = 1;
	}

	multiclass sve_mem_z_spill<string asm> {
	def NAME : sve_mem_z_spill<asm>;

	def : InstAlias<asm # "\t$Zt, [$Rn]",
	(!cast<Instruction>(NAME) ZPRAny:$Zt, GPR64sp:$Rn, 0), 1>;
	}

	class sve_mem_p_spill<string asm>
	: I<(outs), (ins PPRAny:$Pt, GPR64sp:$Rn, simm9:$imm9),
	asm, "\t$Pt, [$Rn, $imm9, mul vl]",
	"",
	[]>, Sched<[]> {
	bits<4> Pt;
	bits<5> Rn;
	bits<9> imm9;
	let Inst{31-22} = 0b1110010110;
	let Inst{21-16} = imm9{8-3};
	let Inst{15-13} = 0b000;
	let Inst{12-10} = imm9{2-0};
	let Inst{9-5} = Rn;
	let Inst{4} = 0b0;
	let Inst{3-0} = Pt;

	let mayStore = 1;
	}

	multiclass sve_mem_p_spill<string asm> {
	def NAME : sve_mem_p_spill<asm>;

	def : InstAlias<asm # "\t$Pt, [$Rn]",
	(!cast<Instruction>(NAME) PPRAny:$Pt, GPR64sp:$Rn, 0), 1>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Permute - Predicates Group
	//===----------------------------------------------------------------------===//

	class sve_int_perm_bin_perm_pp<bits<3> opc, bits<2> sz8_64, string asm,
	PPRRegOp pprty>
	: I<(outs pprty:$Pd), (ins pprty:$Pn, pprty:$Pm),
	asm, "\t$Pd, $Pn, $Pm",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	bits<4> Pm;
	bits<4> Pn;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-20} = 0b10;
	let Inst{19-16} = Pm;
	let Inst{15-13} = 0b010;
	let Inst{12-10} = opc;
	let Inst{9} = 0b0;
	let Inst{8-5} = Pn;
	let Inst{4} = 0b0;
	let Inst{3-0} = Pd;
	}

	multiclass sve_int_perm_bin_perm_pp<bits<3> opc, string asm,
	SDPatternOperator op> {
	def _B : sve_int_perm_bin_perm_pp<opc, 0b00, asm, PPR8>;
	def _H : sve_int_perm_bin_perm_pp<opc, 0b01, asm, PPR16>;
	def _S : sve_int_perm_bin_perm_pp<opc, 0b10, asm, PPR32>;
	def _D : sve_int_perm_bin_perm_pp<opc, 0b11, asm, PPR64>;

	def : SVE_2_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, !cast<Instruction>(NAME # _B)>;
	def : SVE_2_Op_Pat<nxv8i1, op, nxv8i1, nxv8i1, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<nxv4i1, op, nxv4i1, nxv4i1, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<nxv2i1, op, nxv2i1, nxv2i1, !cast<Instruction>(NAME # _D)>;
	}

	class sve_int_perm_punpk<bit opc, string asm>
	: I<(outs PPR16:$Pd), (ins PPR8:$Pn),
	asm, "\t$Pd, $Pn",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	bits<4> Pn;
	let Inst{31-17} = 0b000001010011000;
	let Inst{16} = opc;
	let Inst{15-9} = 0b0100000;
	let Inst{8-5} = Pn;
	let Inst{4} = 0b0;
	let Inst{3-0} = Pd;
	}

	multiclass sve_int_perm_punpk<bit opc, string asm, SDPatternOperator op> {
	def NAME : sve_int_perm_punpk<opc, asm>;

	def : SVE_1_Op_Pat<nxv8i1, op, nxv16i1, !cast<Instruction>(NAME)>;
	def : SVE_1_Op_Pat<nxv4i1, op, nxv8i1, !cast<Instruction>(NAME)>;
	def : SVE_1_Op_Pat<nxv2i1, op, nxv4i1, !cast<Instruction>(NAME)>;
	}

	class sve_int_rdffr_pred<bit s, string asm>
	: I<(outs PPR8:$Pd), (ins PPRAny:$Pg),
	asm, "\t$Pd, $Pg/z",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	bits<4> Pg;
	let Inst{31-23} = 0b001001010;
	let Inst{22} = s;
	let Inst{21-9} = 0b0110001111000;
	let Inst{8-5} = Pg;
	let Inst{4} = 0;
	let Inst{3-0} = Pd;

	let Defs = !if(!eq (s, 1), [NZCV], []);
	let Uses = [FFR];
	}

	multiclass sve_int_rdffr_pred<bit s, string asm, SDPatternOperator op> {
	def _REAL : sve_int_rdffr_pred<s, asm>;

	// We need a layer of indirection because early machine code passes balk at
	// physical register (i.e. FFR) uses that have no previous definition.
	let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
	def "" : Pseudo<(outs PPR8:$Pd), (ins PPRAny:$Pg), [(set (nxv16i1 PPR8:$Pd), (op (nxv16i1 PPRAny:$Pg)))]>,
	PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) PPR8:$Pd, PPRAny:$Pg)>;
	}
	}

	class sve_int_rdffr_unpred<string asm> : I<
	(outs PPR8:$Pd), (ins),
	asm, "\t$Pd",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	let Inst{31-4} = 0b0010010100011001111100000000;
	let Inst{3-0} = Pd;

	let Uses = [FFR];
	}

	multiclass sve_int_rdffr_unpred<string asm, SDPatternOperator op> {
	def _REAL : sve_int_rdffr_unpred<asm>;

	// We need a layer of indirection because early machine code passes balk at
	// physical register (i.e. FFR) uses that have no previous definition.
	let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
	def "" : Pseudo<(outs PPR8:$Pd), (ins), [(set (nxv16i1 PPR8:$Pd), (op))]>,
	PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) PPR8:$Pd)>;
	}
	}

	class sve_int_wrffr<string asm, SDPatternOperator op>
	: I<(outs), (ins PPR8:$Pn),
	asm, "\t$Pn",
	"",
	[(op (nxv16i1 PPR8:$Pn))]>, Sched<[]> {
	bits<4> Pn;
	let Inst{31-9} = 0b00100101001010001001000;
	let Inst{8-5} = Pn;
	let Inst{4-0} = 0b00000;

	let hasSideEffects = 1;
	let Defs = [FFR];
	}

	class sve_int_setffr<string asm, SDPatternOperator op>
	: I<(outs), (ins),
	asm, "",
	"",
	[(op)]>, Sched<[]> {
	let Inst{31-0} = 0b00100101001011001001000000000000;

	let hasSideEffects = 1;
	let Defs = [FFR];
	}

	//===----------------------------------------------------------------------===//
	// SVE Permute Vector - Predicated Group
	//===----------------------------------------------------------------------===//

	class sve_int_perm_clast_rz<bits<2> sz8_64, bit ab, string asm,
	ZPRRegOp zprty, RegisterClass rt>
	: I<(outs rt:$Rdn), (ins PPR3bAny:$Pg, rt:$_Rdn, zprty:$Zm),
	asm, "\t$Rdn, $Pg, $_Rdn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rdn;
	bits<5> Zm;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-17} = 0b11000;
	let Inst{16} = ab;
	let Inst{15-13} = 0b101;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Rdn;

	let Constraints = "$Rdn = $_Rdn";
	}

	multiclass sve_int_perm_clast_rz<bit ab, string asm, SDPatternOperator op> {
	def _B : sve_int_perm_clast_rz<0b00, ab, asm, ZPR8, GPR32>;
	def _H : sve_int_perm_clast_rz<0b01, ab, asm, ZPR16, GPR32>;
	def _S : sve_int_perm_clast_rz<0b10, ab, asm, ZPR32, GPR32>;
	def _D : sve_int_perm_clast_rz<0b11, ab, asm, ZPR64, GPR64>;

	def : SVE_3_Op_Pat<i32, op, nxv16i1, i32, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Pat<i32, op, nxv8i1, i32, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<i32, op, nxv4i1, i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<i64, op, nxv2i1, i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	class sve_int_perm_clast_vz<bits<2> sz8_64, bit ab, string asm,
	ZPRRegOp zprty, RegisterClass rt>
	: I<(outs rt:$Vdn), (ins PPR3bAny:$Pg, rt:$_Vdn, zprty:$Zm),
	asm, "\t$Vdn, $Pg, $_Vdn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Vdn;
	bits<5> Zm;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-17} = 0b10101;
	let Inst{16} = ab;
	let Inst{15-13} = 0b100;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Vdn;

	let Constraints = "$Vdn = $_Vdn";
	}

	multiclass sve_int_perm_clast_vz<bit ab, string asm, SDPatternOperator op> {
	def _B : sve_int_perm_clast_vz<0b00, ab, asm, ZPR8, FPR8>;
	def _H : sve_int_perm_clast_vz<0b01, ab, asm, ZPR16, FPR16>;
	def _S : sve_int_perm_clast_vz<0b10, ab, asm, ZPR32, FPR32>;
	def _D : sve_int_perm_clast_vz<0b11, ab, asm, ZPR64, FPR64>;

	def : SVE_3_Op_Pat<f16, op, nxv8i1, f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<f32, op, nxv4i1, f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<f64, op, nxv2i1, f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
	}

	class sve_int_perm_clast_zz<bits<2> sz8_64, bit ab, string asm,
	ZPRRegOp zprty>
	: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
	asm, "\t$Zdn, $Pg, $_Zdn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zdn;
	bits<5> Zm;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-17} = 0b10100;
	let Inst{16} = ab;
	let Inst{15-13} = 0b100;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve_int_perm_clast_zz<bit ab, string asm, SDPatternOperator op> {
	def _B : sve_int_perm_clast_zz<0b00, ab, asm, ZPR8>;
	def _H : sve_int_perm_clast_zz<0b01, ab, asm, ZPR16>;
	def _S : sve_int_perm_clast_zz<0b10, ab, asm, ZPR32>;
	def _D : sve_int_perm_clast_zz<0b11, ab, asm, ZPR64>;

	def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;

	def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
	}

	class sve_int_perm_last_r<bits<2> sz8_64, bit ab, string asm,
	ZPRRegOp zprty, RegisterClass resultRegType>
	: I<(outs resultRegType:$Rd), (ins PPR3bAny:$Pg, zprty:$Zn),
	asm, "\t$Rd, $Pg, $Zn",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rd;
	bits<5> Zn;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-17} = 0b10000;
	let Inst{16} = ab;
	let Inst{15-13} = 0b101;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Rd;
	}

	multiclass sve_int_perm_last_r<bit ab, string asm, SDPatternOperator op> {
	def _B : sve_int_perm_last_r<0b00, ab, asm, ZPR8, GPR32>;
	def _H : sve_int_perm_last_r<0b01, ab, asm, ZPR16, GPR32>;
	def _S : sve_int_perm_last_r<0b10, ab, asm, ZPR32, GPR32>;
	def _D : sve_int_perm_last_r<0b11, ab, asm, ZPR64, GPR64>;

	def : SVE_2_Op_Pat<i32, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_2_Op_Pat<i32, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	class sve_int_perm_last_v<bits<2> sz8_64, bit ab, string asm,
	ZPRRegOp zprty, RegisterClass dstRegtype>
	: I<(outs dstRegtype:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn),
	asm, "\t$Vd, $Pg, $Zn",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Vd;
	bits<5> Zn;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-17} = 0b10001;
	let Inst{16} = ab;
	let Inst{15-13} = 0b100;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Vd;
	}

	multiclass sve_int_perm_last_v<bit ab, string asm, SDPatternOperator op> {
	def _B : sve_int_perm_last_v<0b00, ab, asm, ZPR8, FPR8>;
	def _H : sve_int_perm_last_v<0b01, ab, asm, ZPR16, FPR16>;
	def _S : sve_int_perm_last_v<0b10, ab, asm, ZPR32, FPR32>;
	def _D : sve_int_perm_last_v<0b11, ab, asm, ZPR64, FPR64>;

	def : SVE_2_Op_Pat<f16, op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<f32, op, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
	}

	class sve_int_perm_splice<bits<2> sz8_64, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
	asm, "\t$Zdn, $Pg, $_Zdn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zdn;
	bits<5> Zm;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-13} = 0b101100100;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve_int_perm_splice<string asm, SDPatternOperator op> {
	def _B : sve_int_perm_splice<0b00, asm, ZPR8>;
	def _H : sve_int_perm_splice<0b01, asm, ZPR16>;
	def _S : sve_int_perm_splice<0b10, asm, ZPR32>;
	def _D : sve_int_perm_splice<0b11, asm, ZPR64>;

	def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;

	def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
	}

	class sve2_int_perm_splice_cons<bits<2> sz8_64, string asm,
	ZPRRegOp zprty, RegisterOperand VecList>
	: I<(outs zprty:$Zd), (ins PPR3bAny:$Pg, VecList:$Zn),
	asm, "\t$Zd, $Pg, $Zn",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zn;
	bits<5> Zd;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-13} = 0b101101100;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve2_int_perm_splice_cons<string asm> {
	def _B : sve2_int_perm_splice_cons<0b00, asm, ZPR8, ZZ_b>;
	def _H : sve2_int_perm_splice_cons<0b01, asm, ZPR16, ZZ_h>;
	def _S : sve2_int_perm_splice_cons<0b10, asm, ZPR32, ZZ_s>;
	def _D : sve2_int_perm_splice_cons<0b11, asm, ZPR64, ZZ_d>;
	}

	class sve_int_perm_rev<bits<2> sz8_64, bits<2> opc, string asm,
	ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, zprty:$Zn),
	asm, "\t$Zd, $Pg/m, $Zn",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<3> Pg;
	bits<5> Zn;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-18} = 0b1001;
	let Inst{17-16} = opc;
	let Inst{15-13} = 0b100;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;

	let Constraints = "$Zd = $_Zd";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_int_perm_rev_rbit<string asm, SDPatternOperator op> {
	def _B : sve_int_perm_rev<0b00, 0b11, asm, ZPR8>;
	def _H : sve_int_perm_rev<0b01, 0b11, asm, ZPR16>;
	def _S : sve_int_perm_rev<0b10, 0b11, asm, ZPR32>;
	def _D : sve_int_perm_rev<0b11, 0b11, asm, ZPR64>;

	def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve_int_perm_rev_revb<string asm,
	SDPatternOperator int_op,
	SDPatternOperator ir_op> {
	def _H : sve_int_perm_rev<0b01, 0b00, asm, ZPR16>;
	def _S : sve_int_perm_rev<0b10, 0b00, asm, ZPR32>;
	def _D : sve_int_perm_rev<0b11, 0b00, asm, ZPR64>;

	def : SVE_3_Op_Pat<nxv8i16, int_op, nxv8i16, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_3_Op_Pat<nxv4i32, int_op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, int_op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;

	def : SVE_1_Op_AllActive_Pat<nxv8i16, ir_op, nxv8i16, !cast<Instruction>(NAME # _H), PTRUE_H>;
	def : SVE_1_Op_AllActive_Pat<nxv4i32, ir_op, nxv4i32, !cast<Instruction>(NAME # _S), PTRUE_S>;
	def : SVE_1_Op_AllActive_Pat<nxv2i64, ir_op, nxv2i64, !cast<Instruction>(NAME # _D), PTRUE_D>;
	}

	multiclass sve_int_perm_rev_revh<string asm, SDPatternOperator op> {
	def _S : sve_int_perm_rev<0b10, 0b01, asm, ZPR32>;
	def _D : sve_int_perm_rev<0b11, 0b01, asm, ZPR64>;

	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve_int_perm_rev_revw<string asm, SDPatternOperator op> {
	def _D : sve_int_perm_rev<0b11, 0b10, asm, ZPR64>;

	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	class sve_int_perm_cpy_r<bits<2> sz8_64, string asm, ZPRRegOp zprty,
	RegisterClass srcRegType>
	: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, srcRegType:$Rn),
	asm, "\t$Zd, $Pg/m, $Rn",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rn;
	bits<5> Zd;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-13} = 0b101000101;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zd;

	let Constraints = "$Zd = $_Zd";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_int_perm_cpy_r<string asm, SDPatternOperator op> {
	def _B : sve_int_perm_cpy_r<0b00, asm, ZPR8, GPR32sp>;
	def _H : sve_int_perm_cpy_r<0b01, asm, ZPR16, GPR32sp>;
	def _S : sve_int_perm_cpy_r<0b10, asm, ZPR32, GPR32sp>;
	def _D : sve_int_perm_cpy_r<0b11, asm, ZPR64, GPR64sp>;

	def : InstAlias<"mov $Zd, $Pg/m, $Rn",
	(!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPR3bAny:$Pg, GPR32sp:$Rn), 1>;
	def : InstAlias<"mov $Zd, $Pg/m, $Rn",
	(!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPR3bAny:$Pg, GPR32sp:$Rn), 1>;
	def : InstAlias<"mov $Zd, $Pg/m, $Rn",
	(!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPR3bAny:$Pg, GPR32sp:$Rn), 1>;
	def : InstAlias<"mov $Zd, $Pg/m, $Rn",
	(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPR3bAny:$Pg, GPR64sp:$Rn), 1>;

	def : Pat<(nxv16i8 (op nxv16i1:$pg, i32:$splat, nxv16i8:$passthru)),
	(!cast<Instruction>(NAME # _B) $passthru, $pg, $splat)>;
	def : Pat<(nxv8i16 (op nxv8i1:$pg, i32:$splat, nxv8i16:$passthru)),
	(!cast<Instruction>(NAME # _H) $passthru, $pg, $splat)>;
	def : Pat<(nxv4i32 (op nxv4i1:$pg, i32:$splat, nxv4i32:$passthru)),
	(!cast<Instruction>(NAME # _S) $passthru, $pg, $splat)>;
	def : Pat<(nxv2i64 (op nxv2i1:$pg, i64:$splat, nxv2i64:$passthru)),
	(!cast<Instruction>(NAME # _D) $passthru, $pg, $splat)>;
	}

	class sve_int_perm_cpy_v<bits<2> sz8_64, string asm, ZPRRegOp zprty,
	RegisterClass srcRegtype>
	: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, srcRegtype:$Vn),
	asm, "\t$Zd, $Pg/m, $Vn",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Vn;
	bits<5> Zd;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-13} = 0b100000100;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Vn;
	let Inst{4-0} = Zd;

	let Constraints = "$Zd = $_Zd";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_int_perm_cpy_v<string asm, SDPatternOperator op> {
	def _B : sve_int_perm_cpy_v<0b00, asm, ZPR8, FPR8>;
	def _H : sve_int_perm_cpy_v<0b01, asm, ZPR16, FPR16>;
	def _S : sve_int_perm_cpy_v<0b10, asm, ZPR32, FPR32>;
	def _D : sve_int_perm_cpy_v<0b11, asm, ZPR64, FPR64>;

	def : InstAlias<"mov $Zd, $Pg/m, $Vn",
	(!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPR3bAny:$Pg, FPR8:$Vn), 1>;
	def : InstAlias<"mov $Zd, $Pg/m, $Vn",
	(!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPR3bAny:$Pg, FPR16:$Vn), 1>;
	def : InstAlias<"mov $Zd, $Pg/m, $Vn",
	(!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPR3bAny:$Pg, FPR32:$Vn), 1>;
	def : InstAlias<"mov $Zd, $Pg/m, $Vn",
	(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPR3bAny:$Pg, FPR64:$Vn), 1>;


	def : Pat<(nxv8f16 (op nxv8i1:$pg, f16:$splat, nxv8f16:$passthru)),
	(!cast<Instruction>(NAME # _H) $passthru, $pg, $splat)>;
	def : Pat<(nxv2f32 (op nxv2i1:$pg, f32:$splat, nxv2f32:$passthru)),
	(!cast<Instruction>(NAME # _S) $passthru, $pg, $splat)>;
	def : Pat<(nxv4f32 (op nxv4i1:$pg, f32:$splat, nxv4f32:$passthru)),
	(!cast<Instruction>(NAME # _S) $passthru, $pg, $splat)>;
	def : Pat<(nxv2f64 (op nxv2i1:$pg, f64:$splat, nxv2f64:$passthru)),
	(!cast<Instruction>(NAME # _D) $passthru, $pg, $splat)>;
	}

	class sve_int_perm_compact<bit sz, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zn),
	asm, "\t$Zd, $Pg, $Zn",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zd;
	bits<5> Zn;
	let Inst{31-23} = 0b000001011;
	let Inst{22} = sz;
	let Inst{21-13} = 0b100001100;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_perm_compact<string asm, SDPatternOperator op> {
	def _S : sve_int_perm_compact<0b0, asm, ZPR32>;
	def _D : sve_int_perm_compact<0b1, asm, ZPR64>;

	def : SVE_2_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
	def : SVE_2_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
	}


	//===----------------------------------------------------------------------===//
	// SVE Memory - Contiguous Load Group
	//===----------------------------------------------------------------------===//

	class sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm,
	RegisterOperand VecList>
	: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4),
	asm, "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rn;
	bits<5> Zt;
	bits<4> imm4;
	let Inst{31-25} = 0b1010010;
	let Inst{24-21} = dtype;
	let Inst{20} = nf;
	let Inst{19-16} = imm4;
	let Inst{15-13} = 0b101;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	let Uses = !if(!eq(nf, 1), [FFR], []);
	let Defs = !if(!eq(nf, 1), [FFR], []);
	}

	multiclass sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm,
	RegisterOperand listty, ZPRRegOp zprty> {
	def _REAL : sve_mem_cld_si_base<dtype, nf, asm, listty>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
	(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
	(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
	(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;

	// We need a layer of indirection because early machine code passes balk at
	// physical register (i.e. FFR) uses that have no previous definition.
	let hasSideEffects = 1, hasNoSchedulingInfo = 1, mayLoad = 1 in {
	def "" : Pseudo<(outs listty:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), []>,
	PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4)>;
	}
	}

	multiclass sve_mem_cld_si<bits<4> dtype, string asm, RegisterOperand listty,
	ZPRRegOp zprty>
	: sve_mem_cld_si_base<dtype, 0, asm, listty, zprty>;

	class sve_mem_cldnt_si_base<bits<2> msz, string asm, RegisterOperand VecList>
	: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4),
	asm, "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
	"",
	[]>, Sched<[]> {
	bits<5> Zt;
	bits<3> Pg;
	bits<5> Rn;
	bits<4> imm4;
	let Inst{31-25} = 0b1010010;
	let Inst{24-23} = msz;
	let Inst{22-20} = 0b000;
	let Inst{19-16} = imm4;
	let Inst{15-13} = 0b111;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	}

	multiclass sve_mem_cldnt_si<bits<2> msz, string asm, RegisterOperand listty,
	ZPRRegOp zprty> {
	def NAME : sve_mem_cldnt_si_base<msz, asm, listty>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
	(!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
	}

	class sve_mem_cldnt_ss_base<bits<2> msz, string asm, RegisterOperand VecList,
	RegisterOperand gprty>
	: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
	asm, "\t$Zt, $Pg/z, [$Rn, $Rm]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rm;
	bits<5> Rn;
	bits<5> Zt;
	let Inst{31-25} = 0b1010010;
	let Inst{24-23} = msz;
	let Inst{22-21} = 0b00;
	let Inst{20-16} = Rm;
	let Inst{15-13} = 0b110;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	}

	multiclass sve_mem_cldnt_ss<bits<2> msz, string asm, RegisterOperand listty,
	ZPRRegOp zprty, RegisterOperand gprty> {
	def NAME : sve_mem_cldnt_ss_base<msz, asm, listty, gprty>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
	}

	class sve_mem_ldqr_si<bits<2> sz, string asm, RegisterOperand VecList>
	: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s16:$imm4),
	asm, "\t$Zt, $Pg/z, [$Rn, $imm4]", "", []>, Sched<[]> {
	bits<5> Zt;
	bits<5> Rn;
	bits<3> Pg;
	bits<4> imm4;
	let Inst{31-25} = 0b1010010;
	let Inst{24-23} = sz;
	let Inst{22-20} = 0;
	let Inst{19-16} = imm4;
	let Inst{15-13} = 0b001;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	}

	multiclass sve_mem_ldqr_si<bits<2> sz, string asm, RegisterOperand listty,
	ZPRRegOp zprty> {
	def NAME : sve_mem_ldqr_si<sz, asm, listty>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
	(!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s16:$imm4), 0>;
	}

	class sve_mem_ldqr_ss<bits<2> sz, string asm, RegisterOperand VecList,
	RegisterOperand gprty>
	: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
	asm, "\t$Zt, $Pg/z, [$Rn, $Rm]", "", []>, Sched<[]> {
	bits<5> Zt;
	bits<3> Pg;
	bits<5> Rn;
	bits<5> Rm;
	let Inst{31-25} = 0b1010010;
	let Inst{24-23} = sz;
	let Inst{22-21} = 0;
	let Inst{20-16} = Rm;
	let Inst{15-13} = 0;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	}

	multiclass sve_mem_ldqr_ss<bits<2> sz, string asm, RegisterOperand listty,
	ZPRRegOp zprty, RegisterOperand gprty> {
	def NAME : sve_mem_ldqr_ss<sz, asm, listty, gprty>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
	}

	class sve_mem_ld_dup<bits<2> dtypeh, bits<2> dtypel, string asm,
	RegisterOperand VecList, Operand immtype>
	: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, immtype:$imm6),
	asm, "\t$Zt, $Pg/z, [$Rn, $imm6]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rn;
	bits<5> Zt;
	bits<6> imm6;
	let Inst{31-25} = 0b1000010;
	let Inst{24-23} = dtypeh;
	let Inst{22} = 1;
	let Inst{21-16} = imm6;
	let Inst{15} = 0b1;
	let Inst{14-13} = dtypel;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	}

	multiclass sve_mem_ld_dup<bits<2> dtypeh, bits<2> dtypel, string asm,
	RegisterOperand zlistty, ZPRRegOp zprty, Operand immtype> {
	def NAME : sve_mem_ld_dup<dtypeh, dtypel, asm, zlistty, immtype>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm6]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, immtype:$imm6), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
	(!cast<Instruction>(NAME) zlistty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
	}

	class sve_mem_cld_ss_base<bits<4> dtype, bit ff, dag iops, string asm,
	RegisterOperand VecList>
	: I<(outs VecList:$Zt), iops,
	asm, "\t$Zt, $Pg/z, [$Rn, $Rm]",
	"",
	[]>, Sched<[]> {
	bits<5> Zt;
	bits<3> Pg;
	bits<5> Rm;
	bits<5> Rn;
	let Inst{31-25} = 0b1010010;
	let Inst{24-21} = dtype;
	let Inst{20-16} = Rm;
	let Inst{15-14} = 0b01;
	let Inst{13} = ff;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	let Uses = !if(!eq(ff, 1), [FFR], []);
	let Defs = !if(!eq(ff, 1), [FFR], []);
	}

	multiclass sve_mem_cld_ss<bits<4> dtype, string asm, RegisterOperand listty,
	ZPRRegOp zprty, RegisterOperand gprty> {
	def "" : sve_mem_cld_ss_base<dtype, 0, (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
	asm, listty>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
	}

	multiclass sve_mem_cldff_ss<bits<4> dtype, string asm, RegisterOperand listty,
	ZPRRegOp zprty, RegisterOperand gprty> {
	def _REAL : sve_mem_cld_ss_base<dtype, 1, (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
	asm, listty>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
	(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
	(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, XZR), 1>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
	(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, XZR), 0>;

	// We need a layer of indirection because early machine code passes balk at
	// physical register (i.e. FFR) uses that have no previous definition.
	let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
	def "" : Pseudo<(outs listty:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), []>,
	PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm)>;
	}
	}

	multiclass sve_mem_cldnf_si<bits<4> dtype, string asm, RegisterOperand listty,
	ZPRRegOp zprty>
	: sve_mem_cld_si_base<dtype, 1, asm, listty, zprty>;

	class sve_mem_eld_si<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
	string asm, Operand immtype>
	: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, immtype:$imm4),
	asm, "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
	"",
	[]>, Sched<[]> {
	bits<5> Zt;
	bits<3> Pg;
	bits<5> Rn;
	bits<4> imm4;
	let Inst{31-25} = 0b1010010;
	let Inst{24-23} = sz;
	let Inst{22-21} = nregs;
	let Inst{20} = 0;
	let Inst{19-16} = imm4;
	let Inst{15-13} = 0b111;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	}

	multiclass sve_mem_eld_si<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
	string asm, Operand immtype> {
	def NAME : sve_mem_eld_si<sz, nregs, VecList, asm, immtype>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
	(!cast<Instruction>(NAME) VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
	}

	class sve_mem_eld_ss<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
	string asm, RegisterOperand gprty>
	: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
	asm, "\t$Zt, $Pg/z, [$Rn, $Rm]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rm;
	bits<5> Rn;
	bits<5> Zt;
	let Inst{31-25} = 0b1010010;
	let Inst{24-23} = sz;
	let Inst{22-21} = nregs;
	let Inst{20-16} = Rm;
	let Inst{15-13} = 0b110;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	}

	//===----------------------------------------------------------------------===//
	// SVE Memory - 32-bit Gather and Unsized Contiguous Group
	//===----------------------------------------------------------------------===//

	// bit xs is '1' if offsets are signed
	// bit scaled is '1' if the offsets are scaled
	class sve_mem_32b_gld_sv<bits<4> opc, bit xs, bit scaled, string asm,
	RegisterOperand zprext>
	: I<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
	asm, "\t$Zt, $Pg/z, [$Rn, $Zm]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rn;
	bits<5> Zm;
	bits<5> Zt;
	let Inst{31-25} = 0b1000010;
	let Inst{24-23} = opc{3-2};
	let Inst{22} = xs;
	let Inst{21} = scaled;
	let Inst{20-16} = Zm;
	let Inst{15} = 0b0;
	let Inst{14-13} = opc{1-0};
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	let Defs = !if(!eq(opc{0}, 1), [FFR], []);
	let Uses = !if(!eq(opc{0}, 1), [FFR], []);
	}

	multiclass sve_mem_32b_gld_sv_32_scaled<bits<4> opc, string asm,
	SDPatternOperator sxtw_op,
	SDPatternOperator uxtw_op,
	RegisterOperand sxtw_opnd,
	RegisterOperand uxtw_opnd,
	ValueType vt> {
	def _UXTW_SCALED_REAL : sve_mem_32b_gld_sv<opc, 0, 1, asm, uxtw_opnd>;
	def _SXTW_SCALED_REAL : sve_mem_32b_gld_sv<opc, 1, 1, asm, sxtw_opnd>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _UXTW_SCALED_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _SXTW_SCALED_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;

	// We need a layer of indirection because early machine code passes balk at
	// physical register (i.e. FFR) uses that have no previous definition.
	let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
	def _UXTW_SCALED : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>,
	PseudoInstExpansion<(!cast<Instruction>(NAME # _UXTW_SCALED_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;
	def _SXTW_SCALED : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>,
	PseudoInstExpansion<(!cast<Instruction>(NAME # _SXTW_SCALED_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
	}

	def : Pat<(nxv4i32 (uxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$indices), vt)),
	(!cast<Instruction>(NAME # _UXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
	def : Pat<(nxv4i32 (sxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$indices), vt)),
	(!cast<Instruction>(NAME # _SXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
	}

	multiclass sve_mem_32b_gld_vs_32_unscaled<bits<4> opc, string asm,
	SDPatternOperator sxtw_op,
	SDPatternOperator uxtw_op,
	RegisterOperand sxtw_opnd,
	RegisterOperand uxtw_opnd,
	ValueType vt> {
	def _UXTW_REAL : sve_mem_32b_gld_sv<opc, 0, 0, asm, uxtw_opnd>;
	def _SXTW_REAL : sve_mem_32b_gld_sv<opc, 1, 0, asm, sxtw_opnd>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _UXTW_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _SXTW_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;

	// We need a layer of indirection because early machine code passes balk at
	// physical register (i.e. FFR) uses that have no previous definition.
	let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
	def _UXTW : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>,
	PseudoInstExpansion<(!cast<Instruction>(NAME # _UXTW_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;
	def _SXTW : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>,
	PseudoInstExpansion<(!cast<Instruction>(NAME # _SXTW_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
	}

	def : Pat<(nxv4i32 (uxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt)),
	(!cast<Instruction>(NAME # _UXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
	def : Pat<(nxv4i32 (sxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt)),
	(!cast<Instruction>(NAME # _SXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
	}


	class sve_mem_32b_gld_vi<bits<4> opc, string asm, Operand imm_ty>
	: I<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5),
	asm, "\t$Zt, $Pg/z, [$Zn, $imm5]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zn;
	bits<5> Zt;
	bits<5> imm5;
	let Inst{31-25} = 0b1000010;
	let Inst{24-23} = opc{3-2};
	let Inst{22-21} = 0b01;
	let Inst{20-16} = imm5;
	let Inst{15} = 0b1;
	let Inst{14-13} = opc{1-0};
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	let Defs = !if(!eq(opc{0}, 1), [FFR], []);
	let Uses = !if(!eq(opc{0}, 1), [FFR], []);
	}

	multiclass sve_mem_32b_gld_vi_32_ptrs<bits<4> opc, string asm, Operand imm_ty,
	SDPatternOperator op, ValueType vt> {
	def _IMM_REAL : sve_mem_32b_gld_vi<opc, asm, imm_ty>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
	(!cast<Instruction>(NAME # _IMM_REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $imm5]",
	(!cast<Instruction>(NAME # _IMM_REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
	(!cast<Instruction>(NAME # _IMM_REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>;

	// We need a layer of indirection because early machine code passes balk at
	// physical register (i.e. FFR) uses that have no previous definition.
	let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
	def _IMM : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5), []>,
	PseudoInstExpansion<(!cast<Instruction>(NAME # _IMM_REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5)>;
	}

	def : Pat<(nxv4i32 (op (nxv4i1 PPR:$gp), (nxv4i32 ZPR:$ptrs), imm_ty:$index, vt)),
	(!cast<Instruction>(NAME # _IMM) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>;
	}

	class sve_mem_prfm_si<bits<2> msz, string asm>
	: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, simm6s1:$imm6),
	asm, "\t$prfop, $Pg, [$Rn, $imm6, mul vl]",
	"",
	[]>, Sched<[]> {
	bits<5> Rn;
	bits<3> Pg;
	bits<6> imm6;
	bits<4> prfop;
	let Inst{31-22} = 0b1000010111;
	let Inst{21-16} = imm6;
	let Inst{15} = 0b0;
	let Inst{14-13} = msz;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4} = 0b0;
	let Inst{3-0} = prfop;

	let hasSideEffects = 1;
	}

	multiclass sve_mem_prfm_si<bits<2> msz, string asm> {
	def NAME : sve_mem_prfm_si<msz, asm>;

	def : InstAlias<asm # "\t$prfop, $Pg, [$Rn]",
	(!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
	}

	class sve_mem_prfm_ss<bits<3> opc, string asm, RegisterOperand gprty>
	: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
	asm, "\t$prfop, $Pg, [$Rn, $Rm]",
	"",
	[]>, Sched<[]> {
	bits<5> Rm;
	bits<5> Rn;
	bits<3> Pg;
	bits<4> prfop;
	let Inst{31-25} = 0b1000010;
	let Inst{24-23} = opc{2-1};
	let Inst{22-21} = 0b00;
	let Inst{20-16} = Rm;
	let Inst{15} = 0b1;
	let Inst{14} = opc{0};
	let Inst{13} = 0b0;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4} = 0b0;
	let Inst{3-0} = prfop;

	let hasSideEffects = 1;
	}

	class sve_mem_32b_prfm_sv<bits<2> msz, bit xs, string asm,
	RegisterOperand zprext>
	: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
	asm, "\t$prfop, $Pg, [$Rn, $Zm]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rn;
	bits<5> Zm;
	bits<4> prfop;
	let Inst{31-23} = 0b100001000;
	let Inst{22} = xs;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15} = 0b0;
	let Inst{14-13} = msz;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4} = 0b0;
	let Inst{3-0} = prfop;

	let hasSideEffects = 1;
	}

	multiclass sve_mem_32b_prfm_sv_scaled<bits<2> msz, string asm,
	RegisterOperand sxtw_opnd,
	RegisterOperand uxtw_opnd,
	PatFrag op_sxtw,
	PatFrag op_uxtw> {
	def _UXTW_SCALED : sve_mem_32b_prfm_sv<msz, 0, asm, uxtw_opnd>;
	def _SXTW_SCALED : sve_mem_32b_prfm_sv<msz, 1, asm, sxtw_opnd>;

	def : Pat<(op_uxtw (nxv4i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv4i32 uxtw_opnd:$Zm), (i32 sve_prfop:$prfop)),
	(!cast<Instruction>(NAME # _UXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;

	def : Pat<(op_sxtw (nxv4i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv4i32 sxtw_opnd:$Zm), (i32 sve_prfop:$prfop)),
	(!cast<Instruction>(NAME # _SXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
	}

	class sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty>
	: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5),
	asm, "\t$prfop, $Pg, [$Zn, $imm5]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zn;
	bits<5> imm5;
	bits<4> prfop;
	let Inst{31-25} = 0b1000010;
	let Inst{24-23} = msz;
	let Inst{22-21} = 0b00;
	let Inst{20-16} = imm5;
	let Inst{15-13} = 0b111;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4} = 0b0;
	let Inst{3-0} = prfop;
	}

	multiclass sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty, SDPatternOperator op> {
	def NAME : sve_mem_32b_prfm_vi<msz, asm, imm_ty>;

	def : InstAlias<asm # "\t$prfop, $Pg, [$Zn]",
	(!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>;

	def : Pat<(op (nxv4i1 PPR_3b:$Pg), (nxv4i32 ZPR32:$Zn), (i64 imm_ty:$imm), (i32 sve_prfop:$prfop)),
	(!cast<Instruction>(NAME) sve_prfop:$prfop, PPR_3b:$Pg, ZPR32:$Zn, imm_ty:$imm)>;
	}

	class sve_mem_z_fill<string asm>
	: I<(outs ZPRAny:$Zt), (ins GPR64sp:$Rn, simm9:$imm9),
	asm, "\t$Zt, [$Rn, $imm9, mul vl]",
	"",
	[]>, Sched<[]> {
	bits<5> Rn;
	bits<5> Zt;
	bits<9> imm9;
	let Inst{31-22} = 0b1000010110;
	let Inst{21-16} = imm9{8-3};
	let Inst{15-13} = 0b010;
	let Inst{12-10} = imm9{2-0};
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	}

	multiclass sve_mem_z_fill<string asm> {
	def NAME : sve_mem_z_fill<asm>;

	def : InstAlias<asm # "\t$Zt, [$Rn]",
	(!cast<Instruction>(NAME) ZPRAny:$Zt, GPR64sp:$Rn, 0), 1>;
	}

	class sve_mem_p_fill<string asm>
	: I<(outs PPRAny:$Pt), (ins GPR64sp:$Rn, simm9:$imm9),
	asm, "\t$Pt, [$Rn, $imm9, mul vl]",
	"",
	[]>, Sched<[]> {
	bits<4> Pt;
	bits<5> Rn;
	bits<9> imm9;
	let Inst{31-22} = 0b1000010110;
	let Inst{21-16} = imm9{8-3};
	let Inst{15-13} = 0b000;
	let Inst{12-10} = imm9{2-0};
	let Inst{9-5} = Rn;
	let Inst{4} = 0b0;
	let Inst{3-0} = Pt;

	let mayLoad = 1;
	}

	multiclass sve_mem_p_fill<string asm> {
	def NAME : sve_mem_p_fill<asm>;

	def : InstAlias<asm # "\t$Pt, [$Rn]",
	(!cast<Instruction>(NAME) PPRAny:$Pt, GPR64sp:$Rn, 0), 1>;
	}

	class sve2_mem_gldnt_vs_base<bits<5> opc, dag iops, string asm,
	RegisterOperand VecList>
	: I<(outs VecList:$Zt), iops,
	asm, "\t$Zt, $Pg/z, [$Zn, $Rm]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rm;
	bits<5> Zn;
	bits<5> Zt;
	let Inst{31} = 0b1;
	let Inst{30} = opc{4};
	let Inst{29-25} = 0b00010;
	let Inst{24-23} = opc{3-2};
	let Inst{22-21} = 0b00;
	let Inst{20-16} = Rm;
	let Inst{15} = 0b1;
	let Inst{14-13} = opc{1-0};
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	}

	multiclass sve2_mem_gldnt_vs_32_ptrs<bits<5> opc, string asm,
	SDPatternOperator op,
	ValueType vt> {
	def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm),
	asm, Z_s>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
	(!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
	(!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
	(!cast<Instruction>(NAME # _REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 1>;

	def : Pat <(nxv4i32 (op (nxv4i1 PPR3bAny:$Pg), (nxv4i32 ZPR32:$Zd), (i64 GPR64:$Rm), vt)),
	(!cast<Instruction>(NAME # _REAL) PPR3bAny:$Pg, ZPR32:$Zd, GPR64:$Rm)>;
	}

	multiclass sve2_mem_gldnt_vs_64_ptrs<bits<5> opc, string asm,
	SDPatternOperator op,
	ValueType vt> {
	def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm),
	asm, Z_d>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
	(!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
	(!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
	(!cast<Instruction>(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>;

	def : Pat <(nxv2i64 (op (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zd), (i64 GPR64:$Rm), vt)),
	(!cast<Instruction>(NAME # _REAL) PPR3bAny:$Pg, ZPR64:$Zd, GPR64:$Rm)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Memory - 64-bit Gather Group
	//===----------------------------------------------------------------------===//

	// bit xs is '1' if offsets are signed
	// bit scaled is '1' if the offsets are scaled
	// bit lsl is '0' if the offsets are extended (uxtw/sxtw), '1' if shifted (lsl)
	class sve_mem_64b_gld_sv<bits<4> opc, bit xs, bit scaled, bit lsl, string asm,
	RegisterOperand zprext>
	: I<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
	asm, "\t$Zt, $Pg/z, [$Rn, $Zm]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rn;
	bits<5> Zm;
	bits<5> Zt;
	let Inst{31-25} = 0b1100010;
	let Inst{24-23} = opc{3-2};
	let Inst{22} = xs;
	let Inst{21} = scaled;
	let Inst{20-16} = Zm;
	let Inst{15} = lsl;
	let Inst{14-13} = opc{1-0};
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	let Defs = !if(!eq(opc{0}, 1), [FFR], []);
	let Uses = !if(!eq(opc{0}, 1), [FFR], []);
	}

	multiclass sve_mem_64b_gld_sv_32_scaled<bits<4> opc, string asm,
	SDPatternOperator sxtw_op,
	SDPatternOperator uxtw_op,
	RegisterOperand sxtw_opnd,
	RegisterOperand uxtw_opnd,
	ValueType vt> {
	def _UXTW_SCALED_REAL : sve_mem_64b_gld_sv<opc, 0, 1, 0, asm, uxtw_opnd>;
	def _SXTW_SCALED_REAL : sve_mem_64b_gld_sv<opc, 1, 1, 0, asm, sxtw_opnd>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _UXTW_SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _SXTW_SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;

	// We need a layer of indirection because early machine code passes balk at
	// physical register (i.e. FFR) uses that have no previous definition.
	let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
	def _UXTW_SCALED : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>,
	PseudoInstExpansion<(!cast<Instruction>(NAME # _UXTW_SCALED_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;
	def _SXTW_SCALED : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>,
	PseudoInstExpansion<(!cast<Instruction>(NAME # _SXTW_SCALED_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
	}

	def : Pat<(nxv2i64 (uxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)),
	(!cast<Instruction>(NAME # _UXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
	def : Pat<(nxv2i64 (sxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)),
	(!cast<Instruction>(NAME # _SXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
	}

	multiclass sve_mem_64b_gld_vs_32_unscaled<bits<4> opc, string asm,
	SDPatternOperator sxtw_op,
	SDPatternOperator uxtw_op,
	RegisterOperand sxtw_opnd,
	RegisterOperand uxtw_opnd,
	ValueType vt> {
	def _UXTW_REAL : sve_mem_64b_gld_sv<opc, 0, 0, 0, asm, uxtw_opnd>;
	def _SXTW_REAL : sve_mem_64b_gld_sv<opc, 1, 0, 0, asm, sxtw_opnd>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _UXTW_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _SXTW_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;

	// We need a layer of indirection because early machine code passes balk at
	// physical register (i.e. FFR) uses that have no previous definition.
	let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
	def _UXTW : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>,
	PseudoInstExpansion<(!cast<Instruction>(NAME # _UXTW_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;
	def _SXTW : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>,
	PseudoInstExpansion<(!cast<Instruction>(NAME # _SXTW_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
	}

	def : Pat<(nxv2i64 (uxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)),
	(!cast<Instruction>(NAME # _UXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
	def : Pat<(nxv2i64 (sxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)),
	(!cast<Instruction>(NAME # _SXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
	}

	multiclass sve_mem_64b_gld_sv2_64_scaled<bits<4> opc, string asm,
	SDPatternOperator op,
	RegisterOperand zprext, ValueType vt> {
	def _SCALED_REAL : sve_mem_64b_gld_sv<opc, 1, 1, 1, asm, zprext>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>;

	// We need a layer of indirection because early machine code passes balk at
	// physical register (i.e. FFR) uses that have no previous definition.
	let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
	def _SCALED : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), []>,
	PseudoInstExpansion<(!cast<Instruction>(NAME # _SCALED_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm)>;
	}

	def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)),
	(!cast<Instruction>(NAME # _SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
	}

	multiclass sve_mem_64b_gld_vs2_64_unscaled<bits<4> opc, string asm,
	SDPatternOperator op, ValueType vt> {
	def _REAL : sve_mem_64b_gld_sv<opc, 1, 0, 1, asm, ZPR64ExtLSL8>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>;

	// We need a layer of indirection because early machine code passes balk at
	// physical register (i.e. FFR) uses that have no previous definition.
	let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
	def "" : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), []>,
	PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm)>;
	}

	def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)),
	(!cast<Instruction>(NAME) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
	}

	class sve_mem_64b_gld_vi<bits<4> opc, string asm, Operand imm_ty>
	: I<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5),
	asm, "\t$Zt, $Pg/z, [$Zn, $imm5]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zn;
	bits<5> Zt;
	bits<5> imm5;
	let Inst{31-25} = 0b1100010;
	let Inst{24-23} = opc{3-2};
	let Inst{22-21} = 0b01;
	let Inst{20-16} = imm5;
	let Inst{15} = 0b1;
	let Inst{14-13} = opc{1-0};
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	let Defs = !if(!eq(opc{0}, 1), [FFR], []);
	let Uses = !if(!eq(opc{0}, 1), [FFR], []);
	}

	multiclass sve_mem_64b_gld_vi_64_ptrs<bits<4> opc, string asm, Operand imm_ty,
	SDPatternOperator op, ValueType vt> {
	def _IMM_REAL : sve_mem_64b_gld_vi<opc, asm, imm_ty>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
	(!cast<Instruction>(NAME # _IMM_REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $imm5]",
	(!cast<Instruction>(NAME # _IMM_REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
	(!cast<Instruction>(NAME # _IMM_REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>;

	// We need a layer of indirection because early machine code passes balk at
	// physical register (i.e. FFR) uses that have no previous definition.
	let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
	def _IMM : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5), []>,
	PseudoInstExpansion<(!cast<Instruction>(NAME # _IMM_REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5)>;
	}

	def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), (nxv2i64 ZPR:$ptrs), imm_ty:$index, vt)),
	(!cast<Instruction>(NAME # _IMM) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>;
	}

	// bit lsl is '0' if the offsets are extended (uxtw/sxtw), '1' if shifted (lsl)
	class sve_mem_64b_prfm_sv<bits<2> msz, bit xs, bit lsl, string asm,
	RegisterOperand zprext>
	: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
	asm, "\t$prfop, $Pg, [$Rn, $Zm]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rn;
	bits<5> Zm;
	bits<4> prfop;
	let Inst{31-23} = 0b110001000;
	let Inst{22} = xs;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15} = lsl;
	let Inst{14-13} = msz;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4} = 0b0;
	let Inst{3-0} = prfop;

	let hasSideEffects = 1;
	}

	multiclass sve_mem_64b_prfm_sv_ext_scaled<bits<2> msz, string asm,
	RegisterOperand sxtw_opnd,
	RegisterOperand uxtw_opnd,
	PatFrag op_sxtw,
	PatFrag op_uxtw> {
	def _UXTW_SCALED : sve_mem_64b_prfm_sv<msz, 0, 0, asm, uxtw_opnd>;
	def _SXTW_SCALED : sve_mem_64b_prfm_sv<msz, 1, 0, asm, sxtw_opnd>;

	def : Pat<(op_uxtw (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 uxtw_opnd:$Zm), (i32 sve_prfop:$prfop)),
	(!cast<Instruction>(NAME # _UXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;

	def : Pat<(op_sxtw (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 sxtw_opnd:$Zm), (i32 sve_prfop:$prfop)),
	(!cast<Instruction>(NAME # _SXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;

	}

	multiclass sve_mem_64b_prfm_sv_lsl_scaled<bits<2> msz, string asm,
	RegisterOperand zprext, PatFrag frag> {
	def NAME : sve_mem_64b_prfm_sv<msz, 1, 1, asm, zprext>;

	def : Pat<(frag (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 zprext:$Zm), (i32 sve_prfop:$prfop)),
	(!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm)>;

	}


	class sve_mem_64b_prfm_vi<bits<2> msz, string asm, Operand imm_ty>
	: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5),
	asm, "\t$prfop, $Pg, [$Zn, $imm5]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zn;
	bits<5> imm5;
	bits<4> prfop;
	let Inst{31-25} = 0b1100010;
	let Inst{24-23} = msz;
	let Inst{22-21} = 0b00;
	let Inst{20-16} = imm5;
	let Inst{15-13} = 0b111;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4} = 0b0;
	let Inst{3-0} = prfop;

	let hasSideEffects = 1;
	}

	multiclass sve_mem_64b_prfm_vi<bits<2> msz, string asm, Operand imm_ty, SDPatternOperator op> {
	def NAME : sve_mem_64b_prfm_vi<msz, asm, imm_ty>;

	def : InstAlias<asm # "\t$prfop, $Pg, [$Zn]",
	(!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>;

	def : Pat<(op (nxv2i1 PPR_3b:$Pg), (nxv2i64 ZPR32:$Zn), (i64 imm_ty:$imm), (i32 sve_prfop:$prfop)),
	(!cast<Instruction>(NAME) sve_prfop:$prfop, PPR_3b:$Pg, ZPR32:$Zn, imm_ty:$imm)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Compute Vector Address Group
	//===----------------------------------------------------------------------===//

	class sve_int_bin_cons_misc_0_a<bits<2> opc, bits<2> msz, string asm,
	ZPRRegOp zprty, RegisterOperand zprext>
	: I<(outs zprty:$Zd), (ins zprty:$Zn, zprext:$Zm),
	asm, "\t$Zd, [$Zn, $Zm]",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<5> Zm;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = opc;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15-12} = 0b1010;
	let Inst{11-10} = msz;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_bin_cons_misc_0_a_uxtw<bits<2> opc, string asm> {
	def _0 : sve_int_bin_cons_misc_0_a<opc, 0b00, asm, ZPR64, ZPR64ExtUXTW8>;
	def _1 : sve_int_bin_cons_misc_0_a<opc, 0b01, asm, ZPR64, ZPR64ExtUXTW16>;
	def _2 : sve_int_bin_cons_misc_0_a<opc, 0b10, asm, ZPR64, ZPR64ExtUXTW32>;
	def _3 : sve_int_bin_cons_misc_0_a<opc, 0b11, asm, ZPR64, ZPR64ExtUXTW64>;
	}

	multiclass sve_int_bin_cons_misc_0_a_sxtw<bits<2> opc, string asm> {
	def _0 : sve_int_bin_cons_misc_0_a<opc, 0b00, asm, ZPR64, ZPR64ExtSXTW8>;
	def _1 : sve_int_bin_cons_misc_0_a<opc, 0b01, asm, ZPR64, ZPR64ExtSXTW16>;
	def _2 : sve_int_bin_cons_misc_0_a<opc, 0b10, asm, ZPR64, ZPR64ExtSXTW32>;
	def _3 : sve_int_bin_cons_misc_0_a<opc, 0b11, asm, ZPR64, ZPR64ExtSXTW64>;
	}

	multiclass sve_int_bin_cons_misc_0_a_32_lsl<bits<2> opc, string asm> {
	def _0 : sve_int_bin_cons_misc_0_a<opc, 0b00, asm, ZPR32, ZPR32ExtLSL8>;
	def _1 : sve_int_bin_cons_misc_0_a<opc, 0b01, asm, ZPR32, ZPR32ExtLSL16>;
	def _2 : sve_int_bin_cons_misc_0_a<opc, 0b10, asm, ZPR32, ZPR32ExtLSL32>;
	def _3 : sve_int_bin_cons_misc_0_a<opc, 0b11, asm, ZPR32, ZPR32ExtLSL64>;
	}

	multiclass sve_int_bin_cons_misc_0_a_64_lsl<bits<2> opc, string asm> {
	def _0 : sve_int_bin_cons_misc_0_a<opc, 0b00, asm, ZPR64, ZPR64ExtLSL8>;
	def _1 : sve_int_bin_cons_misc_0_a<opc, 0b01, asm, ZPR64, ZPR64ExtLSL16>;
	def _2 : sve_int_bin_cons_misc_0_a<opc, 0b10, asm, ZPR64, ZPR64ExtLSL32>;
	def _3 : sve_int_bin_cons_misc_0_a<opc, 0b11, asm, ZPR64, ZPR64ExtLSL64>;
	}


	//===----------------------------------------------------------------------===//
	// SVE Integer Misc - Unpredicated Group
	//===----------------------------------------------------------------------===//

	class sve_int_bin_cons_misc_0_b<bits<2> sz, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
	asm, "\t$Zd, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15-10} = 0b101100;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_bin_cons_misc_0_b<string asm, SDPatternOperator op> {
	def _H : sve_int_bin_cons_misc_0_b<0b01, asm, ZPR16>;
	def _S : sve_int_bin_cons_misc_0_b<0b10, asm, ZPR32>;
	def _D : sve_int_bin_cons_misc_0_b<0b11, asm, ZPR64>;

	def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	class sve_int_bin_cons_misc_0_c<bits<8> opc, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$Zn),
	asm, "\t$Zd, $Zn",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = opc{7-6};
	let Inst{21} = 0b1;
	let Inst{20-16} = opc{5-1};
	let Inst{15-11} = 0b10111;
	let Inst{10} = opc{0};
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_bin_cons_misc_0_c_fexpa<string asm, SDPatternOperator op> {
	def _H : sve_int_bin_cons_misc_0_c<0b01000000, asm, ZPR16>;
	def _S : sve_int_bin_cons_misc_0_c<0b10000000, asm, ZPR32>;
	def _D : sve_int_bin_cons_misc_0_c<0b11000000, asm, ZPR64>;

	def : SVE_1_Op_Pat<nxv8f16, op, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_1_Op_Pat<nxv4f32, op, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_1_Op_Pat<nxv2f64, op, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Integer Reduction Group
	//===----------------------------------------------------------------------===//

	class sve_int_reduce<bits<2> sz8_32, bits<2> fmt, bits<3> opc, string asm,
	ZPRRegOp zprty, RegisterClass regtype>
	: I<(outs regtype:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn),
	asm, "\t$Vd, $Pg, $Zn",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Vd;
	bits<5> Zn;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz8_32;
	let Inst{21} = 0b0;
	let Inst{20-19} = fmt;
	let Inst{18-16} = opc;
	let Inst{15-13} = 0b001;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Vd;
	}

	multiclass sve_int_reduce_0_saddv<bits<3> opc, string asm, SDPatternOperator op> {
	def _B : sve_int_reduce<0b00, 0b00, opc, asm, ZPR8, FPR64>;
	def _H : sve_int_reduce<0b01, 0b00, opc, asm, ZPR16, FPR64>;
	def _S : sve_int_reduce<0b10, 0b00, opc, asm, ZPR32, FPR64>;

	def : SVE_2_Op_Pat<i64, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_2_Op_Pat<i64, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<i64, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
	}

	multiclass sve_int_reduce_0_uaddv<bits<3> opc, string asm, SDPatternOperator op, SDPatternOperator opSaddv> {
	def _B : sve_int_reduce<0b00, 0b00, opc, asm, ZPR8, FPR64>;
	def _H : sve_int_reduce<0b01, 0b00, opc, asm, ZPR16, FPR64>;
	def _S : sve_int_reduce<0b10, 0b00, opc, asm, ZPR32, FPR64>;
	def _D : sve_int_reduce<0b11, 0b00, opc, asm, ZPR64, FPR64>;

	def : SVE_2_Op_Pat<i64, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_2_Op_Pat<i64, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
	def : SVE_2_Op_Pat<i64, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_2_Op_Pat<i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
	def : SVE_2_Op_Pat<i64, opSaddv, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	multiclass sve_int_reduce_1<bits<3> opc, string asm, SDPatternOperator op> {
	def _B : sve_int_reduce<0b00, 0b01, opc, asm, ZPR8, FPR8>;
	def _H : sve_int_reduce<0b01, 0b01, opc, asm, ZPR16, FPR16>;
	def _S : sve_int_reduce<0b10, 0b01, opc, asm, ZPR32, FPR32>;
	def _D : sve_int_reduce<0b11, 0b01, opc, asm, ZPR64, FPR64>;

	def : SVE_2_Op_Pat_Reduce_To_Neon<v16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B), bsub>;
	def : SVE_2_Op_Pat_Reduce_To_Neon<v8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H), hsub>;
	def : SVE_2_Op_Pat_Reduce_To_Neon<v4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S), ssub>;
	def : SVE_2_Op_Pat_Reduce_To_Neon<v2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D), dsub>;
	}

	multiclass sve_int_reduce_2<bits<3> opc, string asm, SDPatternOperator op> {
	def _B : sve_int_reduce<0b00, 0b11, opc, asm, ZPR8, FPR8>;
	def _H : sve_int_reduce<0b01, 0b11, opc, asm, ZPR16, FPR16>;
	def _S : sve_int_reduce<0b10, 0b11, opc, asm, ZPR32, FPR32>;
	def _D : sve_int_reduce<0b11, 0b11, opc, asm, ZPR64, FPR64>;

	def : SVE_2_Op_Pat_Reduce_To_Neon<v16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B), bsub>;
	def : SVE_2_Op_Pat_Reduce_To_Neon<v8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H), hsub>;
	def : SVE_2_Op_Pat_Reduce_To_Neon<v4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S), ssub>;
	def : SVE_2_Op_Pat_Reduce_To_Neon<v2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D), dsub>;
	}

	class sve_int_movprfx_pred<bits<2> sz8_32, bits<3> opc, string asm,
	ZPRRegOp zprty, string pg_suffix, dag iops>
	: I<(outs zprty:$Zd), iops,
	asm, "\t$Zd, $Pg"#pg_suffix#", $Zn",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zd;
	bits<5> Zn;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz8_32;
	let Inst{21-19} = 0b010;
	let Inst{18-16} = opc;
	let Inst{15-13} = 0b001;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;

	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_int_movprfx_pred_merge<bits<3> opc, string asm> {
	let Constraints = "$Zd = $_Zd" in {
	def _B : sve_int_movprfx_pred<0b00, opc, asm, ZPR8, "/m",
	(ins ZPR8:$_Zd, PPR3bAny:$Pg, ZPR8:$Zn)>;
	def _H : sve_int_movprfx_pred<0b01, opc, asm, ZPR16, "/m",
	(ins ZPR16:$_Zd, PPR3bAny:$Pg, ZPR16:$Zn)>;
	def _S : sve_int_movprfx_pred<0b10, opc, asm, ZPR32, "/m",
	(ins ZPR32:$_Zd, PPR3bAny:$Pg, ZPR32:$Zn)>;
	def _D : sve_int_movprfx_pred<0b11, opc, asm, ZPR64, "/m",
	(ins ZPR64:$_Zd, PPR3bAny:$Pg, ZPR64:$Zn)>;
	}
	}

	multiclass sve_int_movprfx_pred_zero<bits<3> opc, string asm> {
	def _B : sve_int_movprfx_pred<0b00, opc, asm, ZPR8, "/z",
	(ins PPR3bAny:$Pg, ZPR8:$Zn)>;
	def _H : sve_int_movprfx_pred<0b01, opc, asm, ZPR16, "/z",
	(ins PPR3bAny:$Pg, ZPR16:$Zn)>;
	def _S : sve_int_movprfx_pred<0b10, opc, asm, ZPR32, "/z",
	(ins PPR3bAny:$Pg, ZPR32:$Zn)>;
	def _D : sve_int_movprfx_pred<0b11, opc, asm, ZPR64, "/z",
	(ins PPR3bAny:$Pg, ZPR64:$Zn)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Propagate Break Group
	//===----------------------------------------------------------------------===//

	class sve_int_brkp<bits<2> opc, string asm>
	: I<(outs PPR8:$Pd), (ins PPRAny:$Pg, PPR8:$Pn, PPR8:$Pm),
	asm, "\t$Pd, $Pg/z, $Pn, $Pm",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	bits<4> Pg;
	bits<4> Pm;
	bits<4> Pn;
	let Inst{31-24} = 0b00100101;
	let Inst{23} = 0b0;
	let Inst{22} = opc{1};
	let Inst{21-20} = 0b00;
	let Inst{19-16} = Pm;
	let Inst{15-14} = 0b11;
	let Inst{13-10} = Pg;
	let Inst{9} = 0b0;
	let Inst{8-5} = Pn;
	let Inst{4} = opc{0};
	let Inst{3-0} = Pd;

	let Defs = !if(!eq (opc{1}, 1), [NZCV], []);
	}

	multiclass sve_int_brkp<bits<2> opc, string asm, SDPatternOperator op> {
	def NAME : sve_int_brkp<opc, asm>;

	def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, nxv16i1, !cast<Instruction>(NAME)>;
	}


	//===----------------------------------------------------------------------===//
	// SVE Partition Break Group
	//===----------------------------------------------------------------------===//

	class sve_int_brkn<bit S, string asm>
	: I<(outs PPR8:$Pdm), (ins PPRAny:$Pg, PPR8:$Pn, PPR8:$_Pdm),
	asm, "\t$Pdm, $Pg/z, $Pn, $_Pdm",
	"",
	[]>, Sched<[]> {
	bits<4> Pdm;
	bits<4> Pg;
	bits<4> Pn;
	let Inst{31-23} = 0b001001010;
	let Inst{22} = S;
	let Inst{21-14} = 0b01100001;
	let Inst{13-10} = Pg;
	let Inst{9} = 0b0;
	let Inst{8-5} = Pn;
	let Inst{4} = 0b0;
	let Inst{3-0} = Pdm;

	let Constraints = "$Pdm = $_Pdm";
	let Defs = !if(!eq (S, 0b1), [NZCV], []);
	}

	multiclass sve_int_brkn<bits<1> opc, string asm, SDPatternOperator op> {
	def NAME : sve_int_brkn<opc, asm>;

	def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, nxv16i1, !cast<Instruction>(NAME)>;
	}

	class sve_int_break<bits<3> opc, string asm, string suffix, dag iops>
	: I<(outs PPR8:$Pd), iops,
	asm, "\t$Pd, $Pg"#suffix#", $Pn",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	bits<4> Pg;
	bits<4> Pn;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = opc{2-1};
	let Inst{21-14} = 0b01000001;
	let Inst{13-10} = Pg;
	let Inst{9} = 0b0;
	let Inst{8-5} = Pn;
	let Inst{4} = opc{0};
	let Inst{3-0} = Pd;

	let Constraints = !if(!eq (opc{0}, 1), "$Pd = $_Pd", "");
	let Defs = !if(!eq (opc{1}, 1), [NZCV], []);

	}

	multiclass sve_int_break_m<bits<3> opc, string asm, SDPatternOperator op> {
	def NAME : sve_int_break<opc, asm, "/m", (ins PPR8:$_Pd, PPRAny:$Pg, PPR8:$Pn)>;

	def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, nxv16i1, !cast<Instruction>(NAME)>;
	}

	multiclass sve_int_break_z<bits<3> opc, string asm, SDPatternOperator op> {
	def NAME : sve_int_break<opc, asm, "/z", (ins PPRAny:$Pg, PPR8:$Pn)>;

	def : SVE_2_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, !cast<Instruction>(NAME)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 String Processing Group
	//===----------------------------------------------------------------------===//

	class sve2_char_match<bit sz, bit opc, string asm,
	PPRRegOp pprty, ZPRRegOp zprty>
	: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn, zprty:$Zm),
	asm, "\t$Pd, $Pg/z, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	bits<3> Pg;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-23} = 0b010001010;
	let Inst{22} = sz;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15-13} = 0b100;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4} = opc;
	let Inst{3-0} = Pd;

	let Defs = [NZCV];
	}

	multiclass sve2_char_match<bit opc, string asm, SDPatternOperator op> {
	def _B : sve2_char_match<0b0, opc, asm, PPR8, ZPR8>;
	def _H : sve2_char_match<0b1, opc, asm, PPR16, ZPR16>;

	def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
	def : SVE_3_Op_Pat<nxv8i1, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Histogram Computation - Segment Group
	//===----------------------------------------------------------------------===//

	class sve2_hist_gen_segment<string asm, SDPatternOperator op>
	: I<(outs ZPR8:$Zd), (ins ZPR8:$Zn, ZPR8:$Zm),
	asm, "\t$Zd, $Zn, $Zm",
	"",
	[(set nxv16i8:$Zd, (op nxv16i8:$Zn, nxv16i8:$Zm))]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<5> Zm;
	let Inst{31-21} = 0b01000101001;
	let Inst{20-16} = Zm;
	let Inst{15-10} = 0b101000;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Histogram Computation - Vector Group
	//===----------------------------------------------------------------------===//

	class sve2_hist_gen_vector<bit sz, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zn, zprty:$Zm),
	asm, "\t$Zd, $Pg/z, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<3> Pg;
	bits<5> Zm;
	let Inst{31-23} = 0b010001011;
	let Inst{22} = sz;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15-13} = 0b110;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve2_hist_gen_vector<string asm, SDPatternOperator op> {
	def _S : sve2_hist_gen_vector<0b0, asm, ZPR32>;
	def _D : sve2_hist_gen_vector<0b1, asm, ZPR64>;

	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Crypto Extensions Group
	//===----------------------------------------------------------------------===//

	class sve2_crypto_cons_bin_op<bit opc, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
	asm, "\t$Zd, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<5> Zm;
	let Inst{31-21} = 0b01000101001;
	let Inst{20-16} = Zm;
	let Inst{15-11} = 0b11110;
	let Inst{10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve2_crypto_cons_bin_op<bit opc, string asm, ZPRRegOp zprty,
	SDPatternOperator op, ValueType vt> {
	def NAME : sve2_crypto_cons_bin_op<opc, asm, zprty>;
	def : SVE_2_Op_Pat<vt, op, vt, vt, !cast<Instruction>(NAME)>;
	}

	class sve2_crypto_des_bin_op<bits<2> opc, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm),
	asm, "\t$Zdn, $_Zdn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<5> Zdn;
	bits<5> Zm;
	let Inst{31-17} = 0b010001010010001;
	let Inst{16} = opc{1};
	let Inst{15-11} = 0b11100;
	let Inst{10} = opc{0};
	let Inst{9-5} = Zm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	}

	multiclass sve2_crypto_des_bin_op<bits<2> opc, string asm, ZPRRegOp zprty,
	SDPatternOperator op, ValueType vt> {
	def NAME : sve2_crypto_des_bin_op<opc, asm, zprty>;
	def : SVE_2_Op_Pat<vt, op, vt, vt, !cast<Instruction>(NAME)>;
	}

	class sve2_crypto_unary_op<bit opc, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zdn), (ins zprty:$_Zdn),
	asm, "\t$Zdn, $_Zdn",
	"",
	[]>, Sched<[]> {
	bits<5> Zdn;
	let Inst{31-11} = 0b010001010010000011100;
	let Inst{10} = opc;
	let Inst{9-5} = 0b00000;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	}

	multiclass sve2_crypto_unary_op<bit opc, string asm, SDPatternOperator op> {
	def NAME : sve2_crypto_unary_op<opc, asm, ZPR8>;
	def : SVE_1_Op_Pat<nxv16i8, op, nxv16i8, !cast<Instruction>(NAME)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE BFloat16 Group
	//===----------------------------------------------------------------------===//

	class sve_bfloat_dot_base<bits<2> opc, string asm, string ops, dag iops>
	: I<(outs ZPR32:$Zda), iops, asm, ops, "", []>, Sched<[]> {
	bits<5> Zda;
	bits<5> Zn;
	let Inst{31-21} = 0b01100100011;
	let Inst{15-14} = opc;
	let Inst{13-10} = 0b0000;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = ElementSizeH;
	}

	class sve_bfloat_dot<string asm>
	: sve_bfloat_dot_base<0b10, asm, "\t$Zda, $Zn, $Zm",
	(ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm)> {
	bits<5> Zm;
	let Inst{20-16} = Zm;
	}

	multiclass sve_bfloat_dot<string asm, SDPatternOperator op> {
	def NAME : sve_bfloat_dot<asm>;
	def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16 ,!cast<Instruction>(NAME)>;
	}

	class sve_bfloat_dot_indexed<string asm>
	: sve_bfloat_dot_base<0b01, asm, "\t$Zda, $Zn, $Zm$iop",
	(ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexS:$iop)> {
	bits<2> iop;
	bits<3> Zm;
	let Inst{20-19} = iop;
	let Inst{18-16} = Zm;
	}

	multiclass sve_bfloat_dot_indexed<string asm, SDPatternOperator op> {
	def NAME : sve_bfloat_dot_indexed<asm>;
	def : SVE_4_Op_Imm_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16, i64, VectorIndexS_timm, !cast<Instruction>(NAME)>;
	}

	class sve_bfloat_matmul<string asm>
	: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm),
	asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
	bits<5> Zm;
	bits<5> Zda;
	bits<5> Zn;
	let Inst{31-21} = 0b01100100011;
	let Inst{20-16} = Zm;
	let Inst{15-10} = 0b111001;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = ElementSizeH;
	}

	multiclass sve_bfloat_matmul<string asm, SDPatternOperator op> {
	def NAME : sve_bfloat_matmul<asm>;
	def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16 ,!cast<Instruction>(NAME)>;
	}

	class sve_bfloat_matmul_longvecl<bit BT, string asm>
	: sve_bfloat_matmul<asm> {
	let Inst{23} = 0b1;
	let Inst{14-13} = 0b00;
	let Inst{10} = BT;
	}

	multiclass sve_bfloat_matmul_longvecl<bit BT, string asm, SDPatternOperator op> {
	def NAME : sve_bfloat_matmul_longvecl<BT, asm>;
	def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16 ,!cast<Instruction>(NAME)>;
	}

	class sve_bfloat_matmul_longvecl_idx<bit BT, string asm>
	: sve_bfloat_dot_base<0b01, asm, "\t$Zda, $Zn, $Zm$iop",
	(ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexH:$iop)> {
	bits<3> iop;
	bits<3> Zm;
	let Inst{23} = 0b1;
	let Inst{20-19} = iop{2-1};
	let Inst{18-16} = Zm;
	let Inst{11} = iop{0};
	let Inst{10} = BT;
	}

	multiclass sve_bfloat_matmul_longvecl_idx<bit BT, string asm, SDPatternOperator op> {
	def NAME : sve_bfloat_matmul_longvecl_idx<BT, asm>;
	def : SVE_4_Op_Imm_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16, i64, VectorIndexH_timm, !cast<Instruction>(NAME)>;
	}

	class sve_bfloat_convert<bit N, string asm>
	: I<(outs ZPR16:$Zd), (ins ZPR16:$_Zd, PPR3bAny:$Pg, ZPR32:$Zn),
	asm, "\t$Zd, $Pg/m, $Zn", "", []>, Sched<[]> {
	bits<5> Zd;
	bits<3> Pg;
	bits<5> Zn;
	let Inst{31-25} = 0b0110010;
	let Inst{24} = N;
	let Inst{23-13} = 0b10001010101;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;

	let Constraints = "$Zd = $_Zd";
	let DestructiveInstType = DestructiveOther;
	let hasSideEffects = 1;
	let ElementSize = ElementSizeS;
	}

	multiclass sve_bfloat_convert<bit N, string asm, SDPatternOperator op> {
	def NAME : sve_bfloat_convert<N, asm>;
	def : SVE_3_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8i1, nxv4f32, !cast<Instruction>(NAME)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Integer Matrix Multiply Group
	//===----------------------------------------------------------------------===//

	class sve_int_matmul<bits<2> uns, string asm>
	: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR8:$Zm), asm,
	"\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
	bits<5> Zda;
	bits<5> Zn;
	bits<5> Zm;
	let Inst{31-24} = 0b01000101;
	let Inst{23-22} = uns;
	let Inst{21} = 0;
	let Inst{20-16} = Zm;
	let Inst{15-10} = 0b100110;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = ZPR32.ElementSize;
	}

	multiclass sve_int_matmul<bits<2> uns, string asm, SDPatternOperator op> {
	def NAME : sve_int_matmul<uns, asm>;

	def : SVE_3_Op_Pat<nxv4i32, op , nxv4i32, nxv16i8, nxv16i8, !cast<Instruction>(NAME)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Integer Dot Product Mixed Sign Group
	//===----------------------------------------------------------------------===//

	class sve_int_dot_mixed<string asm>
	: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR8:$Zm), asm,
	"\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
	bits<5> Zda;
	bits<5> Zn;
	bits<5> Zm;
	let Inst{31-21} = 0b01000100100;
	let Inst{20-16} = Zm;
	let Inst{15-10} = 0b011110;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = ZPR32.ElementSize;
	}

	multiclass sve_int_dot_mixed<string asm, SDPatternOperator op> {
	def NAME : sve_int_dot_mixed<asm>;

	def : SVE_3_Op_Pat<nxv4i32, op , nxv4i32, nxv16i8, nxv16i8, !cast<Instruction>(NAME)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Integer Dot Product Mixed Sign - Indexed Group
	//===----------------------------------------------------------------------===//

	class sve_int_dot_mixed_indexed<bit U, string asm>
	: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR3b8:$Zm, VectorIndexS32b:$idx),
	asm, "\t$Zda, $Zn, $Zm$idx", "", []>, Sched<[]> {
	bits<5> Zda;
	bits<5> Zn;
	bits<3> Zm;
	bits<2> idx;
	let Inst{31-21} = 0b01000100101;
	let Inst{20-19} = idx;
	let Inst{18-16} = Zm;
	let Inst{15-11} = 0b00011;
	let Inst{10} = U;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = ZPR32.ElementSize;
	}

	multiclass sve_int_dot_mixed_indexed<bit U, string asm, SDPatternOperator op> {
	def NAME : sve_int_dot_mixed_indexed<U, asm>;

	def : SVE_4_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv16i8, nxv16i8, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Floating Point Matrix Multiply Accumulate Group
	//===----------------------------------------------------------------------===//

	class sve_fp_matrix_mla<bit sz, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zda), (ins zprty:$_Zda, zprty:$Zn, zprty:$Zm),
	asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
	bits<5> Zda;
	bits<5> Zn;
	bits<5> Zm;
	let Inst{31-23} = 0b011001001;
	let Inst{22} = sz;
	let Inst{21} = 1;
	let Inst{20-16} = Zm;
	let Inst{15-10} = 0b111001;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = DestructiveOther;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_fp_matrix_mla<bit sz, string asm, ZPRRegOp zprty, SDPatternOperator op, ValueType vt> {
	def NAME : sve_fp_matrix_mla<sz, asm, zprty>;

	def : SVE_3_Op_Pat<vt, op , vt, vt, vt, !cast<Instruction>(NAME)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Memory - Contiguous Load And Replicate 256-bit Group
	//===----------------------------------------------------------------------===//

	class sve_mem_ldor_si<bits<2> sz, string asm, RegisterOperand VecList>
	: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s32:$imm4),
	asm, "\t$Zt, $Pg/z, [$Rn, $imm4]", "", []>, Sched<[]> {
	bits<5> Zt;
	bits<5> Rn;
	bits<3> Pg;
	bits<4> imm4;
	let Inst{31-25} = 0b1010010;
	let Inst{24-23} = sz;
	let Inst{22-20} = 0b010;
	let Inst{19-16} = imm4;
	let Inst{15-13} = 0b001;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	}

	multiclass sve_mem_ldor_si<bits<2> sz, string asm, RegisterOperand listty,
	ZPRRegOp zprty, ValueType Ty, ValueType PredTy, SDNode Ld1ro> {
	def NAME : sve_mem_ldor_si<sz, asm, listty>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
	(!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s32:$imm4), 0>;

	// Base addressing mode
	- def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$gp), GPR64sp:$base)),
	- (!cast<Instruction>(NAME) PPR3bAny:$gp, GPR64sp:$base, (i64 0))>;
	-
	+ def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$Pg), GPR64sp:$base)),
	+ (!cast<Instruction>(NAME) PPR3bAny:$Pg, GPR64sp:$base, (i64 0))>;
	+ let AddedComplexity = 2 in {
	+ // Reg + Imm addressing mode
	+ def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$Pg), (add GPR64:$base, (i64 simm4s32:$imm)))),
	+ (!cast<Instruction>(NAME) $Pg, $base, simm4s32:$imm)>;
	+ }
	}

	class sve_mem_ldor_ss<bits<2> sz, string asm, RegisterOperand VecList,
	RegisterOperand gprty>
	: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
	asm, "\t$Zt, $Pg/z, [$Rn, $Rm]", "", []>, Sched<[]> {
	bits<5> Zt;
	bits<3> Pg;
	bits<5> Rn;
	bits<5> Rm;
	let Inst{31-25} = 0b1010010;
	let Inst{24-23} = sz;
	let Inst{22-21} = 0b01;
	let Inst{20-16} = Rm;
	let Inst{15-13} = 0;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	}

	multiclass sve_mem_ldor_ss<bits<2> sz, string asm, RegisterOperand listty,
	ZPRRegOp zprty, RegisterOperand gprty, ValueType Ty,
	ValueType PredTy, SDNode Ld1ro, ComplexPattern AddrCP> {
	def NAME : sve_mem_ldor_ss<sz, asm, listty, gprty>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;

	def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$gp), (AddrCP GPR64sp:$base, gprty:$offset))),
	(!cast<Instruction>(NAME) PPR3bAny:$gp, GPR64sp:$base, gprty:$offset)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Interleave 128-bit Elements Group
	//===----------------------------------------------------------------------===//

	class sve_int_perm_bin_perm_128_zz<bits<2> opc, bit P, string asm>
	: I<(outs ZPR128:$Zd), (ins ZPR128:$Zn, ZPR128:$Zm),
	asm, "\t$Zd, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-21} = 0b00000101101;
	let Inst{20-16} = Zm;
	let Inst{15-13} = 0b000;
	let Inst{12-11} = opc;
	let Inst{10} = P;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_perm_bin_perm_128_zz<bits<2> opc, bit P, string asm, SDPatternOperator op> {
	def NAME : sve_int_perm_bin_perm_128_zz<opc, P, asm>;

	def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME)>;
	def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME)>;
	def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, nxv8f16, !cast<Instruction>(NAME)>;
	def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME)>;
	def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, nxv4f32, !cast<Instruction>(NAME)>;
	def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME)>;
	def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, !cast<Instruction>(NAME)>;
	}

	/// Addressing modes
	def am_sve_indexed_s4 :ComplexPattern<i64, 2, "SelectAddrModeIndexedSVE<-8,7>", [], [SDNPWantRoot]>;
	def am_sve_indexed_s6 :ComplexPattern<i64, 2, "SelectAddrModeIndexedSVE<-32,31>", [], [SDNPWantRoot]>;

	def am_sve_regreg_lsl0 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<0>", []>;
	def am_sve_regreg_lsl1 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<1>", []>;
	def am_sve_regreg_lsl2 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<2>", []>;
	def am_sve_regreg_lsl3 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<3>", []>;

	// Predicated pseudo floating point two operand instructions.
	multiclass sve_fp_bin_pred_hfd<SDPatternOperator op> {
	def _UNDEF_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesUndef>;
	def _UNDEF_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesUndef>;
	def _UNDEF_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesUndef>;

	def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Pseudo>(NAME # _UNDEF_H)>;
	def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Pseudo>(NAME # _UNDEF_S)>;
	def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Pseudo>(NAME # _UNDEF_D)>;
	}

	// Predicated pseudo integer two operand instructions.
	multiclass sve_int_bin_pred_bhsd<SDPatternOperator op> {
	def _UNDEF_B : PredTwoOpPseudo<NAME # _B, ZPR8, FalseLanesUndef>;
	def _UNDEF_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesUndef>;
	def _UNDEF_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesUndef>;
	def _UNDEF_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesUndef>;

	def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Pseudo>(NAME # _UNDEF_B)>;
	def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Pseudo>(NAME # _UNDEF_H)>;
	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Pseudo>(NAME # _UNDEF_S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _UNDEF_D)>;
	}

	// As sve_int_bin_pred but when only i32 and i64 vector types are required.
	multiclass sve_int_bin_pred_sd<SDPatternOperator op> {
	def _UNDEF_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesUndef>;
	def _UNDEF_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesUndef>;

	def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Pseudo>(NAME # _UNDEF_S)>;
	def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _UNDEF_D)>;
	}
	diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
	index 11454841cab7..5c1a4cb16568 100644
	--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
	+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
	@@ -1,17162 +1,17173 @@
	//===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the PPCISelLowering class.
	//
	//===----------------------------------------------------------------------===//

	#include "PPCISelLowering.h"
	#include "MCTargetDesc/PPCPredicates.h"
	#include "PPC.h"
	#include "PPCCCState.h"
	#include "PPCCallingConv.h"
	#include "PPCFrameLowering.h"
	#include "PPCInstrInfo.h"
	#include "PPCMachineFunctionInfo.h"
	#include "PPCPerfectShuffle.h"
	#include "PPCRegisterInfo.h"
	#include "PPCSubtarget.h"
	#include "PPCTargetMachine.h"
	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/None.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/CodeGen/CallingConvLower.h"
	#include "llvm/CodeGen/ISDOpcodes.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineJumpTableInfo.h"
	#include "llvm/CodeGen/MachineLoopInfo.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/RuntimeLibcalls.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/TargetInstrInfo.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/IntrinsicsPowerPC.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Use.h"
	#include "llvm/IR/Value.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCExpr.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/MC/MCSymbolXCOFF.h"
	#include "llvm/Support/AtomicOrdering.h"
	#include "llvm/Support/BranchProbability.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/Format.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MachineValueType.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include <algorithm>
	#include <cassert>
	#include <cstdint>
	#include <iterator>
	#include <list>
	#include <utility>
	#include <vector>

	using namespace llvm;

	#define DEBUG_TYPE "ppc-lowering"

	static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
	cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);

	static cl::opt<bool> DisableILPPref("disable-ppc-ilp-pref",
	cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden);

	static cl::opt<bool> DisablePPCUnaligned("disable-ppc-unaligned",
	cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);

	static cl::opt<bool> DisableSCO("disable-ppc-sco",
	cl::desc("disable sibling call optimization on ppc"), cl::Hidden);

	static cl::opt<bool> DisableInnermostLoopAlign32("disable-ppc-innermost-loop-align32",
	cl::desc("don't always align innermost loop to 32 bytes on ppc"), cl::Hidden);

	static cl::opt<bool> UseAbsoluteJumpTables("ppc-use-absolute-jumptables",
	cl::desc("use absolute jump tables on ppc"), cl::Hidden);

	STATISTIC(NumTailCalls, "Number of tail calls");
	STATISTIC(NumSiblingCalls, "Number of sibling calls");
	STATISTIC(ShufflesHandledWithVPERM, "Number of shuffles lowered to a VPERM");
	STATISTIC(NumDynamicAllocaProbed, "Number of dynamic stack allocation probed");

	static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int);

	static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl);

	// FIXME: Remove this once the bug has been fixed!
	extern cl::opt<bool> ANDIGlueBug;

	PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
	const PPCSubtarget &STI)
	: TargetLowering(TM), Subtarget(STI) {
	// On PPC32/64, arguments smaller than 4/8 bytes are extended, so all
	// arguments are at least 4/8 bytes aligned.
	bool isPPC64 = Subtarget.isPPC64();
	setMinStackArgumentAlignment(isPPC64 ? Align(8) : Align(4));

	// Set up the register classes.
	addRegisterClass(MVT::i32, &PPC::GPRCRegClass);
	if (!useSoftFloat()) {
	if (hasSPE()) {
	addRegisterClass(MVT::f32, &PPC::GPRCRegClass);
	addRegisterClass(MVT::f64, &PPC::SPERCRegClass);
	} else {
	addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
	addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
	}
	}

	// Match BITREVERSE to customized fast code sequence in the td file.
	setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
	setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);

	// Sub-word ATOMIC_CMP_SWAP need to ensure that the input is zero-extended.
	setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);

	// PowerPC has an i16 but no i8 (or i1) SEXTLOAD.
	for (MVT VT : MVT::integer_valuetypes()) {
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
	}

	if (Subtarget.isISA3_0()) {
	setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Legal);
	setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Legal);
	setTruncStoreAction(MVT::f64, MVT::f16, Legal);
	setTruncStoreAction(MVT::f32, MVT::f16, Legal);
	} else {
	// No extending loads from f16 or HW conversions back and forth.
	setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
	setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
	setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
	setTruncStoreAction(MVT::f64, MVT::f16, Expand);
	setTruncStoreAction(MVT::f32, MVT::f16, Expand);
	}

	setTruncStoreAction(MVT::f64, MVT::f32, Expand);

	// PowerPC has pre-inc load and store's.
	setIndexedLoadAction(ISD::PRE_INC, MVT::i1, Legal);
	setIndexedLoadAction(ISD::PRE_INC, MVT::i8, Legal);
	setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal);
	setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal);
	setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal);
	setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal);
	setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal);
	setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal);
	setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal);
	setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal);
	if (!Subtarget.hasSPE()) {
	setIndexedLoadAction(ISD::PRE_INC, MVT::f32, Legal);
	setIndexedLoadAction(ISD::PRE_INC, MVT::f64, Legal);
	setIndexedStoreAction(ISD::PRE_INC, MVT::f32, Legal);
	setIndexedStoreAction(ISD::PRE_INC, MVT::f64, Legal);
	}

	// PowerPC uses ADDC/ADDE/SUBC/SUBE to propagate carry.
	const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
	for (MVT VT : ScalarIntVTs) {
	setOperationAction(ISD::ADDC, VT, Legal);
	setOperationAction(ISD::ADDE, VT, Legal);
	setOperationAction(ISD::SUBC, VT, Legal);
	setOperationAction(ISD::SUBE, VT, Legal);
	}

	if (Subtarget.useCRBits()) {
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);

	if (isPPC64 \|\| Subtarget.hasFPCVT()) {
	setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote);
	AddPromotedToType (ISD::SINT_TO_FP, MVT::i1,
	isPPC64 ? MVT::i64 : MVT::i32);
	setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote);
	AddPromotedToType(ISD::UINT_TO_FP, MVT::i1,
	isPPC64 ? MVT::i64 : MVT::i32);
	} else {
	setOperationAction(ISD::SINT_TO_FP, MVT::i1, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom);
	}

	// PowerPC does not support direct load/store of condition registers.
	setOperationAction(ISD::LOAD, MVT::i1, Custom);
	setOperationAction(ISD::STORE, MVT::i1, Custom);

	// FIXME: Remove this once the ANDI glue bug is fixed:
	if (ANDIGlueBug)
	setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);

	for (MVT VT : MVT::integer_valuetypes()) {
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
	setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
	setTruncStoreAction(VT, MVT::i1, Expand);
	}

	addRegisterClass(MVT::i1, &PPC::CRBITRCRegClass);
	}

	// Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
	// PPC (the libcall is not available).
	setOperationAction(ISD::FP_TO_SINT, MVT::ppcf128, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::ppcf128, Custom);

	// We do not currently implement these libm ops for PowerPC.
	setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand);
	setOperationAction(ISD::FCEIL, MVT::ppcf128, Expand);
	setOperationAction(ISD::FTRUNC, MVT::ppcf128, Expand);
	setOperationAction(ISD::FRINT, MVT::ppcf128, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::ppcf128, Expand);
	setOperationAction(ISD::FREM, MVT::ppcf128, Expand);

	// PowerPC has no SREM/UREM instructions unless we are on P9
	// On P9 we may use a hardware instruction to compute the remainder.
	// When the result of both the remainder and the division is required it is
	// more efficient to compute the remainder from the result of the division
	// rather than use the remainder instruction. The instructions are legalized
	// directly because the DivRemPairsPass performs the transformation at the IR
	// level.
	if (Subtarget.isISA3_0()) {
	setOperationAction(ISD::SREM, MVT::i32, Legal);
	setOperationAction(ISD::UREM, MVT::i32, Legal);
	setOperationAction(ISD::SREM, MVT::i64, Legal);
	setOperationAction(ISD::UREM, MVT::i64, Legal);
	} else {
	setOperationAction(ISD::SREM, MVT::i32, Expand);
	setOperationAction(ISD::UREM, MVT::i32, Expand);
	setOperationAction(ISD::SREM, MVT::i64, Expand);
	setOperationAction(ISD::UREM, MVT::i64, Expand);
	}

	// Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM.
	setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
	setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
	setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
	setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
	setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
	setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
	setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
	setOperationAction(ISD::SDIVREM, MVT::i64, Expand);

	// Handle constrained floating-point operations of scalar.
	// TODO: Handle SPE specific operation.
	setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
	setOperationAction(ISD::STRICT_FMA, MVT::f32, Legal);
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);

	setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
	setOperationAction(ISD::STRICT_FMA, MVT::f64, Legal);
	if (Subtarget.hasVSX())
	setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f64, Legal);

	if (Subtarget.hasFSQRT()) {
	setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
	setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
	}

	if (Subtarget.hasFPRND()) {
	setOperationAction(ISD::STRICT_FFLOOR, MVT::f32, Legal);
	setOperationAction(ISD::STRICT_FCEIL, MVT::f32, Legal);
	setOperationAction(ISD::STRICT_FTRUNC, MVT::f32, Legal);
	setOperationAction(ISD::STRICT_FROUND, MVT::f32, Legal);

	setOperationAction(ISD::STRICT_FFLOOR, MVT::f64, Legal);
	setOperationAction(ISD::STRICT_FCEIL, MVT::f64, Legal);
	setOperationAction(ISD::STRICT_FTRUNC, MVT::f64, Legal);
	setOperationAction(ISD::STRICT_FROUND, MVT::f64, Legal);
	}

	// We don't support sin/cos/sqrt/fmod/pow
	setOperationAction(ISD::FSIN , MVT::f64, Expand);
	setOperationAction(ISD::FCOS , MVT::f64, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
	setOperationAction(ISD::FREM , MVT::f64, Expand);
	setOperationAction(ISD::FPOW , MVT::f64, Expand);
	setOperationAction(ISD::FSIN , MVT::f32, Expand);
	setOperationAction(ISD::FCOS , MVT::f32, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
	setOperationAction(ISD::FREM , MVT::f32, Expand);
	setOperationAction(ISD::FPOW , MVT::f32, Expand);
	if (Subtarget.hasSPE()) {
	setOperationAction(ISD::FMA , MVT::f64, Expand);
	setOperationAction(ISD::FMA , MVT::f32, Expand);
	} else {
	setOperationAction(ISD::FMA , MVT::f64, Legal);
	setOperationAction(ISD::FMA , MVT::f32, Legal);
	}

	setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);

	// If we're enabling GP optimizations, use hardware square root
	if (!Subtarget.hasFSQRT() &&
	!(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTE() &&
	Subtarget.hasFRE()))
	setOperationAction(ISD::FSQRT, MVT::f64, Expand);

	if (!Subtarget.hasFSQRT() &&
	!(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTES() &&
	Subtarget.hasFRES()))
	setOperationAction(ISD::FSQRT, MVT::f32, Expand);

	if (Subtarget.hasFCPSGN()) {
	setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal);
	setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal);
	} else {
	setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
	}

	if (Subtarget.hasFPRND()) {
	setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
	setOperationAction(ISD::FCEIL, MVT::f64, Legal);
	setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
	setOperationAction(ISD::FROUND, MVT::f64, Legal);

	setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
	setOperationAction(ISD::FCEIL, MVT::f32, Legal);
	setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
	setOperationAction(ISD::FROUND, MVT::f32, Legal);
	}

	// PowerPC does not have BSWAP, but we can use vector BSWAP instruction xxbrd
	// to speed up scalar BSWAP64.
	// CTPOP or CTTZ were introduced in P8/P9 respectively
	setOperationAction(ISD::BSWAP, MVT::i32 , Expand);
	if (Subtarget.hasP9Vector())
	setOperationAction(ISD::BSWAP, MVT::i64 , Custom);
	else
	setOperationAction(ISD::BSWAP, MVT::i64 , Expand);
	if (Subtarget.isISA3_0()) {
	setOperationAction(ISD::CTTZ , MVT::i32 , Legal);
	setOperationAction(ISD::CTTZ , MVT::i64 , Legal);
	} else {
	setOperationAction(ISD::CTTZ , MVT::i32 , Expand);
	setOperationAction(ISD::CTTZ , MVT::i64 , Expand);
	}

	if (Subtarget.hasPOPCNTD() == PPCSubtarget::POPCNTD_Fast) {
	setOperationAction(ISD::CTPOP, MVT::i32 , Legal);
	setOperationAction(ISD::CTPOP, MVT::i64 , Legal);
	} else {
	setOperationAction(ISD::CTPOP, MVT::i32 , Expand);
	setOperationAction(ISD::CTPOP, MVT::i64 , Expand);
	}

	// PowerPC does not have ROTR
	setOperationAction(ISD::ROTR, MVT::i32 , Expand);
	setOperationAction(ISD::ROTR, MVT::i64 , Expand);

	if (!Subtarget.useCRBits()) {
	// PowerPC does not have Select
	setOperationAction(ISD::SELECT, MVT::i32, Expand);
	setOperationAction(ISD::SELECT, MVT::i64, Expand);
	setOperationAction(ISD::SELECT, MVT::f32, Expand);
	setOperationAction(ISD::SELECT, MVT::f64, Expand);
	}

	// PowerPC wants to turn select_cc of FP into fsel when possible.
	setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);

	// PowerPC wants to optimize integer setcc a bit
	if (!Subtarget.useCRBits())
	setOperationAction(ISD::SETCC, MVT::i32, Custom);

	// PowerPC does not have BRCOND which requires SetCC
	if (!Subtarget.useCRBits())
	setOperationAction(ISD::BRCOND, MVT::Other, Expand);

	setOperationAction(ISD::BR_JT, MVT::Other, Expand);

	if (Subtarget.hasSPE()) {
	// SPE has built-in conversions
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Legal);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Legal);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
	setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
	} else {
	// PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores.
	setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);

	// PowerPC does not have [U\|S]INT_TO_FP
	setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
	setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
	}

	if (Subtarget.hasDirectMove() && isPPC64) {
	setOperationAction(ISD::BITCAST, MVT::f32, Legal);
	setOperationAction(ISD::BITCAST, MVT::i32, Legal);
	setOperationAction(ISD::BITCAST, MVT::i64, Legal);
	setOperationAction(ISD::BITCAST, MVT::f64, Legal);
	if (TM.Options.UnsafeFPMath) {
	setOperationAction(ISD::LRINT, MVT::f64, Legal);
	setOperationAction(ISD::LRINT, MVT::f32, Legal);
	setOperationAction(ISD::LLRINT, MVT::f64, Legal);
	setOperationAction(ISD::LLRINT, MVT::f32, Legal);
	setOperationAction(ISD::LROUND, MVT::f64, Legal);
	setOperationAction(ISD::LROUND, MVT::f32, Legal);
	setOperationAction(ISD::LLROUND, MVT::f64, Legal);
	setOperationAction(ISD::LLROUND, MVT::f32, Legal);
	}
	} else {
	setOperationAction(ISD::BITCAST, MVT::f32, Expand);
	setOperationAction(ISD::BITCAST, MVT::i32, Expand);
	setOperationAction(ISD::BITCAST, MVT::i64, Expand);
	setOperationAction(ISD::BITCAST, MVT::f64, Expand);
	}

	// We cannot sextinreg(i1). Expand to shifts.
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);

	// NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
	// SjLj exception handling but a light-weight setjmp/longjmp replacement to
	// support continuation, user-level threading, and etc.. As a result, no
	// other SjLj exception interfaces are implemented and please don't build
	// your own exception handling based on them.
	// LLVM/Clang supports zero-cost DWARF exception handling.
	setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
	setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);

	// We want to legalize GlobalAddress and ConstantPool nodes into the
	// appropriate instructions to materialize the address.
	setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
	setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
	setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
	setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
	setOperationAction(ISD::JumpTable, MVT::i32, Custom);
	setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
	setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
	setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
	setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
	setOperationAction(ISD::JumpTable, MVT::i64, Custom);

	// TRAP is legal.
	setOperationAction(ISD::TRAP, MVT::Other, Legal);

	// TRAMPOLINE is custom lowered.
	setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
	setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);

	// VASTART needs to be custom lowered to use the VarArgsFrameIndex
	setOperationAction(ISD::VASTART , MVT::Other, Custom);

	if (Subtarget.is64BitELFABI()) {
	// VAARG always uses double-word chunks, so promote anything smaller.
	setOperationAction(ISD::VAARG, MVT::i1, Promote);
	AddPromotedToType(ISD::VAARG, MVT::i1, MVT::i64);
	setOperationAction(ISD::VAARG, MVT::i8, Promote);
	AddPromotedToType(ISD::VAARG, MVT::i8, MVT::i64);
	setOperationAction(ISD::VAARG, MVT::i16, Promote);
	AddPromotedToType(ISD::VAARG, MVT::i16, MVT::i64);
	setOperationAction(ISD::VAARG, MVT::i32, Promote);
	AddPromotedToType(ISD::VAARG, MVT::i32, MVT::i64);
	setOperationAction(ISD::VAARG, MVT::Other, Expand);
	} else if (Subtarget.is32BitELFABI()) {
	// VAARG is custom lowered with the 32-bit SVR4 ABI.
	setOperationAction(ISD::VAARG, MVT::Other, Custom);
	setOperationAction(ISD::VAARG, MVT::i64, Custom);
	} else
	setOperationAction(ISD::VAARG, MVT::Other, Expand);

	// VACOPY is custom lowered with the 32-bit SVR4 ABI.
	if (Subtarget.is32BitELFABI())
	setOperationAction(ISD::VACOPY , MVT::Other, Custom);
	else
	setOperationAction(ISD::VACOPY , MVT::Other, Expand);

	// Use the default implementation.
	setOperationAction(ISD::VAEND , MVT::Other, Expand);
	setOperationAction(ISD::STACKSAVE , MVT::Other, Expand);
	setOperationAction(ISD::STACKRESTORE , MVT::Other, Custom);
	setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom);
	setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64 , Custom);
	setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i32, Custom);
	setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i64, Custom);
	setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom);
	setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom);

	// We want to custom lower some of our intrinsics.
	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);

	// To handle counter-based loop conditions.
	setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom);

	setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
	setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
	setOperationAction(ISD::INTRINSIC_VOID, MVT::i32, Custom);
	setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);

	// Comparisons that require checking two conditions.
	if (Subtarget.hasSPE()) {
	setCondCodeAction(ISD::SETO, MVT::f32, Expand);
	setCondCodeAction(ISD::SETO, MVT::f64, Expand);
	setCondCodeAction(ISD::SETUO, MVT::f32, Expand);
	setCondCodeAction(ISD::SETUO, MVT::f64, Expand);
	}
	setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
	setCondCodeAction(ISD::SETULT, MVT::f64, Expand);
	setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
	setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);
	setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
	setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand);
	setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
	setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);
	setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
	setCondCodeAction(ISD::SETOLE, MVT::f64, Expand);
	setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
	setCondCodeAction(ISD::SETONE, MVT::f64, Expand);

	if (Subtarget.has64BitSupport()) {
	// They also have instructions for converting between i64 and fp.
	setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
	setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
	// This is just the low 32 bits of a (signed) fp->i64 conversion.
	// We cannot do this with Promote because i64 is not a legal type.
	setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);

	if (Subtarget.hasLFIWAX() \|\| Subtarget.isPPC64())
	setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
	} else {
	// PowerPC does not have FP_TO_UINT on 32-bit implementations.
	if (Subtarget.hasSPE()) {
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
	} else
	setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
	}

	// With the instructions enabled under FPCVT, we can do everything.
	if (Subtarget.hasFPCVT()) {
	if (Subtarget.has64BitSupport()) {
	setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
	}

	setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
	}

	if (Subtarget.use64BitRegs()) {
	// 64-bit PowerPC implementations can support i64 types directly
	addRegisterClass(MVT::i64, &PPC::G8RCRegClass);
	// BUILD_PAIR can't be handled natively, and should be expanded to shl/or
	setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
	// 64-bit PowerPC wants to expand i128 shifts itself.
	setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
	setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
	setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
	} else {
	// 32-bit PowerPC wants to expand i64 shifts itself.
	setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
	setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
	setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
	}

	if (Subtarget.hasVSX()) {
	setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
	setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
	setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);
	setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
	}

	if (Subtarget.hasAltivec()) {
	for (MVT VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
	setOperationAction(ISD::SADDSAT, VT, Legal);
	setOperationAction(ISD::SSUBSAT, VT, Legal);
	setOperationAction(ISD::UADDSAT, VT, Legal);
	setOperationAction(ISD::USUBSAT, VT, Legal);
	}
	// First set operation action for all vector types to expand. Then we
	// will selectively turn on ones that can be effectively codegen'd.
	for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
	// add/sub are legal for all supported vector VT's.
	setOperationAction(ISD::ADD, VT, Legal);
	setOperationAction(ISD::SUB, VT, Legal);

	// For v2i64, these are only valid with P8Vector. This is corrected after
	// the loop.
	if (VT.getSizeInBits() <= 128 && VT.getScalarSizeInBits() <= 64) {
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);
	}
	else {
	setOperationAction(ISD::SMAX, VT, Expand);
	setOperationAction(ISD::SMIN, VT, Expand);
	setOperationAction(ISD::UMAX, VT, Expand);
	setOperationAction(ISD::UMIN, VT, Expand);
	}

	if (Subtarget.hasVSX()) {
	setOperationAction(ISD::FMAXNUM, VT, Legal);
	setOperationAction(ISD::FMINNUM, VT, Legal);
	}

	// Vector instructions introduced in P8
	if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) {
	setOperationAction(ISD::CTPOP, VT, Legal);
	setOperationAction(ISD::CTLZ, VT, Legal);
	}
	else {
	setOperationAction(ISD::CTPOP, VT, Expand);
	setOperationAction(ISD::CTLZ, VT, Expand);
	}

	// Vector instructions introduced in P9
	if (Subtarget.hasP9Altivec() && (VT.SimpleTy != MVT::v1i128))
	setOperationAction(ISD::CTTZ, VT, Legal);
	else
	setOperationAction(ISD::CTTZ, VT, Expand);

	// We promote all shuffles to v16i8.
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Promote);
	AddPromotedToType (ISD::VECTOR_SHUFFLE, VT, MVT::v16i8);

	// We promote all non-typed operations to v4i32.
	setOperationAction(ISD::AND , VT, Promote);
	AddPromotedToType (ISD::AND , VT, MVT::v4i32);
	setOperationAction(ISD::OR , VT, Promote);
	AddPromotedToType (ISD::OR , VT, MVT::v4i32);
	setOperationAction(ISD::XOR , VT, Promote);
	AddPromotedToType (ISD::XOR , VT, MVT::v4i32);
	setOperationAction(ISD::LOAD , VT, Promote);
	AddPromotedToType (ISD::LOAD , VT, MVT::v4i32);
	setOperationAction(ISD::SELECT, VT, Promote);
	AddPromotedToType (ISD::SELECT, VT, MVT::v4i32);
	setOperationAction(ISD::VSELECT, VT, Legal);
	setOperationAction(ISD::SELECT_CC, VT, Promote);
	AddPromotedToType (ISD::SELECT_CC, VT, MVT::v4i32);
	setOperationAction(ISD::STORE, VT, Promote);
	AddPromotedToType (ISD::STORE, VT, MVT::v4i32);

	// No other operations are legal.
	setOperationAction(ISD::MUL , VT, Expand);
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	setOperationAction(ISD::FDIV, VT, Expand);
	setOperationAction(ISD::FREM, VT, Expand);
	setOperationAction(ISD::FNEG, VT, Expand);
	setOperationAction(ISD::FSQRT, VT, Expand);
	setOperationAction(ISD::FLOG, VT, Expand);
	setOperationAction(ISD::FLOG10, VT, Expand);
	setOperationAction(ISD::FLOG2, VT, Expand);
	setOperationAction(ISD::FEXP, VT, Expand);
	setOperationAction(ISD::FEXP2, VT, Expand);
	setOperationAction(ISD::FSIN, VT, Expand);
	setOperationAction(ISD::FCOS, VT, Expand);
	setOperationAction(ISD::FABS, VT, Expand);
	setOperationAction(ISD::FFLOOR, VT, Expand);
	setOperationAction(ISD::FCEIL, VT, Expand);
	setOperationAction(ISD::FTRUNC, VT, Expand);
	setOperationAction(ISD::FRINT, VT, Expand);
	setOperationAction(ISD::FNEARBYINT, VT, Expand);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
	setOperationAction(ISD::BUILD_VECTOR, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::UMUL_LOHI, VT, Expand);
	setOperationAction(ISD::SMUL_LOHI, VT, Expand);
	setOperationAction(ISD::UDIVREM, VT, Expand);
	setOperationAction(ISD::SDIVREM, VT, Expand);
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
	setOperationAction(ISD::FPOW, VT, Expand);
	setOperationAction(ISD::BSWAP, VT, Expand);
	setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
	setOperationAction(ISD::ROTL, VT, Expand);
	setOperationAction(ISD::ROTR, VT, Expand);

	for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
	setTruncStoreAction(VT, InnerVT, Expand);
	setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
	setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
	}
	}
	setOperationAction(ISD::SELECT_CC, MVT::v4i32, Expand);
	if (!Subtarget.hasP8Vector()) {
	setOperationAction(ISD::SMAX, MVT::v2i64, Expand);
	setOperationAction(ISD::SMIN, MVT::v2i64, Expand);
	setOperationAction(ISD::UMAX, MVT::v2i64, Expand);
	setOperationAction(ISD::UMIN, MVT::v2i64, Expand);
	}

	for (auto VT : {MVT::v2i64, MVT::v4i32, MVT::v8i16, MVT::v16i8})
	setOperationAction(ISD::ABS, VT, Custom);

	// We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle
	// with merges, splats, etc.
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);

	// Vector truncates to sub-word integer that fit in an Altivec/VSX register
	// are cheap, so handle them before they get expanded to scalar.
	setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);

	setOperationAction(ISD::AND , MVT::v4i32, Legal);
	setOperationAction(ISD::OR , MVT::v4i32, Legal);
	setOperationAction(ISD::XOR , MVT::v4i32, Legal);
	setOperationAction(ISD::LOAD , MVT::v4i32, Legal);
	setOperationAction(ISD::SELECT, MVT::v4i32,
	Subtarget.useCRBits() ? Legal : Expand);
	setOperationAction(ISD::STORE , MVT::v4i32, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
	setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
	setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
	setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
	setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);

	// Without hasP8Altivec set, v2i64 SMAX isn't available.
	// But ABS custom lowering requires SMAX support.
	if (!Subtarget.hasP8Altivec())
	setOperationAction(ISD::ABS, MVT::v2i64, Expand);

	// Custom lowering ROTL v1i128 to VECTOR_SHUFFLE v16i8.
	setOperationAction(ISD::ROTL, MVT::v1i128, Custom);
	// With hasAltivec set, we can lower ISD::ROTL to vrl(b\|h\|w).
	if (Subtarget.hasAltivec())
	for (auto VT : {MVT::v4i32, MVT::v8i16, MVT::v16i8})
	setOperationAction(ISD::ROTL, VT, Legal);
	// With hasP8Altivec set, we can lower ISD::ROTL to vrld.
	if (Subtarget.hasP8Altivec())
	setOperationAction(ISD::ROTL, MVT::v2i64, Legal);

	addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass);
	addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass);
	addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass);
	addRegisterClass(MVT::v16i8, &PPC::VRRCRegClass);

	setOperationAction(ISD::MUL, MVT::v4f32, Legal);
	setOperationAction(ISD::FMA, MVT::v4f32, Legal);

	if (TM.Options.UnsafeFPMath \|\| Subtarget.hasVSX()) {
	setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
	setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
	}

	if (Subtarget.hasP8Altivec())
	setOperationAction(ISD::MUL, MVT::v4i32, Legal);
	else
	setOperationAction(ISD::MUL, MVT::v4i32, Custom);

	setOperationAction(ISD::MUL, MVT::v8i16, Legal);
	setOperationAction(ISD::MUL, MVT::v16i8, Custom);

	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom);

	setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);

	// Altivec does not contain unordered floating-point compare instructions
	setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand);
	setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand);
	setCondCodeAction(ISD::SETO, MVT::v4f32, Expand);
	setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand);

	if (Subtarget.hasVSX()) {
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
	if (Subtarget.hasP8Vector()) {
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal);
	}
	if (Subtarget.hasDirectMove() && isPPC64) {
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Legal);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Legal);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Legal);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Legal);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Legal);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Legal);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
	}
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);

	// The nearbyint variants are not allowed to raise the inexact exception
	// so we can only code-gen them with unsafe math.
	if (TM.Options.UnsafeFPMath) {
	setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
	setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
	}

	setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
	setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
	setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
	setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
	setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
	setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
	setOperationAction(ISD::FROUND, MVT::f64, Legal);
	setOperationAction(ISD::FRINT, MVT::f64, Legal);

	setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
	setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
	setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
	setOperationAction(ISD::FROUND, MVT::f32, Legal);
	setOperationAction(ISD::FRINT, MVT::f32, Legal);

	setOperationAction(ISD::MUL, MVT::v2f64, Legal);
	setOperationAction(ISD::FMA, MVT::v2f64, Legal);

	setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
	setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);

	// Share the Altivec comparison restrictions.
	setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand);
	setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand);
	setCondCodeAction(ISD::SETO, MVT::v2f64, Expand);
	setCondCodeAction(ISD::SETONE, MVT::v2f64, Expand);

	setOperationAction(ISD::LOAD, MVT::v2f64, Legal);
	setOperationAction(ISD::STORE, MVT::v2f64, Legal);

	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Legal);

	if (Subtarget.hasP8Vector())
	addRegisterClass(MVT::f32, &PPC::VSSRCRegClass);

	addRegisterClass(MVT::f64, &PPC::VSFRCRegClass);

	addRegisterClass(MVT::v4i32, &PPC::VSRCRegClass);
	addRegisterClass(MVT::v4f32, &PPC::VSRCRegClass);
	addRegisterClass(MVT::v2f64, &PPC::VSRCRegClass);

	if (Subtarget.hasP8Altivec()) {
	setOperationAction(ISD::SHL, MVT::v2i64, Legal);
	setOperationAction(ISD::SRA, MVT::v2i64, Legal);
	setOperationAction(ISD::SRL, MVT::v2i64, Legal);

	// 128 bit shifts can be accomplished via 3 instructions for SHL and
	// SRL, but not for SRA because of the instructions available:
	// VS{RL} and VS{RL}O. However due to direct move costs, it's not worth
	// doing
	setOperationAction(ISD::SHL, MVT::v1i128, Expand);
	setOperationAction(ISD::SRL, MVT::v1i128, Expand);
	setOperationAction(ISD::SRA, MVT::v1i128, Expand);

	setOperationAction(ISD::SETCC, MVT::v2i64, Legal);
	}
	else {
	setOperationAction(ISD::SHL, MVT::v2i64, Expand);
	setOperationAction(ISD::SRA, MVT::v2i64, Expand);
	setOperationAction(ISD::SRL, MVT::v2i64, Expand);

	setOperationAction(ISD::SETCC, MVT::v2i64, Custom);

	// VSX v2i64 only supports non-arithmetic operations.
	setOperationAction(ISD::ADD, MVT::v2i64, Expand);
	setOperationAction(ISD::SUB, MVT::v2i64, Expand);
	}

	setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
	AddPromotedToType (ISD::LOAD, MVT::v2i64, MVT::v2f64);
	setOperationAction(ISD::STORE, MVT::v2i64, Promote);
	AddPromotedToType (ISD::STORE, MVT::v2i64, MVT::v2f64);

	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Legal);

	setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);

	// Custom handling for partial vectors of integers converted to
	// floating point. We already have optimal handling for v2i32 through
	// the DAG combine, so those aren't necessary.
	setOperationAction(ISD::UINT_TO_FP, MVT::v2i8, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v2i16, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i8, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i16, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);

	setOperationAction(ISD::FNEG, MVT::v4f32, Legal);
	setOperationAction(ISD::FNEG, MVT::v2f64, Legal);
	setOperationAction(ISD::FABS, MVT::v4f32, Legal);
	setOperationAction(ISD::FABS, MVT::v2f64, Legal);
	setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal);
	setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Legal);

	if (Subtarget.hasDirectMove())
	setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);

	// Handle constrained floating-point operations of vector.
	// The predictor is `hasVSX` because altivec instruction has
	// no exception but VSX vector instruction has.
	setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FMA, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FMAXNUM, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FMINNUM, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FFLOOR, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FCEIL, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FTRUNC, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FROUND, MVT::v4f32, Legal);

	setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FMA, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FMAXNUM, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FMINNUM, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FFLOOR, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FCEIL, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FTRUNC, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FROUND, MVT::v2f64, Legal);

	addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass);
	}

	if (Subtarget.hasP8Altivec()) {
	addRegisterClass(MVT::v2i64, &PPC::VRRCRegClass);
	addRegisterClass(MVT::v1i128, &PPC::VRRCRegClass);
	}

	if (Subtarget.hasP9Vector()) {
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);

	// 128 bit shifts can be accomplished via 3 instructions for SHL and
	// SRL, but not for SRA because of the instructions available:
	// VS{RL} and VS{RL}O.
	setOperationAction(ISD::SHL, MVT::v1i128, Legal);
	setOperationAction(ISD::SRL, MVT::v1i128, Legal);
	setOperationAction(ISD::SRA, MVT::v1i128, Expand);

	addRegisterClass(MVT::f128, &PPC::VRRCRegClass);
	setOperationAction(ISD::FADD, MVT::f128, Legal);
	setOperationAction(ISD::FSUB, MVT::f128, Legal);
	setOperationAction(ISD::FDIV, MVT::f128, Legal);
	setOperationAction(ISD::FMUL, MVT::f128, Legal);
	setOperationAction(ISD::FP_EXTEND, MVT::f128, Legal);
	// No extending loads to f128 on PPC.
	for (MVT FPT : MVT::fp_valuetypes())
	setLoadExtAction(ISD::EXTLOAD, MVT::f128, FPT, Expand);
	setOperationAction(ISD::FMA, MVT::f128, Legal);
	setCondCodeAction(ISD::SETULT, MVT::f128, Expand);
	setCondCodeAction(ISD::SETUGT, MVT::f128, Expand);
	setCondCodeAction(ISD::SETUEQ, MVT::f128, Expand);
	setCondCodeAction(ISD::SETOGE, MVT::f128, Expand);
	setCondCodeAction(ISD::SETOLE, MVT::f128, Expand);
	setCondCodeAction(ISD::SETONE, MVT::f128, Expand);

	setOperationAction(ISD::FTRUNC, MVT::f128, Legal);
	setOperationAction(ISD::FRINT, MVT::f128, Legal);
	setOperationAction(ISD::FFLOOR, MVT::f128, Legal);
	setOperationAction(ISD::FCEIL, MVT::f128, Legal);
	setOperationAction(ISD::FNEARBYINT, MVT::f128, Legal);
	setOperationAction(ISD::FROUND, MVT::f128, Legal);

	setOperationAction(ISD::SELECT, MVT::f128, Expand);
	setOperationAction(ISD::FP_ROUND, MVT::f64, Legal);
	setOperationAction(ISD::FP_ROUND, MVT::f32, Legal);
	setTruncStoreAction(MVT::f128, MVT::f64, Expand);
	setTruncStoreAction(MVT::f128, MVT::f32, Expand);
	setOperationAction(ISD::BITCAST, MVT::i128, Custom);
	// No implementation for these ops for PowerPC.
	setOperationAction(ISD::FSIN, MVT::f128, Expand);
	setOperationAction(ISD::FCOS, MVT::f128, Expand);
	setOperationAction(ISD::FPOW, MVT::f128, Expand);
	setOperationAction(ISD::FPOWI, MVT::f128, Expand);
	setOperationAction(ISD::FREM, MVT::f128, Expand);

	// Handle constrained floating-point operations of fp128
	setOperationAction(ISD::STRICT_FADD, MVT::f128, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::f128, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::f128, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::f128, Legal);
	setOperationAction(ISD::STRICT_FMA, MVT::f128, Legal);
	setOperationAction(ISD::STRICT_FSQRT, MVT::f128, Legal);
	setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Legal);
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
	setOperationAction(ISD::STRICT_FRINT, MVT::f128, Legal);
	setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f128, Legal);
	setOperationAction(ISD::STRICT_FFLOOR, MVT::f128, Legal);
	setOperationAction(ISD::STRICT_FCEIL, MVT::f128, Legal);
	setOperationAction(ISD::STRICT_FTRUNC, MVT::f128, Legal);
	setOperationAction(ISD::STRICT_FROUND, MVT::f128, Legal);
	setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
	setOperationAction(ISD::BSWAP, MVT::v8i16, Legal);
	setOperationAction(ISD::BSWAP, MVT::v4i32, Legal);
	setOperationAction(ISD::BSWAP, MVT::v2i64, Legal);
	setOperationAction(ISD::BSWAP, MVT::v1i128, Legal);
	}

	if (Subtarget.hasP9Altivec()) {
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);

	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Legal);
	}
	}

	if (Subtarget.hasQPX()) {
	setOperationAction(ISD::FADD, MVT::v4f64, Legal);
	setOperationAction(ISD::FSUB, MVT::v4f64, Legal);
	setOperationAction(ISD::FMUL, MVT::v4f64, Legal);
	setOperationAction(ISD::FREM, MVT::v4f64, Expand);

	setOperationAction(ISD::FCOPYSIGN, MVT::v4f64, Legal);
	setOperationAction(ISD::FGETSIGN, MVT::v4f64, Expand);

	setOperationAction(ISD::LOAD , MVT::v4f64, Custom);
	setOperationAction(ISD::STORE , MVT::v4f64, Custom);

	setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom);
	setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Custom);

	if (!Subtarget.useCRBits())
	setOperationAction(ISD::SELECT, MVT::v4f64, Expand);
	setOperationAction(ISD::VSELECT, MVT::v4f64, Legal);

	setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f64, Legal);
	setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f64, Expand);
	setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f64, Expand);
	setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f64, Expand);
	setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f64, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f64, Legal);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom);

	setOperationAction(ISD::FP_TO_SINT , MVT::v4f64, Legal);
	setOperationAction(ISD::FP_TO_UINT , MVT::v4f64, Expand);

	setOperationAction(ISD::FP_ROUND , MVT::v4f32, Legal);
	setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal);

	setOperationAction(ISD::FNEG , MVT::v4f64, Legal);
	setOperationAction(ISD::FABS , MVT::v4f64, Legal);
	setOperationAction(ISD::FSIN , MVT::v4f64, Expand);
	setOperationAction(ISD::FCOS , MVT::v4f64, Expand);
	setOperationAction(ISD::FPOW , MVT::v4f64, Expand);
	setOperationAction(ISD::FLOG , MVT::v4f64, Expand);
	setOperationAction(ISD::FLOG2 , MVT::v4f64, Expand);
	setOperationAction(ISD::FLOG10 , MVT::v4f64, Expand);
	setOperationAction(ISD::FEXP , MVT::v4f64, Expand);
	setOperationAction(ISD::FEXP2 , MVT::v4f64, Expand);

	setOperationAction(ISD::FMINNUM, MVT::v4f64, Legal);
	setOperationAction(ISD::FMAXNUM, MVT::v4f64, Legal);

	setIndexedLoadAction(ISD::PRE_INC, MVT::v4f64, Legal);
	setIndexedStoreAction(ISD::PRE_INC, MVT::v4f64, Legal);

	addRegisterClass(MVT::v4f64, &PPC::QFRCRegClass);

	setOperationAction(ISD::FADD, MVT::v4f32, Legal);
	setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
	setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
	setOperationAction(ISD::FREM, MVT::v4f32, Expand);

	setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal);
	setOperationAction(ISD::FGETSIGN, MVT::v4f32, Expand);

	setOperationAction(ISD::LOAD , MVT::v4f32, Custom);
	setOperationAction(ISD::STORE , MVT::v4f32, Custom);

	if (!Subtarget.useCRBits())
	setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
	setOperationAction(ISD::VSELECT, MVT::v4f32, Legal);

	setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f32, Legal);
	setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f32, Expand);
	setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f32, Expand);
	setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f32, Expand);
	setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f32, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);

	setOperationAction(ISD::FP_TO_SINT , MVT::v4f32, Legal);
	setOperationAction(ISD::FP_TO_UINT , MVT::v4f32, Expand);

	setOperationAction(ISD::FNEG , MVT::v4f32, Legal);
	setOperationAction(ISD::FABS , MVT::v4f32, Legal);
	setOperationAction(ISD::FSIN , MVT::v4f32, Expand);
	setOperationAction(ISD::FCOS , MVT::v4f32, Expand);
	setOperationAction(ISD::FPOW , MVT::v4f32, Expand);
	setOperationAction(ISD::FLOG , MVT::v4f32, Expand);
	setOperationAction(ISD::FLOG2 , MVT::v4f32, Expand);
	setOperationAction(ISD::FLOG10 , MVT::v4f32, Expand);
	setOperationAction(ISD::FEXP , MVT::v4f32, Expand);
	setOperationAction(ISD::FEXP2 , MVT::v4f32, Expand);

	setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
	setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);

	setIndexedLoadAction(ISD::PRE_INC, MVT::v4f32, Legal);
	setIndexedStoreAction(ISD::PRE_INC, MVT::v4f32, Legal);

	addRegisterClass(MVT::v4f32, &PPC::QSRCRegClass);

	setOperationAction(ISD::AND , MVT::v4i1, Legal);
	setOperationAction(ISD::OR , MVT::v4i1, Legal);
	setOperationAction(ISD::XOR , MVT::v4i1, Legal);

	if (!Subtarget.useCRBits())
	setOperationAction(ISD::SELECT, MVT::v4i1, Expand);
	setOperationAction(ISD::VSELECT, MVT::v4i1, Legal);

	setOperationAction(ISD::LOAD , MVT::v4i1, Custom);
	setOperationAction(ISD::STORE , MVT::v4i1, Custom);

	setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4i1, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4i1, Expand);
	setOperationAction(ISD::CONCAT_VECTORS , MVT::v4i1, Expand);
	setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4i1, Expand);
	setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4i1, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i1, Expand);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom);

	setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);

	addRegisterClass(MVT::v4i1, &PPC::QBRCRegClass);

	setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal);
	setOperationAction(ISD::FCEIL, MVT::v4f64, Legal);
	setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal);
	setOperationAction(ISD::FROUND, MVT::v4f64, Legal);

	setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
	setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
	setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
	setOperationAction(ISD::FROUND, MVT::v4f32, Legal);

	setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);

	// These need to set FE_INEXACT, and so cannot be vectorized here.
	setOperationAction(ISD::FRINT, MVT::v4f64, Expand);
	setOperationAction(ISD::FRINT, MVT::v4f32, Expand);

	if (TM.Options.UnsafeFPMath) {
	setOperationAction(ISD::FDIV, MVT::v4f64, Legal);
	setOperationAction(ISD::FSQRT, MVT::v4f64, Legal);

	setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
	setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
	} else {
	setOperationAction(ISD::FDIV, MVT::v4f64, Expand);
	setOperationAction(ISD::FSQRT, MVT::v4f64, Expand);

	setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
	setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
	}

	// TODO: Handle constrained floating-point operations of v4f64
	}

	if (Subtarget.has64BitSupport())
	setOperationAction(ISD::PREFETCH, MVT::Other, Legal);

	setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, isPPC64 ? Legal : Custom);

	if (!isPPC64) {
	setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand);
	setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
	}

	setBooleanContents(ZeroOrOneBooleanContent);

	if (Subtarget.hasAltivec()) {
	// Altivec instructions set fields to all zeros or all ones.
	setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
	}

	if (!isPPC64) {
	// These libcalls are not available in 32-bit.
	setLibcallName(RTLIB::SHL_I128, nullptr);
	setLibcallName(RTLIB::SRL_I128, nullptr);
	setLibcallName(RTLIB::SRA_I128, nullptr);
	}

	setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1);

	// We have target-specific dag combine patterns for the following nodes:
	setTargetDAGCombine(ISD::ADD);
	setTargetDAGCombine(ISD::SHL);
	setTargetDAGCombine(ISD::SRA);
	setTargetDAGCombine(ISD::SRL);
	setTargetDAGCombine(ISD::MUL);
	setTargetDAGCombine(ISD::FMA);
	setTargetDAGCombine(ISD::SINT_TO_FP);
	setTargetDAGCombine(ISD::BUILD_VECTOR);
	if (Subtarget.hasFPCVT())
	setTargetDAGCombine(ISD::UINT_TO_FP);
	setTargetDAGCombine(ISD::LOAD);
	setTargetDAGCombine(ISD::STORE);
	setTargetDAGCombine(ISD::BR_CC);
	if (Subtarget.useCRBits())
	setTargetDAGCombine(ISD::BRCOND);
	setTargetDAGCombine(ISD::BSWAP);
	setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
	setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
	setTargetDAGCombine(ISD::INTRINSIC_VOID);

	setTargetDAGCombine(ISD::SIGN_EXTEND);
	setTargetDAGCombine(ISD::ZERO_EXTEND);
	setTargetDAGCombine(ISD::ANY_EXTEND);

	setTargetDAGCombine(ISD::TRUNCATE);
	setTargetDAGCombine(ISD::VECTOR_SHUFFLE);


	if (Subtarget.useCRBits()) {
	setTargetDAGCombine(ISD::TRUNCATE);
	setTargetDAGCombine(ISD::SETCC);
	setTargetDAGCombine(ISD::SELECT_CC);
	}

	// Use reciprocal estimates.
	if (TM.Options.UnsafeFPMath) {
	setTargetDAGCombine(ISD::FDIV);
	setTargetDAGCombine(ISD::FSQRT);
	}

	if (Subtarget.hasP9Altivec()) {
	setTargetDAGCombine(ISD::ABS);
	setTargetDAGCombine(ISD::VSELECT);
	}

	setLibcallName(RTLIB::LOG_F128, "logf128");
	setLibcallName(RTLIB::LOG2_F128, "log2f128");
	setLibcallName(RTLIB::LOG10_F128, "log10f128");
	setLibcallName(RTLIB::EXP_F128, "expf128");
	setLibcallName(RTLIB::EXP2_F128, "exp2f128");
	setLibcallName(RTLIB::SIN_F128, "sinf128");
	setLibcallName(RTLIB::COS_F128, "cosf128");
	setLibcallName(RTLIB::POW_F128, "powf128");
	setLibcallName(RTLIB::FMIN_F128, "fminf128");
	setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
	setLibcallName(RTLIB::POWI_F128, "__powikf2");
	setLibcallName(RTLIB::REM_F128, "fmodf128");

	// With 32 condition bits, we don't need to sink (and duplicate) compares
	// aggressively in CodeGenPrep.
	if (Subtarget.useCRBits()) {
	setHasMultipleConditionRegisters();
	setJumpIsExpensive();
	}

	setMinFunctionAlignment(Align(4));

	switch (Subtarget.getCPUDirective()) {
	default: break;
	case PPC::DIR_970:
	case PPC::DIR_A2:
	case PPC::DIR_E500:
	case PPC::DIR_E500mc:
	case PPC::DIR_E5500:
	case PPC::DIR_PWR4:
	case PPC::DIR_PWR5:
	case PPC::DIR_PWR5X:
	case PPC::DIR_PWR6:
	case PPC::DIR_PWR6X:
	case PPC::DIR_PWR7:
	case PPC::DIR_PWR8:
	case PPC::DIR_PWR9:
	case PPC::DIR_PWR10:
	case PPC::DIR_PWR_FUTURE:
	setPrefLoopAlignment(Align(16));
	setPrefFunctionAlignment(Align(16));
	break;
	}

	if (Subtarget.enableMachineScheduler())
	setSchedulingPreference(Sched::Source);
	else
	setSchedulingPreference(Sched::Hybrid);

	computeRegisterProperties(STI.getRegisterInfo());

	// The Freescale cores do better with aggressive inlining of memcpy and
	// friends. GCC uses same threshold of 128 bytes (= 32 word stores).
	if (Subtarget.getCPUDirective() == PPC::DIR_E500mc \|\|
	Subtarget.getCPUDirective() == PPC::DIR_E5500) {
	MaxStoresPerMemset = 32;
	MaxStoresPerMemsetOptSize = 16;
	MaxStoresPerMemcpy = 32;
	MaxStoresPerMemcpyOptSize = 8;
	MaxStoresPerMemmove = 32;
	MaxStoresPerMemmoveOptSize = 8;
	} else if (Subtarget.getCPUDirective() == PPC::DIR_A2) {
	// The A2 also benefits from (very) aggressive inlining of memcpy and
	// friends. The overhead of a the function call, even when warm, can be
	// over one hundred cycles.
	MaxStoresPerMemset = 128;
	MaxStoresPerMemcpy = 128;
	MaxStoresPerMemmove = 128;
	MaxLoadsPerMemcmp = 128;
	} else {
	MaxLoadsPerMemcmp = 8;
	MaxLoadsPerMemcmpOptSize = 4;
	}

	// Let the subtarget (CPU) decide if a predictable select is more expensive
	// than the corresponding branch. This information is used in CGP to decide
	// when to convert selects into branches.
	PredictableSelectIsExpensive = Subtarget.isPredictableSelectIsExpensive();
	}

	/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
	/// the desired ByVal argument alignment.
	static void getMaxByValAlign(Type *Ty, Align &MaxAlign, Align MaxMaxAlign) {
	if (MaxAlign == MaxMaxAlign)
	return;
	if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
	if (MaxMaxAlign >= 32 &&
	VTy->getPrimitiveSizeInBits().getFixedSize() >= 256)
	MaxAlign = Align(32);
	else if (VTy->getPrimitiveSizeInBits().getFixedSize() >= 128 &&
	MaxAlign < 16)
	MaxAlign = Align(16);
	} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
	Align EltAlign;
	getMaxByValAlign(ATy->getElementType(), EltAlign, MaxMaxAlign);
	if (EltAlign > MaxAlign)
	MaxAlign = EltAlign;
	} else if (StructType *STy = dyn_cast<StructType>(Ty)) {
	for (auto *EltTy : STy->elements()) {
	Align EltAlign;
	getMaxByValAlign(EltTy, EltAlign, MaxMaxAlign);
	if (EltAlign > MaxAlign)
	MaxAlign = EltAlign;
	if (MaxAlign == MaxMaxAlign)
	break;
	}
	}
	}

	/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
	/// function arguments in the caller parameter area.
	unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty,
	const DataLayout &DL) const {
	// 16byte and wider vectors are passed on 16byte boundary.
	// The rest is 8 on PPC64 and 4 on PPC32 boundary.
	Align Alignment = Subtarget.isPPC64() ? Align(8) : Align(4);
	if (Subtarget.hasAltivec() \|\| Subtarget.hasQPX())
	getMaxByValAlign(Ty, Alignment, Subtarget.hasQPX() ? Align(32) : Align(16));
	return Alignment.value();
	}

	bool PPCTargetLowering::useSoftFloat() const {
	return Subtarget.useSoftFloat();
	}

	bool PPCTargetLowering::hasSPE() const {
	return Subtarget.hasSPE();
	}

	bool PPCTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
	return VT.isScalarInteger();
	}

	/// isMulhCheaperThanMulShift - Return true if a mulh[s\|u] node for a specific
	/// type is cheaper than a multiply followed by a shift.
	/// This is true for words and doublewords on 64-bit PowerPC.
	bool PPCTargetLowering::isMulhCheaperThanMulShift(EVT Type) const {
	if (Subtarget.isPPC64() && (isOperationLegal(ISD::MULHS, Type) \|\|
	isOperationLegal(ISD::MULHU, Type)))
	return true;
	return TargetLowering::isMulhCheaperThanMulShift(Type);
	}

	const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
	switch ((PPCISD::NodeType)Opcode) {
	case PPCISD::FIRST_NUMBER: break;
	case PPCISD::FSEL: return "PPCISD::FSEL";
	case PPCISD::XSMAXCDP: return "PPCISD::XSMAXCDP";
	case PPCISD::XSMINCDP: return "PPCISD::XSMINCDP";
	case PPCISD::FCFID: return "PPCISD::FCFID";
	case PPCISD::FCFIDU: return "PPCISD::FCFIDU";
	case PPCISD::FCFIDS: return "PPCISD::FCFIDS";
	case PPCISD::FCFIDUS: return "PPCISD::FCFIDUS";
	case PPCISD::FCTIDZ: return "PPCISD::FCTIDZ";
	case PPCISD::FCTIWZ: return "PPCISD::FCTIWZ";
	case PPCISD::FCTIDUZ: return "PPCISD::FCTIDUZ";
	case PPCISD::FCTIWUZ: return "PPCISD::FCTIWUZ";
	case PPCISD::FP_TO_UINT_IN_VSR:
	return "PPCISD::FP_TO_UINT_IN_VSR,";
	case PPCISD::FP_TO_SINT_IN_VSR:
	return "PPCISD::FP_TO_SINT_IN_VSR";
	case PPCISD::FRE: return "PPCISD::FRE";
	case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE";
	case PPCISD::STFIWX: return "PPCISD::STFIWX";
	case PPCISD::VPERM: return "PPCISD::VPERM";
	case PPCISD::XXSPLT: return "PPCISD::XXSPLT";
	case PPCISD::XXSPLTI_SP_TO_DP:
	return "PPCISD::XXSPLTI_SP_TO_DP";
	case PPCISD::XXSPLTI32DX:
	return "PPCISD::XXSPLTI32DX";
	case PPCISD::VECINSERT: return "PPCISD::VECINSERT";
	case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI";
	case PPCISD::VECSHL: return "PPCISD::VECSHL";
	case PPCISD::CMPB: return "PPCISD::CMPB";
	case PPCISD::Hi: return "PPCISD::Hi";
	case PPCISD::Lo: return "PPCISD::Lo";
	case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY";
	case PPCISD::ATOMIC_CMP_SWAP_8: return "PPCISD::ATOMIC_CMP_SWAP_8";
	case PPCISD::ATOMIC_CMP_SWAP_16: return "PPCISD::ATOMIC_CMP_SWAP_16";
	case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC";
	case PPCISD::DYNAREAOFFSET: return "PPCISD::DYNAREAOFFSET";
	case PPCISD::PROBED_ALLOCA: return "PPCISD::PROBED_ALLOCA";
	case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg";
	case PPCISD::SRL: return "PPCISD::SRL";
	case PPCISD::SRA: return "PPCISD::SRA";
	case PPCISD::SHL: return "PPCISD::SHL";
	case PPCISD::SRA_ADDZE: return "PPCISD::SRA_ADDZE";
	case PPCISD::CALL: return "PPCISD::CALL";
	case PPCISD::CALL_NOP: return "PPCISD::CALL_NOP";
	case PPCISD::CALL_NOTOC: return "PPCISD::CALL_NOTOC";
	case PPCISD::MTCTR: return "PPCISD::MTCTR";
	case PPCISD::BCTRL: return "PPCISD::BCTRL";
	case PPCISD::BCTRL_LOAD_TOC: return "PPCISD::BCTRL_LOAD_TOC";
	case PPCISD::RET_FLAG: return "PPCISD::RET_FLAG";
	case PPCISD::READ_TIME_BASE: return "PPCISD::READ_TIME_BASE";
	case PPCISD::EH_SJLJ_SETJMP: return "PPCISD::EH_SJLJ_SETJMP";
	case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP";
	case PPCISD::MFOCRF: return "PPCISD::MFOCRF";
	case PPCISD::MFVSR: return "PPCISD::MFVSR";
	case PPCISD::MTVSRA: return "PPCISD::MTVSRA";
	case PPCISD::MTVSRZ: return "PPCISD::MTVSRZ";
	case PPCISD::SINT_VEC_TO_FP: return "PPCISD::SINT_VEC_TO_FP";
	case PPCISD::UINT_VEC_TO_FP: return "PPCISD::UINT_VEC_TO_FP";
	case PPCISD::SCALAR_TO_VECTOR_PERMUTED:
	return "PPCISD::SCALAR_TO_VECTOR_PERMUTED";
	case PPCISD::ANDI_rec_1_EQ_BIT:
	return "PPCISD::ANDI_rec_1_EQ_BIT";
	case PPCISD::ANDI_rec_1_GT_BIT:
	return "PPCISD::ANDI_rec_1_GT_BIT";
	case PPCISD::VCMP: return "PPCISD::VCMP";
	case PPCISD::VCMPo: return "PPCISD::VCMPo";
	case PPCISD::LBRX: return "PPCISD::LBRX";
	case PPCISD::STBRX: return "PPCISD::STBRX";
	case PPCISD::LFIWAX: return "PPCISD::LFIWAX";
	case PPCISD::LFIWZX: return "PPCISD::LFIWZX";
	case PPCISD::LXSIZX: return "PPCISD::LXSIZX";
	case PPCISD::STXSIX: return "PPCISD::STXSIX";
	case PPCISD::VEXTS: return "PPCISD::VEXTS";
	case PPCISD::LXVD2X: return "PPCISD::LXVD2X";
	case PPCISD::STXVD2X: return "PPCISD::STXVD2X";
	case PPCISD::LOAD_VEC_BE: return "PPCISD::LOAD_VEC_BE";
	case PPCISD::STORE_VEC_BE: return "PPCISD::STORE_VEC_BE";
	case PPCISD::ST_VSR_SCAL_INT:
	return "PPCISD::ST_VSR_SCAL_INT";
	case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH";
	case PPCISD::BDNZ: return "PPCISD::BDNZ";
	case PPCISD::BDZ: return "PPCISD::BDZ";
	case PPCISD::MFFS: return "PPCISD::MFFS";
	case PPCISD::FADDRTZ: return "PPCISD::FADDRTZ";
	case PPCISD::TC_RETURN: return "PPCISD::TC_RETURN";
	case PPCISD::CR6SET: return "PPCISD::CR6SET";
	case PPCISD::CR6UNSET: return "PPCISD::CR6UNSET";
	case PPCISD::PPC32_GOT: return "PPCISD::PPC32_GOT";
	case PPCISD::PPC32_PICGOT: return "PPCISD::PPC32_PICGOT";
	case PPCISD::ADDIS_GOT_TPREL_HA: return "PPCISD::ADDIS_GOT_TPREL_HA";
	case PPCISD::LD_GOT_TPREL_L: return "PPCISD::LD_GOT_TPREL_L";
	case PPCISD::ADD_TLS: return "PPCISD::ADD_TLS";
	case PPCISD::ADDIS_TLSGD_HA: return "PPCISD::ADDIS_TLSGD_HA";
	case PPCISD::ADDI_TLSGD_L: return "PPCISD::ADDI_TLSGD_L";
	case PPCISD::GET_TLS_ADDR: return "PPCISD::GET_TLS_ADDR";
	case PPCISD::ADDI_TLSGD_L_ADDR: return "PPCISD::ADDI_TLSGD_L_ADDR";
	case PPCISD::ADDIS_TLSLD_HA: return "PPCISD::ADDIS_TLSLD_HA";
	case PPCISD::ADDI_TLSLD_L: return "PPCISD::ADDI_TLSLD_L";
	case PPCISD::GET_TLSLD_ADDR: return "PPCISD::GET_TLSLD_ADDR";
	case PPCISD::ADDI_TLSLD_L_ADDR: return "PPCISD::ADDI_TLSLD_L_ADDR";
	case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA";
	case PPCISD::ADDI_DTPREL_L: return "PPCISD::ADDI_DTPREL_L";
	case PPCISD::VADD_SPLAT: return "PPCISD::VADD_SPLAT";
	case PPCISD::SC: return "PPCISD::SC";
	case PPCISD::CLRBHRB: return "PPCISD::CLRBHRB";
	case PPCISD::MFBHRBE: return "PPCISD::MFBHRBE";
	case PPCISD::RFEBB: return "PPCISD::RFEBB";
	case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD";
	case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN";
	case PPCISD::VABSD: return "PPCISD::VABSD";
	case PPCISD::QVFPERM: return "PPCISD::QVFPERM";
	case PPCISD::QVGPCI: return "PPCISD::QVGPCI";
	case PPCISD::QVALIGNI: return "PPCISD::QVALIGNI";
	case PPCISD::QVESPLATI: return "PPCISD::QVESPLATI";
	case PPCISD::QBFLT: return "PPCISD::QBFLT";
	case PPCISD::QVLFSb: return "PPCISD::QVLFSb";
	case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128";
	case PPCISD::BUILD_SPE64: return "PPCISD::BUILD_SPE64";
	case PPCISD::EXTRACT_SPE: return "PPCISD::EXTRACT_SPE";
	case PPCISD::EXTSWSLI: return "PPCISD::EXTSWSLI";
	case PPCISD::LD_VSX_LH: return "PPCISD::LD_VSX_LH";
	case PPCISD::FP_EXTEND_HALF: return "PPCISD::FP_EXTEND_HALF";
	case PPCISD::MAT_PCREL_ADDR: return "PPCISD::MAT_PCREL_ADDR";
	case PPCISD::LD_SPLAT: return "PPCISD::LD_SPLAT";
	case PPCISD::FNMSUB: return "PPCISD::FNMSUB";
	}
	return nullptr;
	}

	EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C,
	EVT VT) const {
	if (!VT.isVector())
	return Subtarget.useCRBits() ? MVT::i1 : MVT::i32;

	if (Subtarget.hasQPX())
	return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements());

	return VT.changeVectorElementTypeToInteger();
	}

	bool PPCTargetLowering::enableAggressiveFMAFusion(EVT VT) const {
	assert(VT.isFloatingPoint() && "Non-floating-point FMA?");
	return true;
	}

	//===----------------------------------------------------------------------===//
	// Node matching predicates, for use by the tblgen matching code.
	//===----------------------------------------------------------------------===//

	/// isFloatingPointZero - Return true if this is 0.0 or -0.0.
	static bool isFloatingPointZero(SDValue Op) {
	if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
	return CFP->getValueAPF().isZero();
	else if (ISD::isEXTLoad(Op.getNode()) \|\| ISD::isNON_EXTLoad(Op.getNode())) {
	// Maybe this has already been legalized into the constant pool?
	if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op.getOperand(1)))
	if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
	return CFP->getValueAPF().isZero();
	}
	return false;
	}

	/// isConstantOrUndef - Op is either an undef node or a ConstantSDNode. Return
	/// true if Op is undef or if it matches the specified value.
	static bool isConstantOrUndef(int Op, int Val) {
	return Op < 0 \|\| Op == Val;
	}

	/// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
	/// VPKUHUM instruction.
	/// The ShuffleKind distinguishes between big-endian operations with
	/// two different inputs (0), either-endian operations with two identical
	/// inputs (1), and little-endian operations with two different inputs (2).
	/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
	bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
	SelectionDAG &DAG) {
	bool IsLE = DAG.getDataLayout().isLittleEndian();
	if (ShuffleKind == 0) {
	if (IsLE)
	return false;
	for (unsigned i = 0; i != 16; ++i)
	if (!isConstantOrUndef(N->getMaskElt(i), i*2+1))
	return false;
	} else if (ShuffleKind == 2) {
	if (!IsLE)
	return false;
	for (unsigned i = 0; i != 16; ++i)
	if (!isConstantOrUndef(N->getMaskElt(i), i*2))
	return false;
	} else if (ShuffleKind == 1) {
	unsigned j = IsLE ? 0 : 1;
	for (unsigned i = 0; i != 8; ++i)
	if (!isConstantOrUndef(N->getMaskElt(i), i*2+j) \|\|
	!isConstantOrUndef(N->getMaskElt(i+8), i*2+j))
	return false;
	}
	return true;
	}

	/// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
	/// VPKUWUM instruction.
	/// The ShuffleKind distinguishes between big-endian operations with
	/// two different inputs (0), either-endian operations with two identical
	/// inputs (1), and little-endian operations with two different inputs (2).
	/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
	bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
	SelectionDAG &DAG) {
	bool IsLE = DAG.getDataLayout().isLittleEndian();
	if (ShuffleKind == 0) {
	if (IsLE)
	return false;
	for (unsigned i = 0; i != 16; i += 2)
	if (!isConstantOrUndef(N->getMaskElt(i ), i*2+2) \|\|
	!isConstantOrUndef(N->getMaskElt(i+1), i*2+3))
	return false;
	} else if (ShuffleKind == 2) {
	if (!IsLE)
	return false;
	for (unsigned i = 0; i != 16; i += 2)
	if (!isConstantOrUndef(N->getMaskElt(i ), i*2) \|\|
	!isConstantOrUndef(N->getMaskElt(i+1), i*2+1))
	return false;
	} else if (ShuffleKind == 1) {
	unsigned j = IsLE ? 0 : 2;
	for (unsigned i = 0; i != 8; i += 2)
	if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) \|\|
	!isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) \|\|
	!isConstantOrUndef(N->getMaskElt(i+8), i*2+j) \|\|
	!isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1))
	return false;
	}
	return true;
	}

	/// isVPKUDUMShuffleMask - Return true if this is the shuffle mask for a
	/// VPKUDUM instruction, AND the VPKUDUM instruction exists for the
	/// current subtarget.
	///
	/// The ShuffleKind distinguishes between big-endian operations with
	/// two different inputs (0), either-endian operations with two identical
	/// inputs (1), and little-endian operations with two different inputs (2).
	/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
	bool PPC::isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
	SelectionDAG &DAG) {
	const PPCSubtarget& Subtarget =
	static_cast<const PPCSubtarget&>(DAG.getSubtarget());
	if (!Subtarget.hasP8Vector())
	return false;

	bool IsLE = DAG.getDataLayout().isLittleEndian();
	if (ShuffleKind == 0) {
	if (IsLE)
	return false;
	for (unsigned i = 0; i != 16; i += 4)
	if (!isConstantOrUndef(N->getMaskElt(i ), i*2+4) \|\|
	!isConstantOrUndef(N->getMaskElt(i+1), i*2+5) \|\|
	!isConstantOrUndef(N->getMaskElt(i+2), i*2+6) \|\|
	!isConstantOrUndef(N->getMaskElt(i+3), i*2+7))
	return false;
	} else if (ShuffleKind == 2) {
	if (!IsLE)
	return false;
	for (unsigned i = 0; i != 16; i += 4)
	if (!isConstantOrUndef(N->getMaskElt(i ), i*2) \|\|
	!isConstantOrUndef(N->getMaskElt(i+1), i*2+1) \|\|
	!isConstantOrUndef(N->getMaskElt(i+2), i*2+2) \|\|
	!isConstantOrUndef(N->getMaskElt(i+3), i*2+3))
	return false;
	} else if (ShuffleKind == 1) {
	unsigned j = IsLE ? 0 : 4;
	for (unsigned i = 0; i != 8; i += 4)
	if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) \|\|
	!isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) \|\|
	!isConstantOrUndef(N->getMaskElt(i+2), i*2+j+2) \|\|
	!isConstantOrUndef(N->getMaskElt(i+3), i*2+j+3) \|\|
	!isConstantOrUndef(N->getMaskElt(i+8), i*2+j) \|\|
	!isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1) \|\|
	!isConstantOrUndef(N->getMaskElt(i+10), i*2+j+2) \|\|
	!isConstantOrUndef(N->getMaskElt(i+11), i*2+j+3))
	return false;
	}
	return true;
	}

	/// isVMerge - Common function, used to match vmrg* shuffles.
	///
	static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize,
	unsigned LHSStart, unsigned RHSStart) {
	if (N->getValueType(0) != MVT::v16i8)
	return false;
	assert((UnitSize == 1 \|\| UnitSize == 2 \|\| UnitSize == 4) &&
	"Unsupported merge size!");

	for (unsigned i = 0; i != 8/UnitSize; ++i) // Step over units
	for (unsigned j = 0; j != UnitSize; ++j) { // Step over bytes within unit
	if (!isConstantOrUndef(N->getMaskElt(iUnitSize2+j),
	LHSStart+j+i*UnitSize) \|\|
	!isConstantOrUndef(N->getMaskElt(iUnitSize2+UnitSize+j),
	RHSStart+j+i*UnitSize))
	return false;
	}
	return true;
	}

	/// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
	/// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes).
	/// The ShuffleKind distinguishes between big-endian merges with two
	/// different inputs (0), either-endian merges with two identical inputs (1),
	/// and little-endian merges with two different inputs (2). For the latter,
	/// the input operands are swapped (see PPCInstrAltivec.td).
	bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
	unsigned ShuffleKind, SelectionDAG &DAG) {
	if (DAG.getDataLayout().isLittleEndian()) {
	if (ShuffleKind == 1) // unary
	return isVMerge(N, UnitSize, 0, 0);
	else if (ShuffleKind == 2) // swapped
	return isVMerge(N, UnitSize, 0, 16);
	else
	return false;
	} else {
	if (ShuffleKind == 1) // unary
	return isVMerge(N, UnitSize, 8, 8);
	else if (ShuffleKind == 0) // normal
	return isVMerge(N, UnitSize, 8, 24);
	else
	return false;
	}
	}

	/// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
	/// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes).
	/// The ShuffleKind distinguishes between big-endian merges with two
	/// different inputs (0), either-endian merges with two identical inputs (1),
	/// and little-endian merges with two different inputs (2). For the latter,
	/// the input operands are swapped (see PPCInstrAltivec.td).
	bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
	unsigned ShuffleKind, SelectionDAG &DAG) {
	if (DAG.getDataLayout().isLittleEndian()) {
	if (ShuffleKind == 1) // unary
	return isVMerge(N, UnitSize, 8, 8);
	else if (ShuffleKind == 2) // swapped
	return isVMerge(N, UnitSize, 8, 24);
	else
	return false;
	} else {
	if (ShuffleKind == 1) // unary
	return isVMerge(N, UnitSize, 0, 0);
	else if (ShuffleKind == 0) // normal
	return isVMerge(N, UnitSize, 0, 16);
	else
	return false;
	}
	}

	/**
	* Common function used to match vmrgew and vmrgow shuffles
	*
	* The indexOffset determines whether to look for even or odd words in
	* the shuffle mask. This is based on the of the endianness of the target
	* machine.
	* - Little Endian:
	* - Use offset of 0 to check for odd elements
	* - Use offset of 4 to check for even elements
	* - Big Endian:
	* - Use offset of 0 to check for even elements
	* - Use offset of 4 to check for odd elements
	* A detailed description of the vector element ordering for little endian and
	* big endian can be found at
	* http://www.ibm.com/developerworks/library/l-ibm-xl-c-cpp-compiler/index.html
	* Targeting your applications - what little endian and big endian IBM XL C/C++
	* compiler differences mean to you
	*
	* The mask to the shuffle vector instruction specifies the indices of the
	* elements from the two input vectors to place in the result. The elements are
	* numbered in array-access order, starting with the first vector. These vectors
	* are always of type v16i8, thus each vector will contain 16 elements of size
	* 8. More info on the shuffle vector can be found in the
	* http://llvm.org/docs/LangRef.html#shufflevector-instruction
	* Language Reference.
	*
	* The RHSStartValue indicates whether the same input vectors are used (unary)
	* or two different input vectors are used, based on the following:
	* - If the instruction uses the same vector for both inputs, the range of the
	* indices will be 0 to 15. In this case, the RHSStart value passed should
	* be 0.
	* - If the instruction has two different vectors then the range of the
	* indices will be 0 to 31. In this case, the RHSStart value passed should
	* be 16 (indices 0-15 specify elements in the first vector while indices 16
	* to 31 specify elements in the second vector).
	*
	* \param[in] N The shuffle vector SD Node to analyze
	* \param[in] IndexOffset Specifies whether to look for even or odd elements
	* \param[in] RHSStartValue Specifies the starting index for the righthand input
	* vector to the shuffle_vector instruction
	* \return true iff this shuffle vector represents an even or odd word merge
	*/
	static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset,
	unsigned RHSStartValue) {
	if (N->getValueType(0) != MVT::v16i8)
	return false;

	for (unsigned i = 0; i < 2; ++i)
	for (unsigned j = 0; j < 4; ++j)
	if (!isConstantOrUndef(N->getMaskElt(i*4+j),
	i*RHSStartValue+j+IndexOffset) \|\|
	!isConstantOrUndef(N->getMaskElt(i*4+j+8),
	i*RHSStartValue+j+IndexOffset+8))
	return false;
	return true;
	}

	/**
	* Determine if the specified shuffle mask is suitable for the vmrgew or
	* vmrgow instructions.
	*
	* \param[in] N The shuffle vector SD Node to analyze
	* \param[in] CheckEven Check for an even merge (true) or an odd merge (false)
	* \param[in] ShuffleKind Identify the type of merge:
	* - 0 = big-endian merge with two different inputs;
	* - 1 = either-endian merge with two identical inputs;
	* - 2 = little-endian merge with two different inputs (inputs are swapped for
	* little-endian merges).
	* \param[in] DAG The current SelectionDAG
	* \return true iff this shuffle mask
	*/
	bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven,
	unsigned ShuffleKind, SelectionDAG &DAG) {
	if (DAG.getDataLayout().isLittleEndian()) {
	unsigned indexOffset = CheckEven ? 4 : 0;
	if (ShuffleKind == 1) // Unary
	return isVMerge(N, indexOffset, 0);
	else if (ShuffleKind == 2) // swapped
	return isVMerge(N, indexOffset, 16);
	else
	return false;
	}
	else {
	unsigned indexOffset = CheckEven ? 0 : 4;
	if (ShuffleKind == 1) // Unary
	return isVMerge(N, indexOffset, 0);
	else if (ShuffleKind == 0) // Normal
	return isVMerge(N, indexOffset, 16);
	else
	return false;
	}
	return false;
	}

	/// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
	/// amount, otherwise return -1.
	/// The ShuffleKind distinguishes between big-endian operations with two
	/// different inputs (0), either-endian operations with two identical inputs
	/// (1), and little-endian operations with two different inputs (2). For the
	/// latter, the input operands are swapped (see PPCInstrAltivec.td).
	int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
	SelectionDAG &DAG) {
	if (N->getValueType(0) != MVT::v16i8)
	return -1;

	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);

	// Find the first non-undef value in the shuffle mask.
	unsigned i;
	for (i = 0; i != 16 && SVOp->getMaskElt(i) < 0; ++i)
	/search/;

	if (i == 16) return -1; // all undef.

	// Otherwise, check to see if the rest of the elements are consecutively
	// numbered from this value.
	unsigned ShiftAmt = SVOp->getMaskElt(i);
	if (ShiftAmt < i) return -1;

	ShiftAmt -= i;
	bool isLE = DAG.getDataLayout().isLittleEndian();

	if ((ShuffleKind == 0 && !isLE) \|\| (ShuffleKind == 2 && isLE)) {
	// Check the rest of the elements to see if they are consecutive.
	for (++i; i != 16; ++i)
	if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
	return -1;
	} else if (ShuffleKind == 1) {
	// Check the rest of the elements to see if they are consecutive.
	for (++i; i != 16; ++i)
	if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15))
	return -1;
	} else
	return -1;

	if (isLE)
	ShiftAmt = 16 - ShiftAmt;

	return ShiftAmt;
	}

	/// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
	/// specifies a splat of a single element that is suitable for input to
	/// one of the splat operations (VSPLTB/VSPLTH/VSPLTW/XXSPLTW/LXVDSX/etc.).
	bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
	assert(N->getValueType(0) == MVT::v16i8 && isPowerOf2_32(EltSize) &&
	EltSize <= 8 && "Can only handle 1,2,4,8 byte element sizes");

	// The consecutive indices need to specify an element, not part of two
	// different elements. So abandon ship early if this isn't the case.
	if (N->getMaskElt(0) % EltSize != 0)
	return false;

	// This is a splat operation if each element of the permute is the same, and
	// if the value doesn't reference the second vector.
	unsigned ElementBase = N->getMaskElt(0);

	// FIXME: Handle UNDEF elements too!
	if (ElementBase >= 16)
	return false;

	// Check that the indices are consecutive, in the case of a multi-byte element
	// splatted with a v16i8 mask.
	for (unsigned i = 1; i != EltSize; ++i)
	if (N->getMaskElt(i) < 0 \|\| N->getMaskElt(i) != (int)(i+ElementBase))
	return false;

	for (unsigned i = EltSize, e = 16; i != e; i += EltSize) {
	if (N->getMaskElt(i) < 0) continue;
	for (unsigned j = 0; j != EltSize; ++j)
	if (N->getMaskElt(i+j) != N->getMaskElt(j))
	return false;
	}
	return true;
	}

	/// Check that the mask is shuffling N byte elements. Within each N byte
	/// element of the mask, the indices could be either in increasing or
	/// decreasing order as long as they are consecutive.
	/// \param[in] N the shuffle vector SD Node to analyze
	/// \param[in] Width the element width in bytes, could be 2/4/8/16 (HalfWord/
	/// Word/DoubleWord/QuadWord).
	/// \param[in] StepLen the delta indices number among the N byte element, if
	/// the mask is in increasing/decreasing order then it is 1/-1.
	/// \return true iff the mask is shuffling N byte elements.
	static bool isNByteElemShuffleMask(ShuffleVectorSDNode *N, unsigned Width,
	int StepLen) {
	assert((Width == 2 \|\| Width == 4 \|\| Width == 8 \|\| Width == 16) &&
	"Unexpected element width.");
	assert((StepLen == 1 \|\| StepLen == -1) && "Unexpected element width.");

	unsigned NumOfElem = 16 / Width;
	unsigned MaskVal[16]; // Width is never greater than 16
	for (unsigned i = 0; i < NumOfElem; ++i) {
	MaskVal[0] = N->getMaskElt(i * Width);
	if ((StepLen == 1) && (MaskVal[0] % Width)) {
	return false;
	} else if ((StepLen == -1) && ((MaskVal[0] + 1) % Width)) {
	return false;
	}

	for (unsigned int j = 1; j < Width; ++j) {
	MaskVal[j] = N->getMaskElt(i * Width + j);
	if (MaskVal[j] != MaskVal[j-1] + StepLen) {
	return false;
	}
	}
	}

	return true;
	}

	bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
	unsigned &InsertAtByte, bool &Swap, bool IsLE) {
	if (!isNByteElemShuffleMask(N, 4, 1))
	return false;

	// Now we look at mask elements 0,4,8,12
	unsigned M0 = N->getMaskElt(0) / 4;
	unsigned M1 = N->getMaskElt(4) / 4;
	unsigned M2 = N->getMaskElt(8) / 4;
	unsigned M3 = N->getMaskElt(12) / 4;
	unsigned LittleEndianShifts[] = { 2, 1, 0, 3 };
	unsigned BigEndianShifts[] = { 3, 0, 1, 2 };

	// Below, let H and L be arbitrary elements of the shuffle mask
	// where H is in the range [4,7] and L is in the range [0,3].
	// H, 1, 2, 3 or L, 5, 6, 7
	if ((M0 > 3 && M1 == 1 && M2 == 2 && M3 == 3) \|\|
	(M0 < 4 && M1 == 5 && M2 == 6 && M3 == 7)) {
	ShiftElts = IsLE ? LittleEndianShifts[M0 & 0x3] : BigEndianShifts[M0 & 0x3];
	InsertAtByte = IsLE ? 12 : 0;
	Swap = M0 < 4;
	return true;
	}
	// 0, H, 2, 3 or 4, L, 6, 7
	if ((M1 > 3 && M0 == 0 && M2 == 2 && M3 == 3) \|\|
	(M1 < 4 && M0 == 4 && M2 == 6 && M3 == 7)) {
	ShiftElts = IsLE ? LittleEndianShifts[M1 & 0x3] : BigEndianShifts[M1 & 0x3];
	InsertAtByte = IsLE ? 8 : 4;
	Swap = M1 < 4;
	return true;
	}
	// 0, 1, H, 3 or 4, 5, L, 7
	if ((M2 > 3 && M0 == 0 && M1 == 1 && M3 == 3) \|\|
	(M2 < 4 && M0 == 4 && M1 == 5 && M3 == 7)) {
	ShiftElts = IsLE ? LittleEndianShifts[M2 & 0x3] : BigEndianShifts[M2 & 0x3];
	InsertAtByte = IsLE ? 4 : 8;
	Swap = M2 < 4;
	return true;
	}
	// 0, 1, 2, H or 4, 5, 6, L
	if ((M3 > 3 && M0 == 0 && M1 == 1 && M2 == 2) \|\|
	(M3 < 4 && M0 == 4 && M1 == 5 && M2 == 6)) {
	ShiftElts = IsLE ? LittleEndianShifts[M3 & 0x3] : BigEndianShifts[M3 & 0x3];
	InsertAtByte = IsLE ? 0 : 12;
	Swap = M3 < 4;
	return true;
	}

	// If both vector operands for the shuffle are the same vector, the mask will
	// contain only elements from the first one and the second one will be undef.
	if (N->getOperand(1).isUndef()) {
	ShiftElts = 0;
	Swap = true;
	unsigned XXINSERTWSrcElem = IsLE ? 2 : 1;
	if (M0 == XXINSERTWSrcElem && M1 == 1 && M2 == 2 && M3 == 3) {
	InsertAtByte = IsLE ? 12 : 0;
	return true;
	}
	if (M0 == 0 && M1 == XXINSERTWSrcElem && M2 == 2 && M3 == 3) {
	InsertAtByte = IsLE ? 8 : 4;
	return true;
	}
	if (M0 == 0 && M1 == 1 && M2 == XXINSERTWSrcElem && M3 == 3) {
	InsertAtByte = IsLE ? 4 : 8;
	return true;
	}
	if (M0 == 0 && M1 == 1 && M2 == 2 && M3 == XXINSERTWSrcElem) {
	InsertAtByte = IsLE ? 0 : 12;
	return true;
	}
	}

	return false;
	}

	bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
	bool &Swap, bool IsLE) {
	assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
	// Ensure each byte index of the word is consecutive.
	if (!isNByteElemShuffleMask(N, 4, 1))
	return false;

	// Now we look at mask elements 0,4,8,12, which are the beginning of words.
	unsigned M0 = N->getMaskElt(0) / 4;
	unsigned M1 = N->getMaskElt(4) / 4;
	unsigned M2 = N->getMaskElt(8) / 4;
	unsigned M3 = N->getMaskElt(12) / 4;

	// If both vector operands for the shuffle are the same vector, the mask will
	// contain only elements from the first one and the second one will be undef.
	if (N->getOperand(1).isUndef()) {
	assert(M0 < 4 && "Indexing into an undef vector?");
	if (M1 != (M0 + 1) % 4 \|\| M2 != (M1 + 1) % 4 \|\| M3 != (M2 + 1) % 4)
	return false;

	ShiftElts = IsLE ? (4 - M0) % 4 : M0;
	Swap = false;
	return true;
	}

	// Ensure each word index of the ShuffleVector Mask is consecutive.
	if (M1 != (M0 + 1) % 8 \|\| M2 != (M1 + 1) % 8 \|\| M3 != (M2 + 1) % 8)
	return false;

	if (IsLE) {
	if (M0 == 0 \|\| M0 == 7 \|\| M0 == 6 \|\| M0 == 5) {
	// Input vectors don't need to be swapped if the leading element
	// of the result is one of the 3 left elements of the second vector
	// (or if there is no shift to be done at all).
	Swap = false;
	ShiftElts = (8 - M0) % 8;
	} else if (M0 == 4 \|\| M0 == 3 \|\| M0 == 2 \|\| M0 == 1) {
	// Input vectors need to be swapped if the leading element
	// of the result is one of the 3 left elements of the first vector
	// (or if we're shifting by 4 - thereby simply swapping the vectors).
	Swap = true;
	ShiftElts = (4 - M0) % 4;
	}

	return true;
	} else { // BE
	if (M0 == 0 \|\| M0 == 1 \|\| M0 == 2 \|\| M0 == 3) {
	// Input vectors don't need to be swapped if the leading element
	// of the result is one of the 4 elements of the first vector.
	Swap = false;
	ShiftElts = M0;
	} else if (M0 == 4 \|\| M0 == 5 \|\| M0 == 6 \|\| M0 == 7) {
	// Input vectors need to be swapped if the leading element
	// of the result is one of the 4 elements of the right vector.
	Swap = true;
	ShiftElts = M0 - 4;
	}

	return true;
	}
	}

	bool static isXXBRShuffleMaskHelper(ShuffleVectorSDNode *N, int Width) {
	assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");

	if (!isNByteElemShuffleMask(N, Width, -1))
	return false;

	for (int i = 0; i < 16; i += Width)
	if (N->getMaskElt(i) != i + Width - 1)
	return false;

	return true;
	}

	bool PPC::isXXBRHShuffleMask(ShuffleVectorSDNode *N) {
	return isXXBRShuffleMaskHelper(N, 2);
	}

	bool PPC::isXXBRWShuffleMask(ShuffleVectorSDNode *N) {
	return isXXBRShuffleMaskHelper(N, 4);
	}

	bool PPC::isXXBRDShuffleMask(ShuffleVectorSDNode *N) {
	return isXXBRShuffleMaskHelper(N, 8);
	}

	bool PPC::isXXBRQShuffleMask(ShuffleVectorSDNode *N) {
	return isXXBRShuffleMaskHelper(N, 16);
	}

	/// Can node \p N be lowered to an XXPERMDI instruction? If so, set \p Swap
	/// if the inputs to the instruction should be swapped and set \p DM to the
	/// value for the immediate.
	/// Specifically, set \p Swap to true only if \p N can be lowered to XXPERMDI
	/// AND element 0 of the result comes from the first input (LE) or second input
	/// (BE). Set \p DM to the calculated result (0-3) only if \p N can be lowered.
	/// \return true iff the given mask of shuffle node \p N is a XXPERMDI shuffle
	/// mask.
	bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM,
	bool &Swap, bool IsLE) {
	assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");

	// Ensure each byte index of the double word is consecutive.
	if (!isNByteElemShuffleMask(N, 8, 1))
	return false;

	unsigned M0 = N->getMaskElt(0) / 8;
	unsigned M1 = N->getMaskElt(8) / 8;
	assert(((M0 \| M1) < 4) && "A mask element out of bounds?");

	// If both vector operands for the shuffle are the same vector, the mask will
	// contain only elements from the first one and the second one will be undef.
	if (N->getOperand(1).isUndef()) {
	if ((M0 \| M1) < 2) {
	DM = IsLE ? (((~M1) & 1) << 1) + ((~M0) & 1) : (M0 << 1) + (M1 & 1);
	Swap = false;
	return true;
	} else
	return false;
	}

	if (IsLE) {
	if (M0 > 1 && M1 < 2) {
	Swap = false;
	} else if (M0 < 2 && M1 > 1) {
	M0 = (M0 + 2) % 4;
	M1 = (M1 + 2) % 4;
	Swap = true;
	} else
	return false;

	// Note: if control flow comes here that means Swap is already set above
	DM = (((~M1) & 1) << 1) + ((~M0) & 1);
	return true;
	} else { // BE
	if (M0 < 2 && M1 > 1) {
	Swap = false;
	} else if (M0 > 1 && M1 < 2) {
	M0 = (M0 + 2) % 4;
	M1 = (M1 + 2) % 4;
	Swap = true;
	} else
	return false;

	// Note: if control flow comes here that means Swap is already set above
	DM = (M0 << 1) + (M1 & 1);
	return true;
	}
	}


	/// getSplatIdxForPPCMnemonics - Return the splat index as a value that is
	/// appropriate for PPC mnemonics (which have a big endian bias - namely
	/// elements are counted from the left of the vector register).
	unsigned PPC::getSplatIdxForPPCMnemonics(SDNode *N, unsigned EltSize,
	SelectionDAG &DAG) {
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
	assert(isSplatShuffleMask(SVOp, EltSize));
	if (DAG.getDataLayout().isLittleEndian())
	return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize);
	else
	return SVOp->getMaskElt(0) / EltSize;
	}

	/// get_VSPLTI_elt - If this is a build_vector of constants which can be formed
	/// by using a vspltis[bhw] instruction of the specified element size, return
	/// the constant being splatted. The ByteSize field indicates the number of
	/// bytes of each element [124] -> [bhw].
	SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
	SDValue OpVal(nullptr, 0);

	// If ByteSize of the splat is bigger than the element size of the
	// build_vector, then we have a case where we are checking for a splat where
	// multiple elements of the buildvector are folded together into a single
	// logical element of the splat (e.g. "vsplish 1" to splat {0,1}*8).
	unsigned EltSize = 16/N->getNumOperands();
	if (EltSize < ByteSize) {
	unsigned Multiple = ByteSize/EltSize; // Number of BV entries per spltval.
	SDValue UniquedVals[4];
	assert(Multiple > 1 && Multiple <= 4 && "How can this happen?");

	// See if all of the elements in the buildvector agree across.
	for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
	if (N->getOperand(i).isUndef()) continue;
	// If the element isn't a constant, bail fully out.
	if (!isa<ConstantSDNode>(N->getOperand(i))) return SDValue();

	if (!UniquedVals[i&(Multiple-1)].getNode())
	UniquedVals[i&(Multiple-1)] = N->getOperand(i);
	else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i))
	return SDValue(); // no match.
	}

	// Okay, if we reached this point, UniquedVals[0..Multiple-1] contains
	// either constant or undef values that are identical for each chunk. See
	// if these chunks can form into a larger vspltis*.

	// Check to see if all of the leading entries are either 0 or -1. If
	// neither, then this won't fit into the immediate field.
	bool LeadingZero = true;
	bool LeadingOnes = true;
	for (unsigned i = 0; i != Multiple-1; ++i) {
	if (!UniquedVals[i].getNode()) continue; // Must have been undefs.

	LeadingZero &= isNullConstant(UniquedVals[i]);
	LeadingOnes &= isAllOnesConstant(UniquedVals[i]);
	}
	// Finally, check the least significant entry.
	if (LeadingZero) {
	if (!UniquedVals[Multiple-1].getNode())
	return DAG.getTargetConstant(0, SDLoc(N), MVT::i32); // 0,0,0,undef
	int Val = cast<ConstantSDNode>(UniquedVals[Multiple-1])->getZExtValue();
	if (Val < 16) // 0,0,0,4 -> vspltisw(4)
	return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32);
	}
	if (LeadingOnes) {
	if (!UniquedVals[Multiple-1].getNode())
	return DAG.getTargetConstant(~0U, SDLoc(N), MVT::i32); // -1,-1,-1,undef
	int Val =cast<ConstantSDNode>(UniquedVals[Multiple-1])->getSExtValue();
	if (Val >= -16) // -1,-1,-1,-2 -> vspltisw(-2)
	return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32);
	}

	return SDValue();
	}

	// Check to see if this buildvec has a single non-undef value in its elements.
	for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
	if (N->getOperand(i).isUndef()) continue;
	if (!OpVal.getNode())
	OpVal = N->getOperand(i);
	else if (OpVal != N->getOperand(i))
	return SDValue();
	}

	if (!OpVal.getNode()) return SDValue(); // All UNDEF: use implicit def.

	unsigned ValSizeInBytes = EltSize;
	uint64_t Value = 0;
	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
	Value = CN->getZExtValue();
	} else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) {
	assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!");
	Value = FloatToBits(CN->getValueAPF().convertToFloat());
	}

	// If the splat value is larger than the element value, then we can never do
	// this splat. The only case that we could fit the replicated bits into our
	// immediate field for would be zero, and we prefer to use vxor for it.
	if (ValSizeInBytes < ByteSize) return SDValue();

	// If the element value is larger than the splat value, check if it consists
	// of a repeated bit pattern of size ByteSize.
	if (!APInt(ValSizeInBytes * 8, Value).isSplat(ByteSize * 8))
	return SDValue();

	// Properly sign extend the value.
	int MaskVal = SignExtend32(Value, ByteSize * 8);

	// If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros.
	if (MaskVal == 0) return SDValue();

	// Finally, if this value fits in a 5 bit sext field, return it
	if (SignExtend32<5>(MaskVal) == MaskVal)
	return DAG.getTargetConstant(MaskVal, SDLoc(N), MVT::i32);
	return SDValue();
	}

	/// isQVALIGNIShuffleMask - If this is a qvaligni shuffle mask, return the shift
	/// amount, otherwise return -1.
	int PPC::isQVALIGNIShuffleMask(SDNode *N) {
	EVT VT = N->getValueType(0);
	if (VT != MVT::v4f64 && VT != MVT::v4f32 && VT != MVT::v4i1)
	return -1;

	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);

	// Find the first non-undef value in the shuffle mask.
	unsigned i;
	for (i = 0; i != 4 && SVOp->getMaskElt(i) < 0; ++i)
	/search/;

	if (i == 4) return -1; // all undef.

	// Otherwise, check to see if the rest of the elements are consecutively
	// numbered from this value.
	unsigned ShiftAmt = SVOp->getMaskElt(i);
	if (ShiftAmt < i) return -1;
	ShiftAmt -= i;

	// Check the rest of the elements to see if they are consecutive.
	for (++i; i != 4; ++i)
	if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
	return -1;

	return ShiftAmt;
	}

	//===----------------------------------------------------------------------===//
	// Addressing Mode Selection
	//===----------------------------------------------------------------------===//

	/// isIntS16Immediate - This method tests to see if the node is either a 32-bit
	/// or 64-bit immediate, and if the value can be accurately represented as a
	/// sign extension from a 16-bit value. If so, this returns true and the
	/// immediate.
	bool llvm::isIntS16Immediate(SDNode *N, int16_t &Imm) {
	if (!isa<ConstantSDNode>(N))
	return false;

	Imm = (int16_t)cast<ConstantSDNode>(N)->getZExtValue();
	if (N->getValueType(0) == MVT::i32)
	return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
	else
	return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
	}
	bool llvm::isIntS16Immediate(SDValue Op, int16_t &Imm) {
	return isIntS16Immediate(Op.getNode(), Imm);
	}


	/// SelectAddressEVXRegReg - Given the specified address, check to see if it can
	/// be represented as an indexed [r+r] operation.
	bool PPCTargetLowering::SelectAddressEVXRegReg(SDValue N, SDValue &Base,
	SDValue &Index,
	SelectionDAG &DAG) const {
	for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end();
	UI != E; ++UI) {
	if (MemSDNode Memop = dyn_cast<MemSDNode>(UI)) {
	if (Memop->getMemoryVT() == MVT::f64) {
	Base = N.getOperand(0);
	Index = N.getOperand(1);
	return true;
	}
	}
	}
	return false;
	}

	/// SelectAddressRegReg - Given the specified addressed, check to see if it
	/// can be represented as an indexed [r+r] operation. Returns false if it
	/// can be more efficiently represented as [r+imm]. If \p EncodingAlignment is
	/// non-zero and N can be represented by a base register plus a signed 16-bit
	/// displacement, make a more precise judgement by checking (displacement % \p
	/// EncodingAlignment).
	bool PPCTargetLowering::SelectAddressRegReg(
	SDValue N, SDValue &Base, SDValue &Index, SelectionDAG &DAG,
	MaybeAlign EncodingAlignment) const {
	// If we have a PC Relative target flag don't select as [reg+reg]. It will be
	// a [pc+imm].
	if (SelectAddressPCRel(N, Base))
	return false;

	int16_t Imm = 0;
	if (N.getOpcode() == ISD::ADD) {
	// Is there any SPE load/store (f64), which can't handle 16bit offset?
	// SPE load/store can only handle 8-bit offsets.
	if (hasSPE() && SelectAddressEVXRegReg(N, Base, Index, DAG))
	return true;
	if (isIntS16Immediate(N.getOperand(1), Imm) &&
	(!EncodingAlignment \|\| isAligned(*EncodingAlignment, Imm)))
	return false; // r+i
	if (N.getOperand(1).getOpcode() == PPCISD::Lo)
	return false; // r+i

	Base = N.getOperand(0);
	Index = N.getOperand(1);
	return true;
	} else if (N.getOpcode() == ISD::OR) {
	if (isIntS16Immediate(N.getOperand(1), Imm) &&
	(!EncodingAlignment \|\| isAligned(*EncodingAlignment, Imm)))
	return false; // r+i can fold it if we can.

	// If this is an or of disjoint bitfields, we can codegen this as an add
	// (for better address arithmetic) if the LHS and RHS of the OR are provably
	// disjoint.
	KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0));

	if (LHSKnown.Zero.getBoolValue()) {
	KnownBits RHSKnown = DAG.computeKnownBits(N.getOperand(1));
	// If all of the bits are known zero on the LHS or RHS, the add won't
	// carry.
	if (~(LHSKnown.Zero \| RHSKnown.Zero) == 0) {
	Base = N.getOperand(0);
	Index = N.getOperand(1);
	return true;
	}
	}
	}

	return false;
	}

	// If we happen to be doing an i64 load or store into a stack slot that has
	// less than a 4-byte alignment, then the frame-index elimination may need to
	// use an indexed load or store instruction (because the offset may not be a
	// multiple of 4). The extra register needed to hold the offset comes from the
	// register scavenger, and it is possible that the scavenger will need to use
	// an emergency spill slot. As a result, we need to make sure that a spill slot
	// is allocated when doing an i64 load/store into a less-than-4-byte-aligned
	// stack slot.
	static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {
	// FIXME: This does not handle the LWA case.
	if (VT != MVT::i64)
	return;

	// NOTE: We'll exclude negative FIs here, which come from argument
	// lowering, because there are no known test cases triggering this problem
	// using packed structures (or similar). We can remove this exclusion if
	// we find such a test case. The reason why this is so test-case driven is
	// because this entire 'fixup' is only to prevent crashes (from the
	// register scavenger) on not-really-valid inputs. For example, if we have:
	// %a = alloca i1
	// %b = bitcast i1* %a to i64*
	// store i64* a, i64 b
	// then the store should really be marked as 'align 1', but is not. If it
	// were marked as 'align 1' then the indexed form would have been
	// instruction-selected initially, and the problem this 'fixup' is preventing
	// won't happen regardless.
	if (FrameIdx < 0)
	return;

	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();

	if (MFI.getObjectAlign(FrameIdx) >= Align(4))
	return;

	PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
	FuncInfo->setHasNonRISpills();
	}

	/// Returns true if the address N can be represented by a base register plus
	/// a signed 16-bit displacement [r+imm], and if it is not better
	/// represented as reg+reg. If \p EncodingAlignment is non-zero, only accept
	/// displacements that are multiples of that value.
	bool PPCTargetLowering::SelectAddressRegImm(
	SDValue N, SDValue &Disp, SDValue &Base, SelectionDAG &DAG,
	MaybeAlign EncodingAlignment) const {
	// FIXME dl should come from parent load or store, not from address
	SDLoc dl(N);

	// If we have a PC Relative target flag don't select as [reg+imm]. It will be
	// a [pc+imm].
	if (SelectAddressPCRel(N, Base))
	return false;

	// If this can be more profitably realized as r+r, fail.
	if (SelectAddressRegReg(N, Disp, Base, DAG, EncodingAlignment))
	return false;

	if (N.getOpcode() == ISD::ADD) {
	int16_t imm = 0;
	if (isIntS16Immediate(N.getOperand(1), imm) &&
	(!EncodingAlignment \|\| isAligned(*EncodingAlignment, imm))) {
	Disp = DAG.getTargetConstant(imm, dl, N.getValueType());
	if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
	Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
	fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
	} else {
	Base = N.getOperand(0);
	}
	return true; // [r+i]
	} else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
	// Match LOAD (ADD (X, Lo(G))).
	assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
	&& "Cannot handle constant offsets yet!");
	Disp = N.getOperand(1).getOperand(0); // The global address.
	assert(Disp.getOpcode() == ISD::TargetGlobalAddress \|\|
	Disp.getOpcode() == ISD::TargetGlobalTLSAddress \|\|
	Disp.getOpcode() == ISD::TargetConstantPool \|\|
	Disp.getOpcode() == ISD::TargetJumpTable);
	Base = N.getOperand(0);
	return true; // [&g+r]
	}
	} else if (N.getOpcode() == ISD::OR) {
	int16_t imm = 0;
	if (isIntS16Immediate(N.getOperand(1), imm) &&
	(!EncodingAlignment \|\| isAligned(*EncodingAlignment, imm))) {
	// If this is an or of disjoint bitfields, we can codegen this as an add
	// (for better address arithmetic) if the LHS and RHS of the OR are
	// provably disjoint.
	KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0));

	if ((LHSKnown.Zero.getZExtValue()\|~(uint64_t)imm) == ~0ULL) {
	// If all of the bits are known zero on the LHS or RHS, the add won't
	// carry.
	if (FrameIndexSDNode *FI =
	dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
	Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
	fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
	} else {
	Base = N.getOperand(0);
	}
	Disp = DAG.getTargetConstant(imm, dl, N.getValueType());
	return true;
	}
	}
	} else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
	// Loading from a constant address.

	// If this address fits entirely in a 16-bit sext immediate field, codegen
	// this as "d, 0"
	int16_t Imm;
	if (isIntS16Immediate(CN, Imm) &&
	(!EncodingAlignment \|\| isAligned(*EncodingAlignment, Imm))) {
	Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0));
	Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
	CN->getValueType(0));
	return true;
	}

	// Handle 32-bit sext immediates with LIS + addr mode.
	if ((CN->getValueType(0) == MVT::i32 \|\|
	(int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) &&
	(!EncodingAlignment \|\|
	isAligned(*EncodingAlignment, CN->getZExtValue()))) {
	int Addr = (int)CN->getZExtValue();

	// Otherwise, break this down into an LIS + disp.
	Disp = DAG.getTargetConstant((short)Addr, dl, MVT::i32);

	Base = DAG.getTargetConstant((Addr - (signed short)Addr) >> 16, dl,
	MVT::i32);
	unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8;
	Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base), 0);
	return true;
	}
	}

	Disp = DAG.getTargetConstant(0, dl, getPointerTy(DAG.getDataLayout()));
	if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) {
	Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
	fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
	} else
	Base = N;
	return true; // [r+0]
	}

	/// SelectAddressRegRegOnly - Given the specified addressed, force it to be
	/// represented as an indexed [r+r] operation.
	bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
	SDValue &Index,
	SelectionDAG &DAG) const {
	// Check to see if we can easily represent this as an [r+r] address. This
	// will fail if it thinks that the address is more profitably represented as
	// reg+imm, e.g. where imm = 0.
	if (SelectAddressRegReg(N, Base, Index, DAG))
	return true;

	// If the address is the result of an add, we will utilize the fact that the
	// address calculation includes an implicit add. However, we can reduce
	// register pressure if we do not materialize a constant just for use as the
	// index register. We only get rid of the add if it is not an add of a
	// value and a 16-bit signed constant and both have a single use.
	int16_t imm = 0;
	if (N.getOpcode() == ISD::ADD &&
	(!isIntS16Immediate(N.getOperand(1), imm) \|\|
	!N.getOperand(1).hasOneUse() \|\| !N.getOperand(0).hasOneUse())) {
	Base = N.getOperand(0);
	Index = N.getOperand(1);
	return true;
	}

	// Otherwise, do it the hard way, using R0 as the base register.
	Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
	N.getValueType());
	Index = N;
	return true;
	}

	template <typename Ty> static bool isValidPCRelNode(SDValue N) {
	Ty *PCRelCand = dyn_cast<Ty>(N);
	return PCRelCand && (PCRelCand->getTargetFlags() & PPCII::MO_PCREL_FLAG);
	}

	/// Returns true if this address is a PC Relative address.
	/// PC Relative addresses are marked with the flag PPCII::MO_PCREL_FLAG
	/// or if the node opcode is PPCISD::MAT_PCREL_ADDR.
	bool PPCTargetLowering::SelectAddressPCRel(SDValue N, SDValue &Base) const {
	// This is a materialize PC Relative node. Always select this as PC Relative.
	Base = N;
	if (N.getOpcode() == PPCISD::MAT_PCREL_ADDR)
	return true;
	if (isValidPCRelNode<ConstantPoolSDNode>(N) \|\|
	isValidPCRelNode<GlobalAddressSDNode>(N) \|\|
	isValidPCRelNode<JumpTableSDNode>(N) \|\|
	isValidPCRelNode<BlockAddressSDNode>(N))
	return true;
	return false;
	}

	/// Returns true if we should use a direct load into vector instruction
	/// (such as lxsd or lfd), instead of a load into gpr + direct move sequence.
	static bool usePartialVectorLoads(SDNode *N, const PPCSubtarget& ST) {

	// If there are any other uses other than scalar to vector, then we should
	// keep it as a scalar load -> direct move pattern to prevent multiple
	// loads.
	LoadSDNode *LD = dyn_cast<LoadSDNode>(N);
	if (!LD)
	return false;

	EVT MemVT = LD->getMemoryVT();
	if (!MemVT.isSimple())
	return false;
	switch(MemVT.getSimpleVT().SimpleTy) {
	case MVT::i64:
	break;
	case MVT::i32:
	if (!ST.hasP8Vector())
	return false;
	break;
	case MVT::i16:
	case MVT::i8:
	if (!ST.hasP9Vector())
	return false;
	break;
	default:
	return false;
	}

	SDValue LoadedVal(N, 0);
	if (!LoadedVal.hasOneUse())
	return false;

	for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end();
	UI != UE; ++UI)
	if (UI.getUse().get().getResNo() == 0 &&
	UI->getOpcode() != ISD::SCALAR_TO_VECTOR &&
	UI->getOpcode() != PPCISD::SCALAR_TO_VECTOR_PERMUTED)
	return false;

	return true;
	}

	/// getPreIndexedAddressParts - returns true by value, base pointer and
	/// offset pointer and addressing mode by reference if the node's address
	/// can be legally represented as pre-indexed load / store address.
	bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
	SDValue &Offset,
	ISD::MemIndexedMode &AM,
	SelectionDAG &DAG) const {
	if (DisablePPCPreinc) return false;

	bool isLoad = true;
	SDValue Ptr;
	EVT VT;
	unsigned Alignment;
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
	Ptr = LD->getBasePtr();
	VT = LD->getMemoryVT();
	Alignment = LD->getAlignment();
	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
	Ptr = ST->getBasePtr();
	VT = ST->getMemoryVT();
	Alignment = ST->getAlignment();
	isLoad = false;
	} else
	return false;

	// Do not generate pre-inc forms for specific loads that feed scalar_to_vector
	// instructions because we can fold these into a more efficient instruction
	// instead, (such as LXSD).
	if (isLoad && usePartialVectorLoads(N, Subtarget)) {
	return false;
	}

	// PowerPC doesn't have preinc load/store instructions for vectors (except
	// for QPX, which does have preinc r+r forms).
	if (VT.isVector()) {
	if (!Subtarget.hasQPX() \|\| (VT != MVT::v4f64 && VT != MVT::v4f32)) {
	return false;
	} else if (SelectAddressRegRegOnly(Ptr, Offset, Base, DAG)) {
	AM = ISD::PRE_INC;
	return true;
	}
	}

	if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) {
	// Common code will reject creating a pre-inc form if the base pointer
	// is a frame index, or if N is a store and the base pointer is either
	// the same as or a predecessor of the value being stored. Check for
	// those situations here, and try with swapped Base/Offset instead.
	bool Swap = false;

	if (isa<FrameIndexSDNode>(Base) \|\| isa<RegisterSDNode>(Base))
	Swap = true;
	else if (!isLoad) {
	SDValue Val = cast<StoreSDNode>(N)->getValue();
	if (Val == Base \|\| Base.getNode()->isPredecessorOf(Val.getNode()))
	Swap = true;
	}

	if (Swap)
	std::swap(Base, Offset);

	AM = ISD::PRE_INC;
	return true;
	}

	// LDU/STU can only handle immediates that are a multiple of 4.
	if (VT != MVT::i64) {
	if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, None))
	return false;
	} else {
	// LDU/STU need an address with at least 4-byte alignment.
	if (Alignment < 4)
	return false;

	if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, Align(4)))
	return false;
	}

	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
	// PPC64 doesn't have lwau, but it does have lwaux. Reject preinc load of
	// sext i32 to i64 when addr mode is r+i.
	if (LD->getValueType(0) == MVT::i64 && LD->getMemoryVT() == MVT::i32 &&
	LD->getExtensionType() == ISD::SEXTLOAD &&
	isa<ConstantSDNode>(Offset))
	return false;
	}

	AM = ISD::PRE_INC;
	return true;
	}

	//===----------------------------------------------------------------------===//
	// LowerOperation implementation
	//===----------------------------------------------------------------------===//

	/// Return true if we should reference labels using a PICBase, set the HiOpFlags
	/// and LoOpFlags to the target MO flags.
	static void getLabelAccessInfo(bool IsPIC, const PPCSubtarget &Subtarget,
	unsigned &HiOpFlags, unsigned &LoOpFlags,
	const GlobalValue *GV = nullptr) {
	HiOpFlags = PPCII::MO_HA;
	LoOpFlags = PPCII::MO_LO;

	// Don't use the pic base if not in PIC relocation model.
	if (IsPIC) {
	HiOpFlags \|= PPCII::MO_PIC_FLAG;
	LoOpFlags \|= PPCII::MO_PIC_FLAG;
	}
	}

	static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC,
	SelectionDAG &DAG) {
	SDLoc DL(HiPart);
	EVT PtrVT = HiPart.getValueType();
	SDValue Zero = DAG.getConstant(0, DL, PtrVT);

	SDValue Hi = DAG.getNode(PPCISD::Hi, DL, PtrVT, HiPart, Zero);
	SDValue Lo = DAG.getNode(PPCISD::Lo, DL, PtrVT, LoPart, Zero);

	// With PIC, the first instruction is actually "GR+hi(&G)".
	if (isPIC)
	Hi = DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT), Hi);

	// Generate non-pic code that has direct accesses to the constant pool.
	// The address of the global is just (hi(&g)+lo(&g)).
	return DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo);
	}

	static void setUsesTOCBasePtr(MachineFunction &MF) {
	PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
	FuncInfo->setUsesTOCBasePtr();
	}

	static void setUsesTOCBasePtr(SelectionDAG &DAG) {
	setUsesTOCBasePtr(DAG.getMachineFunction());
	}

	SDValue PPCTargetLowering::getTOCEntry(SelectionDAG &DAG, const SDLoc &dl,
	SDValue GA) const {
	const bool Is64Bit = Subtarget.isPPC64();
	EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
	SDValue Reg = Is64Bit ? DAG.getRegister(PPC::X2, VT)
	: Subtarget.isAIXABI()
	? DAG.getRegister(PPC::R2, VT)
	: DAG.getNode(PPCISD::GlobalBaseReg, dl, VT);
	SDValue Ops[] = { GA, Reg };
	return DAG.getMemIntrinsicNode(
	PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()), None,
	MachineMemOperand::MOLoad);
	}

	SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
	SelectionDAG &DAG) const {
	EVT PtrVT = Op.getValueType();
	ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
	const Constant *C = CP->getConstVal();

	// 64-bit SVR4 ABI and AIX ABI code are always position-independent.
	// The actual address of the GlobalValue is stored in the TOC.
	if (Subtarget.is64BitELFABI() \|\| Subtarget.isAIXABI()) {
	if (Subtarget.isUsingPCRelativeCalls()) {
	SDLoc DL(CP);
	EVT Ty = getPointerTy(DAG.getDataLayout());
	SDValue ConstPool = DAG.getTargetConstantPool(
	C, Ty, CP->getAlign(), CP->getOffset(), PPCII::MO_PCREL_FLAG);
	return DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, ConstPool);
	}
	setUsesTOCBasePtr(DAG);
	SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), 0);
	return getTOCEntry(DAG, SDLoc(CP), GA);
	}

	unsigned MOHiFlag, MOLoFlag;
	bool IsPIC = isPositionIndependent();
	getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);

	if (IsPIC && Subtarget.isSVR4ABI()) {
	SDValue GA =
	DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), PPCII::MO_PIC_FLAG);
	return getTOCEntry(DAG, SDLoc(CP), GA);
	}

	SDValue CPIHi =
	DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), 0, MOHiFlag);
	SDValue CPILo =
	DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), 0, MOLoFlag);
	return LowerLabelRef(CPIHi, CPILo, IsPIC, DAG);
	}

	// For 64-bit PowerPC, prefer the more compact relative encodings.
	// This trades 32 bits per jump table entry for one or two instructions
	// on the jump site.
	unsigned PPCTargetLowering::getJumpTableEncoding() const {
	if (isJumpTableRelative())
	return MachineJumpTableInfo::EK_LabelDifference32;

	return TargetLowering::getJumpTableEncoding();
	}

	bool PPCTargetLowering::isJumpTableRelative() const {
	if (UseAbsoluteJumpTables)
	return false;
	if (Subtarget.isPPC64() \|\| Subtarget.isAIXABI())
	return true;
	return TargetLowering::isJumpTableRelative();
	}

	SDValue PPCTargetLowering::getPICJumpTableRelocBase(SDValue Table,
	SelectionDAG &DAG) const {
	if (!Subtarget.isPPC64() \|\| Subtarget.isAIXABI())
	return TargetLowering::getPICJumpTableRelocBase(Table, DAG);

	switch (getTargetMachine().getCodeModel()) {
	case CodeModel::Small:
	case CodeModel::Medium:
	return TargetLowering::getPICJumpTableRelocBase(Table, DAG);
	default:
	return DAG.getNode(PPCISD::GlobalBaseReg, SDLoc(),
	getPointerTy(DAG.getDataLayout()));
	}
	}

	const MCExpr *
	PPCTargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
	unsigned JTI,
	MCContext &Ctx) const {
	if (!Subtarget.isPPC64() \|\| Subtarget.isAIXABI())
	return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);

	switch (getTargetMachine().getCodeModel()) {
	case CodeModel::Small:
	case CodeModel::Medium:
	return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
	default:
	return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
	}
	}

	SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
	EVT PtrVT = Op.getValueType();
	JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

	// isUsingPCRelativeCalls() returns true when PCRelative is enabled
	if (Subtarget.isUsingPCRelativeCalls()) {
	SDLoc DL(JT);
	EVT Ty = getPointerTy(DAG.getDataLayout());
	SDValue GA =
	DAG.getTargetJumpTable(JT->getIndex(), Ty, PPCII::MO_PCREL_FLAG);
	SDValue MatAddr = DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);
	return MatAddr;
	}

	// 64-bit SVR4 ABI and AIX ABI code are always position-independent.
	// The actual address of the GlobalValue is stored in the TOC.
	if (Subtarget.is64BitELFABI() \|\| Subtarget.isAIXABI()) {
	setUsesTOCBasePtr(DAG);
	SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
	return getTOCEntry(DAG, SDLoc(JT), GA);
	}

	unsigned MOHiFlag, MOLoFlag;
	bool IsPIC = isPositionIndependent();
	getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);

	if (IsPIC && Subtarget.isSVR4ABI()) {
	SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
	PPCII::MO_PIC_FLAG);
	return getTOCEntry(DAG, SDLoc(GA), GA);
	}

	SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag);
	SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOLoFlag);
	return LowerLabelRef(JTIHi, JTILo, IsPIC, DAG);
	}

	SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
	SelectionDAG &DAG) const {
	EVT PtrVT = Op.getValueType();
	BlockAddressSDNode *BASDN = cast<BlockAddressSDNode>(Op);
	const BlockAddress *BA = BASDN->getBlockAddress();

	// isUsingPCRelativeCalls() returns true when PCRelative is enabled
	if (Subtarget.isUsingPCRelativeCalls()) {
	SDLoc DL(BASDN);
	EVT Ty = getPointerTy(DAG.getDataLayout());
	SDValue GA = DAG.getTargetBlockAddress(BA, Ty, BASDN->getOffset(),
	PPCII::MO_PCREL_FLAG);
	SDValue MatAddr = DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);
	return MatAddr;
	}

	// 64-bit SVR4 ABI and AIX ABI code are always position-independent.
	// The actual BlockAddress is stored in the TOC.
	if (Subtarget.is64BitELFABI() \|\| Subtarget.isAIXABI()) {
	setUsesTOCBasePtr(DAG);
	SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset());
	return getTOCEntry(DAG, SDLoc(BASDN), GA);
	}

	// 32-bit position-independent ELF stores the BlockAddress in the .got.
	if (Subtarget.is32BitELFABI() && isPositionIndependent())
	return getTOCEntry(
	DAG, SDLoc(BASDN),
	DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset()));

	unsigned MOHiFlag, MOLoFlag;
	bool IsPIC = isPositionIndependent();
	getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
	SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOHiFlag);
	SDValue TgtBALo = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOLoFlag);
	return LowerLabelRef(TgtBAHi, TgtBALo, IsPIC, DAG);
	}

	SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
	SelectionDAG &DAG) const {
	// FIXME: TLS addresses currently use medium model code sequences,
	// which is the most useful form. Eventually support for small and
	// large models could be added if users need it, at the cost of
	// additional complexity.
	GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
	if (DAG.getTarget().useEmulatedTLS())
	return LowerToTLSEmulatedModel(GA, DAG);

	SDLoc dl(GA);
	const GlobalValue *GV = GA->getGlobal();
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	bool is64bit = Subtarget.isPPC64();
	const Module *M = DAG.getMachineFunction().getFunction().getParent();
	PICLevel::Level picLevel = M->getPICLevel();

	const TargetMachine &TM = getTargetMachine();
	TLSModel::Model Model = TM.getTLSModel(GV);

	if (Model == TLSModel::LocalExec) {
	SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
	PPCII::MO_TPREL_HA);
	SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
	PPCII::MO_TPREL_LO);
	SDValue TLSReg = is64bit ? DAG.getRegister(PPC::X13, MVT::i64)
	: DAG.getRegister(PPC::R2, MVT::i32);

	SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg);
	return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi);
	}

	if (Model == TLSModel::InitialExec) {
	SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
	SDValue TGATLS = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
	PPCII::MO_TLS);
	SDValue GOTPtr;
	if (is64bit) {
	setUsesTOCBasePtr(DAG);
	SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
	GOTPtr = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl,
	PtrVT, GOTReg, TGA);
	} else {
	if (!TM.isPositionIndependent())
	GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT);
	else if (picLevel == PICLevel::SmallPIC)
	GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
	else
	GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
	}
	SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl,
	PtrVT, TGA, GOTPtr);
	return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS);
	}

	if (Model == TLSModel::GeneralDynamic) {
	SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
	SDValue GOTPtr;
	if (is64bit) {
	setUsesTOCBasePtr(DAG);
	SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
	GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT,
	GOTReg, TGA);
	} else {
	if (picLevel == PICLevel::SmallPIC)
	GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
	else
	GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
	}
	return DAG.getNode(PPCISD::ADDI_TLSGD_L_ADDR, dl, PtrVT,
	GOTPtr, TGA, TGA);
	}

	if (Model == TLSModel::LocalDynamic) {
	SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
	SDValue GOTPtr;
	if (is64bit) {
	setUsesTOCBasePtr(DAG);
	SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
	GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT,
	GOTReg, TGA);
	} else {
	if (picLevel == PICLevel::SmallPIC)
	GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
	else
	GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
	}
	SDValue TLSAddr = DAG.getNode(PPCISD::ADDI_TLSLD_L_ADDR, dl,
	PtrVT, GOTPtr, TGA, TGA);
	SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl,
	PtrVT, TLSAddr, TGA);
	return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA);
	}

	llvm_unreachable("Unknown TLS model!");
	}

	SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
	SelectionDAG &DAG) const {
	EVT PtrVT = Op.getValueType();
	GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
	SDLoc DL(GSDN);
	const GlobalValue *GV = GSDN->getGlobal();

	// 64-bit SVR4 ABI & AIX ABI code is always position-independent.
	// The actual address of the GlobalValue is stored in the TOC.
	if (Subtarget.is64BitELFABI() \|\| Subtarget.isAIXABI()) {
	if (Subtarget.isUsingPCRelativeCalls()) {
	EVT Ty = getPointerTy(DAG.getDataLayout());
	if (isAccessedAsGotIndirect(Op)) {
	SDValue GA = DAG.getTargetGlobalAddress(GV, DL, Ty, GSDN->getOffset(),
	PPCII::MO_PCREL_FLAG \|
	PPCII::MO_GOT_FLAG);
	SDValue MatPCRel = DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);
	SDValue Load = DAG.getLoad(MVT::i64, DL, DAG.getEntryNode(), MatPCRel,
	MachinePointerInfo());
	return Load;
	} else {
	SDValue GA = DAG.getTargetGlobalAddress(GV, DL, Ty, GSDN->getOffset(),
	PPCII::MO_PCREL_FLAG);
	return DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);
	}
	}
	setUsesTOCBasePtr(DAG);
	SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset());
	return getTOCEntry(DAG, DL, GA);
	}

	unsigned MOHiFlag, MOLoFlag;
	bool IsPIC = isPositionIndependent();
	getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag, GV);

	if (IsPIC && Subtarget.isSVR4ABI()) {
	SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT,
	GSDN->getOffset(),
	PPCII::MO_PIC_FLAG);
	return getTOCEntry(DAG, DL, GA);
	}

	SDValue GAHi =
	DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOHiFlag);
	SDValue GALo =
	DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOLoFlag);

	return LowerLabelRef(GAHi, GALo, IsPIC, DAG);
	}

	SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
	SDLoc dl(Op);

	if (Op.getValueType() == MVT::v2i64) {
	// When the operands themselves are v2i64 values, we need to do something
	// special because VSX has no underlying comparison operations for these.
	if (Op.getOperand(0).getValueType() == MVT::v2i64) {
	// Equality can be handled by casting to the legal type for Altivec
	// comparisons, everything else needs to be expanded.
	if (CC == ISD::SETEQ \|\| CC == ISD::SETNE) {
	return DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
	DAG.getSetCC(dl, MVT::v4i32,
	DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)),
	DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(1)),
	CC));
	}

	return SDValue();
	}

	// We handle most of these in the usual way.
	return Op;
	}

	// If we're comparing for equality to zero, expose the fact that this is
	// implemented as a ctlz/srl pair on ppc, so that the dag combiner can
	// fold the new nodes.
	if (SDValue V = lowerCmpEqZeroToCtlzSrl(Op, DAG))
	return V;

	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	// Leave comparisons against 0 and -1 alone for now, since they're usually
	// optimized. FIXME: revisit this when we can custom lower all setcc
	// optimizations.
	if (C->isAllOnesValue() \|\| C->isNullValue())
	return SDValue();
	}

	// If we have an integer seteq/setne, turn it into a compare against zero
	// by xor'ing the rhs with the lhs, which is faster than setting a
	// condition register, reading it back out, and masking the correct bit. The
	// normal approach here uses sub to do this instead of xor. Using xor exposes
	// the result to other bit-twiddling opportunities.
	EVT LHSVT = Op.getOperand(0).getValueType();
	if (LHSVT.isInteger() && (CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	EVT VT = Op.getValueType();
	SDValue Sub = DAG.getNode(ISD::XOR, dl, LHSVT, Op.getOperand(0),
	Op.getOperand(1));
	return DAG.getSetCC(dl, VT, Sub, DAG.getConstant(0, dl, LHSVT), CC);
	}
	return SDValue();
	}

	SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
	SDNode *Node = Op.getNode();
	EVT VT = Node->getValueType(0);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue InChain = Node->getOperand(0);
	SDValue VAListPtr = Node->getOperand(1);
	const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
	SDLoc dl(Node);

	assert(!Subtarget.isPPC64() && "LowerVAARG is PPC32 only");

	// gpr_index
	SDValue GprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
	VAListPtr, MachinePointerInfo(SV), MVT::i8);
	InChain = GprIndex.getValue(1);

	if (VT == MVT::i64) {
	// Check if GprIndex is even
	SDValue GprAnd = DAG.getNode(ISD::AND, dl, MVT::i32, GprIndex,
	DAG.getConstant(1, dl, MVT::i32));
	SDValue CC64 = DAG.getSetCC(dl, MVT::i32, GprAnd,
	DAG.getConstant(0, dl, MVT::i32), ISD::SETNE);
	SDValue GprIndexPlusOne = DAG.getNode(ISD::ADD, dl, MVT::i32, GprIndex,
	DAG.getConstant(1, dl, MVT::i32));
	// Align GprIndex to be even if it isn't
	GprIndex = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC64, GprIndexPlusOne,
	GprIndex);
	}

	// fpr index is 1 byte after gpr
	SDValue FprPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
	DAG.getConstant(1, dl, MVT::i32));

	// fpr
	SDValue FprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
	FprPtr, MachinePointerInfo(SV), MVT::i8);
	InChain = FprIndex.getValue(1);

	SDValue RegSaveAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
	DAG.getConstant(8, dl, MVT::i32));

	SDValue OverflowAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
	DAG.getConstant(4, dl, MVT::i32));

	// areas
	SDValue OverflowArea =
	DAG.getLoad(MVT::i32, dl, InChain, OverflowAreaPtr, MachinePointerInfo());
	InChain = OverflowArea.getValue(1);

	SDValue RegSaveArea =
	DAG.getLoad(MVT::i32, dl, InChain, RegSaveAreaPtr, MachinePointerInfo());
	InChain = RegSaveArea.getValue(1);

	// select overflow_area if index > 8
	SDValue CC = DAG.getSetCC(dl, MVT::i32, VT.isInteger() ? GprIndex : FprIndex,
	DAG.getConstant(8, dl, MVT::i32), ISD::SETLT);

	// adjustment constant gpr_index * 4/8
	SDValue RegConstant = DAG.getNode(ISD::MUL, dl, MVT::i32,
	VT.isInteger() ? GprIndex : FprIndex,
	DAG.getConstant(VT.isInteger() ? 4 : 8, dl,
	MVT::i32));

	// OurReg = RegSaveArea + RegConstant
	SDValue OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, RegSaveArea,
	RegConstant);

	// Floating types are 32 bytes into RegSaveArea
	if (VT.isFloatingPoint())
	OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, OurReg,
	DAG.getConstant(32, dl, MVT::i32));

	// increase {f,g}pr_index by 1 (or 2 if VT is i64)
	SDValue IndexPlus1 = DAG.getNode(ISD::ADD, dl, MVT::i32,
	VT.isInteger() ? GprIndex : FprIndex,
	DAG.getConstant(VT == MVT::i64 ? 2 : 1, dl,
	MVT::i32));

	InChain = DAG.getTruncStore(InChain, dl, IndexPlus1,
	VT.isInteger() ? VAListPtr : FprPtr,
	MachinePointerInfo(SV), MVT::i8);

	// determine if we should load from reg_save_area or overflow_area
	SDValue Result = DAG.getNode(ISD::SELECT, dl, PtrVT, CC, OurReg, OverflowArea);

	// increase overflow_area by 4/8 if gpr/fpr > 8
	SDValue OverflowAreaPlusN = DAG.getNode(ISD::ADD, dl, PtrVT, OverflowArea,
	DAG.getConstant(VT.isInteger() ? 4 : 8,
	dl, MVT::i32));

	OverflowArea = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC, OverflowArea,
	OverflowAreaPlusN);

	InChain = DAG.getTruncStore(InChain, dl, OverflowArea, OverflowAreaPtr,
	MachinePointerInfo(), MVT::i32);

	return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo());
	}

	SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
	assert(!Subtarget.isPPC64() && "LowerVACOPY is PPC32 only");

	// We have to copy the entire va_list struct:
	// 2sizeof(char) + 2 Byte alignment + 2sizeof(char*) = 12 Byte
	return DAG.getMemcpy(Op.getOperand(0), Op, Op.getOperand(1), Op.getOperand(2),
	DAG.getConstant(12, SDLoc(Op), MVT::i32), Align(8),
	false, true, false, MachinePointerInfo(),
	MachinePointerInfo());
	}

	SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
	SelectionDAG &DAG) const {
	if (Subtarget.isAIXABI())
	report_fatal_error("ADJUST_TRAMPOLINE operation is not supported on AIX.");

	return Op.getOperand(0);
	}

	SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
	SelectionDAG &DAG) const {
	if (Subtarget.isAIXABI())
	report_fatal_error("INIT_TRAMPOLINE operation is not supported on AIX.");

	SDValue Chain = Op.getOperand(0);
	SDValue Trmp = Op.getOperand(1); // trampoline
	SDValue FPtr = Op.getOperand(2); // nested function
	SDValue Nest = Op.getOperand(3); // 'nest' parameter value
	SDLoc dl(Op);

	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	bool isPPC64 = (PtrVT == MVT::i64);
	Type IntPtrTy = DAG.getDataLayout().getIntPtrType(DAG.getContext());

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;

	Entry.Ty = IntPtrTy;
	Entry.Node = Trmp; Args.push_back(Entry);

	// TrampSize == (isPPC64 ? 48 : 40);
	Entry.Node = DAG.getConstant(isPPC64 ? 48 : 40, dl,
	isPPC64 ? MVT::i64 : MVT::i32);
	Args.push_back(Entry);

	Entry.Node = FPtr; Args.push_back(Entry);
	Entry.Node = Nest; Args.push_back(Entry);

	// Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
	CallingConv::C, Type::getVoidTy(*DAG.getContext()),
	DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args));

	std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
	return CallResult.second;
	}

	SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
	EVT PtrVT = getPointerTy(MF.getDataLayout());

	SDLoc dl(Op);

	if (Subtarget.isPPC64() \|\| Subtarget.isAIXABI()) {
	// vastart just stores the address of the VarArgsFrameIndex slot into the
	// memory location argument.
	SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	// For the 32-bit SVR4 ABI we follow the layout of the va_list struct.
	// We suppose the given va_list is already allocated.
	//
	// typedef struct {
	// char gpr; /* index into the array of 8 GPRs
	// * stored in the register save area
	// * gpr=0 corresponds to r3,
	// * gpr=1 to r4, etc.
	// */
	// char fpr; /* index into the array of 8 FPRs
	// * stored in the register save area
	// * fpr=0 corresponds to f1,
	// * fpr=1 to f2, etc.
	// */
	// char *overflow_arg_area;
	// /* location on stack that holds
	// * the next overflow argument
	// */
	// char *reg_save_area;
	// /* where r3:r10 and f1:f8 (if saved)
	// * are stored
	// */
	// } va_list[1];

	SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), dl, MVT::i32);
	SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), dl, MVT::i32);
	SDValue StackOffsetFI = DAG.getFrameIndex(FuncInfo->getVarArgsStackOffset(),
	PtrVT);
	SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
	PtrVT);

	uint64_t FrameOffset = PtrVT.getSizeInBits()/8;
	SDValue ConstFrameOffset = DAG.getConstant(FrameOffset, dl, PtrVT);

	uint64_t StackOffset = PtrVT.getSizeInBits()/8 - 1;
	SDValue ConstStackOffset = DAG.getConstant(StackOffset, dl, PtrVT);

	uint64_t FPROffset = 1;
	SDValue ConstFPROffset = DAG.getConstant(FPROffset, dl, PtrVT);

	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

	// Store first byte : number of int regs
	SDValue firstStore =
	DAG.getTruncStore(Op.getOperand(0), dl, ArgGPR, Op.getOperand(1),
	MachinePointerInfo(SV), MVT::i8);
	uint64_t nextOffset = FPROffset;
	SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, Op.getOperand(1),
	ConstFPROffset);

	// Store second byte : number of float regs
	SDValue secondStore =
	DAG.getTruncStore(firstStore, dl, ArgFPR, nextPtr,
	MachinePointerInfo(SV, nextOffset), MVT::i8);
	nextOffset += StackOffset;
	nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstStackOffset);

	// Store second word : arguments given on stack
	SDValue thirdStore = DAG.getStore(secondStore, dl, StackOffsetFI, nextPtr,
	MachinePointerInfo(SV, nextOffset));
	nextOffset += FrameOffset;
	nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstFrameOffset);

	// Store third word : arguments given in registers
	return DAG.getStore(thirdStore, dl, FR, nextPtr,
	MachinePointerInfo(SV, nextOffset));
	}

	/// FPR - The set of FP registers that should be allocated for arguments
	/// on Darwin and AIX.
	static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5,
	PPC::F6, PPC::F7, PPC::F8, PPC::F9, PPC::F10,
	PPC::F11, PPC::F12, PPC::F13};

	/// QFPR - The set of QPX registers that should be allocated for arguments.
	static const MCPhysReg QFPR[] = {
	PPC::QF1, PPC::QF2, PPC::QF3, PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7,
	PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11, PPC::QF12, PPC::QF13};

	/// CalculateStackSlotSize - Calculates the size reserved for this argument on
	/// the stack.
	static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags,
	unsigned PtrByteSize) {
	unsigned ArgSize = ArgVT.getStoreSize();
	if (Flags.isByVal())
	ArgSize = Flags.getByValSize();

	// Round up to multiples of the pointer size, except for array members,
	// which are always packed.
	if (!Flags.isInConsecutiveRegs())
	ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;

	return ArgSize;
	}

	/// CalculateStackSlotAlignment - Calculates the alignment of this argument
	/// on the stack.
	static Align CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
	ISD::ArgFlagsTy Flags,
	unsigned PtrByteSize) {
	Align Alignment(PtrByteSize);

	// Altivec parameters are padded to a 16 byte boundary.
	if (ArgVT == MVT::v4f32 \|\| ArgVT == MVT::v4i32 \|\|
	ArgVT == MVT::v8i16 \|\| ArgVT == MVT::v16i8 \|\|
	ArgVT == MVT::v2f64 \|\| ArgVT == MVT::v2i64 \|\|
	ArgVT == MVT::v1i128 \|\| ArgVT == MVT::f128)
	Alignment = Align(16);
	// QPX vector types stored in double-precision are padded to a 32 byte
	// boundary.
	else if (ArgVT == MVT::v4f64 \|\| ArgVT == MVT::v4i1)
	Alignment = Align(32);

	// ByVal parameters are aligned as requested.
	if (Flags.isByVal()) {
	auto BVAlign = Flags.getNonZeroByValAlign();
	if (BVAlign > PtrByteSize) {
	if (BVAlign.value() % PtrByteSize != 0)
	llvm_unreachable(
	"ByVal alignment is not a multiple of the pointer size");

	Alignment = BVAlign;
	}
	}

	// Array members are always packed to their original alignment.
	if (Flags.isInConsecutiveRegs()) {
	// If the array member was split into multiple registers, the first
	// needs to be aligned to the size of the full type. (Except for
	// ppcf128, which is only aligned as its f64 components.)
	if (Flags.isSplit() && OrigVT != MVT::ppcf128)
	Alignment = Align(OrigVT.getStoreSize());
	else
	Alignment = Align(ArgVT.getStoreSize());
	}

	return Alignment;
	}

	/// CalculateStackSlotUsed - Return whether this argument will use its
	/// stack slot (instead of being passed in registers). ArgOffset,
	/// AvailableFPRs, and AvailableVRs must hold the current argument
	/// position, and will be updated to account for this argument.
	static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT,
	ISD::ArgFlagsTy Flags,
	unsigned PtrByteSize,
	unsigned LinkageSize,
	unsigned ParamAreaSize,
	unsigned &ArgOffset,
	unsigned &AvailableFPRs,
	unsigned &AvailableVRs, bool HasQPX) {
	bool UseMemory = false;

	// Respect alignment of argument on the stack.
	Align Alignment =
	CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
	ArgOffset = alignTo(ArgOffset, Alignment);
	// If there's no space left in the argument save area, we must
	// use memory (this check also catches zero-sized arguments).
	if (ArgOffset >= LinkageSize + ParamAreaSize)
	UseMemory = true;

	// Allocate argument on the stack.
	ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
	if (Flags.isInConsecutiveRegsLast())
	ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
	// If we overran the argument save area, we must use memory
	// (this check catches arguments passed partially in memory)
	if (ArgOffset > LinkageSize + ParamAreaSize)
	UseMemory = true;

	// However, if the argument is actually passed in an FPR or a VR,
	// we don't use memory after all.
	if (!Flags.isByVal()) {
	if (ArgVT == MVT::f32 \|\| ArgVT == MVT::f64 \|\|
	// QPX registers overlap with the scalar FP registers.
	(HasQPX && (ArgVT == MVT::v4f32 \|\|
	ArgVT == MVT::v4f64 \|\|
	ArgVT == MVT::v4i1)))
	if (AvailableFPRs > 0) {
	--AvailableFPRs;
	return false;
	}
	if (ArgVT == MVT::v4f32 \|\| ArgVT == MVT::v4i32 \|\|
	ArgVT == MVT::v8i16 \|\| ArgVT == MVT::v16i8 \|\|
	ArgVT == MVT::v2f64 \|\| ArgVT == MVT::v2i64 \|\|
	ArgVT == MVT::v1i128 \|\| ArgVT == MVT::f128)
	if (AvailableVRs > 0) {
	--AvailableVRs;
	return false;
	}
	}

	return UseMemory;
	}

	/// EnsureStackAlignment - Round stack frame size up from NumBytes to
	/// ensure minimum alignment required for target.
	static unsigned EnsureStackAlignment(const PPCFrameLowering *Lowering,
	unsigned NumBytes) {
	return alignTo(NumBytes, Lowering->getStackAlign());
	}

	SDValue PPCTargetLowering::LowerFormalArguments(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	if (Subtarget.isAIXABI())
	return LowerFormalArguments_AIX(Chain, CallConv, isVarArg, Ins, dl, DAG,
	InVals);
	if (Subtarget.is64BitELFABI())
	return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
	InVals);
	if (Subtarget.is32BitELFABI())
	return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
	InVals);

	return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins, dl, DAG,
	InVals);
	}

	SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {

	// 32-bit SVR4 ABI Stack Frame Layout:
	// +-----------------------------------+
	// +--> \| Back chain \|
	// \| +-----------------------------------+
	// \| \| Floating-point register save area \|
	// \| +-----------------------------------+
	// \| \| General register save area \|
	// \| +-----------------------------------+
	// \| \| CR save word \|
	// \| +-----------------------------------+
	// \| \| VRSAVE save word \|
	// \| +-----------------------------------+
	// \| \| Alignment padding \|
	// \| +-----------------------------------+
	// \| \| Vector register save area \|
	// \| +-----------------------------------+
	// \| \| Local variable space \|
	// \| +-----------------------------------+
	// \| \| Parameter list area \|
	// \| +-----------------------------------+
	// \| \| LR save word \|
	// \| +-----------------------------------+
	// SP--> +--- \| Back chain \|
	// +-----------------------------------+
	//
	// Specifications:
	// System V Application Binary Interface PowerPC Processor Supplement
	// AltiVec Technology Programming Interface Manual

	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();

	EVT PtrVT = getPointerTy(MF.getDataLayout());
	// Potential tail calls could cause overwriting of argument stack slots.
	bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
	(CallConv == CallingConv::Fast));
	const Align PtrAlign(4);

	// Assign locations to all of the incoming arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	PPCCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
	*DAG.getContext());

	// Reserve space for the linkage area on the stack.
	unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
	CCInfo.AllocateStack(LinkageSize, PtrAlign);
	if (useSoftFloat())
	CCInfo.PreAnalyzeFormalArguments(Ins);

	CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4);
	CCInfo.clearWasPPCF128();

	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];

	// Arguments stored in registers.
	if (VA.isRegLoc()) {
	const TargetRegisterClass *RC;
	EVT ValVT = VA.getValVT();

	switch (ValVT.getSimpleVT().SimpleTy) {
	default:
	llvm_unreachable("ValVT not supported by formal arguments Lowering");
	case MVT::i1:
	case MVT::i32:
	RC = &PPC::GPRCRegClass;
	break;
	case MVT::f32:
	if (Subtarget.hasP8Vector())
	RC = &PPC::VSSRCRegClass;
	else if (Subtarget.hasSPE())
	RC = &PPC::GPRCRegClass;
	else
	RC = &PPC::F4RCRegClass;
	break;
	case MVT::f64:
	if (Subtarget.hasVSX())
	RC = &PPC::VSFRCRegClass;
	else if (Subtarget.hasSPE())
	// SPE passes doubles in GPR pairs.
	RC = &PPC::GPRCRegClass;
	else
	RC = &PPC::F8RCRegClass;
	break;
	case MVT::v16i8:
	case MVT::v8i16:
	case MVT::v4i32:
	RC = &PPC::VRRCRegClass;
	break;
	case MVT::v4f32:
	RC = Subtarget.hasQPX() ? &PPC::QSRCRegClass : &PPC::VRRCRegClass;
	break;
	case MVT::v2f64:
	case MVT::v2i64:
	RC = &PPC::VRRCRegClass;
	break;
	case MVT::v4f64:
	RC = &PPC::QFRCRegClass;
	break;
	case MVT::v4i1:
	RC = &PPC::QBRCRegClass;
	break;
	}

	SDValue ArgValue;
	// Transform the arguments stored in physical registers into
	// virtual ones.
	if (VA.getLocVT() == MVT::f64 && Subtarget.hasSPE()) {
	assert(i + 1 < e && "No second half of double precision argument");
	unsigned RegLo = MF.addLiveIn(VA.getLocReg(), RC);
	unsigned RegHi = MF.addLiveIn(ArgLocs[++i].getLocReg(), RC);
	SDValue ArgValueLo = DAG.getCopyFromReg(Chain, dl, RegLo, MVT::i32);
	SDValue ArgValueHi = DAG.getCopyFromReg(Chain, dl, RegHi, MVT::i32);
	if (!Subtarget.isLittleEndian())
	std::swap (ArgValueLo, ArgValueHi);
	ArgValue = DAG.getNode(PPCISD::BUILD_SPE64, dl, MVT::f64, ArgValueLo,
	ArgValueHi);
	} else {
	unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValue = DAG.getCopyFromReg(Chain, dl, Reg,
	ValVT == MVT::i1 ? MVT::i32 : ValVT);
	if (ValVT == MVT::i1)
	ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue);
	}

	InVals.push_back(ArgValue);
	} else {
	// Argument stored in memory.
	assert(VA.isMemLoc());

	// Get the extended size of the argument type in stack
	unsigned ArgSize = VA.getLocVT().getStoreSize();
	// Get the actual size of the argument type
	unsigned ObjSize = VA.getValVT().getStoreSize();
	unsigned ArgOffset = VA.getLocMemOffset();
	// Stack objects in PPC32 are right justified.
	ArgOffset += ArgSize - ObjSize;
	int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, isImmutable);

	// Create load nodes to retrieve arguments from the stack.
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	InVals.push_back(
	DAG.getLoad(VA.getValVT(), dl, Chain, FIN, MachinePointerInfo()));
	}
	}

	// Assign locations to all of the incoming aggregate by value arguments.
	// Aggregates passed by value are stored in the local variable space of the
	// caller's stack frame, right above the parameter list area.
	SmallVector<CCValAssign, 16> ByValArgLocs;
	CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
	ByValArgLocs, *DAG.getContext());

	// Reserve stack space for the allocations in CCInfo.
	CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrAlign);

	CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4_ByVal);

	// Area that is at least reserved in the caller of this function.
	unsigned MinReservedArea = CCByValInfo.getNextStackOffset();
	MinReservedArea = std::max(MinReservedArea, LinkageSize);

	// Set the size that is at least reserved in caller of this function. Tail
	// call optimized function's reserved stack space needs to be aligned so that
	// taking the difference between two stack areas will result in an aligned
	// stack.
	MinReservedArea =
	EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
	FuncInfo->setMinReservedArea(MinReservedArea);

	SmallVector<SDValue, 8> MemOps;

	// If the function takes variable number of arguments, make a frame index for
	// the start of the first vararg value... for expansion of llvm.va_start.
	if (isVarArg) {
	static const MCPhysReg GPArgRegs[] = {
	PPC::R3, PPC::R4, PPC::R5, PPC::R6,
	PPC::R7, PPC::R8, PPC::R9, PPC::R10,
	};
	const unsigned NumGPArgRegs = array_lengthof(GPArgRegs);

	static const MCPhysReg FPArgRegs[] = {
	PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
	PPC::F8
	};
	unsigned NumFPArgRegs = array_lengthof(FPArgRegs);

	if (useSoftFloat() \|\| hasSPE())
	NumFPArgRegs = 0;

	FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs));
	FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(FPArgRegs));

	// Make room for NumGPArgRegs and NumFPArgRegs.
	int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 +
	NumFPArgRegs * MVT(MVT::f64).getSizeInBits()/8;

	FuncInfo->setVarArgsStackOffset(
	MFI.CreateFixedObject(PtrVT.getSizeInBits()/8,
	CCInfo.getNextStackOffset(), true));

	FuncInfo->setVarArgsFrameIndex(
	MFI.CreateStackObject(Depth, Align(8), false));
	SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

	// The fixed integer arguments of a variadic function are stored to the
	// VarArgsFrameIndex on the stack so that they may be loaded by
	// dereferencing the result of va_next.
	for (unsigned GPRIndex = 0; GPRIndex != NumGPArgRegs; ++GPRIndex) {
	// Get an existing live-in vreg, or add a new one.
	unsigned VReg = MF.getRegInfo().getLiveInVirtReg(GPArgRegs[GPRIndex]);
	if (!VReg)
	VReg = MF.addLiveIn(GPArgRegs[GPRIndex], &PPC::GPRCRegClass);

	SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
	SDValue Store =
	DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
	MemOps.push_back(Store);
	// Increment the address by four for the next argument to store
	SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT);
	FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
	}

	// FIXME 32-bit SVR4: We only need to save FP argument registers if CR bit 6
	// is set.
	// The double arguments are stored to the VarArgsFrameIndex
	// on the stack.
	for (unsigned FPRIndex = 0; FPRIndex != NumFPArgRegs; ++FPRIndex) {
	// Get an existing live-in vreg, or add a new one.
	unsigned VReg = MF.getRegInfo().getLiveInVirtReg(FPArgRegs[FPRIndex]);
	if (!VReg)
	VReg = MF.addLiveIn(FPArgRegs[FPRIndex], &PPC::F8RCRegClass);

	SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::f64);
	SDValue Store =
	DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
	MemOps.push_back(Store);
	// Increment the address by eight for the next argument to store
	SDValue PtrOff = DAG.getConstant(MVT(MVT::f64).getSizeInBits()/8, dl,
	PtrVT);
	FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
	}
	}

	if (!MemOps.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);

	return Chain;
	}

	// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
	// value to MVT::i64 and then truncate to the correct register size.
	SDValue PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags,
	EVT ObjectVT, SelectionDAG &DAG,
	SDValue ArgVal,
	const SDLoc &dl) const {
	if (Flags.isSExt())
	ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal,
	DAG.getValueType(ObjectVT));
	else if (Flags.isZExt())
	ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal,
	DAG.getValueType(ObjectVT));

	return DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, ArgVal);
	}

	SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	// TODO: add description of PPC stack frame format, or at least some docs.
	//
	bool isELFv2ABI = Subtarget.isELFv2ABI();
	bool isLittleEndian = Subtarget.isLittleEndian();
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();

	assert(!(CallConv == CallingConv::Fast && isVarArg) &&
	"fastcc not supported on varargs functions");

	EVT PtrVT = getPointerTy(MF.getDataLayout());
	// Potential tail calls could cause overwriting of argument stack slots.
	bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
	(CallConv == CallingConv::Fast));
	unsigned PtrByteSize = 8;
	unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();

	static const MCPhysReg GPR[] = {
	PPC::X3, PPC::X4, PPC::X5, PPC::X6,
	PPC::X7, PPC::X8, PPC::X9, PPC::X10,
	};
	static const MCPhysReg VR[] = {
	PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
	PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
	};

	const unsigned Num_GPR_Regs = array_lengthof(GPR);
	const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13;
	const unsigned Num_VR_Regs = array_lengthof(VR);
	const unsigned Num_QFPR_Regs = Num_FPR_Regs;

	// Do a first pass over the arguments to determine whether the ABI
	// guarantees that our caller has allocated the parameter save area
	// on its stack frame. In the ELFv1 ABI, this is always the case;
	// in the ELFv2 ABI, it is true if this is a vararg function or if
	// any parameter is located in a stack slot.

	bool HasParameterArea = !isELFv2ABI \|\| isVarArg;
	unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize;
	unsigned NumBytes = LinkageSize;
	unsigned AvailableFPRs = Num_FPR_Regs;
	unsigned AvailableVRs = Num_VR_Regs;
	for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
	if (Ins[i].Flags.isNest())
	continue;

	if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags,
	PtrByteSize, LinkageSize, ParamAreaSize,
	NumBytes, AvailableFPRs, AvailableVRs,
	Subtarget.hasQPX()))
	HasParameterArea = true;
	}

	// Add DAG nodes to load the arguments or copy them out of registers. On
	// entry to a function on PPC, the arguments start after the linkage area,
	// although the first ones are often in registers.

	unsigned ArgOffset = LinkageSize;
	unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
	unsigned &QFPR_idx = FPR_idx;
	SmallVector<SDValue, 8> MemOps;
	Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();
	unsigned CurArgIdx = 0;
	for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
	SDValue ArgVal;
	bool needsLoad = false;
	EVT ObjectVT = Ins[ArgNo].VT;
	EVT OrigVT = Ins[ArgNo].ArgVT;
	unsigned ObjSize = ObjectVT.getStoreSize();
	unsigned ArgSize = ObjSize;
	ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
	if (Ins[ArgNo].isOrigArg()) {
	std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx);
	CurArgIdx = Ins[ArgNo].getOrigArgIndex();
	}
	// We re-align the argument offset for each argument, except when using the
	// fast calling convention, when we need to make sure we do that only when
	// we'll actually use a stack slot.
	unsigned CurArgOffset;
	Align Alignment;
	auto ComputeArgOffset = [&]() {
	/* Respect alignment of argument on the stack. */
	Alignment =
	CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize);
	ArgOffset = alignTo(ArgOffset, Alignment);
	CurArgOffset = ArgOffset;
	};

	if (CallConv != CallingConv::Fast) {
	ComputeArgOffset();

	/* Compute GPR index associated with argument offset. */
	GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
	GPR_idx = std::min(GPR_idx, Num_GPR_Regs);
	}

	// FIXME the codegen can be much improved in some cases.
	// We do not have to keep everything in memory.
	if (Flags.isByVal()) {
	assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit");

	if (CallConv == CallingConv::Fast)
	ComputeArgOffset();

	// ObjSize is the true size, ArgSize rounded up to multiple of registers.
	ObjSize = Flags.getByValSize();
	ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
	// Empty aggregate parameters do not take up registers. Examples:
	// struct { } a;
	// union { } b;
	// int c[0];
	// etc. However, we have to provide a place-holder in InVals, so
	// pretend we have an 8-byte item at the current address for that
	// purpose.
	if (!ObjSize) {
	int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true);
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	InVals.push_back(FIN);
	continue;
	}

	// Create a stack object covering all stack doublewords occupied
	// by the argument. If the argument is (fully or partially) on
	// the stack, or if the argument is fully in registers but the
	// caller has allocated the parameter save anyway, we can refer
	// directly to the caller's stack frame. Otherwise, create a
	// local copy in our own frame.
	int FI;
	if (HasParameterArea \|\|
	ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize)
	FI = MFI.CreateFixedObject(ArgSize, ArgOffset, false, true);
	else
	FI = MFI.CreateStackObject(ArgSize, Alignment, false);
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);

	// Handle aggregates smaller than 8 bytes.
	if (ObjSize < PtrByteSize) {
	// The value of the object is its address, which differs from the
	// address of the enclosing doubleword on big-endian systems.
	SDValue Arg = FIN;
	if (!isLittleEndian) {
	SDValue ArgOff = DAG.getConstant(PtrByteSize - ObjSize, dl, PtrVT);
	Arg = DAG.getNode(ISD::ADD, dl, ArgOff.getValueType(), Arg, ArgOff);
	}
	InVals.push_back(Arg);

	if (GPR_idx != Num_GPR_Regs) {
	unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
	FuncInfo->addLiveInAttr(VReg, Flags);
	SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
	SDValue Store;

	if (ObjSize==1 \|\| ObjSize==2 \|\| ObjSize==4) {
	EVT ObjType = (ObjSize == 1 ? MVT::i8 :
	(ObjSize == 2 ? MVT::i16 : MVT::i32));
	Store = DAG.getTruncStore(Val.getValue(1), dl, Val, Arg,
	MachinePointerInfo(&*FuncArg), ObjType);
	} else {
	// For sizes that don't fit a truncating store (3, 5, 6, 7),
	// store the whole register as-is to the parameter save area
	// slot.
	Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
	MachinePointerInfo(&*FuncArg));
	}

	MemOps.push_back(Store);
	}
	// Whether we copied from a register or not, advance the offset
	// into the parameter save area by a full doubleword.
	ArgOffset += PtrByteSize;
	continue;
	}

	// The value of the object is its address, which is the address of
	// its first stack doubleword.
	InVals.push_back(FIN);

	// Store whatever pieces of the object are in registers to memory.
	for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
	if (GPR_idx == Num_GPR_Regs)
	break;

	unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
	FuncInfo->addLiveInAttr(VReg, Flags);
	SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
	SDValue Addr = FIN;
	if (j) {
	SDValue Off = DAG.getConstant(j, dl, PtrVT);
	Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off);
	}
	SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, Addr,
	MachinePointerInfo(&*FuncArg, j));
	MemOps.push_back(Store);
	++GPR_idx;
	}
	ArgOffset += ArgSize;
	continue;
	}

	switch (ObjectVT.getSimpleVT().SimpleTy) {
	default: llvm_unreachable("Unhandled argument type!");
	case MVT::i1:
	case MVT::i32:
	case MVT::i64:
	if (Flags.isNest()) {
	// The 'nest' parameter, if any, is passed in R11.
	unsigned VReg = MF.addLiveIn(PPC::X11, &PPC::G8RCRegClass);
	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);

	if (ObjectVT == MVT::i32 \|\| ObjectVT == MVT::i1)
	ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);

	break;
	}

	// These can be scalar arguments or elements of an integer array type
	// passed directly. Clang may use those instead of "byval" aggregate
	// types to avoid forcing arguments to memory unnecessarily.
	if (GPR_idx != Num_GPR_Regs) {
	unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
	FuncInfo->addLiveInAttr(VReg, Flags);
	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);

	if (ObjectVT == MVT::i32 \|\| ObjectVT == MVT::i1)
	// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
	// value to MVT::i64 and then truncate to the correct register size.
	ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
	} else {
	if (CallConv == CallingConv::Fast)
	ComputeArgOffset();

	needsLoad = true;
	ArgSize = PtrByteSize;
	}
	if (CallConv != CallingConv::Fast \|\| needsLoad)
	ArgOffset += 8;
	break;

	case MVT::f32:
	case MVT::f64:
	// These can be scalar arguments or elements of a float array type
	// passed directly. The latter are used to implement ELFv2 homogenous
	// float aggregates.
	if (FPR_idx != Num_FPR_Regs) {
	unsigned VReg;

	if (ObjectVT == MVT::f32)
	VReg = MF.addLiveIn(FPR[FPR_idx],
	Subtarget.hasP8Vector()
	? &PPC::VSSRCRegClass
	: &PPC::F4RCRegClass);
	else
	VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasVSX()
	? &PPC::VSFRCRegClass
	: &PPC::F8RCRegClass);

	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
	++FPR_idx;
	} else if (GPR_idx != Num_GPR_Regs && CallConv != CallingConv::Fast) {
	// FIXME: We may want to re-enable this for CallingConv::Fast on the P8
	// once we support fp <-> gpr moves.

	// This can only ever happen in the presence of f32 array types,
	// since otherwise we never run out of FPRs before running out
	// of GPRs.
	unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
	FuncInfo->addLiveInAttr(VReg, Flags);
	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);

	if (ObjectVT == MVT::f32) {
	if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0))
	ArgVal = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgVal,
	DAG.getConstant(32, dl, MVT::i32));
	ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal);
	}

	ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal);
	} else {
	if (CallConv == CallingConv::Fast)
	ComputeArgOffset();

	needsLoad = true;
	}

	// When passing an array of floats, the array occupies consecutive
	// space in the argument area; only round up to the next doubleword
	// at the end of the array. Otherwise, each float takes 8 bytes.
	if (CallConv != CallingConv::Fast \|\| needsLoad) {
	ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize;
	ArgOffset += ArgSize;
	if (Flags.isInConsecutiveRegsLast())
	ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
	}
	break;
	case MVT::v4f32:
	case MVT::v4i32:
	case MVT::v8i16:
	case MVT::v16i8:
	case MVT::v2f64:
	case MVT::v2i64:
	case MVT::v1i128:
	case MVT::f128:
	if (!Subtarget.hasQPX()) {
	// These can be scalar arguments or elements of a vector array type
	// passed directly. The latter are used to implement ELFv2 homogenous
	// vector aggregates.
	if (VR_idx != Num_VR_Regs) {
	unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
	++VR_idx;
	} else {
	if (CallConv == CallingConv::Fast)
	ComputeArgOffset();
	needsLoad = true;
	}
	if (CallConv != CallingConv::Fast \|\| needsLoad)
	ArgOffset += 16;
	break;
	} // not QPX

	assert(ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 &&
	"Invalid QPX parameter type");
	LLVM_FALLTHROUGH;

	case MVT::v4f64:
	case MVT::v4i1:
	// QPX vectors are treated like their scalar floating-point subregisters
	// (except that they're larger).
	unsigned Sz = ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 ? 16 : 32;
	if (QFPR_idx != Num_QFPR_Regs) {
	const TargetRegisterClass *RC;
	switch (ObjectVT.getSimpleVT().SimpleTy) {
	case MVT::v4f64: RC = &PPC::QFRCRegClass; break;
	case MVT::v4f32: RC = &PPC::QSRCRegClass; break;
	default: RC = &PPC::QBRCRegClass; break;
	}

	unsigned VReg = MF.addLiveIn(QFPR[QFPR_idx], RC);
	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
	++QFPR_idx;
	} else {
	if (CallConv == CallingConv::Fast)
	ComputeArgOffset();
	needsLoad = true;
	}
	if (CallConv != CallingConv::Fast \|\| needsLoad)
	ArgOffset += Sz;
	break;
	}

	// We need to load the argument to a virtual register if we determined
	// above that we ran out of physical registers of the appropriate type.
	if (needsLoad) {
	if (ObjSize < ArgSize && !isLittleEndian)
	CurArgOffset += ArgSize - ObjSize;
	int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, isImmutable);
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo());
	}

	InVals.push_back(ArgVal);
	}

	// Area that is at least reserved in the caller of this function.
	unsigned MinReservedArea;
	if (HasParameterArea)
	MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize);
	else
	MinReservedArea = LinkageSize;

	// Set the size that is at least reserved in caller of this function. Tail
	// call optimized functions' reserved stack space needs to be aligned so that
	// taking the difference between two stack areas will result in an aligned
	// stack.
	MinReservedArea =
	EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
	FuncInfo->setMinReservedArea(MinReservedArea);

	// If the function takes variable number of arguments, make a frame index for
	// the start of the first vararg value... for expansion of llvm.va_start.
	// On ELFv2ABI spec, it writes:
	// C programs that are intended to be portable across different compilers
	// and architectures must use the header file <stdarg.h> to deal with variable
	// argument lists.
	if (isVarArg && MFI.hasVAStart()) {
	int Depth = ArgOffset;

	FuncInfo->setVarArgsFrameIndex(
	MFI.CreateFixedObject(PtrByteSize, Depth, true));
	SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

	// If this function is vararg, store any remaining integer argument regs
	// to their spots on the stack so that they may be loaded by dereferencing
	// the result of va_next.
	for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
	GPR_idx < Num_GPR_Regs; ++GPR_idx) {
	unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
	SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
	SDValue Store =
	DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
	MemOps.push_back(Store);
	// Increment the address by four for the next argument to store
	SDValue PtrOff = DAG.getConstant(PtrByteSize, dl, PtrVT);
	FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
	}
	}

	if (!MemOps.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);

	return Chain;
	}

	SDValue PPCTargetLowering::LowerFormalArguments_Darwin(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	// TODO: add description of PPC stack frame format, or at least some docs.
	//
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();

	EVT PtrVT = getPointerTy(MF.getDataLayout());
	bool isPPC64 = PtrVT == MVT::i64;
	// Potential tail calls could cause overwriting of argument stack slots.
	bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
	(CallConv == CallingConv::Fast));
	unsigned PtrByteSize = isPPC64 ? 8 : 4;
	unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
	unsigned ArgOffset = LinkageSize;
	// Area that is at least reserved in caller of this function.
	unsigned MinReservedArea = ArgOffset;

	static const MCPhysReg GPR_32[] = { // 32-bit registers.
	PPC::R3, PPC::R4, PPC::R5, PPC::R6,
	PPC::R7, PPC::R8, PPC::R9, PPC::R10,
	};
	static const MCPhysReg GPR_64[] = { // 64-bit registers.
	PPC::X3, PPC::X4, PPC::X5, PPC::X6,
	PPC::X7, PPC::X8, PPC::X9, PPC::X10,
	};
	static const MCPhysReg VR[] = {
	PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
	PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
	};

	const unsigned Num_GPR_Regs = array_lengthof(GPR_32);
	const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13;
	const unsigned Num_VR_Regs = array_lengthof( VR);

	unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;

	const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;

	// In 32-bit non-varargs functions, the stack space for vectors is after the
	// stack space for non-vectors. We do not use this space unless we have
	// too many vectors to fit in registers, something that only occurs in
	// constructed examples:), but we have to walk the arglist to figure
	// that out...for the pathological case, compute VecArgOffset as the
	// start of the vector parameter area. Computing VecArgOffset is the
	// entire point of the following loop.
	unsigned VecArgOffset = ArgOffset;
	if (!isVarArg && !isPPC64) {
	for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e;
	++ArgNo) {
	EVT ObjectVT = Ins[ArgNo].VT;
	ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;

	if (Flags.isByVal()) {
	// ObjSize is the true size, ArgSize rounded up to multiple of regs.
	unsigned ObjSize = Flags.getByValSize();
	unsigned ArgSize =
	((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
	VecArgOffset += ArgSize;
	continue;
	}

	switch(ObjectVT.getSimpleVT().SimpleTy) {
	default: llvm_unreachable("Unhandled argument type!");
	case MVT::i1:
	case MVT::i32:
	case MVT::f32:
	VecArgOffset += 4;
	break;
	case MVT::i64: // PPC64
	case MVT::f64:
	// FIXME: We are guaranteed to be !isPPC64 at this point.
	// Does MVT::i64 apply?
	VecArgOffset += 8;
	break;
	case MVT::v4f32:
	case MVT::v4i32:
	case MVT::v8i16:
	case MVT::v16i8:
	// Nothing to do, we're only looking at Nonvector args here.
	break;
	}
	}
	}
	// We've found where the vector parameter area in memory is. Skip the
	// first 12 parameters; these don't use that memory.
	VecArgOffset = ((VecArgOffset+15)/16)*16;
	VecArgOffset += 12*16;

	// Add DAG nodes to load the arguments or copy them out of registers. On
	// entry to a function on PPC, the arguments start after the linkage area,
	// although the first ones are often in registers.

	SmallVector<SDValue, 8> MemOps;
	unsigned nAltivecParamsAtEnd = 0;
	Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();
	unsigned CurArgIdx = 0;
	for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
	SDValue ArgVal;
	bool needsLoad = false;
	EVT ObjectVT = Ins[ArgNo].VT;
	unsigned ObjSize = ObjectVT.getSizeInBits()/8;
	unsigned ArgSize = ObjSize;
	ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
	if (Ins[ArgNo].isOrigArg()) {
	std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx);
	CurArgIdx = Ins[ArgNo].getOrigArgIndex();
	}
	unsigned CurArgOffset = ArgOffset;

	// Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary.
	if (ObjectVT==MVT::v4f32 \|\| ObjectVT==MVT::v4i32 \|\|
	ObjectVT==MVT::v8i16 \|\| ObjectVT==MVT::v16i8) {
	if (isVarArg \|\| isPPC64) {
	MinReservedArea = ((MinReservedArea+15)/16)*16;
	MinReservedArea += CalculateStackSlotSize(ObjectVT,
	Flags,
	PtrByteSize);
	} else nAltivecParamsAtEnd++;
	} else
	// Calculate min reserved area.
	MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT,
	Flags,
	PtrByteSize);

	// FIXME the codegen can be much improved in some cases.
	// We do not have to keep everything in memory.
	if (Flags.isByVal()) {
	assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit");

	// ObjSize is the true size, ArgSize rounded up to multiple of registers.
	ObjSize = Flags.getByValSize();
	ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
	// Objects of size 1 and 2 are right justified, everything else is
	// left justified. This means the memory address is adjusted forwards.
	if (ObjSize==1 \|\| ObjSize==2) {
	CurArgOffset = CurArgOffset + (4 - ObjSize);
	}
	// The value of the object is its address.
	int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, false, true);
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	InVals.push_back(FIN);
	if (ObjSize==1 \|\| ObjSize==2) {
	if (GPR_idx != Num_GPR_Regs) {
	unsigned VReg;
	if (isPPC64)
	VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
	else
	VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
	SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
	EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16;
	SDValue Store =
	DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
	MachinePointerInfo(&*FuncArg), ObjType);
	MemOps.push_back(Store);
	++GPR_idx;
	}

	ArgOffset += PtrByteSize;

	continue;
	}
	for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
	// Store whatever pieces of the object are in registers
	// to memory. ArgOffset will be the address of the beginning
	// of the object.
	if (GPR_idx != Num_GPR_Regs) {
	unsigned VReg;
	if (isPPC64)
	VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
	else
	VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
	int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true);
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
	SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
	MachinePointerInfo(&*FuncArg, j));
	MemOps.push_back(Store);
	++GPR_idx;
	ArgOffset += PtrByteSize;
	} else {
	ArgOffset += ArgSize - (ArgOffset-CurArgOffset);
	break;
	}
	}
	continue;
	}

	switch (ObjectVT.getSimpleVT().SimpleTy) {
	default: llvm_unreachable("Unhandled argument type!");
	case MVT::i1:
	case MVT::i32:
	if (!isPPC64) {
	if (GPR_idx != Num_GPR_Regs) {
	unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);

	if (ObjectVT == MVT::i1)
	ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgVal);

	++GPR_idx;
	} else {
	needsLoad = true;
	ArgSize = PtrByteSize;
	}
	// All int arguments reserve stack space in the Darwin ABI.
	ArgOffset += PtrByteSize;
	break;
	}
	LLVM_FALLTHROUGH;
	case MVT::i64: // PPC64
	if (GPR_idx != Num_GPR_Regs) {
	unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);

	if (ObjectVT == MVT::i32 \|\| ObjectVT == MVT::i1)
	// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
	// value to MVT::i64 and then truncate to the correct register size.
	ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);

	++GPR_idx;
	} else {
	needsLoad = true;
	ArgSize = PtrByteSize;
	}
	// All int arguments reserve stack space in the Darwin ABI.
	ArgOffset += 8;
	break;

	case MVT::f32:
	case MVT::f64:
	// Every 4 bytes of argument space consumes one of the GPRs available for
	// argument passing.
	if (GPR_idx != Num_GPR_Regs) {
	++GPR_idx;
	if (ObjSize == 8 && GPR_idx != Num_GPR_Regs && !isPPC64)
	++GPR_idx;
	}
	if (FPR_idx != Num_FPR_Regs) {
	unsigned VReg;

	if (ObjectVT == MVT::f32)
	VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass);
	else
	VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass);

	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
	++FPR_idx;
	} else {
	needsLoad = true;
	}

	// All FP arguments reserve stack space in the Darwin ABI.
	ArgOffset += isPPC64 ? 8 : ObjSize;
	break;
	case MVT::v4f32:
	case MVT::v4i32:
	case MVT::v8i16:
	case MVT::v16i8:
	// Note that vector arguments in registers don't reserve stack space,
	// except in varargs functions.
	if (VR_idx != Num_VR_Regs) {
	unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
	if (isVarArg) {
	while ((ArgOffset % 16) != 0) {
	ArgOffset += PtrByteSize;
	if (GPR_idx != Num_GPR_Regs)
	GPR_idx++;
	}
	ArgOffset += 16;
	GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64?
	}
	++VR_idx;
	} else {
	if (!isVarArg && !isPPC64) {
	// Vectors go after all the nonvectors.
	CurArgOffset = VecArgOffset;
	VecArgOffset += 16;
	} else {
	// Vectors are aligned.
	ArgOffset = ((ArgOffset+15)/16)*16;
	CurArgOffset = ArgOffset;
	ArgOffset += 16;
	}
	needsLoad = true;
	}
	break;
	}

	// We need to load the argument to a virtual register if we determined above
	// that we ran out of physical registers of the appropriate type.
	if (needsLoad) {
	int FI = MFI.CreateFixedObject(ObjSize,
	CurArgOffset + (ArgSize - ObjSize),
	isImmutable);
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo());
	}

	InVals.push_back(ArgVal);
	}

	// Allow for Altivec parameters at the end, if needed.
	if (nAltivecParamsAtEnd) {
	MinReservedArea = ((MinReservedArea+15)/16)*16;
	MinReservedArea += 16*nAltivecParamsAtEnd;
	}

	// Area that is at least reserved in the caller of this function.
	MinReservedArea = std::max(MinReservedArea, LinkageSize + 8 * PtrByteSize);

	// Set the size that is at least reserved in caller of this function. Tail
	// call optimized functions' reserved stack space needs to be aligned so that
	// taking the difference between two stack areas will result in an aligned
	// stack.
	MinReservedArea =
	EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
	FuncInfo->setMinReservedArea(MinReservedArea);

	// If the function takes variable number of arguments, make a frame index for
	// the start of the first vararg value... for expansion of llvm.va_start.
	if (isVarArg) {
	int Depth = ArgOffset;

	FuncInfo->setVarArgsFrameIndex(
	MFI.CreateFixedObject(PtrVT.getSizeInBits()/8,
	Depth, true));
	SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

	// If this function is vararg, store any remaining integer argument regs
	// to their spots on the stack so that they may be loaded by dereferencing
	// the result of va_next.
	for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) {
	unsigned VReg;

	if (isPPC64)
	VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
	else
	VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);

	SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
	SDValue Store =
	DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
	MemOps.push_back(Store);
	// Increment the address by four for the next argument to store
	SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT);
	FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
	}
	}

	if (!MemOps.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);

	return Chain;
	}

	/// CalculateTailCallSPDiff - Get the amount the stack pointer has to be
	/// adjusted to accommodate the arguments for the tailcall.
	static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
	unsigned ParamSize) {

	if (!isTailCall) return 0;

	PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo<PPCFunctionInfo>();
	unsigned CallerMinReservedArea = FI->getMinReservedArea();
	int SPDiff = (int)CallerMinReservedArea - (int)ParamSize;
	// Remember only if the new adjustment is bigger.
	if (SPDiff < FI->getTailCallSPDelta())
	FI->setTailCallSPDelta(SPDiff);

	return SPDiff;
	}

	static bool isFunctionGlobalAddress(SDValue Callee);

	static bool callsShareTOCBase(const Function *Caller, SDValue Callee,
	const TargetMachine &TM) {
	// It does not make sense to call callsShareTOCBase() with a caller that
	// is PC Relative since PC Relative callers do not have a TOC.
	#ifndef NDEBUG
	const PPCSubtarget STICaller = &TM.getSubtarget<PPCSubtarget>(Caller);
	assert(!STICaller->isUsingPCRelativeCalls() &&
	"PC Relative callers do not have a TOC and cannot share a TOC Base");
	#endif

	// Callee is either a GlobalAddress or an ExternalSymbol. ExternalSymbols
	// don't have enough information to determine if the caller and callee share
	// the same TOC base, so we have to pessimistically assume they don't for
	// correctness.
	GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
	if (!G)
	return false;

	const GlobalValue *GV = G->getGlobal();

	// If the callee is preemptable, then the static linker will use a plt-stub
	// which saves the toc to the stack, and needs a nop after the call
	// instruction to convert to a toc-restore.
	if (!TM.shouldAssumeDSOLocal(*Caller->getParent(), GV))
	return false;

	// Functions with PC Relative enabled may clobber the TOC in the same DSO.
	// We may need a TOC restore in the situation where the caller requires a
	// valid TOC but the callee is PC Relative and does not.
	const Function *F = dyn_cast<Function>(GV);
	const GlobalAlias *Alias = dyn_cast<GlobalAlias>(GV);

	// If we have an Alias we can try to get the function from there.
	if (Alias) {
	const GlobalObject *GlobalObj = Alias->getBaseObject();
	F = dyn_cast<Function>(GlobalObj);
	}

	// If we still have no valid function pointer we do not have enough
	// information to determine if the callee uses PC Relative calls so we must
	// assume that it does.
	if (!F)
	return false;

	// If the callee uses PC Relative we cannot guarantee that the callee won't
	// clobber the TOC of the caller and so we must assume that the two
	// functions do not share a TOC base.
	const PPCSubtarget STICallee = &TM.getSubtarget<PPCSubtarget>(F);
	if (STICallee->isUsingPCRelativeCalls())
	return false;

	// The medium and large code models are expected to provide a sufficiently
	// large TOC to provide all data addressing needs of a module with a
	// single TOC.
	if (CodeModel::Medium == TM.getCodeModel() \|\|
	CodeModel::Large == TM.getCodeModel())
	return true;

	// Otherwise we need to ensure callee and caller are in the same section,
	// since the linker may allocate multiple TOCs, and we don't know which
	// sections will belong to the same TOC base.
	if (!GV->isStrongDefinitionForLinker())
	return false;

	// Any explicitly-specified sections and section prefixes must also match.
	// Also, if we're using -ffunction-sections, then each function is always in
	// a different section (the same is true for COMDAT functions).
	if (TM.getFunctionSections() \|\| GV->hasComdat() \|\| Caller->hasComdat() \|\|
	GV->getSection() != Caller->getSection())
	return false;
	if (const auto *F = dyn_cast<Function>(GV)) {
	if (F->getSectionPrefix() != Caller->getSectionPrefix())
	return false;
	}

	return true;
	}

	static bool
	needStackSlotPassParameters(const PPCSubtarget &Subtarget,
	const SmallVectorImpl<ISD::OutputArg> &Outs) {
	assert(Subtarget.is64BitELFABI());

	const unsigned PtrByteSize = 8;
	const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();

	static const MCPhysReg GPR[] = {
	PPC::X3, PPC::X4, PPC::X5, PPC::X6,
	PPC::X7, PPC::X8, PPC::X9, PPC::X10,
	};
	static const MCPhysReg VR[] = {
	PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
	PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
	};

	const unsigned NumGPRs = array_lengthof(GPR);
	const unsigned NumFPRs = 13;
	const unsigned NumVRs = array_lengthof(VR);
	const unsigned ParamAreaSize = NumGPRs * PtrByteSize;

	unsigned NumBytes = LinkageSize;
	unsigned AvailableFPRs = NumFPRs;
	unsigned AvailableVRs = NumVRs;

	for (const ISD::OutputArg& Param : Outs) {
	if (Param.Flags.isNest()) continue;

	if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags,
	PtrByteSize, LinkageSize, ParamAreaSize,
	NumBytes, AvailableFPRs, AvailableVRs,
	Subtarget.hasQPX()))
	return true;
	}
	return false;
	}

	static bool hasSameArgumentList(const Function *CallerFn, const CallBase &CB) {
	if (CB.arg_size() != CallerFn->arg_size())
	return false;

	auto CalleeArgIter = CB.arg_begin();
	auto CalleeArgEnd = CB.arg_end();
	Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin();

	for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) {
	const Value* CalleeArg = *CalleeArgIter;
	const Value* CallerArg = &(*CallerArgIter);
	if (CalleeArg == CallerArg)
	continue;

	// e.g. @caller([4 x i64] %a, [4 x i64] %b) {
	// tail call @callee([4 x i64] undef, [4 x i64] %b)
	// }
	// 1st argument of callee is undef and has the same type as caller.
	if (CalleeArg->getType() == CallerArg->getType() &&
	isa<UndefValue>(CalleeArg))
	continue;

	return false;
	}

	return true;
	}

	// Returns true if TCO is possible between the callers and callees
	// calling conventions.
	static bool
	areCallingConvEligibleForTCO_64SVR4(CallingConv::ID CallerCC,
	CallingConv::ID CalleeCC) {
	// Tail calls are possible with fastcc and ccc.
	auto isTailCallableCC = [] (CallingConv::ID CC){
	return CC == CallingConv::C \|\| CC == CallingConv::Fast;
	};
	if (!isTailCallableCC(CallerCC) \|\| !isTailCallableCC(CalleeCC))
	return false;

	// We can safely tail call both fastcc and ccc callees from a c calling
	// convention caller. If the caller is fastcc, we may have less stack space
	// than a non-fastcc caller with the same signature so disable tail-calls in
	// that case.
	return CallerCC == CallingConv::C \|\| CallerCC == CalleeCC;
	}

	bool PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
	SDValue Callee, CallingConv::ID CalleeCC, const CallBase *CB, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
	bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;

	if (DisableSCO && !TailCallOpt) return false;

	// Variadic argument functions are not supported.
	if (isVarArg) return false;

	auto &Caller = DAG.getMachineFunction().getFunction();
	// Check that the calling conventions are compatible for tco.
	if (!areCallingConvEligibleForTCO_64SVR4(Caller.getCallingConv(), CalleeCC))
	return false;

	// Caller contains any byval parameter is not supported.
	if (any_of(Ins, [](const ISD::InputArg &IA) { return IA.Flags.isByVal(); }))
	return false;

	// Callee contains any byval parameter is not supported, too.
	// Note: This is a quick work around, because in some cases, e.g.
	// caller's stack size > callee's stack size, we are still able to apply
	// sibling call optimization. For example, gcc is able to do SCO for caller1
	// in the following example, but not for caller2.
	// struct test {
	// long int a;
	// char ary[56];
	// } gTest;
	// __attribute__((noinline)) int callee(struct test v, struct test *b) {
	// b->a = v.a;
	// return 0;
	// }
	// void caller1(struct test a, struct test c, struct test *b) {
	// callee(gTest, b); }
	// void caller2(struct test *b) { callee(gTest, b); }
	if (any_of(Outs, [](const ISD::OutputArg& OA) { return OA.Flags.isByVal(); }))
	return false;

	// If callee and caller use different calling conventions, we cannot pass
	// parameters on stack since offsets for the parameter area may be different.
	if (Caller.getCallingConv() != CalleeCC &&
	needStackSlotPassParameters(Subtarget, Outs))
	return false;

	// All variants of 64-bit ELF ABIs without PC-Relative addressing require that
	// the caller and callee share the same TOC for TCO/SCO. If the caller and
	// callee potentially have different TOC bases then we cannot tail call since
	// we need to restore the TOC pointer after the call.
	// ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977
	// We cannot guarantee this for indirect calls or calls to external functions.
	// When PC-Relative addressing is used, the concept of the TOC is no longer
	// applicable so this check is not required.
	// Check first for indirect calls.
	if (!Subtarget.isUsingPCRelativeCalls() &&
	!isFunctionGlobalAddress(Callee) && !isa<ExternalSymbolSDNode>(Callee))
	return false;

	// Check if we share the TOC base.
	if (!Subtarget.isUsingPCRelativeCalls() &&
	!callsShareTOCBase(&Caller, Callee, getTargetMachine()))
	return false;

	// TCO allows altering callee ABI, so we don't have to check further.
	if (CalleeCC == CallingConv::Fast && TailCallOpt)
	return true;

	if (DisableSCO) return false;

	// If callee use the same argument list that caller is using, then we can
	// apply SCO on this case. If it is not, then we need to check if callee needs
	// stack for passing arguments.
	// PC Relative tail calls may not have a CallBase.
	// If there is no CallBase we cannot verify if we have the same argument
	// list so assume that we don't have the same argument list.
	if (CB && !hasSameArgumentList(&Caller, *CB) &&
	needStackSlotPassParameters(Subtarget, Outs))
	return false;
	else if (!CB && needStackSlotPassParameters(Subtarget, Outs))
	return false;

	return true;
	}

	/// IsEligibleForTailCallOptimization - Check whether the call is eligible
	/// for tail call optimization. Targets which want to do tail call
	/// optimization should implement this function.
	bool
	PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
	CallingConv::ID CalleeCC,
	bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	SelectionDAG& DAG) const {
	if (!getTargetMachine().Options.GuaranteedTailCallOpt)
	return false;

	// Variable argument functions are not supported.
	if (isVarArg)
	return false;

	MachineFunction &MF = DAG.getMachineFunction();
	CallingConv::ID CallerCC = MF.getFunction().getCallingConv();
	if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
	// Functions containing by val parameters are not supported.
	for (unsigned i = 0; i != Ins.size(); i++) {
	ISD::ArgFlagsTy Flags = Ins[i].Flags;
	if (Flags.isByVal()) return false;
	}

	// Non-PIC/GOT tail calls are supported.
	if (getTargetMachine().getRelocationModel() != Reloc::PIC_)
	return true;

	// At the moment we can only do local tail calls (in same module, hidden
	// or protected) if we are generating PIC.
	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
	return G->getGlobal()->hasHiddenVisibility()
	\|\| G->getGlobal()->hasProtectedVisibility();
	}

	return false;
	}

	/// isCallCompatibleAddress - Return the immediate to use if the specified
	/// 32-bit value is representable in the immediate field of a BxA instruction.
	static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) {
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
	if (!C) return nullptr;

	int Addr = C->getZExtValue();
	if ((Addr & 3) != 0 \|\| // Low 2 bits are implicitly zero.
	SignExtend32<26>(Addr) != Addr)
	return nullptr; // Top 6 bits have to be sext of immediate.

	return DAG
	.getConstant(
	(int)C->getZExtValue() >> 2, SDLoc(Op),
	DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()))
	.getNode();
	}

	namespace {

	struct TailCallArgumentInfo {
	SDValue Arg;
	SDValue FrameIdxOp;
	int FrameIdx = 0;

	TailCallArgumentInfo() = default;
	};

	} // end anonymous namespace

	/// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot.
	static void StoreTailCallArgumentsToStackSlot(
	SelectionDAG &DAG, SDValue Chain,
	const SmallVectorImpl<TailCallArgumentInfo> &TailCallArgs,
	SmallVectorImpl<SDValue> &MemOpChains, const SDLoc &dl) {
	for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) {
	SDValue Arg = TailCallArgs[i].Arg;
	SDValue FIN = TailCallArgs[i].FrameIdxOp;
	int FI = TailCallArgs[i].FrameIdx;
	// Store relative to framepointer.
	MemOpChains.push_back(DAG.getStore(
	Chain, dl, Arg, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
	}
	}

	/// EmitTailCallStoreFPAndRetAddr - Move the frame pointer and return address to
	/// the appropriate stack slot for the tail call optimized function call.
	static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, SDValue Chain,
	SDValue OldRetAddr, SDValue OldFP,
	int SPDiff, const SDLoc &dl) {
	if (SPDiff) {
	// Calculate the new stack slot for the return address.
	MachineFunction &MF = DAG.getMachineFunction();
	const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
	const PPCFrameLowering *FL = Subtarget.getFrameLowering();
	bool isPPC64 = Subtarget.isPPC64();
	int SlotSize = isPPC64 ? 8 : 4;
	int NewRetAddrLoc = SPDiff + FL->getReturnSaveOffset();
	int NewRetAddr = MF.getFrameInfo().CreateFixedObject(SlotSize,
	NewRetAddrLoc, true);
	EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
	SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT);
	Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx,
	MachinePointerInfo::getFixedStack(MF, NewRetAddr));
	}
	return Chain;
	}

	/// CalculateTailCallArgDest - Remember Argument for later processing. Calculate
	/// the position of the argument.
	static void
	CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64,
	SDValue Arg, int SPDiff, unsigned ArgOffset,
	SmallVectorImpl<TailCallArgumentInfo>& TailCallArguments) {
	int Offset = ArgOffset + SPDiff;
	uint32_t OpSize = (Arg.getValueSizeInBits() + 7) / 8;
	int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
	EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
	SDValue FIN = DAG.getFrameIndex(FI, VT);
	TailCallArgumentInfo Info;
	Info.Arg = Arg;
	Info.FrameIdxOp = FIN;
	Info.FrameIdx = FI;
	TailCallArguments.push_back(Info);
	}

	/// EmitTCFPAndRetAddrLoad - Emit load from frame pointer and return address
	/// stack slot. Returns the chain as result and the loaded frame pointers in
	/// LROpOut/FPOpout. Used when tail calling.
	SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(
	SelectionDAG &DAG, int SPDiff, SDValue Chain, SDValue &LROpOut,
	SDValue &FPOpOut, const SDLoc &dl) const {
	if (SPDiff) {
	// Load the LR and FP stack slot for later adjusting.
	EVT VT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
	LROpOut = getReturnAddrFrameIndex(DAG);
	LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo());
	Chain = SDValue(LROpOut.getNode(), 1);
	}
	return Chain;
	}

	/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
	/// by "Src" to address "Dst" of size "Size". Alignment information is
	/// specified by the specific parameter attribute. The copy will be passed as
	/// a byval function parameter.
	/// Sometimes what we are copying is the end of a larger object, the part that
	/// does not fit in registers.
	static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
	SDValue Chain, ISD::ArgFlagsTy Flags,
	SelectionDAG &DAG, const SDLoc &dl) {
	SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
	return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode,
	Flags.getNonZeroByValAlign(), false, false, false,
	MachinePointerInfo(), MachinePointerInfo());
	}

	/// LowerMemOpCallTo - Store the argument to the stack or remember it in case of
	/// tail calls.
	static void LowerMemOpCallTo(
	SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue Arg,
	SDValue PtrOff, int SPDiff, unsigned ArgOffset, bool isPPC64,
	bool isTailCall, bool isVector, SmallVectorImpl<SDValue> &MemOpChains,
	SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments, const SDLoc &dl) {
	EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
	if (!isTailCall) {
	if (isVector) {
	SDValue StackPtr;
	if (isPPC64)
	StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
	else
	StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
	PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
	DAG.getConstant(ArgOffset, dl, PtrVT));
	}
	MemOpChains.push_back(
	DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
	// Calculate and remember argument location.
	} else CalculateTailCallArgDest(DAG, MF, isPPC64, Arg, SPDiff, ArgOffset,
	TailCallArguments);
	}

	static void
	PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain,
	const SDLoc &dl, int SPDiff, unsigned NumBytes, SDValue LROp,
	SDValue FPOp,
	SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments) {
	// Emit a sequence of copyto/copyfrom virtual registers for arguments that
	// might overwrite each other in case of tail call optimization.
	SmallVector<SDValue, 8> MemOpChains2;
	// Do not flag preceding copytoreg stuff together with the following stuff.
	InFlag = SDValue();
	StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments,
	MemOpChains2, dl);
	if (!MemOpChains2.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);

	// Store the return address to the appropriate stack slot.
	Chain = EmitTailCallStoreFPAndRetAddr(DAG, Chain, LROp, FPOp, SPDiff, dl);

	// Emit callseq_end just before tailcall node.
	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
	DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
	InFlag = Chain.getValue(1);
	}

	// Is this global address that of a function that can be called by name? (as
	// opposed to something that must hold a descriptor for an indirect call).
	static bool isFunctionGlobalAddress(SDValue Callee) {
	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
	if (Callee.getOpcode() == ISD::GlobalTLSAddress \|\|
	Callee.getOpcode() == ISD::TargetGlobalTLSAddress)
	return false;

	return G->getGlobal()->getValueType()->isFunctionTy();
	}

	return false;
	}

	SDValue PPCTargetLowering::LowerCallResult(
	SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());

	CCRetInfo.AnalyzeCallResult(
	Ins, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
	? RetCC_PPC_Cold
	: RetCC_PPC);

	// Copy all of the result registers out of their specified physreg.
	for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
	CCValAssign &VA = RVLocs[i];
	assert(VA.isRegLoc() && "Can only return in registers!");

	SDValue Val;

	if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) {
	SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
	InFlag);
	Chain = Lo.getValue(1);
	InFlag = Lo.getValue(2);
	VA = RVLocs[++i]; // skip ahead to next loc
	SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
	InFlag);
	Chain = Hi.getValue(1);
	InFlag = Hi.getValue(2);
	if (!Subtarget.isLittleEndian())
	std::swap (Lo, Hi);
	Val = DAG.getNode(PPCISD::BUILD_SPE64, dl, MVT::f64, Lo, Hi);
	} else {
	Val = DAG.getCopyFromReg(Chain, dl,
	VA.getLocReg(), VA.getLocVT(), InFlag);
	Chain = Val.getValue(1);
	InFlag = Val.getValue(2);
	}

	switch (VA.getLocInfo()) {
	default: llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full: break;
	case CCValAssign::AExt:
	Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
	break;
	case CCValAssign::ZExt:
	Val = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), Val,
	DAG.getValueType(VA.getValVT()));
	Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
	break;
	case CCValAssign::SExt:
	Val = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), Val,
	DAG.getValueType(VA.getValVT()));
	Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
	break;
	}

	InVals.push_back(Val);
	}

	return Chain;
	}

	static bool isIndirectCall(const SDValue &Callee, SelectionDAG &DAG,
	const PPCSubtarget &Subtarget, bool isPatchPoint) {
	// PatchPoint calls are not indirect.
	if (isPatchPoint)
	return false;

	if (isFunctionGlobalAddress(Callee) \|\| dyn_cast<ExternalSymbolSDNode>(Callee))
	return false;

	// Darwin, and 32-bit ELF can use a BLA. The descriptor based ABIs can not
	// becuase the immediate function pointer points to a descriptor instead of
	// a function entry point. The ELFv2 ABI cannot use a BLA because the function
	// pointer immediate points to the global entry point, while the BLA would
	// need to jump to the local entry point (see rL211174).
	if (!Subtarget.usesFunctionDescriptors() && !Subtarget.isELFv2ABI() &&
	isBLACompatibleAddress(Callee, DAG))
	return false;

	return true;
	}

	// AIX and 64-bit ELF ABIs w/o PCRel require a TOC save/restore around calls.
	static inline bool isTOCSaveRestoreRequired(const PPCSubtarget &Subtarget) {
	return Subtarget.isAIXABI() \|\|
	(Subtarget.is64BitELFABI() && !Subtarget.isUsingPCRelativeCalls());
	}

	static unsigned getCallOpcode(PPCTargetLowering::CallFlags CFlags,
	const Function &Caller,
	const SDValue &Callee,
	const PPCSubtarget &Subtarget,
	const TargetMachine &TM) {
	if (CFlags.IsTailCall)
	return PPCISD::TC_RETURN;

	// This is a call through a function pointer.
	if (CFlags.IsIndirect) {
	// AIX and the 64-bit ELF ABIs need to maintain the TOC pointer accross
	// indirect calls. The save of the caller's TOC pointer to the stack will be
	// inserted into the DAG as part of call lowering. The restore of the TOC
	// pointer is modeled by using a pseudo instruction for the call opcode that
	// represents the 2 instruction sequence of an indirect branch and link,
	// immediately followed by a load of the TOC pointer from the the stack save
	// slot into gpr2. For 64-bit ELFv2 ABI with PCRel, do not restore the TOC
	// as it is not saved or used.
	return isTOCSaveRestoreRequired(Subtarget) ? PPCISD::BCTRL_LOAD_TOC
	: PPCISD::BCTRL;
	}

	if (Subtarget.isUsingPCRelativeCalls()) {
	assert(Subtarget.is64BitELFABI() && "PC Relative is only on ELF ABI.");
	return PPCISD::CALL_NOTOC;
	}

	// The ABIs that maintain a TOC pointer accross calls need to have a nop
	// immediately following the call instruction if the caller and callee may
	// have different TOC bases. At link time if the linker determines the calls
	// may not share a TOC base, the call is redirected to a trampoline inserted
	// by the linker. The trampoline will (among other things) save the callers
	// TOC pointer at an ABI designated offset in the linkage area and the linker
	// will rewrite the nop to be a load of the TOC pointer from the linkage area
	// into gpr2.
	if (Subtarget.isAIXABI() \|\| Subtarget.is64BitELFABI())
	return callsShareTOCBase(&Caller, Callee, TM) ? PPCISD::CALL
	: PPCISD::CALL_NOP;

	return PPCISD::CALL;
	}

	static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG,
	const SDLoc &dl, const PPCSubtarget &Subtarget) {
	if (!Subtarget.usesFunctionDescriptors() && !Subtarget.isELFv2ABI())
	if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG))
	return SDValue(Dest, 0);

	// Returns true if the callee is local, and false otherwise.
	auto isLocalCallee = [&]() {
	const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
	const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
	const GlobalValue *GV = G ? G->getGlobal() : nullptr;

	return DAG.getTarget().shouldAssumeDSOLocal(*Mod, GV) &&
	!dyn_cast_or_null<GlobalIFunc>(GV);
	};

	// The PLT is only used in 32-bit ELF PIC mode. Attempting to use the PLT in
	// a static relocation model causes some versions of GNU LD (2.17.50, at
	// least) to force BSS-PLT, instead of secure-PLT, even if all objects are
	// built with secure-PLT.
	bool UsePlt =
	Subtarget.is32BitELFABI() && !isLocalCallee() &&
	Subtarget.getTargetMachine().getRelocationModel() == Reloc::PIC_;

	// On AIX, direct function calls reference the symbol for the function's
	// entry point, which is named by prepending a "." before the function's
	// C-linkage name.
	const auto getAIXFuncEntryPointSymbolSDNode =
	[&](StringRef FuncName, bool IsDeclaration,
	const XCOFF::StorageClass &SC) {
	auto &Context = DAG.getMachineFunction().getMMI().getContext();

	MCSymbolXCOFF *S = cast<MCSymbolXCOFF>(
	Context.getOrCreateSymbol(Twine(".") + Twine(FuncName)));

	if (IsDeclaration && !S->hasRepresentedCsectSet()) {
	// On AIX, an undefined symbol needs to be associated with a
	// MCSectionXCOFF to get the correct storage mapping class.
	// In this case, XCOFF::XMC_PR.
	MCSectionXCOFF *Sec = Context.getXCOFFSection(
	S->getSymbolTableName(), XCOFF::XMC_PR, XCOFF::XTY_ER, SC,
	SectionKind::getMetadata());
	S->setRepresentedCsect(Sec);
	}

	MVT PtrVT =
	DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
	return DAG.getMCSymbol(S, PtrVT);
	};

	if (isFunctionGlobalAddress(Callee)) {
	const GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Callee);
	const GlobalValue *GV = G->getGlobal();

	if (!Subtarget.isAIXABI())
	return DAG.getTargetGlobalAddress(GV, dl, Callee.getValueType(), 0,
	UsePlt ? PPCII::MO_PLT : 0);

	assert(!isa<GlobalIFunc>(GV) && "IFunc is not supported on AIX.");
	const GlobalObject *GO = cast<GlobalObject>(GV);
	const XCOFF::StorageClass SC =
	TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GO);
	return getAIXFuncEntryPointSymbolSDNode(GO->getName(), GO->isDeclaration(),
	SC);
	}

	if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
	const char *SymName = S->getSymbol();
	if (!Subtarget.isAIXABI())
	return DAG.getTargetExternalSymbol(SymName, Callee.getValueType(),
	UsePlt ? PPCII::MO_PLT : 0);

	// If there exists a user-declared function whose name is the same as the
	// ExternalSymbol's, then we pick up the user-declared version.
	const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
	if (const Function *F =
	dyn_cast_or_null<Function>(Mod->getNamedValue(SymName))) {
	const XCOFF::StorageClass SC =
	TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(F);
	return getAIXFuncEntryPointSymbolSDNode(F->getName(), F->isDeclaration(),
	SC);
	}

	return getAIXFuncEntryPointSymbolSDNode(SymName, true, XCOFF::C_EXT);
	}

	// No transformation needed.
	assert(Callee.getNode() && "What no callee?");
	return Callee;
	}

	static SDValue getOutputChainFromCallSeq(SDValue CallSeqStart) {
	assert(CallSeqStart.getOpcode() == ISD::CALLSEQ_START &&
	"Expected a CALLSEQ_STARTSDNode.");

	// The last operand is the chain, except when the node has glue. If the node
	// has glue, then the last operand is the glue, and the chain is the second
	// last operand.
	SDValue LastValue = CallSeqStart.getValue(CallSeqStart->getNumValues() - 1);
	if (LastValue.getValueType() != MVT::Glue)
	return LastValue;

	return CallSeqStart.getValue(CallSeqStart->getNumValues() - 2);
	}

	// Creates the node that moves a functions address into the count register
	// to prepare for an indirect call instruction.
	static void prepareIndirectCall(SelectionDAG &DAG, SDValue &Callee,
	SDValue &Glue, SDValue &Chain,
	const SDLoc &dl) {
	SDValue MTCTROps[] = {Chain, Callee, Glue};
	EVT ReturnTypes[] = {MVT::Other, MVT::Glue};
	Chain = DAG.getNode(PPCISD::MTCTR, dl, makeArrayRef(ReturnTypes, 2),
	makeArrayRef(MTCTROps, Glue.getNode() ? 3 : 2));
	// The glue is the second value produced.
	Glue = Chain.getValue(1);
	}

	static void prepareDescriptorIndirectCall(SelectionDAG &DAG, SDValue &Callee,
	SDValue &Glue, SDValue &Chain,
	SDValue CallSeqStart,
	const CallBase *CB, const SDLoc &dl,
	bool hasNest,
	const PPCSubtarget &Subtarget) {
	// Function pointers in the 64-bit SVR4 ABI do not point to the function
	// entry point, but to the function descriptor (the function entry point
	// address is part of the function descriptor though).
	// The function descriptor is a three doubleword structure with the
	// following fields: function entry point, TOC base address and
	// environment pointer.
	// Thus for a call through a function pointer, the following actions need
	// to be performed:
	// 1. Save the TOC of the caller in the TOC save area of its stack
	// frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()).
	// 2. Load the address of the function entry point from the function
	// descriptor.
	// 3. Load the TOC of the callee from the function descriptor into r2.
	// 4. Load the environment pointer from the function descriptor into
	// r11.
	// 5. Branch to the function entry point address.
	// 6. On return of the callee, the TOC of the caller needs to be
	// restored (this is done in FinishCall()).
	//
	// The loads are scheduled at the beginning of the call sequence, and the
	// register copies are flagged together to ensure that no other
	// operations can be scheduled in between. E.g. without flagging the
	// copies together, a TOC access in the caller could be scheduled between
	// the assignment of the callee TOC and the branch to the callee, which leads
	// to incorrect code.

	// Start by loading the function address from the descriptor.
	SDValue LDChain = getOutputChainFromCallSeq(CallSeqStart);
	auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors()
	? (MachineMemOperand::MODereferenceable \|
	MachineMemOperand::MOInvariant)
	: MachineMemOperand::MONone;

	MachinePointerInfo MPI(CB ? CB->getCalledOperand() : nullptr);

	// Registers used in building the DAG.
	const MCRegister EnvPtrReg = Subtarget.getEnvironmentPointerRegister();
	const MCRegister TOCReg = Subtarget.getTOCPointerRegister();

	// Offsets of descriptor members.
	const unsigned TOCAnchorOffset = Subtarget.descriptorTOCAnchorOffset();
	const unsigned EnvPtrOffset = Subtarget.descriptorEnvironmentPointerOffset();

	const MVT RegVT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
	const unsigned Alignment = Subtarget.isPPC64() ? 8 : 4;

	// One load for the functions entry point address.
	SDValue LoadFuncPtr = DAG.getLoad(RegVT, dl, LDChain, Callee, MPI,
	Alignment, MMOFlags);

	// One for loading the TOC anchor for the module that contains the called
	// function.
	SDValue TOCOff = DAG.getIntPtrConstant(TOCAnchorOffset, dl);
	SDValue AddTOC = DAG.getNode(ISD::ADD, dl, RegVT, Callee, TOCOff);
	SDValue TOCPtr =
	DAG.getLoad(RegVT, dl, LDChain, AddTOC,
	MPI.getWithOffset(TOCAnchorOffset), Alignment, MMOFlags);

	// One for loading the environment pointer.
	SDValue PtrOff = DAG.getIntPtrConstant(EnvPtrOffset, dl);
	SDValue AddPtr = DAG.getNode(ISD::ADD, dl, RegVT, Callee, PtrOff);
	SDValue LoadEnvPtr =
	DAG.getLoad(RegVT, dl, LDChain, AddPtr,
	MPI.getWithOffset(EnvPtrOffset), Alignment, MMOFlags);


	// Then copy the newly loaded TOC anchor to the TOC pointer.
	SDValue TOCVal = DAG.getCopyToReg(Chain, dl, TOCReg, TOCPtr, Glue);
	Chain = TOCVal.getValue(0);
	Glue = TOCVal.getValue(1);

	// If the function call has an explicit 'nest' parameter, it takes the
	// place of the environment pointer.
	assert((!hasNest \|\| !Subtarget.isAIXABI()) &&
	"Nest parameter is not supported on AIX.");
	if (!hasNest) {
	SDValue EnvVal = DAG.getCopyToReg(Chain, dl, EnvPtrReg, LoadEnvPtr, Glue);
	Chain = EnvVal.getValue(0);
	Glue = EnvVal.getValue(1);
	}

	// The rest of the indirect call sequence is the same as the non-descriptor
	// DAG.
	prepareIndirectCall(DAG, LoadFuncPtr, Glue, Chain, dl);
	}

	static void
	buildCallOperands(SmallVectorImpl<SDValue> &Ops,
	PPCTargetLowering::CallFlags CFlags, const SDLoc &dl,
	SelectionDAG &DAG,
	SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass,
	SDValue Glue, SDValue Chain, SDValue &Callee, int SPDiff,
	const PPCSubtarget &Subtarget) {
	const bool IsPPC64 = Subtarget.isPPC64();
	// MVT for a general purpose register.
	const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;

	// First operand is always the chain.
	Ops.push_back(Chain);

	// If it's a direct call pass the callee as the second operand.
	if (!CFlags.IsIndirect)
	Ops.push_back(Callee);
	else {
	assert(!CFlags.IsPatchPoint && "Patch point calls are not indirect.");

	// For the TOC based ABIs, we have saved the TOC pointer to the linkage area
	// on the stack (this would have been done in `LowerCall_64SVR4` or
	// `LowerCall_AIX`). The call instruction is a pseudo instruction that
	// represents both the indirect branch and a load that restores the TOC
	// pointer from the linkage area. The operand for the TOC restore is an add
	// of the TOC save offset to the stack pointer. This must be the second
	// operand: after the chain input but before any other variadic arguments.
	// For 64-bit ELFv2 ABI with PCRel, do not restore the TOC as it is not
	// saved or used.
	if (isTOCSaveRestoreRequired(Subtarget)) {
	const MCRegister StackPtrReg = Subtarget.getStackPointerRegister();

	SDValue StackPtr = DAG.getRegister(StackPtrReg, RegVT);
	unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
	SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
	SDValue AddTOC = DAG.getNode(ISD::ADD, dl, RegVT, StackPtr, TOCOff);
	Ops.push_back(AddTOC);
	}

	// Add the register used for the environment pointer.
	if (Subtarget.usesFunctionDescriptors() && !CFlags.HasNest)
	Ops.push_back(DAG.getRegister(Subtarget.getEnvironmentPointerRegister(),
	RegVT));


	// Add CTR register as callee so a bctr can be emitted later.
	if (CFlags.IsTailCall)
	Ops.push_back(DAG.getRegister(IsPPC64 ? PPC::CTR8 : PPC::CTR, RegVT));
	}

	// If this is a tail call add stack pointer delta.
	if (CFlags.IsTailCall)
	Ops.push_back(DAG.getConstant(SPDiff, dl, MVT::i32));

	// Add argument registers to the end of the list so that they are known live
	// into the call.
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
	Ops.push_back(DAG.getRegister(RegsToPass[i].first,
	RegsToPass[i].second.getValueType()));

	// We cannot add R2/X2 as an operand here for PATCHPOINT, because there is
	// no way to mark dependencies as implicit here.
	// We will add the R2/X2 dependency in EmitInstrWithCustomInserter.
	if ((Subtarget.is64BitELFABI() \|\| Subtarget.isAIXABI()) &&
	!CFlags.IsPatchPoint && !Subtarget.isUsingPCRelativeCalls())
	Ops.push_back(DAG.getRegister(Subtarget.getTOCPointerRegister(), RegVT));

	// Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
	if (CFlags.IsVarArg && Subtarget.is32BitELFABI())
	Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32));

	// Add a register mask operand representing the call-preserved registers.
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	const uint32_t *Mask =
	TRI->getCallPreservedMask(DAG.getMachineFunction(), CFlags.CallConv);
	assert(Mask && "Missing call preserved mask for calling convention");
	Ops.push_back(DAG.getRegisterMask(Mask));

	// If the glue is valid, it is the last operand.
	if (Glue.getNode())
	Ops.push_back(Glue);
	}

	SDValue PPCTargetLowering::FinishCall(
	CallFlags CFlags, const SDLoc &dl, SelectionDAG &DAG,
	SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, SDValue Glue,
	SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff,
	unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins,
	SmallVectorImpl<SDValue> &InVals, const CallBase *CB) const {

	if ((Subtarget.is64BitELFABI() && !Subtarget.isUsingPCRelativeCalls()) \|\|
	Subtarget.isAIXABI())
	setUsesTOCBasePtr(DAG);

	unsigned CallOpc =
	getCallOpcode(CFlags, DAG.getMachineFunction().getFunction(), Callee,
	Subtarget, DAG.getTarget());

	if (!CFlags.IsIndirect)
	Callee = transformCallee(Callee, DAG, dl, Subtarget);
	else if (Subtarget.usesFunctionDescriptors())
	prepareDescriptorIndirectCall(DAG, Callee, Glue, Chain, CallSeqStart, CB,
	dl, CFlags.HasNest, Subtarget);
	else
	prepareIndirectCall(DAG, Callee, Glue, Chain, dl);

	// Build the operand list for the call instruction.
	SmallVector<SDValue, 8> Ops;
	buildCallOperands(Ops, CFlags, dl, DAG, RegsToPass, Glue, Chain, Callee,
	SPDiff, Subtarget);

	// Emit tail call.
	if (CFlags.IsTailCall) {
	// Indirect tail call when using PC Relative calls do not have the same
	// constraints.
	assert(((Callee.getOpcode() == ISD::Register &&
	cast<RegisterSDNode>(Callee)->getReg() == PPC::CTR) \|\|
	Callee.getOpcode() == ISD::TargetExternalSymbol \|\|
	Callee.getOpcode() == ISD::TargetGlobalAddress \|\|
	isa<ConstantSDNode>(Callee) \|\|
	(CFlags.IsIndirect && Subtarget.isUsingPCRelativeCalls())) &&
	"Expecting a global address, external symbol, absolute value, "
	"register or an indirect tail call when PC Relative calls are "
	"used.");
	// PC Relative calls also use TC_RETURN as the way to mark tail calls.
	assert(CallOpc == PPCISD::TC_RETURN &&
	"Unexpected call opcode for a tail call.");
	DAG.getMachineFunction().getFrameInfo().setHasTailCall();
	return DAG.getNode(CallOpc, dl, MVT::Other, Ops);
	}

	std::array<EVT, 2> ReturnTypes = {{MVT::Other, MVT::Glue}};
	Chain = DAG.getNode(CallOpc, dl, ReturnTypes, Ops);
	DAG.addNoMergeSiteInfo(Chain.getNode(), CFlags.NoMerge);
	Glue = Chain.getValue(1);

	// When performing tail call optimization the callee pops its arguments off
	// the stack. Account for this here so these bytes can be pushed back on in
	// PPCFrameLowering::eliminateCallFramePseudoInstr.
	int BytesCalleePops = (CFlags.CallConv == CallingConv::Fast &&
	getTargetMachine().Options.GuaranteedTailCallOpt)
	? NumBytes
	: 0;

	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
	DAG.getIntPtrConstant(BytesCalleePops, dl, true),
	Glue, dl);
	Glue = Chain.getValue(1);

	return LowerCallResult(Chain, Glue, CFlags.CallConv, CFlags.IsVarArg, Ins, dl,
	DAG, InVals);
	}

	SDValue
	PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	SelectionDAG &DAG = CLI.DAG;
	SDLoc &dl = CLI.DL;
	SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
	SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
	SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
	SDValue Chain = CLI.Chain;
	SDValue Callee = CLI.Callee;
	bool &isTailCall = CLI.IsTailCall;
	CallingConv::ID CallConv = CLI.CallConv;
	bool isVarArg = CLI.IsVarArg;
	bool isPatchPoint = CLI.IsPatchPoint;
	const CallBase *CB = CLI.CB;

	if (isTailCall) {
	if (Subtarget.useLongCalls() && !(CB && CB->isMustTailCall()))
	isTailCall = false;
	else if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
	isTailCall = IsEligibleForTailCallOptimization_64SVR4(
	Callee, CallConv, CB, isVarArg, Outs, Ins, DAG);
	else
	isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
	Ins, DAG);
	if (isTailCall) {
	++NumTailCalls;
	if (!getTargetMachine().Options.GuaranteedTailCallOpt)
	++NumSiblingCalls;

	// PC Relative calls no longer guarantee that the callee is a Global
	// Address Node. The callee could be an indirect tail call in which
	// case the SDValue for the callee could be a load (to load the address
	// of a function pointer) or it may be a register copy (to move the
	// address of the callee from a function parameter into a virtual
	// register). It may also be an ExternalSymbolSDNode (ex memcopy).
	assert((Subtarget.isUsingPCRelativeCalls() \|\|
	isa<GlobalAddressSDNode>(Callee)) &&
	"Callee should be an llvm::Function object.");

	LLVM_DEBUG(dbgs() << "TCO caller: " << DAG.getMachineFunction().getName()
	<< "\nTCO callee: ");
	LLVM_DEBUG(Callee.dump());
	}
	}

	if (!isTailCall && CB && CB->isMustTailCall())
	report_fatal_error("failed to perform tail call elimination on a call "
	"site marked musttail");

	// When long calls (i.e. indirect calls) are always used, calls are always
	// made via function pointer. If we have a function name, first translate it
	// into a pointer.
	if (Subtarget.useLongCalls() && isa<GlobalAddressSDNode>(Callee) &&
	!isTailCall)
	Callee = LowerGlobalAddress(Callee, DAG);

	CallFlags CFlags(
	CallConv, isTailCall, isVarArg, isPatchPoint,
	isIndirectCall(Callee, DAG, Subtarget, isPatchPoint),
	// hasNest
	Subtarget.is64BitELFABI() &&
	any_of(Outs, [](ISD::OutputArg Arg) { return Arg.Flags.isNest(); }),
	CLI.NoMerge);

	if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
	return LowerCall_64SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
	InVals, CB);

	if (Subtarget.isSVR4ABI())
	return LowerCall_32SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
	InVals, CB);

	if (Subtarget.isAIXABI())
	return LowerCall_AIX(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
	InVals, CB);

	return LowerCall_Darwin(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
	InVals, CB);
	}

	SDValue PPCTargetLowering::LowerCall_32SVR4(
	SDValue Chain, SDValue Callee, CallFlags CFlags,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
	const CallBase *CB) const {
	// See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description
	// of the 32-bit SVR4 ABI stack frame layout.

	const CallingConv::ID CallConv = CFlags.CallConv;
	const bool IsVarArg = CFlags.IsVarArg;
	const bool IsTailCall = CFlags.IsTailCall;

	assert((CallConv == CallingConv::C \|\|
	CallConv == CallingConv::Cold \|\|
	CallConv == CallingConv::Fast) && "Unknown calling convention!");

	const Align PtrAlign(4);

	MachineFunction &MF = DAG.getMachineFunction();

	// Mark this function as potentially containing a function that contains a
	// tail call. As a consequence the frame pointer will be used for dynamicalloc
	// and restoring the callers stack pointer in this functions epilog. This is
	// done because by tail calling the called function might overwrite the value
	// in this function's (MF) stack pointer stack slot 0(SP).
	if (getTargetMachine().Options.GuaranteedTailCallOpt &&
	CallConv == CallingConv::Fast)
	MF.getInfo<PPCFunctionInfo>()->setHasFastCall();

	// Count how many bytes are to be pushed on the stack, including the linkage
	// area, parameter list area and the part of the local variable space which
	// contains copies of aggregates which are passed by value.

	// Assign locations to all of the outgoing arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	PPCCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());

	// Reserve space for the linkage area on the stack.
	CCInfo.AllocateStack(Subtarget.getFrameLowering()->getLinkageSize(),
	PtrAlign);
	if (useSoftFloat())
	CCInfo.PreAnalyzeCallOperands(Outs);

	if (IsVarArg) {
	// Handle fixed and variable vector arguments differently.
	// Fixed vector arguments go into registers as long as registers are
	// available. Variable vector arguments always go into memory.
	unsigned NumArgs = Outs.size();

	for (unsigned i = 0; i != NumArgs; ++i) {
	MVT ArgVT = Outs[i].VT;
	ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
	bool Result;

	if (Outs[i].IsFixed) {
	Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
	CCInfo);
	} else {
	Result = CC_PPC32_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full,
	ArgFlags, CCInfo);
	}

	if (Result) {
	#ifndef NDEBUG
	errs() << "Call operand #" << i << " has unhandled type "
	<< EVT(ArgVT).getEVTString() << "\n";
	#endif
	llvm_unreachable(nullptr);
	}
	}
	} else {
	// All arguments are treated the same.
	CCInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4);
	}
	CCInfo.clearWasPPCF128();

	// Assign locations to all of the outgoing aggregate by value arguments.
	SmallVector<CCValAssign, 16> ByValArgLocs;
	CCState CCByValInfo(CallConv, IsVarArg, MF, ByValArgLocs, *DAG.getContext());

	// Reserve stack space for the allocations in CCInfo.
	CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrAlign);

	CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4_ByVal);

	// Size of the linkage area, parameter list area and the part of the local
	// space variable where copies of aggregates which are passed by value are
	// stored.
	unsigned NumBytes = CCByValInfo.getNextStackOffset();

	// Calculate by how many bytes the stack has to be adjusted in case of tail
	// call optimization.
	int SPDiff = CalculateTailCallSPDiff(DAG, IsTailCall, NumBytes);

	// Adjust the stack pointer for the new arguments...
	// These operations are automatically eliminated by the prolog/epilog pass
	Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
	SDValue CallSeqStart = Chain;

	// Load the return address and frame pointer so it can be moved somewhere else
	// later.
	SDValue LROp, FPOp;
	Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);

	// Set up a copy of the stack pointer for use loading and storing any
	// arguments that may not fit in the registers available for argument
	// passing.
	SDValue StackPtr = DAG.getRegister(PPC::R1, MVT::i32);

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
	SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
	SmallVector<SDValue, 8> MemOpChains;

	bool seenFloatArg = false;
	// Walk the register/memloc assignments, inserting copies/loads.
	// i - Tracks the index into the list of registers allocated for the call
	// RealArgIdx - Tracks the index into the list of actual function arguments
	// j - Tracks the index into the list of byval arguments
	for (unsigned i = 0, RealArgIdx = 0, j = 0, e = ArgLocs.size();
	i != e;
	++i, ++RealArgIdx) {
	CCValAssign &VA = ArgLocs[i];
	SDValue Arg = OutVals[RealArgIdx];
	ISD::ArgFlagsTy Flags = Outs[RealArgIdx].Flags;

	if (Flags.isByVal()) {
	// Argument is an aggregate which is passed by value, thus we need to
	// create a copy of it in the local variable space of the current stack
	// frame (which is the stack frame of the caller) and pass the address of
	// this copy to the callee.
	assert((j < ByValArgLocs.size()) && "Index out of bounds!");
	CCValAssign &ByValVA = ByValArgLocs[j++];
	assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!");

	// Memory reserved in the local variable space of the callers stack frame.
	unsigned LocMemOffset = ByValVA.getLocMemOffset();

	SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
	PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),
	StackPtr, PtrOff);

	// Create a copy of the argument in the local area of the current
	// stack frame.
	SDValue MemcpyCall =
	CreateCopyOfByValArgument(Arg, PtrOff,
	CallSeqStart.getNode()->getOperand(0),
	Flags, DAG, dl);

	// This must go outside the CALLSEQ_START..END.
	SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, NumBytes, 0,
	SDLoc(MemcpyCall));
	DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
	NewCallSeqStart.getNode());
	Chain = CallSeqStart = NewCallSeqStart;

	// Pass the address of the aggregate copy on the stack either in a
	// physical register or in the parameter list area of the current stack
	// frame to the callee.
	Arg = PtrOff;
	}

	// When useCRBits() is true, there can be i1 arguments.
	// It is because getRegisterType(MVT::i1) => MVT::i1,
	// and for other integer types getRegisterType() => MVT::i32.
	// Extend i1 and ensure callee will get i32.
	if (Arg.getValueType() == MVT::i1)
	Arg = DAG.getNode(Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
	dl, MVT::i32, Arg);

	if (VA.isRegLoc()) {
	seenFloatArg \|= VA.getLocVT().isFloatingPoint();
	// Put argument in a physical register.
	if (Subtarget.hasSPE() && Arg.getValueType() == MVT::f64) {
	bool IsLE = Subtarget.isLittleEndian();
	SDValue SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
	DAG.getIntPtrConstant(IsLE ? 0 : 1, dl));
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), SVal.getValue(0)));
	SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
	DAG.getIntPtrConstant(IsLE ? 1 : 0, dl));
	RegsToPass.push_back(std::make_pair(ArgLocs[++i].getLocReg(),
	SVal.getValue(0)));
	} else
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
	} else {
	// Put argument in the parameter list area of the current stack frame.
	assert(VA.isMemLoc());
	unsigned LocMemOffset = VA.getLocMemOffset();

	if (!IsTailCall) {
	SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
	PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),
	StackPtr, PtrOff);

	MemOpChains.push_back(
	DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
	} else {
	// Calculate and remember argument location.
	CalculateTailCallArgDest(DAG, MF, false, Arg, SPDiff, LocMemOffset,
	TailCallArguments);
	}
	}
	}

	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

	// Build a sequence of copy-to-reg nodes chained together with token chain
	// and flag operands which copy the outgoing args into the appropriate regs.
	SDValue InFlag;
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
	Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
	RegsToPass[i].second, InFlag);
	InFlag = Chain.getValue(1);
	}

	// Set CR bit 6 to true if this is a vararg call with floating args passed in
	// registers.
	if (IsVarArg) {
	SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue Ops[] = { Chain, InFlag };

	Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET,
	dl, VTs, makeArrayRef(Ops, InFlag.getNode() ? 2 : 1));

	InFlag = Chain.getValue(1);
	}

	if (IsTailCall)
	PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
	TailCallArguments);

	return FinishCall(CFlags, dl, DAG, RegsToPass, InFlag, Chain, CallSeqStart,
	Callee, SPDiff, NumBytes, Ins, InVals, CB);
	}

	// Copy an argument into memory, being careful to do this outside the
	// call sequence for the call to which the argument belongs.
	SDValue PPCTargetLowering::createMemcpyOutsideCallSeq(
	SDValue Arg, SDValue PtrOff, SDValue CallSeqStart, ISD::ArgFlagsTy Flags,
	SelectionDAG &DAG, const SDLoc &dl) const {
	SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff,
	CallSeqStart.getNode()->getOperand(0),
	Flags, DAG, dl);
	// The MEMCPY must go outside the CALLSEQ_START..END.
	int64_t FrameSize = CallSeqStart.getConstantOperandVal(1);
	SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, FrameSize, 0,
	SDLoc(MemcpyCall));
	DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
	NewCallSeqStart.getNode());
	return NewCallSeqStart;
	}

	SDValue PPCTargetLowering::LowerCall_64SVR4(
	SDValue Chain, SDValue Callee, CallFlags CFlags,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
	const CallBase *CB) const {
	bool isELFv2ABI = Subtarget.isELFv2ABI();
	bool isLittleEndian = Subtarget.isLittleEndian();
	unsigned NumOps = Outs.size();
	bool IsSibCall = false;
	bool IsFastCall = CFlags.CallConv == CallingConv::Fast;

	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	unsigned PtrByteSize = 8;

	MachineFunction &MF = DAG.getMachineFunction();

	if (CFlags.IsTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt)
	IsSibCall = true;

	// Mark this function as potentially containing a function that contains a
	// tail call. As a consequence the frame pointer will be used for dynamicalloc
	// and restoring the callers stack pointer in this functions epilog. This is
	// done because by tail calling the called function might overwrite the value
	// in this function's (MF) stack pointer stack slot 0(SP).
	if (getTargetMachine().Options.GuaranteedTailCallOpt && IsFastCall)
	MF.getInfo<PPCFunctionInfo>()->setHasFastCall();

	assert(!(IsFastCall && CFlags.IsVarArg) &&
	"fastcc not supported on varargs functions");

	// Count how many bytes are to be pushed on the stack, including the linkage
	// area, and parameter passing area. On ELFv1, the linkage area is 48 bytes
	// reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage
	// area is 32 bytes reserved space for [SP][CR][LR][TOC].
	unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
	unsigned NumBytes = LinkageSize;
	unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
	unsigned &QFPR_idx = FPR_idx;

	static const MCPhysReg GPR[] = {
	PPC::X3, PPC::X4, PPC::X5, PPC::X6,
	PPC::X7, PPC::X8, PPC::X9, PPC::X10,
	};
	static const MCPhysReg VR[] = {
	PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
	PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
	};

	const unsigned NumGPRs = array_lengthof(GPR);
	const unsigned NumFPRs = useSoftFloat() ? 0 : 13;
	const unsigned NumVRs = array_lengthof(VR);
	const unsigned NumQFPRs = NumFPRs;

	// On ELFv2, we can avoid allocating the parameter area if all the arguments
	// can be passed to the callee in registers.
	// For the fast calling convention, there is another check below.
	// Note: We should keep consistent with LowerFormalArguments_64SVR4()
	bool HasParameterArea = !isELFv2ABI \|\| CFlags.IsVarArg \|\| IsFastCall;
	if (!HasParameterArea) {
	unsigned ParamAreaSize = NumGPRs * PtrByteSize;
	unsigned AvailableFPRs = NumFPRs;
	unsigned AvailableVRs = NumVRs;
	unsigned NumBytesTmp = NumBytes;
	for (unsigned i = 0; i != NumOps; ++i) {
	if (Outs[i].Flags.isNest()) continue;
	if (CalculateStackSlotUsed(Outs[i].VT, Outs[i].ArgVT, Outs[i].Flags,
	PtrByteSize, LinkageSize, ParamAreaSize,
	NumBytesTmp, AvailableFPRs, AvailableVRs,
	Subtarget.hasQPX()))
	HasParameterArea = true;
	}
	}

	// When using the fast calling convention, we don't provide backing for
	// arguments that will be in registers.
	unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0;

	// Avoid allocating parameter area for fastcc functions if all the arguments
	// can be passed in the registers.
	if (IsFastCall)
	HasParameterArea = false;

	// Add up all the space actually used.
	for (unsigned i = 0; i != NumOps; ++i) {
	ISD::ArgFlagsTy Flags = Outs[i].Flags;
	EVT ArgVT = Outs[i].VT;
	EVT OrigVT = Outs[i].ArgVT;

	if (Flags.isNest())
	continue;

	if (IsFastCall) {
	if (Flags.isByVal()) {
	NumGPRsUsed += (Flags.getByValSize()+7)/8;
	if (NumGPRsUsed > NumGPRs)
	HasParameterArea = true;
	} else {
	switch (ArgVT.getSimpleVT().SimpleTy) {
	default: llvm_unreachable("Unexpected ValueType for argument!");
	case MVT::i1:
	case MVT::i32:
	case MVT::i64:
	if (++NumGPRsUsed <= NumGPRs)
	continue;
	break;
	case MVT::v4i32:
	case MVT::v8i16:
	case MVT::v16i8:
	case MVT::v2f64:
	case MVT::v2i64:
	case MVT::v1i128:
	case MVT::f128:
	if (++NumVRsUsed <= NumVRs)
	continue;
	break;
	case MVT::v4f32:
	// When using QPX, this is handled like a FP register, otherwise, it
	// is an Altivec register.
	if (Subtarget.hasQPX()) {
	if (++NumFPRsUsed <= NumFPRs)
	continue;
	} else {
	if (++NumVRsUsed <= NumVRs)
	continue;
	}
	break;
	case MVT::f32:
	case MVT::f64:
	case MVT::v4f64: // QPX
	case MVT::v4i1: // QPX
	if (++NumFPRsUsed <= NumFPRs)
	continue;
	break;
	}
	HasParameterArea = true;
	}
	}

	/* Respect alignment of argument on the stack. */
	auto Alignement =
	CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
	NumBytes = alignTo(NumBytes, Alignement);

	NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
	if (Flags.isInConsecutiveRegsLast())
	NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
	}

	unsigned NumBytesActuallyUsed = NumBytes;

	// In the old ELFv1 ABI,
	// the prolog code of the callee may store up to 8 GPR argument registers to
	// the stack, allowing va_start to index over them in memory if its varargs.
	// Because we cannot tell if this is needed on the caller side, we have to
	// conservatively assume that it is needed. As such, make sure we have at
	// least enough stack space for the caller to store the 8 GPRs.
	// In the ELFv2 ABI, we allocate the parameter area iff a callee
	// really requires memory operands, e.g. a vararg function.
	if (HasParameterArea)
	NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
	else
	NumBytes = LinkageSize;

	// Tail call needs the stack to be aligned.
	if (getTargetMachine().Options.GuaranteedTailCallOpt && IsFastCall)
	NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);

	int SPDiff = 0;

	// Calculate by how many bytes the stack has to be adjusted in case of tail
	// call optimization.
	if (!IsSibCall)
	SPDiff = CalculateTailCallSPDiff(DAG, CFlags.IsTailCall, NumBytes);

	// To protect arguments on the stack from being clobbered in a tail call,
	// force all the loads to happen before doing any other lowering.
	if (CFlags.IsTailCall)
	Chain = DAG.getStackArgumentTokenFactor(Chain);

	// Adjust the stack pointer for the new arguments...
	// These operations are automatically eliminated by the prolog/epilog pass
	if (!IsSibCall)
	Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
	SDValue CallSeqStart = Chain;

	// Load the return address and frame pointer so it can be move somewhere else
	// later.
	SDValue LROp, FPOp;
	Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);

	// Set up a copy of the stack pointer for use loading and storing any
	// arguments that may not fit in the registers available for argument
	// passing.
	SDValue StackPtr = DAG.getRegister(PPC::X1, MVT::i64);

	// Figure out which arguments are going to go in registers, and which in
	// memory. Also, if this is a vararg function, floating point operations
	// must be stored to our stack, and loaded into integer regs as well, if
	// any integer regs are available for argument passing.
	unsigned ArgOffset = LinkageSize;

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
	SmallVector<TailCallArgumentInfo, 8> TailCallArguments;

	SmallVector<SDValue, 8> MemOpChains;
	for (unsigned i = 0; i != NumOps; ++i) {
	SDValue Arg = OutVals[i];
	ISD::ArgFlagsTy Flags = Outs[i].Flags;
	EVT ArgVT = Outs[i].VT;
	EVT OrigVT = Outs[i].ArgVT;

	// PtrOff will be used to store the current argument to the stack if a
	// register cannot be found for it.
	SDValue PtrOff;

	// We re-align the argument offset for each argument, except when using the
	// fast calling convention, when we need to make sure we do that only when
	// we'll actually use a stack slot.
	auto ComputePtrOff = [&]() {
	/* Respect alignment of argument on the stack. */
	auto Alignment =
	CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
	ArgOffset = alignTo(ArgOffset, Alignment);

	PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType());

	PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
	};

	if (!IsFastCall) {
	ComputePtrOff();

	/* Compute GPR index associated with argument offset. */
	GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
	GPR_idx = std::min(GPR_idx, NumGPRs);
	}

	// Promote integers to 64-bit values.
	if (Arg.getValueType() == MVT::i32 \|\| Arg.getValueType() == MVT::i1) {
	// FIXME: Should this use ANY_EXTEND if neither sext nor zext?
	unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
	}

	// FIXME memcpy is used way more than necessary. Correctness first.
	// Note: "by value" is code for passing a structure by value, not
	// basic types.
	if (Flags.isByVal()) {
	// Note: Size includes alignment padding, so
	// struct x { short a; char b; }
	// will have Size = 4. With #pragma pack(1), it will have Size = 3.
	// These are the proper values we need for right-justifying the
	// aggregate in a parameter register.
	unsigned Size = Flags.getByValSize();

	// An empty aggregate parameter takes up no storage and no
	// registers.
	if (Size == 0)
	continue;

	if (IsFastCall)
	ComputePtrOff();

	// All aggregates smaller than 8 bytes must be passed right-justified.
	if (Size==1 \|\| Size==2 \|\| Size==4) {
	EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32);
	if (GPR_idx != NumGPRs) {
	SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
	MachinePointerInfo(), VT);
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));

	ArgOffset += PtrByteSize;
	continue;
	}
	}

	if (GPR_idx == NumGPRs && Size < 8) {
	SDValue AddPtr = PtrOff;
	if (!isLittleEndian) {
	SDValue Const = DAG.getConstant(PtrByteSize - Size, dl,
	PtrOff.getValueType());
	AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
	}
	Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
	CallSeqStart,
	Flags, DAG, dl);
	ArgOffset += PtrByteSize;
	continue;
	}
	// Copy entire object into memory. There are cases where gcc-generated
	// code assumes it is there, even if it could be put entirely into
	// registers. (This is not what the doc says.)

	// FIXME: The above statement is likely due to a misunderstanding of the
	// documents. All arguments must be copied into the parameter area BY
	// THE CALLEE in the event that the callee takes the address of any
	// formal argument. That has not yet been implemented. However, it is
	// reasonable to use the stack area as a staging area for the register
	// load.

	// Skip this for small aggregates, as we will use the same slot for a
	// right-justified copy, below.
	if (Size >= 8)
	Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
	CallSeqStart,
	Flags, DAG, dl);

	// When a register is available, pass a small aggregate right-justified.
	if (Size < 8 && GPR_idx != NumGPRs) {
	// The easiest way to get this right-justified in a register
	// is to copy the structure into the rightmost portion of a
	// local variable slot, then load the whole slot into the
	// register.
	// FIXME: The memcpy seems to produce pretty awful code for
	// small aggregates, particularly for packed ones.
	// FIXME: It would be preferable to use the slot in the
	// parameter save area instead of a new local variable.
	SDValue AddPtr = PtrOff;
	if (!isLittleEndian) {
	SDValue Const = DAG.getConstant(8 - Size, dl, PtrOff.getValueType());
	AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
	}
	Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
	CallSeqStart,
	Flags, DAG, dl);

	// Load the slot into the register.
	SDValue Load =
	DAG.getLoad(PtrVT, dl, Chain, PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));

	// Done with this argument.
	ArgOffset += PtrByteSize;
	continue;
	}

	// For aggregates larger than PtrByteSize, copy the pieces of the
	// object that fit into registers from the parameter save area.
	for (unsigned j=0; j<Size; j+=PtrByteSize) {
	SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType());
	SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
	if (GPR_idx != NumGPRs) {
	SDValue Load =
	DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
	ArgOffset += PtrByteSize;
	} else {
	ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
	break;
	}
	}
	continue;
	}

	switch (Arg.getSimpleValueType().SimpleTy) {
	default: llvm_unreachable("Unexpected ValueType for argument!");
	case MVT::i1:
	case MVT::i32:
	case MVT::i64:
	if (Flags.isNest()) {
	// The 'nest' parameter, if any, is passed in R11.
	RegsToPass.push_back(std::make_pair(PPC::X11, Arg));
	break;
	}

	// These can be scalar arguments or elements of an integer array type
	// passed directly. Clang may use those instead of "byval" aggregate
	// types to avoid forcing arguments to memory unnecessarily.
	if (GPR_idx != NumGPRs) {
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
	} else {
	if (IsFastCall)
	ComputePtrOff();

	assert(HasParameterArea &&
	"Parameter area must exist to pass an argument in memory.");
	LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
	true, CFlags.IsTailCall, false, MemOpChains,
	TailCallArguments, dl);
	if (IsFastCall)
	ArgOffset += PtrByteSize;
	}
	if (!IsFastCall)
	ArgOffset += PtrByteSize;
	break;
	case MVT::f32:
	case MVT::f64: {
	// These can be scalar arguments or elements of a float array type
	// passed directly. The latter are used to implement ELFv2 homogenous
	// float aggregates.

	// Named arguments go into FPRs first, and once they overflow, the
	// remaining arguments go into GPRs and then the parameter save area.
	// Unnamed arguments for vararg functions always go to GPRs and
	// then the parameter save area. For now, put all arguments to vararg
	// routines always in both locations (FPR and GPR or stack slot).
	bool NeedGPROrStack = CFlags.IsVarArg \|\| FPR_idx == NumFPRs;
	bool NeededLoad = false;

	// First load the argument into the next available FPR.
	if (FPR_idx != NumFPRs)
	RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));

	// Next, load the argument into GPR or stack slot if needed.
	if (!NeedGPROrStack)
	;
	else if (GPR_idx != NumGPRs && !IsFastCall) {
	// FIXME: We may want to re-enable this for CallingConv::Fast on the P8
	// once we support fp <-> gpr moves.

	// In the non-vararg case, this can only ever happen in the
	// presence of f32 array types, since otherwise we never run
	// out of FPRs before running out of GPRs.
	SDValue ArgVal;

	// Double values are always passed in a single GPR.
	if (Arg.getValueType() != MVT::f32) {
	ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);

	// Non-array float values are extended and passed in a GPR.
	} else if (!Flags.isInConsecutiveRegs()) {
	ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
	ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);

	// If we have an array of floats, we collect every odd element
	// together with its predecessor into one GPR.
	} else if (ArgOffset % PtrByteSize != 0) {
	SDValue Lo, Hi;
	Lo = DAG.getNode(ISD::BITCAST, dl, MVT::i32, OutVals[i - 1]);
	Hi = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
	if (!isLittleEndian)
	std::swap(Lo, Hi);
	ArgVal = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);

	// The final element, if even, goes into the first half of a GPR.
	} else if (Flags.isInConsecutiveRegsLast()) {
	ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
	ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);
	if (!isLittleEndian)
	ArgVal = DAG.getNode(ISD::SHL, dl, MVT::i64, ArgVal,
	DAG.getConstant(32, dl, MVT::i32));

	// Non-final even elements are skipped; they will be handled
	// together the with subsequent argument on the next go-around.
	} else
	ArgVal = SDValue();

	if (ArgVal.getNode())
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], ArgVal));
	} else {
	if (IsFastCall)
	ComputePtrOff();

	// Single-precision floating-point values are mapped to the
	// second (rightmost) word of the stack doubleword.
	if (Arg.getValueType() == MVT::f32 &&
	!isLittleEndian && !Flags.isInConsecutiveRegs()) {
	SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType());
	PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
	}

	assert(HasParameterArea &&
	"Parameter area must exist to pass an argument in memory.");
	LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
	true, CFlags.IsTailCall, false, MemOpChains,
	TailCallArguments, dl);

	NeededLoad = true;
	}
	// When passing an array of floats, the array occupies consecutive
	// space in the argument area; only round up to the next doubleword
	// at the end of the array. Otherwise, each float takes 8 bytes.
	if (!IsFastCall \|\| NeededLoad) {
	ArgOffset += (Arg.getValueType() == MVT::f32 &&
	Flags.isInConsecutiveRegs()) ? 4 : 8;
	if (Flags.isInConsecutiveRegsLast())
	ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
	}
	break;
	}
	case MVT::v4f32:
	case MVT::v4i32:
	case MVT::v8i16:
	case MVT::v16i8:
	case MVT::v2f64:
	case MVT::v2i64:
	case MVT::v1i128:
	case MVT::f128:
	if (!Subtarget.hasQPX()) {
	// These can be scalar arguments or elements of a vector array type
	// passed directly. The latter are used to implement ELFv2 homogenous
	// vector aggregates.

	// For a varargs call, named arguments go into VRs or on the stack as
	// usual; unnamed arguments always go to the stack or the corresponding
	// GPRs when within range. For now, we always put the value in both
	// locations (or even all three).
	if (CFlags.IsVarArg) {
	assert(HasParameterArea &&
	"Parameter area must exist if we have a varargs call.");
	// We could elide this store in the case where the object fits
	// entirely in R registers. Maybe later.
	SDValue Store =
	DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Store);
	if (VR_idx != NumVRs) {
	SDValue Load =
	DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
	}
	ArgOffset += 16;
	for (unsigned i=0; i<16; i+=PtrByteSize) {
	if (GPR_idx == NumGPRs)
	break;
	SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
	DAG.getConstant(i, dl, PtrVT));
	SDValue Load =
	DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
	}
	break;
	}

	// Non-varargs Altivec params go into VRs or on the stack.
	if (VR_idx != NumVRs) {
	RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
	} else {
	if (IsFastCall)
	ComputePtrOff();

	assert(HasParameterArea &&
	"Parameter area must exist to pass an argument in memory.");
	LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
	true, CFlags.IsTailCall, true, MemOpChains,
	TailCallArguments, dl);
	if (IsFastCall)
	ArgOffset += 16;
	}

	if (!IsFastCall)
	ArgOffset += 16;
	break;
	} // not QPX

	assert(Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32 &&
	"Invalid QPX parameter type");

	LLVM_FALLTHROUGH;
	case MVT::v4f64:
	case MVT::v4i1: {
	bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32;
	if (CFlags.IsVarArg) {
	assert(HasParameterArea &&
	"Parameter area must exist if we have a varargs call.");
	// We could elide this store in the case where the object fits
	// entirely in R registers. Maybe later.
	SDValue Store =
	DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Store);
	if (QFPR_idx != NumQFPRs) {
	SDValue Load = DAG.getLoad(IsF32 ? MVT::v4f32 : MVT::v4f64, dl, Store,
	PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Load));
	}
	ArgOffset += (IsF32 ? 16 : 32);
	for (unsigned i = 0; i < (IsF32 ? 16U : 32U); i += PtrByteSize) {
	if (GPR_idx == NumGPRs)
	break;
	SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
	DAG.getConstant(i, dl, PtrVT));
	SDValue Load =
	DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
	}
	break;
	}

	// Non-varargs QPX params go into registers or on the stack.
	if (QFPR_idx != NumQFPRs) {
	RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Arg));
	} else {
	if (IsFastCall)
	ComputePtrOff();

	assert(HasParameterArea &&
	"Parameter area must exist to pass an argument in memory.");
	LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
	true, CFlags.IsTailCall, true, MemOpChains,
	TailCallArguments, dl);
	if (IsFastCall)
	ArgOffset += (IsF32 ? 16 : 32);
	}

	if (!IsFastCall)
	ArgOffset += (IsF32 ? 16 : 32);
	break;
	}
	}
	}

	assert((!HasParameterArea \|\| NumBytesActuallyUsed == ArgOffset) &&
	"mismatch in size of parameter area");
	(void)NumBytesActuallyUsed;

	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

	// Check if this is an indirect call (MTCTR/BCTRL).
	// See prepareDescriptorIndirectCall and buildCallOperands for more
	// information about calls through function pointers in the 64-bit SVR4 ABI.
	if (CFlags.IsIndirect) {
	// For 64-bit ELFv2 ABI with PCRel, do not save the TOC of the
	// caller in the TOC save area.
	if (isTOCSaveRestoreRequired(Subtarget)) {
	assert(!CFlags.IsTailCall && "Indirect tails calls not supported");
	// Load r2 into a virtual register and store it to the TOC save area.
	setUsesTOCBasePtr(DAG);
	SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
	// TOC save area offset.
	unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
	SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
	SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
	Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr,
	MachinePointerInfo::getStack(
	DAG.getMachineFunction(), TOCSaveOffset));
	}
	// In the ELFv2 ABI, R12 must contain the address of an indirect callee.
	// This does not mean the MTCTR instruction must use R12; it's easier
	// to model this as an extra parameter, so do that.
	if (isELFv2ABI && !CFlags.IsPatchPoint)
	RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee));
	}

	// Build a sequence of copy-to-reg nodes chained together with token chain
	// and flag operands which copy the outgoing args into the appropriate regs.
	SDValue InFlag;
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
	Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
	RegsToPass[i].second, InFlag);
	InFlag = Chain.getValue(1);
	}

	if (CFlags.IsTailCall && !IsSibCall)
	PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
	TailCallArguments);

	return FinishCall(CFlags, dl, DAG, RegsToPass, InFlag, Chain, CallSeqStart,
	Callee, SPDiff, NumBytes, Ins, InVals, CB);
	}

	SDValue PPCTargetLowering::LowerCall_Darwin(
	SDValue Chain, SDValue Callee, CallFlags CFlags,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
	const CallBase *CB) const {
	unsigned NumOps = Outs.size();

	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	bool isPPC64 = PtrVT == MVT::i64;
	unsigned PtrByteSize = isPPC64 ? 8 : 4;

	MachineFunction &MF = DAG.getMachineFunction();

	// Mark this function as potentially containing a function that contains a
	// tail call. As a consequence the frame pointer will be used for dynamicalloc
	// and restoring the callers stack pointer in this functions epilog. This is
	// done because by tail calling the called function might overwrite the value
	// in this function's (MF) stack pointer stack slot 0(SP).
	if (getTargetMachine().Options.GuaranteedTailCallOpt &&
	CFlags.CallConv == CallingConv::Fast)
	MF.getInfo<PPCFunctionInfo>()->setHasFastCall();

	// Count how many bytes are to be pushed on the stack, including the linkage
	// area, and parameter passing area. We start with 24/48 bytes, which is
	// prereserved space for [SP][CR][LR][3 x unused].
	unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
	unsigned NumBytes = LinkageSize;

	// Add up all the space actually used.
	// In 32-bit non-varargs calls, Altivec parameters all go at the end; usually
	// they all go in registers, but we must reserve stack space for them for
	// possible use by the caller. In varargs or 64-bit calls, parameters are
	// assigned stack space in order, with padding so Altivec parameters are
	// 16-byte aligned.
	unsigned nAltivecParamsAtEnd = 0;
	for (unsigned i = 0; i != NumOps; ++i) {
	ISD::ArgFlagsTy Flags = Outs[i].Flags;
	EVT ArgVT = Outs[i].VT;
	// Varargs Altivec parameters are padded to a 16 byte boundary.
	if (ArgVT == MVT::v4f32 \|\| ArgVT == MVT::v4i32 \|\|
	ArgVT == MVT::v8i16 \|\| ArgVT == MVT::v16i8 \|\|
	ArgVT == MVT::v2f64 \|\| ArgVT == MVT::v2i64) {
	if (!CFlags.IsVarArg && !isPPC64) {
	// Non-varargs Altivec parameters go after all the non-Altivec
	// parameters; handle those later so we know how much padding we need.
	nAltivecParamsAtEnd++;
	continue;
	}
	// Varargs and 64-bit Altivec parameters are padded to 16 byte boundary.
	NumBytes = ((NumBytes+15)/16)*16;
	}
	NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
	}

	// Allow for Altivec parameters at the end, if needed.
	if (nAltivecParamsAtEnd) {
	NumBytes = ((NumBytes+15)/16)*16;
	NumBytes += 16*nAltivecParamsAtEnd;
	}

	// The prolog code of the callee may store up to 8 GPR argument registers to
	// the stack, allowing va_start to index over them in memory if its varargs.
	// Because we cannot tell if this is needed on the caller side, we have to
	// conservatively assume that it is needed. As such, make sure we have at
	// least enough stack space for the caller to store the 8 GPRs.
	NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);

	// Tail call needs the stack to be aligned.
	if (getTargetMachine().Options.GuaranteedTailCallOpt &&
	CFlags.CallConv == CallingConv::Fast)
	NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);

	// Calculate by how many bytes the stack has to be adjusted in case of tail
	// call optimization.
	int SPDiff = CalculateTailCallSPDiff(DAG, CFlags.IsTailCall, NumBytes);

	// To protect arguments on the stack from being clobbered in a tail call,
	// force all the loads to happen before doing any other lowering.
	if (CFlags.IsTailCall)
	Chain = DAG.getStackArgumentTokenFactor(Chain);

	// Adjust the stack pointer for the new arguments...
	// These operations are automatically eliminated by the prolog/epilog pass
	Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
	SDValue CallSeqStart = Chain;

	// Load the return address and frame pointer so it can be move somewhere else
	// later.
	SDValue LROp, FPOp;
	Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);

	// Set up a copy of the stack pointer for use loading and storing any
	// arguments that may not fit in the registers available for argument
	// passing.
	SDValue StackPtr;
	if (isPPC64)
	StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
	else
	StackPtr = DAG.getRegister(PPC::R1, MVT::i32);

	// Figure out which arguments are going to go in registers, and which in
	// memory. Also, if this is a vararg function, floating point operations
	// must be stored to our stack, and loaded into integer regs as well, if
	// any integer regs are available for argument passing.
	unsigned ArgOffset = LinkageSize;
	unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;

	static const MCPhysReg GPR_32[] = { // 32-bit registers.
	PPC::R3, PPC::R4, PPC::R5, PPC::R6,
	PPC::R7, PPC::R8, PPC::R9, PPC::R10,
	};
	static const MCPhysReg GPR_64[] = { // 64-bit registers.
	PPC::X3, PPC::X4, PPC::X5, PPC::X6,
	PPC::X7, PPC::X8, PPC::X9, PPC::X10,
	};
	static const MCPhysReg VR[] = {
	PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
	PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
	};
	const unsigned NumGPRs = array_lengthof(GPR_32);
	const unsigned NumFPRs = 13;
	const unsigned NumVRs = array_lengthof(VR);

	const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
	SmallVector<TailCallArgumentInfo, 8> TailCallArguments;

	SmallVector<SDValue, 8> MemOpChains;
	for (unsigned i = 0; i != NumOps; ++i) {
	SDValue Arg = OutVals[i];
	ISD::ArgFlagsTy Flags = Outs[i].Flags;

	// PtrOff will be used to store the current argument to the stack if a
	// register cannot be found for it.
	SDValue PtrOff;

	PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType());

	PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);

	// On PPC64, promote integers to 64-bit values.
	if (isPPC64 && Arg.getValueType() == MVT::i32) {
	// FIXME: Should this use ANY_EXTEND if neither sext nor zext?
	unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
	}

	// FIXME memcpy is used way more than necessary. Correctness first.
	// Note: "by value" is code for passing a structure by value, not
	// basic types.
	if (Flags.isByVal()) {
	unsigned Size = Flags.getByValSize();
	// Very small objects are passed right-justified. Everything else is
	// passed left-justified.
	if (Size==1 \|\| Size==2) {
	EVT VT = (Size==1) ? MVT::i8 : MVT::i16;
	if (GPR_idx != NumGPRs) {
	SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
	MachinePointerInfo(), VT);
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));

	ArgOffset += PtrByteSize;
	} else {
	SDValue Const = DAG.getConstant(PtrByteSize - Size, dl,
	PtrOff.getValueType());
	SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
	Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
	CallSeqStart,
	Flags, DAG, dl);
	ArgOffset += PtrByteSize;
	}
	continue;
	}
	// Copy entire object into memory. There are cases where gcc-generated
	// code assumes it is there, even if it could be put entirely into
	// registers. (This is not what the doc says.)
	Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
	CallSeqStart,
	Flags, DAG, dl);

	// For small aggregates (Darwin only) and aggregates >= PtrByteSize,
	// copy the pieces of the object that fit into registers from the
	// parameter save area.
	for (unsigned j=0; j<Size; j+=PtrByteSize) {
	SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType());
	SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
	if (GPR_idx != NumGPRs) {
	SDValue Load =
	DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
	ArgOffset += PtrByteSize;
	} else {
	ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
	break;
	}
	}
	continue;
	}

	switch (Arg.getSimpleValueType().SimpleTy) {
	default: llvm_unreachable("Unexpected ValueType for argument!");
	case MVT::i1:
	case MVT::i32:
	case MVT::i64:
	if (GPR_idx != NumGPRs) {
	if (Arg.getValueType() == MVT::i1)
	Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, PtrVT, Arg);

	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
	} else {
	LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
	isPPC64, CFlags.IsTailCall, false, MemOpChains,
	TailCallArguments, dl);
	}
	ArgOffset += PtrByteSize;
	break;
	case MVT::f32:
	case MVT::f64:
	if (FPR_idx != NumFPRs) {
	RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));

	if (CFlags.IsVarArg) {
	SDValue Store =
	DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Store);

	// Float varargs are always shadowed in available integer registers
	if (GPR_idx != NumGPRs) {
	SDValue Load =
	DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
	}
	if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && !isPPC64){
	SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType());
	PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
	SDValue Load =
	DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
	}
	} else {
	// If we have any FPRs remaining, we may also have GPRs remaining.
	// Args passed in FPRs consume either 1 (f32) or 2 (f64) available
	// GPRs.
	if (GPR_idx != NumGPRs)
	++GPR_idx;
	if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 &&
	!isPPC64) // PPC64 has 64-bit GPR's obviously :)
	++GPR_idx;
	}
	} else
	LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
	isPPC64, CFlags.IsTailCall, false, MemOpChains,
	TailCallArguments, dl);
	if (isPPC64)
	ArgOffset += 8;
	else
	ArgOffset += Arg.getValueType() == MVT::f32 ? 4 : 8;
	break;
	case MVT::v4f32:
	case MVT::v4i32:
	case MVT::v8i16:
	case MVT::v16i8:
	if (CFlags.IsVarArg) {
	// These go aligned on the stack, or in the corresponding R registers
	// when within range. The Darwin PPC ABI doc claims they also go in
	// V registers; in fact gcc does this only for arguments that are
	// prototyped, not for those that match the ... We do it for all
	// arguments, seems to work.
	while (ArgOffset % 16 !=0) {
	ArgOffset += PtrByteSize;
	if (GPR_idx != NumGPRs)
	GPR_idx++;
	}
	// We could elide this store in the case where the object fits
	// entirely in R registers. Maybe later.
	PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
	DAG.getConstant(ArgOffset, dl, PtrVT));
	SDValue Store =
	DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Store);
	if (VR_idx != NumVRs) {
	SDValue Load =
	DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
	}
	ArgOffset += 16;
	for (unsigned i=0; i<16; i+=PtrByteSize) {
	if (GPR_idx == NumGPRs)
	break;
	SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
	DAG.getConstant(i, dl, PtrVT));
	SDValue Load =
	DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
	}
	break;
	}

	// Non-varargs Altivec params generally go in registers, but have
	// stack space allocated at the end.
	if (VR_idx != NumVRs) {
	// Doesn't have GPR space allocated.
	RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
	} else if (nAltivecParamsAtEnd==0) {
	// We are emitting Altivec params in order.
	LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
	isPPC64, CFlags.IsTailCall, true, MemOpChains,
	TailCallArguments, dl);
	ArgOffset += 16;
	}
	break;
	}
	}
	// If all Altivec parameters fit in registers, as they usually do,
	// they get stack space following the non-Altivec parameters. We
	// don't track this here because nobody below needs it.
	// If there are more Altivec parameters than fit in registers emit
	// the stores here.
	if (!CFlags.IsVarArg && nAltivecParamsAtEnd > NumVRs) {
	unsigned j = 0;
	// Offset is aligned; skip 1st 12 params which go in V registers.
	ArgOffset = ((ArgOffset+15)/16)*16;
	ArgOffset += 12*16;
	for (unsigned i = 0; i != NumOps; ++i) {
	SDValue Arg = OutVals[i];
	EVT ArgType = Outs[i].VT;
	if (ArgType==MVT::v4f32 \|\| ArgType==MVT::v4i32 \|\|
	ArgType==MVT::v8i16 \|\| ArgType==MVT::v16i8) {
	if (++j > NumVRs) {
	SDValue PtrOff;
	// We are emitting Altivec params in order.
	LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
	isPPC64, CFlags.IsTailCall, true, MemOpChains,
	TailCallArguments, dl);
	ArgOffset += 16;
	}
	}
	}
	}

	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

	// On Darwin, R12 must contain the address of an indirect callee. This does
	// not mean the MTCTR instruction must use R12; it's easier to model this as
	// an extra parameter, so do that.
	if (CFlags.IsIndirect) {
	assert(!CFlags.IsTailCall && "Indirect tail-calls not supported.");
	RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 :
	PPC::R12), Callee));
	}

	// Build a sequence of copy-to-reg nodes chained together with token chain
	// and flag operands which copy the outgoing args into the appropriate regs.
	SDValue InFlag;
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
	Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
	RegsToPass[i].second, InFlag);
	InFlag = Chain.getValue(1);
	}

	if (CFlags.IsTailCall)
	PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
	TailCallArguments);

	return FinishCall(CFlags, dl, DAG, RegsToPass, InFlag, Chain, CallSeqStart,
	Callee, SPDiff, NumBytes, Ins, InVals, CB);
	}

	static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
	CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
	CCState &State) {

	const PPCSubtarget &Subtarget = static_cast<const PPCSubtarget &>(
	State.getMachineFunction().getSubtarget());
	const bool IsPPC64 = Subtarget.isPPC64();
	const Align PtrAlign = IsPPC64 ? Align(8) : Align(4);
	const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;

	assert((!ValVT.isInteger() \|\|
	(ValVT.getSizeInBits() <= RegVT.getSizeInBits())) &&
	"Integer argument exceeds register size: should have been legalized");

	if (ValVT == MVT::f128)
	report_fatal_error("f128 is unimplemented on AIX.");

	if (ArgFlags.isNest())
	report_fatal_error("Nest arguments are unimplemented.");

	if (ValVT.isVector() \|\| LocVT.isVector())
	report_fatal_error("Vector arguments are unimplemented on AIX.");

	static const MCPhysReg GPR_32[] = {// 32-bit registers.
	PPC::R3, PPC::R4, PPC::R5, PPC::R6,
	PPC::R7, PPC::R8, PPC::R9, PPC::R10};
	static const MCPhysReg GPR_64[] = {// 64-bit registers.
	PPC::X3, PPC::X4, PPC::X5, PPC::X6,
	PPC::X7, PPC::X8, PPC::X9, PPC::X10};

	if (ArgFlags.isByVal()) {
	if (ArgFlags.getNonZeroByValAlign() > PtrAlign)
	report_fatal_error("Pass-by-value arguments with alignment greater than "
	"register width are not supported.");

	const unsigned ByValSize = ArgFlags.getByValSize();

	// An empty aggregate parameter takes up no storage and no registers,
	// but needs a MemLoc for a stack slot for the formal arguments side.
	if (ByValSize == 0) {
	State.addLoc(CCValAssign::getMem(ValNo, MVT::INVALID_SIMPLE_VALUE_TYPE,
	State.getNextStackOffset(), RegVT,
	LocInfo));
	return false;
	}

	const unsigned StackSize = alignTo(ByValSize, PtrAlign);
	unsigned Offset = State.AllocateStack(StackSize, PtrAlign);
	for (const unsigned E = Offset + StackSize; Offset < E;
	Offset += PtrAlign.value()) {
	if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32))
	State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, RegVT, LocInfo));
	else {
	State.addLoc(CCValAssign::getMem(ValNo, MVT::INVALID_SIMPLE_VALUE_TYPE,
	Offset, MVT::INVALID_SIMPLE_VALUE_TYPE,
	LocInfo));
	break;
	}
	}
	return false;
	}

	// Arguments always reserve parameter save area.
	switch (ValVT.SimpleTy) {
	default:
	report_fatal_error("Unhandled value type for argument.");
	case MVT::i64:
	// i64 arguments should have been split to i32 for PPC32.
	assert(IsPPC64 && "PPC32 should have split i64 values.");
	LLVM_FALLTHROUGH;
	case MVT::i1:
	case MVT::i32: {
	const unsigned Offset = State.AllocateStack(PtrAlign.value(), PtrAlign);
	// AIX integer arguments are always passed in register width.
	if (ValVT.getSizeInBits() < RegVT.getSizeInBits())
	LocInfo = ArgFlags.isSExt() ? CCValAssign::LocInfo::SExt
	: CCValAssign::LocInfo::ZExt;
	if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32))
	State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, RegVT, LocInfo));
	else
	State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, RegVT, LocInfo));

	return false;
	}
	case MVT::f32:
	case MVT::f64: {
	// Parameter save area (PSA) is reserved even if the float passes in fpr.
	const unsigned StoreSize = LocVT.getStoreSize();
	// Floats are always 4-byte aligned in the PSA on AIX.
	// This includes f64 in 64-bit mode for ABI compatibility.
	const unsigned Offset =
	State.AllocateStack(IsPPC64 ? 8 : StoreSize, Align(4));
	unsigned FReg = State.AllocateReg(FPR);
	if (FReg)
	State.addLoc(CCValAssign::getReg(ValNo, ValVT, FReg, LocVT, LocInfo));

	// Reserve and initialize GPRs or initialize the PSA as required.
	for (unsigned I = 0; I < StoreSize; I += PtrAlign.value()) {
	if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32)) {
	assert(FReg && "An FPR should be available when a GPR is reserved.");
	if (State.isVarArg()) {
	// Successfully reserved GPRs are only initialized for vararg calls.
	// Custom handling is required for:
	// f64 in PPC32 needs to be split into 2 GPRs.
	// f32 in PPC64 needs to occupy only lower 32 bits of 64-bit GPR.
	State.addLoc(
	CCValAssign::getCustomReg(ValNo, ValVT, Reg, RegVT, LocInfo));
	}
	} else {
	// If there are insufficient GPRs, the PSA needs to be initialized.
	// Initialization occurs even if an FPR was initialized for
	// compatibility with the AIX XL compiler. The full memory for the
	// argument will be initialized even if a prior word is saved in GPR.
	// A custom memLoc is used when the argument also passes in FPR so
	// that the callee handling can skip over it easily.
	State.addLoc(
	FReg ? CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT,
	LocInfo)
	: CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
	break;
	}
	}

	return false;
	}
	}
	return true;
	}

	static const TargetRegisterClass *getRegClassForSVT(MVT::SimpleValueType SVT,
	bool IsPPC64) {
	assert((IsPPC64 \|\| SVT != MVT::i64) &&
	"i64 should have been split for 32-bit codegen.");

	switch (SVT) {
	default:
	report_fatal_error("Unexpected value type for formal argument");
	case MVT::i1:
	case MVT::i32:
	case MVT::i64:
	return IsPPC64 ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
	case MVT::f32:
	return &PPC::F4RCRegClass;
	case MVT::f64:
	return &PPC::F8RCRegClass;
	}
	}

	static SDValue truncateScalarIntegerArg(ISD::ArgFlagsTy Flags, EVT ValVT,
	SelectionDAG &DAG, SDValue ArgValue,
	MVT LocVT, const SDLoc &dl) {
	assert(ValVT.isScalarInteger() && LocVT.isScalarInteger());
	assert(ValVT.getSizeInBits() < LocVT.getSizeInBits());

	if (Flags.isSExt())
	ArgValue = DAG.getNode(ISD::AssertSext, dl, LocVT, ArgValue,
	DAG.getValueType(ValVT));
	else if (Flags.isZExt())
	ArgValue = DAG.getNode(ISD::AssertZext, dl, LocVT, ArgValue,
	DAG.getValueType(ValVT));

	return DAG.getNode(ISD::TRUNCATE, dl, ValVT, ArgValue);
	}

	static unsigned mapArgRegToOffsetAIX(unsigned Reg, const PPCFrameLowering *FL) {
	const unsigned LASize = FL->getLinkageSize();

	if (PPC::GPRCRegClass.contains(Reg)) {
	assert(Reg >= PPC::R3 && Reg <= PPC::R10 &&
	"Reg must be a valid argument register!");
	return LASize + 4 * (Reg - PPC::R3);
	}

	if (PPC::G8RCRegClass.contains(Reg)) {
	assert(Reg >= PPC::X3 && Reg <= PPC::X10 &&
	"Reg must be a valid argument register!");
	return LASize + 8 * (Reg - PPC::X3);
	}

	llvm_unreachable("Only general purpose registers expected.");
	}

	// AIX ABI Stack Frame Layout:
	//
	// Low Memory +--------------------------------------------+
	// SP +---> \| Back chain \| ---+
	// \| +--------------------------------------------+ \|
	// \| \| Saved Condition Register \| \|
	// \| +--------------------------------------------+ \|
	// \| \| Saved Linkage Register \| \|
	// \| +--------------------------------------------+ \| Linkage Area
	// \| \| Reserved for compilers \| \|
	// \| +--------------------------------------------+ \|
	// \| \| Reserved for binders \| \|
	// \| +--------------------------------------------+ \|
	// \| \| Saved TOC pointer \| ---+
	// \| +--------------------------------------------+
	// \| \| Parameter save area \|
	// \| +--------------------------------------------+
	// \| \| Alloca space \|
	// \| +--------------------------------------------+
	// \| \| Local variable space \|
	// \| +--------------------------------------------+
	// \| \| Float/int conversion temporary \|
	// \| +--------------------------------------------+
	// \| \| Save area for AltiVec registers \|
	// \| +--------------------------------------------+
	// \| \| AltiVec alignment padding \|
	// \| +--------------------------------------------+
	// \| \| Save area for VRSAVE register \|
	// \| +--------------------------------------------+
	// \| \| Save area for General Purpose registers \|
	// \| +--------------------------------------------+
	// \| \| Save area for Floating Point registers \|
	// \| +--------------------------------------------+
	// +---- \| Back chain \|
	// High Memory +--------------------------------------------+
	//
	// Specifications:
	// AIX 7.2 Assembler Language Reference
	// Subroutine linkage convention

	SDValue PPCTargetLowering::LowerFormalArguments_AIX(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {

	assert((CallConv == CallingConv::C \|\| CallConv == CallingConv::Cold \|\|
	CallConv == CallingConv::Fast) &&
	"Unexpected calling convention!");

	if (getTargetMachine().Options.GuaranteedTailCallOpt)
	report_fatal_error("Tail call support is unimplemented on AIX.");

	if (useSoftFloat())
	report_fatal_error("Soft float support is unimplemented on AIX.");

	const PPCSubtarget &Subtarget =
	static_cast<const PPCSubtarget &>(DAG.getSubtarget());
	if (Subtarget.hasQPX())
	report_fatal_error("QPX support is not supported on AIX.");

	const bool IsPPC64 = Subtarget.isPPC64();
	const unsigned PtrByteSize = IsPPC64 ? 8 : 4;

	// Assign locations to all of the incoming arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

	const EVT PtrVT = getPointerTy(MF.getDataLayout());
	// Reserve space for the linkage area on the stack.
	const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
	CCInfo.AllocateStack(LinkageSize, Align(PtrByteSize));
	CCInfo.AnalyzeFormalArguments(Ins, CC_AIX);

	SmallVector<SDValue, 8> MemOps;

	for (size_t I = 0, End = ArgLocs.size(); I != End; /* No increment here */) {
	CCValAssign &VA = ArgLocs[I++];
	MVT LocVT = VA.getLocVT();
	ISD::ArgFlagsTy Flags = Ins[VA.getValNo()].Flags;

	// For compatibility with the AIX XL compiler, the float args in the
	// parameter save area are initialized even if the argument is available
	// in register. The caller is required to initialize both the register
	// and memory, however, the callee can choose to expect it in either.
	// The memloc is dismissed here because the argument is retrieved from
	// the register.
	if (VA.isMemLoc() && VA.needsCustom())
	continue;

	if (Flags.isByVal() && VA.isMemLoc()) {
	const unsigned Size =
	alignTo(Flags.getByValSize() ? Flags.getByValSize() : PtrByteSize,
	PtrByteSize);
	const int FI = MF.getFrameInfo().CreateFixedObject(
	Size, VA.getLocMemOffset(), /* IsImmutable */ false,
	/* IsAliased */ true);
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	InVals.push_back(FIN);

	continue;
	}

	if (Flags.isByVal()) {
	assert(VA.isRegLoc() && "MemLocs should already be handled.");

	const MCPhysReg ArgReg = VA.getLocReg();
	const PPCFrameLowering *FL = Subtarget.getFrameLowering();

	if (Flags.getNonZeroByValAlign() > PtrByteSize)
	report_fatal_error("Over aligned byvals not supported yet.");

	const unsigned StackSize = alignTo(Flags.getByValSize(), PtrByteSize);
	const int FI = MF.getFrameInfo().CreateFixedObject(
	StackSize, mapArgRegToOffsetAIX(ArgReg, FL), /* IsImmutable */ false,
	/* IsAliased */ true);
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	InVals.push_back(FIN);

	// Add live ins for all the RegLocs for the same ByVal.
	const TargetRegisterClass *RegClass =
	IsPPC64 ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;

	auto HandleRegLoc = [&, RegClass, LocVT](const MCPhysReg PhysReg,
	unsigned Offset) {
	const unsigned VReg = MF.addLiveIn(PhysReg, RegClass);
	// Since the callers side has left justified the aggregate in the
	// register, we can simply store the entire register into the stack
	// slot.
	SDValue CopyFrom = DAG.getCopyFromReg(Chain, dl, VReg, LocVT);
	// The store to the fixedstack object is needed becuase accessing a
	// field of the ByVal will use a gep and load. Ideally we will optimize
	// to extracting the value from the register directly, and elide the
	// stores when the arguments address is not taken, but that will need to
	// be future work.
	SDValue Store =
	DAG.getStore(CopyFrom.getValue(1), dl, CopyFrom,
	DAG.getObjectPtrOffset(dl, FIN, Offset),
	MachinePointerInfo::getFixedStack(MF, FI, Offset));

	MemOps.push_back(Store);
	};

	unsigned Offset = 0;
	HandleRegLoc(VA.getLocReg(), Offset);
	Offset += PtrByteSize;
	for (; Offset != StackSize && ArgLocs[I].isRegLoc();
	Offset += PtrByteSize) {
	assert(ArgLocs[I].getValNo() == VA.getValNo() &&
	"RegLocs should be for ByVal argument.");

	const CCValAssign RL = ArgLocs[I++];
	HandleRegLoc(RL.getLocReg(), Offset);
	}

	if (Offset != StackSize) {
	assert(ArgLocs[I].getValNo() == VA.getValNo() &&
	"Expected MemLoc for remaining bytes.");
	assert(ArgLocs[I].isMemLoc() && "Expected MemLoc for remaining bytes.");
	// Consume the MemLoc.The InVal has already been emitted, so nothing
	// more needs to be done.
	++I;
	}

	continue;
	}

	EVT ValVT = VA.getValVT();
	if (VA.isRegLoc() && !VA.needsCustom()) {
	MVT::SimpleValueType SVT = ValVT.getSimpleVT().SimpleTy;
	unsigned VReg =
	MF.addLiveIn(VA.getLocReg(), getRegClassForSVT(SVT, IsPPC64));
	SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, LocVT);
	if (ValVT.isScalarInteger() &&
	(ValVT.getSizeInBits() < LocVT.getSizeInBits())) {
	ArgValue =
	truncateScalarIntegerArg(Flags, ValVT, DAG, ArgValue, LocVT, dl);
	}
	InVals.push_back(ArgValue);
	continue;
	}
	if (VA.isMemLoc()) {
	const unsigned LocSize = LocVT.getStoreSize();
	const unsigned ValSize = ValVT.getStoreSize();
	assert((ValSize <= LocSize) &&
	"Object size is larger than size of MemLoc");
	int CurArgOffset = VA.getLocMemOffset();
	// Objects are right-justified because AIX is big-endian.
	if (LocSize > ValSize)
	CurArgOffset += LocSize - ValSize;
	// Potential tail calls could cause overwriting of argument stack slots.
	const bool IsImmutable =
	!(getTargetMachine().Options.GuaranteedTailCallOpt &&
	(CallConv == CallingConv::Fast));
	int FI = MFI.CreateFixedObject(ValSize, CurArgOffset, IsImmutable);
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	SDValue ArgValue =
	DAG.getLoad(ValVT, dl, Chain, FIN, MachinePointerInfo());
	InVals.push_back(ArgValue);
	continue;
	}
	}

	// On AIX a minimum of 8 words is saved to the parameter save area.
	const unsigned MinParameterSaveArea = 8 * PtrByteSize;
	// Area that is at least reserved in the caller of this function.
	unsigned CallerReservedArea =
	std::max(CCInfo.getNextStackOffset(), LinkageSize + MinParameterSaveArea);

	// Set the size that is at least reserved in caller of this function. Tail
	// call optimized function's reserved stack space needs to be aligned so
	// that taking the difference between two stack areas will result in an
	// aligned stack.
	CallerReservedArea =
	EnsureStackAlignment(Subtarget.getFrameLowering(), CallerReservedArea);
	PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
	FuncInfo->setMinReservedArea(CallerReservedArea);

	if (isVarArg) {
	FuncInfo->setVarArgsFrameIndex(
	MFI.CreateFixedObject(PtrByteSize, CCInfo.getNextStackOffset(), true));
	SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

	static const MCPhysReg GPR_32[] = {PPC::R3, PPC::R4, PPC::R5, PPC::R6,
	PPC::R7, PPC::R8, PPC::R9, PPC::R10};

	static const MCPhysReg GPR_64[] = {PPC::X3, PPC::X4, PPC::X5, PPC::X6,
	PPC::X7, PPC::X8, PPC::X9, PPC::X10};
	const unsigned NumGPArgRegs = array_lengthof(IsPPC64 ? GPR_64 : GPR_32);

	// The fixed integer arguments of a variadic function are stored to the
	// VarArgsFrameIndex on the stack so that they may be loaded by
	// dereferencing the result of va_next.
	for (unsigned GPRIndex =
	(CCInfo.getNextStackOffset() - LinkageSize) / PtrByteSize;
	GPRIndex < NumGPArgRegs; ++GPRIndex) {

	const unsigned VReg =
	IsPPC64 ? MF.addLiveIn(GPR_64[GPRIndex], &PPC::G8RCRegClass)
	: MF.addLiveIn(GPR_32[GPRIndex], &PPC::GPRCRegClass);

	SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
	SDValue Store =
	DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
	MemOps.push_back(Store);
	// Increment the address for the next argument to store.
	SDValue PtrOff = DAG.getConstant(PtrByteSize, dl, PtrVT);
	FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
	}
	}

	if (!MemOps.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);

	return Chain;
	}

	SDValue PPCTargetLowering::LowerCall_AIX(
	SDValue Chain, SDValue Callee, CallFlags CFlags,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
	const CallBase *CB) const {
	// See PPCTargetLowering::LowerFormalArguments_AIX() for a description of the
	// AIX ABI stack frame layout.

	assert((CFlags.CallConv == CallingConv::C \|\|
	CFlags.CallConv == CallingConv::Cold \|\|
	CFlags.CallConv == CallingConv::Fast) &&
	"Unexpected calling convention!");

	if (CFlags.IsPatchPoint)
	report_fatal_error("This call type is unimplemented on AIX.");

	const PPCSubtarget& Subtarget =
	static_cast<const PPCSubtarget&>(DAG.getSubtarget());
	if (Subtarget.hasQPX())
	report_fatal_error("QPX is not supported on AIX.");
	if (Subtarget.hasAltivec())
	report_fatal_error("Altivec support is unimplemented on AIX.");

	MachineFunction &MF = DAG.getMachineFunction();
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CFlags.CallConv, CFlags.IsVarArg, MF, ArgLocs,
	*DAG.getContext());

	// Reserve space for the linkage save area (LSA) on the stack.
	// In both PPC32 and PPC64 there are 6 reserved slots in the LSA:
	// [SP][CR][LR][2 x reserved][TOC].
	// The LSA is 24 bytes (6x4) in PPC32 and 48 bytes (6x8) in PPC64.
	const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
	const bool IsPPC64 = Subtarget.isPPC64();
	const EVT PtrVT = getPointerTy(DAG.getDataLayout());
	const unsigned PtrByteSize = IsPPC64 ? 8 : 4;
	CCInfo.AllocateStack(LinkageSize, Align(PtrByteSize));
	CCInfo.AnalyzeCallOperands(Outs, CC_AIX);

	// The prolog code of the callee may store up to 8 GPR argument registers to
	// the stack, allowing va_start to index over them in memory if the callee
	// is variadic.
	// Because we cannot tell if this is needed on the caller side, we have to
	// conservatively assume that it is needed. As such, make sure we have at
	// least enough stack space for the caller to store the 8 GPRs.
	const unsigned MinParameterSaveAreaSize = 8 * PtrByteSize;
	const unsigned NumBytes = std::max(LinkageSize + MinParameterSaveAreaSize,
	CCInfo.getNextStackOffset());

	// Adjust the stack pointer for the new arguments...
	// These operations are automatically eliminated by the prolog/epilog pass.
	Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
	SDValue CallSeqStart = Chain;

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
	SmallVector<SDValue, 8> MemOpChains;

	// Set up a copy of the stack pointer for loading and storing any
	// arguments that may not fit in the registers available for argument
	// passing.
	const SDValue StackPtr = IsPPC64 ? DAG.getRegister(PPC::X1, MVT::i64)
	: DAG.getRegister(PPC::R1, MVT::i32);

	for (unsigned I = 0, E = ArgLocs.size(); I != E;) {
	const unsigned ValNo = ArgLocs[I].getValNo();
	SDValue Arg = OutVals[ValNo];
	ISD::ArgFlagsTy Flags = Outs[ValNo].Flags;

	if (Flags.isByVal()) {
	const unsigned ByValSize = Flags.getByValSize();

	// Nothing to do for zero-sized ByVals on the caller side.
	if (!ByValSize) {
	++I;
	continue;
	}

	auto GetLoad = [&](EVT VT, unsigned LoadOffset) {
	return DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain,
	(LoadOffset != 0)
	? DAG.getObjectPtrOffset(dl, Arg, LoadOffset)
	: Arg,
	MachinePointerInfo(), VT);
	};

	unsigned LoadOffset = 0;

	// Initialize registers, which are fully occupied by the by-val argument.
	while (LoadOffset + PtrByteSize <= ByValSize && ArgLocs[I].isRegLoc()) {
	SDValue Load = GetLoad(PtrVT, LoadOffset);
	MemOpChains.push_back(Load.getValue(1));
	LoadOffset += PtrByteSize;
	const CCValAssign &ByValVA = ArgLocs[I++];
	assert(ByValVA.getValNo() == ValNo &&
	"Unexpected location for pass-by-value argument.");
	RegsToPass.push_back(std::make_pair(ByValVA.getLocReg(), Load));
	}

	if (LoadOffset == ByValSize)
	continue;

	// There must be one more loc to handle the remainder.
	assert(ArgLocs[I].getValNo() == ValNo &&
	"Expected additional location for by-value argument.");

	if (ArgLocs[I].isMemLoc()) {
	assert(LoadOffset < ByValSize && "Unexpected memloc for by-val arg.");
	const CCValAssign &ByValVA = ArgLocs[I++];
	ISD::ArgFlagsTy MemcpyFlags = Flags;
	// Only memcpy the bytes that don't pass in register.
	MemcpyFlags.setByValSize(ByValSize - LoadOffset);
	Chain = CallSeqStart = createMemcpyOutsideCallSeq(
	(LoadOffset != 0) ? DAG.getObjectPtrOffset(dl, Arg, LoadOffset)
	: Arg,
	DAG.getObjectPtrOffset(dl, StackPtr, ByValVA.getLocMemOffset()),
	CallSeqStart, MemcpyFlags, DAG, dl);
	continue;
	}

	// Initialize the final register residue.
	// Any residue that occupies the final by-val arg register must be
	// left-justified on AIX. Loads must be a power-of-2 size and cannot be
	// larger than the ByValSize. For example: a 7 byte by-val arg requires 4,
	// 2 and 1 byte loads.
	const unsigned ResidueBytes = ByValSize % PtrByteSize;
	assert(ResidueBytes != 0 && LoadOffset + PtrByteSize > ByValSize &&
	"Unexpected register residue for by-value argument.");
	SDValue ResidueVal;
	for (unsigned Bytes = 0; Bytes != ResidueBytes;) {
	const unsigned N = PowerOf2Floor(ResidueBytes - Bytes);
	const MVT VT =
	N == 1 ? MVT::i8
	: ((N == 2) ? MVT::i16 : (N == 4 ? MVT::i32 : MVT::i64));
	SDValue Load = GetLoad(VT, LoadOffset);
	MemOpChains.push_back(Load.getValue(1));
	LoadOffset += N;
	Bytes += N;

	// By-val arguments are passed left-justfied in register.
	// Every load here needs to be shifted, otherwise a full register load
	// should have been used.
	assert(PtrVT.getSimpleVT().getSizeInBits() > (Bytes * 8) &&
	"Unexpected load emitted during handling of pass-by-value "
	"argument.");
	unsigned NumSHLBits = PtrVT.getSimpleVT().getSizeInBits() - (Bytes * 8);
	EVT ShiftAmountTy =
	getShiftAmountTy(Load->getValueType(0), DAG.getDataLayout());
	SDValue SHLAmt = DAG.getConstant(NumSHLBits, dl, ShiftAmountTy);
	SDValue ShiftedLoad =
	DAG.getNode(ISD::SHL, dl, Load.getValueType(), Load, SHLAmt);
	ResidueVal = ResidueVal ? DAG.getNode(ISD::OR, dl, PtrVT, ResidueVal,
	ShiftedLoad)
	: ShiftedLoad;
	}

	const CCValAssign &ByValVA = ArgLocs[I++];
	RegsToPass.push_back(std::make_pair(ByValVA.getLocReg(), ResidueVal));
	continue;
	}

	CCValAssign &VA = ArgLocs[I++];
	const MVT LocVT = VA.getLocVT();
	const MVT ValVT = VA.getValVT();

	switch (VA.getLocInfo()) {
	default:
	report_fatal_error("Unexpected argument extension type.");
	case CCValAssign::Full:
	break;
	case CCValAssign::ZExt:
	Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
	break;
	case CCValAssign::SExt:
	Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
	break;
	}

	if (VA.isRegLoc() && !VA.needsCustom()) {
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
	continue;
	}

	if (VA.isMemLoc()) {
	SDValue PtrOff =
	DAG.getConstant(VA.getLocMemOffset(), dl, StackPtr.getValueType());
	PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
	MemOpChains.push_back(
	DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));

	continue;
	}

	// Custom handling is used for GPR initializations for vararg float
	// arguments.
	assert(VA.isRegLoc() && VA.needsCustom() && CFlags.IsVarArg &&
	ValVT.isFloatingPoint() && LocVT.isInteger() &&
	"Unexpected register handling for calling convention.");

	SDValue ArgAsInt =
	DAG.getBitcast(MVT::getIntegerVT(ValVT.getSizeInBits()), Arg);

	if (Arg.getValueType().getStoreSize() == LocVT.getStoreSize())
	// f32 in 32-bit GPR
	// f64 in 64-bit GPR
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgAsInt));
	else if (Arg.getValueType().getSizeInBits() < LocVT.getSizeInBits())
	// f32 in 64-bit GPR.
	RegsToPass.push_back(std::make_pair(
	VA.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, LocVT)));
	else {
	// f64 in two 32-bit GPRs
	// The 2 GPRs are marked custom and expected to be adjacent in ArgLocs.
	assert(Arg.getValueType() == MVT::f64 && CFlags.IsVarArg && !IsPPC64 &&
	"Unexpected custom register for argument!");
	CCValAssign &GPR1 = VA;
	SDValue MSWAsI64 = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgAsInt,
	DAG.getConstant(32, dl, MVT::i8));
	RegsToPass.push_back(std::make_pair(
	GPR1.getLocReg(), DAG.getZExtOrTrunc(MSWAsI64, dl, MVT::i32)));

	if (I != E) {
	// If only 1 GPR was available, there will only be one custom GPR and
	// the argument will also pass in memory.
	CCValAssign &PeekArg = ArgLocs[I];
	if (PeekArg.isRegLoc() && PeekArg.getValNo() == PeekArg.getValNo()) {
	assert(PeekArg.needsCustom() && "A second custom GPR is expected.");
	CCValAssign &GPR2 = ArgLocs[I++];
	RegsToPass.push_back(std::make_pair(
	GPR2.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, MVT::i32)));
	}
	}
	}
	}

	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

	// For indirect calls, we need to save the TOC base to the stack for
	// restoration after the call.
	if (CFlags.IsIndirect) {
	assert(!CFlags.IsTailCall && "Indirect tail-calls not supported.");
	const MCRegister TOCBaseReg = Subtarget.getTOCPointerRegister();
	const MCRegister StackPtrReg = Subtarget.getStackPointerRegister();
	const MVT PtrVT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
	const unsigned TOCSaveOffset =
	Subtarget.getFrameLowering()->getTOCSaveOffset();

	setUsesTOCBasePtr(DAG);
	SDValue Val = DAG.getCopyFromReg(Chain, dl, TOCBaseReg, PtrVT);
	SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
	SDValue StackPtr = DAG.getRegister(StackPtrReg, PtrVT);
	SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
	Chain = DAG.getStore(
	Val.getValue(1), dl, Val, AddPtr,
	MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset));
	}

	// Build a sequence of copy-to-reg nodes chained together with token chain
	// and flag operands which copy the outgoing args into the appropriate regs.
	SDValue InFlag;
	for (auto Reg : RegsToPass) {
	Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, InFlag);
	InFlag = Chain.getValue(1);
	}

	const int SPDiff = 0;
	return FinishCall(CFlags, dl, DAG, RegsToPass, InFlag, Chain, CallSeqStart,
	Callee, SPDiff, NumBytes, Ins, InVals, CB);
	}

	bool
	PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
	MachineFunction &MF, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	LLVMContext &Context) const {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
	return CCInfo.CheckReturn(
	Outs, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
	? RetCC_PPC_Cold
	: RetCC_PPC);
	}

	SDValue
	PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &dl, SelectionDAG &DAG) const {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());
	CCInfo.AnalyzeReturn(Outs,
	(Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
	? RetCC_PPC_Cold
	: RetCC_PPC);

	SDValue Flag;
	SmallVector<SDValue, 4> RetOps(1, Chain);

	// Copy the result values into the output registers.
	for (unsigned i = 0, RealResIdx = 0; i != RVLocs.size(); ++i, ++RealResIdx) {
	CCValAssign &VA = RVLocs[i];
	assert(VA.isRegLoc() && "Can only return in registers!");

	SDValue Arg = OutVals[RealResIdx];

	switch (VA.getLocInfo()) {
	default: llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full: break;
	case CCValAssign::AExt:
	Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
	break;
	case CCValAssign::ZExt:
	Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
	break;
	case CCValAssign::SExt:
	Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
	break;
	}
	if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) {
	bool isLittleEndian = Subtarget.isLittleEndian();
	// Legalize ret f64 -> ret 2 x i32.
	SDValue SVal =
	DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
	DAG.getIntPtrConstant(isLittleEndian ? 0 : 1, dl));
	Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), SVal, Flag);
	RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
	SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
	DAG.getIntPtrConstant(isLittleEndian ? 1 : 0, dl));
	Flag = Chain.getValue(1);
	VA = RVLocs[++i]; // skip ahead to next loc
	Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), SVal, Flag);
	} else
	Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
	Flag = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
	}

	RetOps[0] = Chain; // Update chain.

	// Add the flag if we have it.
	if (Flag.getNode())
	RetOps.push_back(Flag);

	return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps);
	}

	SDValue
	PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);

	// Get the correct type for integers.
	EVT IntVT = Op.getValueType();

	// Get the inputs.
	SDValue Chain = Op.getOperand(0);
	SDValue FPSIdx = getFramePointerFrameIndex(DAG);
	// Build a DYNAREAOFFSET node.
	SDValue Ops[2] = {Chain, FPSIdx};
	SDVTList VTs = DAG.getVTList(IntVT);
	return DAG.getNode(PPCISD::DYNAREAOFFSET, dl, VTs, Ops);
	}

	SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op,
	SelectionDAG &DAG) const {
	// When we pop the dynamic allocation we need to restore the SP link.
	SDLoc dl(Op);

	// Get the correct type for pointers.
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	// Construct the stack pointer operand.
	bool isPPC64 = Subtarget.isPPC64();
	unsigned SP = isPPC64 ? PPC::X1 : PPC::R1;
	SDValue StackPtr = DAG.getRegister(SP, PtrVT);

	// Get the operands for the STACKRESTORE.
	SDValue Chain = Op.getOperand(0);
	SDValue SaveSP = Op.getOperand(1);

	// Load the old link SP.
	SDValue LoadLinkSP =
	DAG.getLoad(PtrVT, dl, Chain, StackPtr, MachinePointerInfo());

	// Restore the stack pointer.
	Chain = DAG.getCopyToReg(LoadLinkSP.getValue(1), dl, SP, SaveSP);

	// Store the old link SP.
	return DAG.getStore(Chain, dl, LoadLinkSP, StackPtr, MachinePointerInfo());
	}

	SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	bool isPPC64 = Subtarget.isPPC64();
	EVT PtrVT = getPointerTy(MF.getDataLayout());

	// Get current frame pointer save index. The users of this index will be
	// primarily DYNALLOC instructions.
	PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
	int RASI = FI->getReturnAddrSaveIndex();

	// If the frame pointer save index hasn't been defined yet.
	if (!RASI) {
	// Find out what the fix offset of the frame pointer save area.
	int LROffset = Subtarget.getFrameLowering()->getReturnSaveOffset();
	// Allocate the frame index for frame pointer save area.
	RASI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, LROffset, false);
	// Save the result.
	FI->setReturnAddrSaveIndex(RASI);
	}
	return DAG.getFrameIndex(RASI, PtrVT);
	}

	SDValue
	PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	bool isPPC64 = Subtarget.isPPC64();
	EVT PtrVT = getPointerTy(MF.getDataLayout());

	// Get current frame pointer save index. The users of this index will be
	// primarily DYNALLOC instructions.
	PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
	int FPSI = FI->getFramePointerSaveIndex();

	// If the frame pointer save index hasn't been defined yet.
	if (!FPSI) {
	// Find out what the fix offset of the frame pointer save area.
	int FPOffset = Subtarget.getFrameLowering()->getFramePointerSaveOffset();
	// Allocate the frame index for frame pointer save area.
	FPSI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, FPOffset, true);
	// Save the result.
	FI->setFramePointerSaveIndex(FPSI);
	}
	return DAG.getFrameIndex(FPSI, PtrVT);
	}

	SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	// Get the inputs.
	SDValue Chain = Op.getOperand(0);
	SDValue Size = Op.getOperand(1);
	SDLoc dl(Op);

	// Get the correct type for pointers.
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	// Negate the size.
	SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT,
	DAG.getConstant(0, dl, PtrVT), Size);
	// Construct a node for the frame pointer save index.
	SDValue FPSIdx = getFramePointerFrameIndex(DAG);
	SDValue Ops[3] = { Chain, NegSize, FPSIdx };
	SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other);
	if (hasInlineStackProbe(MF))
	return DAG.getNode(PPCISD::PROBED_ALLOCA, dl, VTs, Ops);
	return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops);
	}

	SDValue PPCTargetLowering::LowerEH_DWARF_CFA(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();

	bool isPPC64 = Subtarget.isPPC64();
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	int FI = MF.getFrameInfo().CreateFixedObject(isPPC64 ? 8 : 4, 0, false);
	return DAG.getFrameIndex(FI, PtrVT);
	}

	SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	return DAG.getNode(PPCISD::EH_SJLJ_SETJMP, DL,
	DAG.getVTList(MVT::i32, MVT::Other),
	Op.getOperand(0), Op.getOperand(1));
	}

	SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	return DAG.getNode(PPCISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
	Op.getOperand(0), Op.getOperand(1));
	}

	SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
	if (Op.getValueType().isVector())
	return LowerVectorLoad(Op, DAG);

	assert(Op.getValueType() == MVT::i1 &&
	"Custom lowering only for i1 loads");

	// First, load 8 bits into 32 bits, then truncate to 1 bit.

	SDLoc dl(Op);
	LoadSDNode *LD = cast<LoadSDNode>(Op);

	SDValue Chain = LD->getChain();
	SDValue BasePtr = LD->getBasePtr();
	MachineMemOperand *MMO = LD->getMemOperand();

	SDValue NewLD =
	DAG.getExtLoad(ISD::EXTLOAD, dl, getPointerTy(DAG.getDataLayout()), Chain,
	BasePtr, MVT::i8, MMO);
	SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewLD);

	SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) };
	return DAG.getMergeValues(Ops, dl);
	}

	SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
	if (Op.getOperand(1).getValueType().isVector())
	return LowerVectorStore(Op, DAG);

	assert(Op.getOperand(1).getValueType() == MVT::i1 &&
	"Custom lowering only for i1 stores");

	// First, zero extend to 32 bits, then use a truncating store to 8 bits.

	SDLoc dl(Op);
	StoreSDNode *ST = cast<StoreSDNode>(Op);

	SDValue Chain = ST->getChain();
	SDValue BasePtr = ST->getBasePtr();
	SDValue Value = ST->getValue();
	MachineMemOperand *MMO = ST->getMemOperand();

	Value = DAG.getNode(ISD::ZERO_EXTEND, dl, getPointerTy(DAG.getDataLayout()),
	Value);
	return DAG.getTruncStore(Chain, dl, Value, BasePtr, MVT::i8, MMO);
	}

	// FIXME: Remove this once the ANDI glue bug is fixed:
	SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
	assert(Op.getValueType() == MVT::i1 &&
	"Custom lowering only for i1 results");

	SDLoc DL(Op);
	return DAG.getNode(PPCISD::ANDI_rec_1_GT_BIT, DL, MVT::i1, Op.getOperand(0));
	}

	SDValue PPCTargetLowering::LowerTRUNCATEVector(SDValue Op,
	SelectionDAG &DAG) const {

	// Implements a vector truncate that fits in a vector register as a shuffle.
	// We want to legalize vector truncates down to where the source fits in
	// a vector register (and target is therefore smaller than vector register
	// size). At that point legalization will try to custom lower the sub-legal
	// result and get here - where we can contain the truncate as a single target
	// operation.

	// For example a trunc <2 x i16> to <2 x i8> could be visualized as follows:
	// <MSB1\|LSB1, MSB2\|LSB2> to <LSB1, LSB2>
	//
	// We will implement it for big-endian ordering as this (where x denotes
	// undefined):
	// < MSB1\|LSB1, MSB2\|LSB2, uu, uu, uu, uu, uu, uu> to
	// < LSB1, LSB2, u, u, u, u, u, u, u, u, u, u, u, u, u, u>
	//
	// The same operation in little-endian ordering will be:
	// <uu, uu, uu, uu, uu, uu, LSB2\|MSB2, LSB1\|MSB1> to
	// <u, u, u, u, u, u, u, u, u, u, u, u, u, u, LSB2, LSB1>

	assert(Op.getValueType().isVector() && "Vector type expected.");

	SDLoc DL(Op);
	SDValue N1 = Op.getOperand(0);
	unsigned SrcSize = N1.getValueType().getSizeInBits();
	assert(SrcSize <= 128 && "Source must fit in an Altivec/VSX vector");
	SDValue WideSrc = SrcSize == 128 ? N1 : widenVec(DAG, N1, DL);

	EVT TrgVT = Op.getValueType();
	unsigned TrgNumElts = TrgVT.getVectorNumElements();
	EVT EltVT = TrgVT.getVectorElementType();
	unsigned WideNumElts = 128 / EltVT.getSizeInBits();
	EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts);

	// First list the elements we want to keep.
	unsigned SizeMult = SrcSize / TrgVT.getSizeInBits();
	SmallVector<int, 16> ShuffV;
	if (Subtarget.isLittleEndian())
	for (unsigned i = 0; i < TrgNumElts; ++i)
	ShuffV.push_back(i * SizeMult);
	else
	for (unsigned i = 1; i <= TrgNumElts; ++i)
	ShuffV.push_back(i * SizeMult - 1);

	// Populate the remaining elements with undefs.
	for (unsigned i = TrgNumElts; i < WideNumElts; ++i)
	// ShuffV.push_back(i + WideNumElts);
	ShuffV.push_back(WideNumElts + 1);

	SDValue Conv = DAG.getNode(ISD::BITCAST, DL, WideVT, WideSrc);
	return DAG.getVectorShuffle(WideVT, DL, Conv, DAG.getUNDEF(WideVT), ShuffV);
	}

	/// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when
	/// possible.
	SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
	// Not FP? Not a fsel.
	if (!Op.getOperand(0).getValueType().isFloatingPoint() \|\|
	!Op.getOperand(2).getValueType().isFloatingPoint())
	return Op;

	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();

	EVT ResVT = Op.getValueType();
	EVT CmpVT = Op.getOperand(0).getValueType();
	SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
	SDValue TV = Op.getOperand(2), FV = Op.getOperand(3);
	SDLoc dl(Op);
	SDNodeFlags Flags = Op.getNode()->getFlags();

	// We have xsmaxcdp/xsmincdp which are OK to emit even in the
	// presence of infinities.
	if (Subtarget.hasP9Vector() && LHS == TV && RHS == FV) {
	switch (CC) {
	default:
	break;
	case ISD::SETOGT:
	case ISD::SETGT:
	return DAG.getNode(PPCISD::XSMAXCDP, dl, Op.getValueType(), LHS, RHS);
	case ISD::SETOLT:
	case ISD::SETLT:
	return DAG.getNode(PPCISD::XSMINCDP, dl, Op.getValueType(), LHS, RHS);
	}
	}

	// We might be able to do better than this under some circumstances, but in
	// general, fsel-based lowering of select is a finite-math-only optimization.
	// For more information, see section F.3 of the 2.06 ISA specification.
	// With ISA 3.0
	if ((!DAG.getTarget().Options.NoInfsFPMath && !Flags.hasNoInfs()) \|\|
	(!DAG.getTarget().Options.NoNaNsFPMath && !Flags.hasNoNaNs()))
	return Op;

	// If the RHS of the comparison is a 0.0, we don't need to do the
	// subtraction at all.
	SDValue Sel1;
	if (isFloatingPointZero(RHS))
	switch (CC) {
	default: break; // SETUO etc aren't handled by fsel.
	case ISD::SETNE:
	std::swap(TV, FV);
	LLVM_FALLTHROUGH;
	case ISD::SETEQ:
	if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
	LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
	Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV);
	if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits
	Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1);
	return DAG.getNode(PPCISD::FSEL, dl, ResVT,
	DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), Sel1, FV);
	case ISD::SETULT:
	case ISD::SETLT:
	std::swap(TV, FV); // fsel is natively setge, swap operands for setlt
	LLVM_FALLTHROUGH;
	case ISD::SETOGE:
	case ISD::SETGE:
	if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
	LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
	return DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV);
	case ISD::SETUGT:
	case ISD::SETGT:
	std::swap(TV, FV); // fsel is natively setge, swap operands for setlt
	LLVM_FALLTHROUGH;
	case ISD::SETOLE:
	case ISD::SETLE:
	if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
	LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
	return DAG.getNode(PPCISD::FSEL, dl, ResVT,
	DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), TV, FV);
	}

	SDValue Cmp;
	switch (CC) {
	default: break; // SETUO etc aren't handled by fsel.
	case ISD::SETNE:
	std::swap(TV, FV);
	LLVM_FALLTHROUGH;
	case ISD::SETEQ:
	Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);
	if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
	Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
	Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
	if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits
	Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1);
	return DAG.getNode(PPCISD::FSEL, dl, ResVT,
	DAG.getNode(ISD::FNEG, dl, MVT::f64, Cmp), Sel1, FV);
	case ISD::SETULT:
	case ISD::SETLT:
	Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);
	if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
	Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
	return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
	case ISD::SETOGE:
	case ISD::SETGE:
	Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);
	if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
	Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
	return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
	case ISD::SETUGT:
	case ISD::SETGT:
	Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags);
	if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
	Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
	return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
	case ISD::SETOLE:
	case ISD::SETLE:
	Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags);
	if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
	Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
	return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
	}
	return Op;
	}

	void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
	SelectionDAG &DAG,
	const SDLoc &dl) const {
	assert(Op.getOperand(0).getValueType().isFloatingPoint());
	SDValue Src = Op.getOperand(0);
	if (Src.getValueType() == MVT::f32)
	Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);

	SDValue Tmp;
	switch (Op.getSimpleValueType().SimpleTy) {
	default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
	case MVT::i32:
	Tmp = DAG.getNode(
	Op.getOpcode() == ISD::FP_TO_SINT
	? PPCISD::FCTIWZ
	: (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ),
	dl, MVT::f64, Src);
	break;
	case MVT::i64:
	assert((Op.getOpcode() == ISD::FP_TO_SINT \|\| Subtarget.hasFPCVT()) &&
	"i64 FP_TO_UINT is supported only with FPCVT");
	Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
	PPCISD::FCTIDUZ,
	dl, MVT::f64, Src);
	break;
	}

	// Convert the FP value to an int value through memory.
	bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() &&
	(Op.getOpcode() == ISD::FP_TO_SINT \|\| Subtarget.hasFPCVT());
	SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64);
	int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex();
	MachinePointerInfo MPI =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);

	// Emit a store to the stack slot.
	SDValue Chain;
	Align Alignment(DAG.getEVTAlign(Tmp.getValueType()));
	if (i32Stack) {
	MachineFunction &MF = DAG.getMachineFunction();
	Alignment = Align(4);
	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, Alignment);
	SDValue Ops[] = { DAG.getEntryNode(), Tmp, FIPtr };
	Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
	DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO);
	} else
	Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr, MPI, Alignment);

	// Result is a load from the stack slot. If loading 4 bytes, make sure to
	// add in a bias on big endian.
	if (Op.getValueType() == MVT::i32 && !i32Stack) {
	FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr,
	DAG.getConstant(4, dl, FIPtr.getValueType()));
	MPI = MPI.getWithOffset(Subtarget.isLittleEndian() ? 0 : 4);
	}

	RLI.Chain = Chain;
	RLI.Ptr = FIPtr;
	RLI.MPI = MPI;
	RLI.Alignment = Alignment;
	}

	/// Custom lowers floating point to integer conversions to use
	/// the direct move instructions available in ISA 2.07 to avoid the
	/// need for load/store combinations.
	SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op,
	SelectionDAG &DAG,
	const SDLoc &dl) const {
	assert(Op.getOperand(0).getValueType().isFloatingPoint());
	SDValue Src = Op.getOperand(0);

	if (Src.getValueType() == MVT::f32)
	Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);

	SDValue Tmp;
	switch (Op.getSimpleValueType().SimpleTy) {
	default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
	case MVT::i32:
	Tmp = DAG.getNode(
	Op.getOpcode() == ISD::FP_TO_SINT
	? PPCISD::FCTIWZ
	: (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ),
	dl, MVT::f64, Src);
	Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i32, Tmp);
	break;
	case MVT::i64:
	assert((Op.getOpcode() == ISD::FP_TO_SINT \|\| Subtarget.hasFPCVT()) &&
	"i64 FP_TO_UINT is supported only with FPCVT");
	Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
	PPCISD::FCTIDUZ,
	dl, MVT::f64, Src);
	Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i64, Tmp);
	break;
	}
	return Tmp;
	}

	SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
	const SDLoc &dl) const {

	// FP to INT conversions are legal for f128.
	if (Op->getOperand(0).getValueType() == MVT::f128)
	return Op;

	// Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
	// PPC (the libcall is not available).
	if (Op.getOperand(0).getValueType() == MVT::ppcf128) {
	if (Op.getValueType() == MVT::i32) {
	if (Op.getOpcode() == ISD::FP_TO_SINT) {
	SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
	MVT::f64, Op.getOperand(0),
	DAG.getIntPtrConstant(0, dl));
	SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
	MVT::f64, Op.getOperand(0),
	DAG.getIntPtrConstant(1, dl));

	// Add the two halves of the long double in round-to-zero mode.
	SDValue Res = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi);

	// Now use a smaller FP_TO_SINT.
	return DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Res);
	}
	if (Op.getOpcode() == ISD::FP_TO_UINT) {
	const uint64_t TwoE31[] = {0x41e0000000000000LL, 0};
	APFloat APF = APFloat(APFloat::PPCDoubleDouble(), APInt(128, TwoE31));
	SDValue Tmp = DAG.getConstantFP(APF, dl, MVT::ppcf128);
	// X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X
	// FIXME: generated code sucks.
	// TODO: Are there fast-math-flags to propagate to this FSUB?
	SDValue True = DAG.getNode(ISD::FSUB, dl, MVT::ppcf128,
	Op.getOperand(0), Tmp);
	True = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, True);
	True = DAG.getNode(ISD::ADD, dl, MVT::i32, True,
	DAG.getConstant(0x80000000, dl, MVT::i32));
	SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32,
	Op.getOperand(0));
	return DAG.getSelectCC(dl, Op.getOperand(0), Tmp, True, False,
	ISD::SETGE);
	}
	}

	return SDValue();
	}

	if (Subtarget.hasDirectMove() && Subtarget.isPPC64())
	return LowerFP_TO_INTDirectMove(Op, DAG, dl);

	ReuseLoadInfo RLI;
	LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);

	return DAG.getLoad(Op.getValueType(), dl, RLI.Chain, RLI.Ptr, RLI.MPI,
	RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges);
	}

	// We're trying to insert a regular store, S, and then a load, L. If the
	// incoming value, O, is a load, we might just be able to have our load use the
	// address used by O. However, we don't know if anything else will store to
	// that address before we can load from it. To prevent this situation, we need
	// to insert our load, L, into the chain as a peer of O. To do this, we give L
	// the same chain operand as O, we create a token factor from the chain results
	// of O and L, and we replace all uses of O's chain result with that token
	// factor (see spliceIntoChain below for this last part).
	bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT,
	ReuseLoadInfo &RLI,
	SelectionDAG &DAG,
	ISD::LoadExtType ET) const {
	SDLoc dl(Op);
	bool ValidFPToUint = Op.getOpcode() == ISD::FP_TO_UINT &&
	(Subtarget.hasFPCVT() \|\| Op.getValueType() == MVT::i32);
	if (ET == ISD::NON_EXTLOAD &&
	(ValidFPToUint \|\| Op.getOpcode() == ISD::FP_TO_SINT) &&
	isOperationLegalOrCustom(Op.getOpcode(),
	Op.getOperand(0).getValueType())) {

	LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);
	return true;
	}

	LoadSDNode *LD = dyn_cast<LoadSDNode>(Op);
	if (!LD \|\| LD->getExtensionType() != ET \|\| LD->isVolatile() \|\|
	LD->isNonTemporal())
	return false;
	if (LD->getMemoryVT() != MemVT)
	return false;

	RLI.Ptr = LD->getBasePtr();
	if (LD->isIndexed() && !LD->getOffset().isUndef()) {
	assert(LD->getAddressingMode() == ISD::PRE_INC &&
	"Non-pre-inc AM on PPC?");
	RLI.Ptr = DAG.getNode(ISD::ADD, dl, RLI.Ptr.getValueType(), RLI.Ptr,
	LD->getOffset());
	}

	RLI.Chain = LD->getChain();
	RLI.MPI = LD->getPointerInfo();
	RLI.IsDereferenceable = LD->isDereferenceable();
	RLI.IsInvariant = LD->isInvariant();
	RLI.Alignment = LD->getAlign();
	RLI.AAInfo = LD->getAAInfo();
	RLI.Ranges = LD->getRanges();

	RLI.ResChain = SDValue(LD, LD->isIndexed() ? 2 : 1);
	return true;
	}

	// Given the head of the old chain, ResChain, insert a token factor containing
	// it and NewResChain, and make users of ResChain now be users of that token
	// factor.
	// TODO: Remove and use DAG::makeEquivalentMemoryOrdering() instead.
	void PPCTargetLowering::spliceIntoChain(SDValue ResChain,
	SDValue NewResChain,
	SelectionDAG &DAG) const {
	if (!ResChain)
	return;

	SDLoc dl(NewResChain);

	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	NewResChain, DAG.getUNDEF(MVT::Other));
	assert(TF.getNode() != NewResChain.getNode() &&
	"A new TF really is required here");

	DAG.ReplaceAllUsesOfValueWith(ResChain, TF);
	DAG.UpdateNodeOperands(TF.getNode(), ResChain, NewResChain);
	}

	/// Analyze profitability of direct move
	/// prefer float load to int load plus direct move
	/// when there is no integer use of int load
	bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const {
	SDNode *Origin = Op.getOperand(0).getNode();
	if (Origin->getOpcode() != ISD::LOAD)
	return true;

	// If there is no LXSIBZX/LXSIHZX, like Power8,
	// prefer direct move if the memory size is 1 or 2 bytes.
	MachineMemOperand *MMO = cast<LoadSDNode>(Origin)->getMemOperand();
	if (!Subtarget.hasP9Vector() && MMO->getSize() <= 2)
	return true;

	for (SDNode::use_iterator UI = Origin->use_begin(),
	UE = Origin->use_end();
	UI != UE; ++UI) {

	// Only look at the users of the loaded value.
	if (UI.getUse().get().getResNo() != 0)
	continue;

	if (UI->getOpcode() != ISD::SINT_TO_FP &&
	UI->getOpcode() != ISD::UINT_TO_FP)
	return true;
	}

	return false;
	}

	/// Custom lowers integer to floating point conversions to use
	/// the direct move instructions available in ISA 2.07 to avoid the
	/// need for load/store combinations.
	SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op,
	SelectionDAG &DAG,
	const SDLoc &dl) const {
	assert((Op.getValueType() == MVT::f32 \|\|
	Op.getValueType() == MVT::f64) &&
	"Invalid floating point type as target of conversion");
	assert(Subtarget.hasFPCVT() &&
	"Int to FP conversions with direct moves require FPCVT");
	SDValue FP;
	SDValue Src = Op.getOperand(0);
	bool SinglePrec = Op.getValueType() == MVT::f32;
	bool WordInt = Src.getSimpleValueType().SimpleTy == MVT::i32;
	bool Signed = Op.getOpcode() == ISD::SINT_TO_FP;
	unsigned ConvOp = Signed ? (SinglePrec ? PPCISD::FCFIDS : PPCISD::FCFID) :
	(SinglePrec ? PPCISD::FCFIDUS : PPCISD::FCFIDU);

	if (WordInt) {
	FP = DAG.getNode(Signed ? PPCISD::MTVSRA : PPCISD::MTVSRZ,
	dl, MVT::f64, Src);
	FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP);
	}
	else {
	FP = DAG.getNode(PPCISD::MTVSRA, dl, MVT::f64, Src);
	FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP);
	}

	return FP;
	}

	static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl) {

	EVT VecVT = Vec.getValueType();
	assert(VecVT.isVector() && "Expected a vector type.");
	assert(VecVT.getSizeInBits() < 128 && "Vector is already full width.");

	EVT EltVT = VecVT.getVectorElementType();
	unsigned WideNumElts = 128 / EltVT.getSizeInBits();
	EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts);

	unsigned NumConcat = WideNumElts / VecVT.getVectorNumElements();
	SmallVector<SDValue, 16> Ops(NumConcat);
	Ops[0] = Vec;
	SDValue UndefVec = DAG.getUNDEF(VecVT);
	for (unsigned i = 1; i < NumConcat; ++i)
	Ops[i] = UndefVec;

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Ops);
	}

	SDValue PPCTargetLowering::LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG,
	const SDLoc &dl) const {

	unsigned Opc = Op.getOpcode();
	assert((Opc == ISD::UINT_TO_FP \|\| Opc == ISD::SINT_TO_FP) &&
	"Unexpected conversion type");
	assert((Op.getValueType() == MVT::v2f64 \|\| Op.getValueType() == MVT::v4f32) &&
	"Supports conversions to v2f64/v4f32 only.");

	bool SignedConv = Opc == ISD::SINT_TO_FP;
	bool FourEltRes = Op.getValueType() == MVT::v4f32;

	SDValue Wide = widenVec(DAG, Op.getOperand(0), dl);
	EVT WideVT = Wide.getValueType();
	unsigned WideNumElts = WideVT.getVectorNumElements();
	MVT IntermediateVT = FourEltRes ? MVT::v4i32 : MVT::v2i64;

	SmallVector<int, 16> ShuffV;
	for (unsigned i = 0; i < WideNumElts; ++i)
	ShuffV.push_back(i + WideNumElts);

	int Stride = FourEltRes ? WideNumElts / 4 : WideNumElts / 2;
	int SaveElts = FourEltRes ? 4 : 2;
	if (Subtarget.isLittleEndian())
	for (int i = 0; i < SaveElts; i++)
	ShuffV[i * Stride] = i;
	else
	for (int i = 1; i <= SaveElts; i++)
	ShuffV[i * Stride - 1] = i - 1;

	SDValue ShuffleSrc2 =
	SignedConv ? DAG.getUNDEF(WideVT) : DAG.getConstant(0, dl, WideVT);
	SDValue Arrange = DAG.getVectorShuffle(WideVT, dl, Wide, ShuffleSrc2, ShuffV);

	SDValue Extend;
	if (SignedConv) {
	Arrange = DAG.getBitcast(IntermediateVT, Arrange);
	EVT ExtVT = Op.getOperand(0).getValueType();
	if (Subtarget.hasP9Altivec())
	ExtVT = EVT::getVectorVT(*DAG.getContext(), WideVT.getVectorElementType(),
	IntermediateVT.getVectorNumElements());

	Extend = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, IntermediateVT, Arrange,
	DAG.getValueType(ExtVT));
	} else
	Extend = DAG.getNode(ISD::BITCAST, dl, IntermediateVT, Arrange);

	return DAG.getNode(Opc, dl, Op.getValueType(), Extend);
	}

	SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);

	EVT InVT = Op.getOperand(0).getValueType();
	EVT OutVT = Op.getValueType();
	if (OutVT.isVector() && OutVT.isFloatingPoint() &&
	isOperationCustom(Op.getOpcode(), InVT))
	return LowerINT_TO_FPVector(Op, DAG, dl);

	// Conversions to f128 are legal.
	if (Op.getValueType() == MVT::f128)
	return Op;

	if (Subtarget.hasQPX() && Op.getOperand(0).getValueType() == MVT::v4i1) {
	if (Op.getValueType() != MVT::v4f32 && Op.getValueType() != MVT::v4f64)
	return SDValue();

	SDValue Value = Op.getOperand(0);
	// The values are now known to be -1 (false) or 1 (true). To convert this
	// into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
	// This can be done with an fma and the 0.5 constant: (V+1.0)0.5 = 0.5V+0.5
	Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);

	SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);

	Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);

	if (Op.getValueType() != MVT::v4f64)
	Value = DAG.getNode(ISD::FP_ROUND, dl,
	Op.getValueType(), Value,
	DAG.getIntPtrConstant(1, dl));
	return Value;
	}

	// Don't handle ppc_fp128 here; let it be lowered to a libcall.
	if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
	return SDValue();

	if (Op.getOperand(0).getValueType() == MVT::i1)
	return DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Op.getOperand(0),
	DAG.getConstantFP(1.0, dl, Op.getValueType()),
	DAG.getConstantFP(0.0, dl, Op.getValueType()));

	// If we have direct moves, we can do all the conversion, skip the store/load
	// however, without FPCVT we can't do most conversions.
	if (Subtarget.hasDirectMove() && directMoveIsProfitable(Op) &&
	Subtarget.isPPC64() && Subtarget.hasFPCVT())
	return LowerINT_TO_FPDirectMove(Op, DAG, dl);

	assert((Op.getOpcode() == ISD::SINT_TO_FP \|\| Subtarget.hasFPCVT()) &&
	"UINT_TO_FP is supported only with FPCVT");

	// If we have FCFIDS, then use it when converting to single-precision.
	// Otherwise, convert to double-precision and then round.
	unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
	? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS
	: PPCISD::FCFIDS)
	: (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU
	: PPCISD::FCFID);
	MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
	? MVT::f32
	: MVT::f64;

	if (Op.getOperand(0).getValueType() == MVT::i64) {
	SDValue SINT = Op.getOperand(0);
	// When converting to single-precision, we actually need to convert
	// to double-precision first and then round to single-precision.
	// To avoid double-rounding effects during that operation, we have
	// to prepare the input operand. Bits that might be truncated when
	// converting to double-precision are replaced by a bit that won't
	// be lost at this stage, but is below the single-precision rounding
	// position.
	//
	// However, if -enable-unsafe-fp-math is in effect, accept double
	// rounding to avoid the extra overhead.
	if (Op.getValueType() == MVT::f32 &&
	!Subtarget.hasFPCVT() &&
	!DAG.getTarget().Options.UnsafeFPMath) {

	// Twiddle input to make sure the low 11 bits are zero. (If this
	// is the case, we are guaranteed the value will fit into the 53 bit
	// mantissa of an IEEE double-precision value without rounding.)
	// If any of those low 11 bits were not zero originally, make sure
	// bit 12 (value 2048) is set instead, so that the final rounding
	// to single-precision gets the correct result.
	SDValue Round = DAG.getNode(ISD::AND, dl, MVT::i64,
	SINT, DAG.getConstant(2047, dl, MVT::i64));
	Round = DAG.getNode(ISD::ADD, dl, MVT::i64,
	Round, DAG.getConstant(2047, dl, MVT::i64));
	Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT);
	Round = DAG.getNode(ISD::AND, dl, MVT::i64,
	Round, DAG.getConstant(-2048, dl, MVT::i64));

	// However, we cannot use that value unconditionally: if the magnitude
	// of the input value is small, the bit-twiddling we did above might
	// end up visibly changing the output. Fortunately, in that case, we
	// don't need to twiddle bits since the original input will convert
	// exactly to double-precision floating-point already. Therefore,
	// construct a conditional to use the original value if the top 11
	// bits are all sign-bit copies, and use the rounded value computed
	// above otherwise.
	SDValue Cond = DAG.getNode(ISD::SRA, dl, MVT::i64,
	SINT, DAG.getConstant(53, dl, MVT::i32));
	Cond = DAG.getNode(ISD::ADD, dl, MVT::i64,
	Cond, DAG.getConstant(1, dl, MVT::i64));
	Cond = DAG.getSetCC(
	dl,
	getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
	Cond, DAG.getConstant(1, dl, MVT::i64), ISD::SETUGT);

	SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT);
	}

	ReuseLoadInfo RLI;
	SDValue Bits;

	MachineFunction &MF = DAG.getMachineFunction();
	if (canReuseLoadAddress(SINT, MVT::i64, RLI, DAG)) {
	Bits = DAG.getLoad(MVT::f64, dl, RLI.Chain, RLI.Ptr, RLI.MPI,
	RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges);
	spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
	} else if (Subtarget.hasLFIWAX() &&
	canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::SEXTLOAD)) {
	MachineMemOperand *MMO =
	MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
	RLI.Alignment, RLI.AAInfo, RLI.Ranges);
	SDValue Ops[] = { RLI.Chain, RLI.Ptr };
	Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWAX, dl,
	DAG.getVTList(MVT::f64, MVT::Other),
	Ops, MVT::i32, MMO);
	spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
	} else if (Subtarget.hasFPCVT() &&
	canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::ZEXTLOAD)) {
	MachineMemOperand *MMO =
	MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
	RLI.Alignment, RLI.AAInfo, RLI.Ranges);
	SDValue Ops[] = { RLI.Chain, RLI.Ptr };
	Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWZX, dl,
	DAG.getVTList(MVT::f64, MVT::Other),
	Ops, MVT::i32, MMO);
	spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
	} else if (((Subtarget.hasLFIWAX() &&
	SINT.getOpcode() == ISD::SIGN_EXTEND) \|\|
	(Subtarget.hasFPCVT() &&
	SINT.getOpcode() == ISD::ZERO_EXTEND)) &&
	SINT.getOperand(0).getValueType() == MVT::i32) {
	MachineFrameInfo &MFI = MF.getFrameInfo();
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	int FrameIdx = MFI.CreateStackObject(4, Align(4), false);
	SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);

	SDValue Store =
	DAG.getStore(DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(), FrameIdx));

	assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
	"Expected an i32 store");

	RLI.Ptr = FIdx;
	RLI.Chain = Store;
	RLI.MPI =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
	RLI.Alignment = Align(4);

	MachineMemOperand *MMO =
	MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
	RLI.Alignment, RLI.AAInfo, RLI.Ranges);
	SDValue Ops[] = { RLI.Chain, RLI.Ptr };
	Bits = DAG.getMemIntrinsicNode(SINT.getOpcode() == ISD::ZERO_EXTEND ?
	PPCISD::LFIWZX : PPCISD::LFIWAX,
	dl, DAG.getVTList(MVT::f64, MVT::Other),
	Ops, MVT::i32, MMO);
	} else
	Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT);

	SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits);

	if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
	FP = DAG.getNode(ISD::FP_ROUND, dl,
	MVT::f32, FP, DAG.getIntPtrConstant(0, dl));
	return FP;
	}

	assert(Op.getOperand(0).getValueType() == MVT::i32 &&
	"Unhandled INT_TO_FP type in custom expander!");
	// Since we only generate this in 64-bit mode, we can take advantage of
	// 64-bit registers. In particular, sign extend the input value into the
	// 64-bit register with extsw, store the WHOLE 64-bit value into the stack
	// then lfd it and fcfid it.
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	EVT PtrVT = getPointerTy(MF.getDataLayout());

	SDValue Ld;
	if (Subtarget.hasLFIWAX() \|\| Subtarget.hasFPCVT()) {
	ReuseLoadInfo RLI;
	bool ReusingLoad;
	if (!(ReusingLoad = canReuseLoadAddress(Op.getOperand(0), MVT::i32, RLI,
	DAG))) {
	int FrameIdx = MFI.CreateStackObject(4, Align(4), false);
	SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);

	SDValue Store =
	DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(), FrameIdx));

	assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
	"Expected an i32 store");

	RLI.Ptr = FIdx;
	RLI.Chain = Store;
	RLI.MPI =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
	RLI.Alignment = Align(4);
	}

	MachineMemOperand *MMO =
	MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
	RLI.Alignment, RLI.AAInfo, RLI.Ranges);
	SDValue Ops[] = { RLI.Chain, RLI.Ptr };
	Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ?
	PPCISD::LFIWZX : PPCISD::LFIWAX,
	dl, DAG.getVTList(MVT::f64, MVT::Other),
	Ops, MVT::i32, MMO);
	if (ReusingLoad)
	spliceIntoChain(RLI.ResChain, Ld.getValue(1), DAG);
	} else {
	assert(Subtarget.isPPC64() &&
	"i32->FP without LFIWAX supported only on PPC64");

	int FrameIdx = MFI.CreateStackObject(8, Align(8), false);
	SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);

	SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64,
	Op.getOperand(0));

	// STD the extended value into the stack slot.
	SDValue Store = DAG.getStore(
	DAG.getEntryNode(), dl, Ext64, FIdx,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx));

	// Load the value as a double.
	Ld = DAG.getLoad(
	MVT::f64, dl, Store, FIdx,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx));
	}

	// FCFID it and return it.
	SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Ld);
	if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
	FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP,
	DAG.getIntPtrConstant(0, dl));
	return FP;
	}

	SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	/*
	The rounding mode is in bits 30:31 of FPSR, and has the following
	settings:
	00 Round to nearest
	01 Round to 0
	10 Round to +inf
	11 Round to -inf

	FLT_ROUNDS, on the other hand, expects the following:
	-1 Undefined
	0 Round to 0
	1 Round to nearest
	2 Round to +inf
	3 Round to -inf

	To perform the conversion, we do:
	((FPSCR & 0x3) ^ ((~FPSCR & 0x3) >> 1))
	*/

	MachineFunction &MF = DAG.getMachineFunction();
	EVT VT = Op.getValueType();
	EVT PtrVT = getPointerTy(MF.getDataLayout());

	// Save FP Control Word to register
	SDValue Chain = Op.getOperand(0);
	SDValue MFFS = DAG.getNode(PPCISD::MFFS, dl, {MVT::f64, MVT::Other}, Chain);
	Chain = MFFS.getValue(1);

	// Save FP register to stack slot
	int SSFI = MF.getFrameInfo().CreateStackObject(8, Align(8), false);
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
	Chain = DAG.getStore(Chain, dl, MFFS, StackSlot, MachinePointerInfo());

	// Load FP Control Word from low 32 bits of stack slot.
	SDValue Four = DAG.getConstant(4, dl, PtrVT);
	SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four);
	SDValue CWD = DAG.getLoad(MVT::i32, dl, Chain, Addr, MachinePointerInfo());
	Chain = CWD.getValue(1);

	// Transform as necessary
	SDValue CWD1 =
	DAG.getNode(ISD::AND, dl, MVT::i32,
	CWD, DAG.getConstant(3, dl, MVT::i32));
	SDValue CWD2 =
	DAG.getNode(ISD::SRL, dl, MVT::i32,
	DAG.getNode(ISD::AND, dl, MVT::i32,
	DAG.getNode(ISD::XOR, dl, MVT::i32,
	CWD, DAG.getConstant(3, dl, MVT::i32)),
	DAG.getConstant(3, dl, MVT::i32)),
	DAG.getConstant(1, dl, MVT::i32));

	SDValue RetVal =
	DAG.getNode(ISD::XOR, dl, MVT::i32, CWD1, CWD2);

	RetVal =
	DAG.getNode((VT.getSizeInBits() < 16 ? ISD::TRUNCATE : ISD::ZERO_EXTEND),
	dl, VT, RetVal);

	return DAG.getMergeValues({RetVal, Chain}, dl);
	}

	SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();
	unsigned BitWidth = VT.getSizeInBits();
	SDLoc dl(Op);
	assert(Op.getNumOperands() == 3 &&
	VT == Op.getOperand(1).getValueType() &&
	"Unexpected SHL!");

	// Expand into a bunch of logical ops. Note that these ops
	// depend on the PPC behavior for oversized shift amounts.
	SDValue Lo = Op.getOperand(0);
	SDValue Hi = Op.getOperand(1);
	SDValue Amt = Op.getOperand(2);
	EVT AmtVT = Amt.getValueType();

	SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
	DAG.getConstant(BitWidth, dl, AmtVT), Amt);
	SDValue Tmp2 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Amt);
	SDValue Tmp3 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Tmp1);
	SDValue Tmp4 = DAG.getNode(ISD::OR , dl, VT, Tmp2, Tmp3);
	SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
	DAG.getConstant(-BitWidth, dl, AmtVT));
	SDValue Tmp6 = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Tmp5);
	SDValue OutHi = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
	SDValue OutLo = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Amt);
	SDValue OutOps[] = { OutLo, OutHi };
	return DAG.getMergeValues(OutOps, dl);
	}

	SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();
	SDLoc dl(Op);
	unsigned BitWidth = VT.getSizeInBits();
	assert(Op.getNumOperands() == 3 &&
	VT == Op.getOperand(1).getValueType() &&
	"Unexpected SRL!");

	// Expand into a bunch of logical ops. Note that these ops
	// depend on the PPC behavior for oversized shift amounts.
	SDValue Lo = Op.getOperand(0);
	SDValue Hi = Op.getOperand(1);
	SDValue Amt = Op.getOperand(2);
	EVT AmtVT = Amt.getValueType();

	SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
	DAG.getConstant(BitWidth, dl, AmtVT), Amt);
	SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
	SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
	SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
	SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
	DAG.getConstant(-BitWidth, dl, AmtVT));
	SDValue Tmp6 = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Tmp5);
	SDValue OutLo = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
	SDValue OutHi = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Amt);
	SDValue OutOps[] = { OutLo, OutHi };
	return DAG.getMergeValues(OutOps, dl);
	}

	SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
	SDLoc dl(Op);
	EVT VT = Op.getValueType();
	unsigned BitWidth = VT.getSizeInBits();
	assert(Op.getNumOperands() == 3 &&
	VT == Op.getOperand(1).getValueType() &&
	"Unexpected SRA!");

	// Expand into a bunch of logical ops, followed by a select_cc.
	SDValue Lo = Op.getOperand(0);
	SDValue Hi = Op.getOperand(1);
	SDValue Amt = Op.getOperand(2);
	EVT AmtVT = Amt.getValueType();

	SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
	DAG.getConstant(BitWidth, dl, AmtVT), Amt);
	SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
	SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
	SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
	SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
	DAG.getConstant(-BitWidth, dl, AmtVT));
	SDValue Tmp6 = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Tmp5);
	SDValue OutHi = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Amt);
	SDValue OutLo = DAG.getSelectCC(dl, Tmp5, DAG.getConstant(0, dl, AmtVT),
	Tmp4, Tmp6, ISD::SETLE);
	SDValue OutOps[] = { OutLo, OutHi };
	return DAG.getMergeValues(OutOps, dl);
	}

	//===----------------------------------------------------------------------===//
	// Vector related lowering.
	//

	/// getCanonicalConstSplat - Build a canonical splat immediate of Val with an
	/// element size of SplatSize. Cast the result to VT.
	static SDValue getCanonicalConstSplat(uint64_t Val, unsigned SplatSize, EVT VT,
	SelectionDAG &DAG, const SDLoc &dl) {
	static const MVT VTys[] = { // canonical VT to use for each size.
	MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32
	};

	EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1];

	// For a splat with all ones, turn it to vspltisb 0xFF to canonicalize.
	if (Val == ((1LU << (SplatSize * 8)) - 1)) {
	SplatSize = 1;
	Val = 0xFF;
	}

	EVT CanonicalVT = VTys[SplatSize-1];

	// Build a canonical splat for this value.
	return DAG.getBitcast(ReqVT, DAG.getConstant(Val, dl, CanonicalVT));
	}

	/// BuildIntrinsicOp - Return a unary operator intrinsic node with the
	/// specified intrinsic ID.
	static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op, SelectionDAG &DAG,
	const SDLoc &dl, EVT DestVT = MVT::Other) {
	if (DestVT == MVT::Other) DestVT = Op.getValueType();
	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
	DAG.getConstant(IID, dl, MVT::i32), Op);
	}

	/// BuildIntrinsicOp - Return a binary operator intrinsic node with the
	/// specified intrinsic ID.
	static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS,
	SelectionDAG &DAG, const SDLoc &dl,
	EVT DestVT = MVT::Other) {
	if (DestVT == MVT::Other) DestVT = LHS.getValueType();
	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
	DAG.getConstant(IID, dl, MVT::i32), LHS, RHS);
	}

	/// BuildIntrinsicOp - Return a ternary operator intrinsic node with the
	/// specified intrinsic ID.
	static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1,
	SDValue Op2, SelectionDAG &DAG, const SDLoc &dl,
	EVT DestVT = MVT::Other) {
	if (DestVT == MVT::Other) DestVT = Op0.getValueType();
	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
	DAG.getConstant(IID, dl, MVT::i32), Op0, Op1, Op2);
	}

	/// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified
	/// amount. The result has the specified value type.
	static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, EVT VT,
	SelectionDAG &DAG, const SDLoc &dl) {
	// Force LHS/RHS to be the right type.
	LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, LHS);
	RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, RHS);

	int Ops[16];
	for (unsigned i = 0; i != 16; ++i)
	Ops[i] = i + Amt;
	SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, LHS, RHS, Ops);
	return DAG.getNode(ISD::BITCAST, dl, VT, T);
	}

	/// Do we have an efficient pattern in a .td file for this node?
	///
	/// \param V - pointer to the BuildVectorSDNode being matched
	/// \param HasDirectMove - does this subtarget have VSR <-> GPR direct moves?
	///
	/// There are some patterns where it is beneficial to keep a BUILD_VECTOR
	/// node as a BUILD_VECTOR node rather than expanding it. The patterns where
	/// the opposite is true (expansion is beneficial) are:
	/// - The node builds a vector out of integers that are not 32 or 64-bits
	/// - The node builds a vector out of constants
	/// - The node is a "load-and-splat"
	/// In all other cases, we will choose to keep the BUILD_VECTOR.
	static bool haveEfficientBuildVectorPattern(BuildVectorSDNode *V,
	bool HasDirectMove,
	bool HasP8Vector) {
	EVT VecVT = V->getValueType(0);
	bool RightType = VecVT == MVT::v2f64 \|\|
	(HasP8Vector && VecVT == MVT::v4f32) \|\|
	(HasDirectMove && (VecVT == MVT::v2i64 \|\| VecVT == MVT::v4i32));
	if (!RightType)
	return false;

	bool IsSplat = true;
	bool IsLoad = false;
	SDValue Op0 = V->getOperand(0);

	// This function is called in a block that confirms the node is not a constant
	// splat. So a constant BUILD_VECTOR here means the vector is built out of
	// different constants.
	if (V->isConstant())
	return false;
	for (int i = 0, e = V->getNumOperands(); i < e; ++i) {
	if (V->getOperand(i).isUndef())
	return false;
	// We want to expand nodes that represent load-and-splat even if the
	// loaded value is a floating point truncation or conversion to int.
	if (V->getOperand(i).getOpcode() == ISD::LOAD \|\|
	(V->getOperand(i).getOpcode() == ISD::FP_ROUND &&
	V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) \|\|
	(V->getOperand(i).getOpcode() == ISD::FP_TO_SINT &&
	V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) \|\|
	(V->getOperand(i).getOpcode() == ISD::FP_TO_UINT &&
	V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD))
	IsLoad = true;
	// If the operands are different or the input is not a load and has more
	// uses than just this BV node, then it isn't a splat.
	if (V->getOperand(i) != Op0 \|\|
	(!IsLoad && !V->isOnlyUserOf(V->getOperand(i).getNode())))
	IsSplat = false;
	}
	return !(IsSplat && IsLoad);
	}

	// Lower BITCAST(f128, (build_pair i64, i64)) to BUILD_FP128.
	SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {

	SDLoc dl(Op);
	SDValue Op0 = Op->getOperand(0);

	if ((Op.getValueType() != MVT::f128) \|\|
	(Op0.getOpcode() != ISD::BUILD_PAIR) \|\|
	(Op0.getOperand(0).getValueType() != MVT::i64) \|\|
	(Op0.getOperand(1).getValueType() != MVT::i64))
	return SDValue();

	return DAG.getNode(PPCISD::BUILD_FP128, dl, MVT::f128, Op0.getOperand(0),
	Op0.getOperand(1));
	}

	-static const SDValue *getNormalLoadInput(const SDValue &Op) {
	+static const SDValue *getNormalLoadInput(const SDValue &Op, bool &IsPermuted) {
	const SDValue *InputLoad = &Op;
	if (InputLoad->getOpcode() == ISD::BITCAST)
	InputLoad = &InputLoad->getOperand(0);
	if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR \|\|
	- InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED)
	+ InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED) {
	+ IsPermuted = InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED;
	InputLoad = &InputLoad->getOperand(0);
	+ }
	if (InputLoad->getOpcode() != ISD::LOAD)
	return nullptr;
	LoadSDNode LD = cast<LoadSDNode>(InputLoad);
	return ISD::isNormalLoad(LD) ? InputLoad : nullptr;
	}

	// Convert the argument APFloat to a single precision APFloat if there is no
	// loss in information during the conversion to single precision APFloat and the
	// resulting number is not a denormal number. Return true if successful.
	bool llvm::convertToNonDenormSingle(APFloat &ArgAPFloat) {
	APFloat APFloatToConvert = ArgAPFloat;
	bool LosesInfo = true;
	APFloatToConvert.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
	&LosesInfo);
	bool Success = (!LosesInfo && !APFloatToConvert.isDenormal());
	if (Success)
	ArgAPFloat = APFloatToConvert;
	return Success;
	}

	// Bitcast the argument APInt to a double and convert it to a single precision
	// APFloat, bitcast the APFloat to an APInt and assign it to the original
	// argument if there is no loss in information during the conversion from
	// double to single precision APFloat and the resulting number is not a denormal
	// number. Return true if successful.
	bool llvm::convertToNonDenormSingle(APInt &ArgAPInt) {
	double DpValue = ArgAPInt.bitsToDouble();
	APFloat APFloatDp(DpValue);
	bool Success = convertToNonDenormSingle(APFloatDp);
	if (Success)
	ArgAPInt = APFloatDp.bitcastToAPInt();
	return Success;
	}

	// If this is a case we can't handle, return null and let the default
	// expansion code take care of it. If we CAN select this case, and if it
	// selects to a single instruction, return Op. Otherwise, if we can codegen
	// this case more efficiently than a constant pool load, lower it to the
	// sequence of ops that should be used.
	SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
	assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");

	if (Subtarget.hasQPX() && Op.getValueType() == MVT::v4i1) {
	// We first build an i32 vector, load it into a QPX register,
	// then convert it to a floating-point vector and compare it
	// to a zero vector to get the boolean result.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	int FrameIdx = MFI.CreateStackObject(16, Align(16), false);
	MachinePointerInfo PtrInfo =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);

	assert(BVN->getNumOperands() == 4 &&
	"BUILD_VECTOR for v4i1 does not have 4 operands");

	bool IsConst = true;
	for (unsigned i = 0; i < 4; ++i) {
	if (BVN->getOperand(i).isUndef()) continue;
	if (!isa<ConstantSDNode>(BVN->getOperand(i))) {
	IsConst = false;
	break;
	}
	}

	if (IsConst) {
	Constant *One =
	ConstantFP::get(Type::getFloatTy(*DAG.getContext()), 1.0);
	Constant *NegOne =
	ConstantFP::get(Type::getFloatTy(*DAG.getContext()), -1.0);

	Constant *CV[4];
	for (unsigned i = 0; i < 4; ++i) {
	if (BVN->getOperand(i).isUndef())
	CV[i] = UndefValue::get(Type::getFloatTy(*DAG.getContext()));
	else if (isNullConstant(BVN->getOperand(i)))
	CV[i] = NegOne;
	else
	CV[i] = One;
	}

	Constant *CP = ConstantVector::get(CV);
	SDValue CPIdx =
	DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()), Align(16));

	SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
	SDVTList VTs = DAG.getVTList({MVT::v4i1, /chain/ MVT::Other});
	return DAG.getMemIntrinsicNode(
	PPCISD::QVLFSb, dl, VTs, Ops, MVT::v4f32,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
	}

	SmallVector<SDValue, 4> Stores;
	for (unsigned i = 0; i < 4; ++i) {
	if (BVN->getOperand(i).isUndef()) continue;

	unsigned Offset = 4*i;
	SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
	Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);

	unsigned StoreSize = BVN->getOperand(i).getValueType().getStoreSize();
	if (StoreSize > 4) {
	Stores.push_back(
	DAG.getTruncStore(DAG.getEntryNode(), dl, BVN->getOperand(i), Idx,
	PtrInfo.getWithOffset(Offset), MVT::i32));
	} else {
	SDValue StoreValue = BVN->getOperand(i);
	if (StoreSize < 4)
	StoreValue = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, StoreValue);

	Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, StoreValue, Idx,
	PtrInfo.getWithOffset(Offset)));
	}
	}

	SDValue StoreChain;
	if (!Stores.empty())
	StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
	else
	StoreChain = DAG.getEntryNode();

	// Now load from v4i32 into the QPX register; this will extend it to
	// v4i64 but not yet convert it to a floating point. Nevertheless, this
	// is typed as v4f64 because the QPX register integer states are not
	// explicitly represented.

	SDValue Ops[] = {StoreChain,
	DAG.getConstant(Intrinsic::ppc_qpx_qvlfiwz, dl, MVT::i32),
	FIdx};
	SDVTList VTs = DAG.getVTList({MVT::v4f64, /chain/ MVT::Other});

	SDValue LoadedVect = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN,
	dl, VTs, Ops, MVT::v4i32, PtrInfo);
	LoadedVect = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
	DAG.getConstant(Intrinsic::ppc_qpx_qvfcfidu, dl, MVT::i32),
	LoadedVect);

	SDValue FPZeros = DAG.getConstantFP(0.0, dl, MVT::v4f64);

	return DAG.getSetCC(dl, MVT::v4i1, LoadedVect, FPZeros, ISD::SETEQ);
	}

	// All other QPX vectors are handled by generic code.
	if (Subtarget.hasQPX())
	return SDValue();

	// Check if this is a splat of a constant value.
	APInt APSplatBits, APSplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	bool BVNIsConstantSplat =
	BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
	HasAnyUndefs, 0, !Subtarget.isLittleEndian());

	// If it is a splat of a double, check if we can shrink it to a 32 bit
	// non-denormal float which when converted back to double gives us the same
	// double. This is to exploit the XXSPLTIDP instruction.
	if (BVNIsConstantSplat && Subtarget.hasPrefixInstrs() &&
	(SplatBitSize == 64) && (Op->getValueType(0) == MVT::v2f64) &&
	convertToNonDenormSingle(APSplatBits)) {
	SDValue SplatNode = DAG.getNode(
	PPCISD::XXSPLTI_SP_TO_DP, dl, MVT::v2f64,
	DAG.getTargetConstant(APSplatBits.getZExtValue(), dl, MVT::i32));
	return DAG.getBitcast(Op.getValueType(), SplatNode);
	}

	if (!BVNIsConstantSplat \|\| SplatBitSize > 32) {

	- const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0));
	+ bool IsPermutedLoad = false;
	+ const SDValue *InputLoad =
	+ getNormalLoadInput(Op.getOperand(0), IsPermutedLoad);
	// Handle load-and-splat patterns as we have instructions that will do this
	// in one go.
	if (InputLoad && DAG.isSplatValue(Op, true)) {
	LoadSDNode LD = cast<LoadSDNode>(InputLoad);

	// We have handling for 4 and 8 byte elements.
	unsigned ElementSize = LD->getMemoryVT().getScalarSizeInBits();

	// Checking for a single use of this load, we have to check for vector
	// width (128 bits) / ElementSize uses (since each operand of the
	// BUILD_VECTOR is a separate use of the value.
	if (InputLoad->getNode()->hasNUsesOfValue(128 / ElementSize, 0) &&
	((Subtarget.hasVSX() && ElementSize == 64) \|\|
	(Subtarget.hasP9Vector() && ElementSize == 32))) {
	SDValue Ops[] = {
	LD->getChain(), // Chain
	LD->getBasePtr(), // Ptr
	DAG.getValueType(Op.getValueType()) // VT
	};
	return
	DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl,
	DAG.getVTList(Op.getValueType(), MVT::Other),
	Ops, LD->getMemoryVT(), LD->getMemOperand());
	}
	}

	// BUILD_VECTOR nodes that are not constant splats of up to 32-bits can be
	// lowered to VSX instructions under certain conditions.
	// Without VSX, there is no pattern more efficient than expanding the node.
	if (Subtarget.hasVSX() &&
	haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove(),
	Subtarget.hasP8Vector()))
	return Op;
	return SDValue();
	}

	uint64_t SplatBits = APSplatBits.getZExtValue();
	uint64_t SplatUndef = APSplatUndef.getZExtValue();
	unsigned SplatSize = SplatBitSize / 8;

	// First, handle single instruction cases.

	// All zeros?
	if (SplatBits == 0) {
	// Canonicalize all zero vectors to be v4i32.
	if (Op.getValueType() != MVT::v4i32 \|\| HasAnyUndefs) {
	SDValue Z = DAG.getConstant(0, dl, MVT::v4i32);
	Op = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Z);
	}
	return Op;
	}

	// We have XXSPLTIW for constant splats four bytes wide.
	// Given vector length is a multiple of 4, 2-byte splats can be replaced
	// with 4-byte splats. We replicate the SplatBits in case of 2-byte splat to
	// make a 4-byte splat element. For example: 2-byte splat of 0xABAB can be
	// turned into a 4-byte splat of 0xABABABAB.
	if (Subtarget.hasPrefixInstrs() && SplatSize == 2)
	return getCanonicalConstSplat((SplatBits \|= SplatBits << 16), SplatSize * 2,
	Op.getValueType(), DAG, dl);

	if (Subtarget.hasPrefixInstrs() && SplatSize == 4)
	return getCanonicalConstSplat(SplatBits, SplatSize, Op.getValueType(), DAG,
	dl);

	// We have XXSPLTIB for constant splats one byte wide.
	if (Subtarget.hasP9Vector() && SplatSize == 1)
	return getCanonicalConstSplat(SplatBits, SplatSize, Op.getValueType(), DAG,
	dl);

	// If the sign extended value is in the range [-16,15], use VSPLTI[bhw].
	int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >>
	(32-SplatBitSize));
	if (SextVal >= -16 && SextVal <= 15)
	return getCanonicalConstSplat(SextVal, SplatSize, Op.getValueType(), DAG,
	dl);

	// Two instruction sequences.

	// If this value is in the range [-32,30] and is even, use:
	// VSPLTI[bhw](val/2) + VSPLTI[bhw](val/2)
	// If this value is in the range [17,31] and is odd, use:
	// VSPLTI[bhw](val-16) - VSPLTI[bhw](-16)
	// If this value is in the range [-31,-17] and is odd, use:
	// VSPLTI[bhw](val+16) + VSPLTI[bhw](-16)
	// Note the last two are three-instruction sequences.
	if (SextVal >= -32 && SextVal <= 31) {
	// To avoid having these optimizations undone by constant folding,
	// we convert to a pseudo that will be expanded later into one of
	// the above forms.
	SDValue Elt = DAG.getConstant(SextVal, dl, MVT::i32);
	EVT VT = (SplatSize == 1 ? MVT::v16i8 :
	(SplatSize == 2 ? MVT::v8i16 : MVT::v4i32));
	SDValue EltSize = DAG.getConstant(SplatSize, dl, MVT::i32);
	SDValue RetVal = DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize);
	if (VT == Op.getValueType())
	return RetVal;
	else
	return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), RetVal);
	}

	// If this is 0x8000_0000 x 4, turn into vspltisw + vslw. If it is
	// 0x7FFF_FFFF x 4, turn it into not(0x8000_0000). This is important
	// for fneg/fabs.
	if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) {
	// Make -1 and vspltisw -1:
	SDValue OnesV = getCanonicalConstSplat(-1, 4, MVT::v4i32, DAG, dl);

	// Make the VSLW intrinsic, computing 0x8000_0000.
	SDValue Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV,
	OnesV, DAG, dl);

	// xor by OnesV to invert it.
	Res = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Res, OnesV);
	return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
	}

	// Check to see if this is a wide variety of vsplti*, binop self cases.
	static const signed char SplatCsts[] = {
	-1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7,
	-8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -16
	};

	for (unsigned idx = 0; idx < array_lengthof(SplatCsts); ++idx) {
	// Indirect through the SplatCsts array so that we favor 'vsplti -1' for
	// cases which are ambiguous (e.g. formation of 0x8000_0000). 'vsplti -1'
	int i = SplatCsts[idx];

	// Figure out what shift amount will be used by altivec if shifted by i in
	// this splat size.
	unsigned TypeShiftAmt = i & (SplatBitSize-1);

	// vsplti + shl self.
	if (SextVal == (int)((unsigned)i << TypeShiftAmt)) {
	SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
	static const unsigned IIDs[] = { // Intrinsic to use for each size.
	Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,
	Intrinsic::ppc_altivec_vslw
	};
	Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
	return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
	}

	// vsplti + srl self.
	if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
	SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
	static const unsigned IIDs[] = { // Intrinsic to use for each size.
	Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0,
	Intrinsic::ppc_altivec_vsrw
	};
	Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
	return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
	}

	// vsplti + sra self.
	if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
	SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
	static const unsigned IIDs[] = { // Intrinsic to use for each size.
	Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0,
	Intrinsic::ppc_altivec_vsraw
	};
	Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
	return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
	}

	// vsplti + rol self.
	if (SextVal == (int)(((unsigned)i << TypeShiftAmt) \|
	((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) {
	SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
	static const unsigned IIDs[] = { // Intrinsic to use for each size.
	Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0,
	Intrinsic::ppc_altivec_vrlw
	};
	Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
	return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
	}

	// t = vsplti c, result = vsldoi t, t, 1
	if (SextVal == (int)(((unsigned)i << 8) \| (i < 0 ? 0xFF : 0))) {
	SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);
	unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1;
	return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
	}
	// t = vsplti c, result = vsldoi t, t, 2
	if (SextVal == (int)(((unsigned)i << 16) \| (i < 0 ? 0xFFFF : 0))) {
	SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);
	unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2;
	return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
	}
	// t = vsplti c, result = vsldoi t, t, 3
	if (SextVal == (int)(((unsigned)i << 24) \| (i < 0 ? 0xFFFFFF : 0))) {
	SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);
	unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3;
	return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
	}
	}

	return SDValue();
	}

	/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
	/// the specified operations to build the shuffle.
	static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
	SDValue RHS, SelectionDAG &DAG,
	const SDLoc &dl) {
	unsigned OpNum = (PFEntry >> 26) & 0x0F;
	unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
	unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);

	enum {
	OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
	OP_VMRGHW,
	OP_VMRGLW,
	OP_VSPLTISW0,
	OP_VSPLTISW1,
	OP_VSPLTISW2,
	OP_VSPLTISW3,
	OP_VSLDOI4,
	OP_VSLDOI8,
	OP_VSLDOI12
	};

	if (OpNum == OP_COPY) {
	if (LHSID == (19+2)9+3) return LHS;
	assert(LHSID == ((49+5)9+6)*9+7 && "Illegal OP_COPY!");
	return RHS;
	}

	SDValue OpLHS, OpRHS;
	OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
	OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);

	int ShufIdxs[16];
	switch (OpNum) {
	default: llvm_unreachable("Unknown i32 permute!");
	case OP_VMRGHW:
	ShufIdxs[ 0] = 0; ShufIdxs[ 1] = 1; ShufIdxs[ 2] = 2; ShufIdxs[ 3] = 3;
	ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19;
	ShufIdxs[ 8] = 4; ShufIdxs[ 9] = 5; ShufIdxs[10] = 6; ShufIdxs[11] = 7;
	ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23;
	break;
	case OP_VMRGLW:
	ShufIdxs[ 0] = 8; ShufIdxs[ 1] = 9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11;
	ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27;
	ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15;
	ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31;
	break;
	case OP_VSPLTISW0:
	for (unsigned i = 0; i != 16; ++i)
	ShufIdxs[i] = (i&3)+0;
	break;
	case OP_VSPLTISW1:
	for (unsigned i = 0; i != 16; ++i)
	ShufIdxs[i] = (i&3)+4;
	break;
	case OP_VSPLTISW2:
	for (unsigned i = 0; i != 16; ++i)
	ShufIdxs[i] = (i&3)+8;
	break;
	case OP_VSPLTISW3:
	for (unsigned i = 0; i != 16; ++i)
	ShufIdxs[i] = (i&3)+12;
	break;
	case OP_VSLDOI4:
	return BuildVSLDOI(OpLHS, OpRHS, 4, OpLHS.getValueType(), DAG, dl);
	case OP_VSLDOI8:
	return BuildVSLDOI(OpLHS, OpRHS, 8, OpLHS.getValueType(), DAG, dl);
	case OP_VSLDOI12:
	return BuildVSLDOI(OpLHS, OpRHS, 12, OpLHS.getValueType(), DAG, dl);
	}
	EVT VT = OpLHS.getValueType();
	OpLHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLHS);
	OpRHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpRHS);
	SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, OpLHS, OpRHS, ShufIdxs);
	return DAG.getNode(ISD::BITCAST, dl, VT, T);
	}

	/// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be handled
	/// by the VINSERTB instruction introduced in ISA 3.0, else just return default
	/// SDValue.
	SDValue PPCTargetLowering::lowerToVINSERTB(ShuffleVectorSDNode *N,
	SelectionDAG &DAG) const {
	const unsigned BytesInVector = 16;
	bool IsLE = Subtarget.isLittleEndian();
	SDLoc dl(N);
	SDValue V1 = N->getOperand(0);
	SDValue V2 = N->getOperand(1);
	unsigned ShiftElts = 0, InsertAtByte = 0;
	bool Swap = false;

	// Shifts required to get the byte we want at element 7.
	unsigned LittleEndianShifts[] = {8, 7, 6, 5, 4, 3, 2, 1,
	0, 15, 14, 13, 12, 11, 10, 9};
	unsigned BigEndianShifts[] = {9, 10, 11, 12, 13, 14, 15, 0,
	1, 2, 3, 4, 5, 6, 7, 8};

	ArrayRef<int> Mask = N->getMask();
	int OriginalOrder[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};

	// For each mask element, find out if we're just inserting something
	// from V2 into V1 or vice versa.
	// Possible permutations inserting an element from V2 into V1:
	// X, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
	// 0, X, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
	// ...
	// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, X
	// Inserting from V1 into V2 will be similar, except mask range will be
	// [16,31].

	bool FoundCandidate = false;
	// If both vector operands for the shuffle are the same vector, the mask
	// will contain only elements from the first one and the second one will be
	// undef.
	unsigned VINSERTBSrcElem = IsLE ? 8 : 7;
	// Go through the mask of half-words to find an element that's being moved
	// from one vector to the other.
	for (unsigned i = 0; i < BytesInVector; ++i) {
	unsigned CurrentElement = Mask[i];
	// If 2nd operand is undefined, we should only look for element 7 in the
	// Mask.
	if (V2.isUndef() && CurrentElement != VINSERTBSrcElem)
	continue;

	bool OtherElementsInOrder = true;
	// Examine the other elements in the Mask to see if they're in original
	// order.
	for (unsigned j = 0; j < BytesInVector; ++j) {
	if (j == i)
	continue;
	// If CurrentElement is from V1 [0,15], then we the rest of the Mask to be
	// from V2 [16,31] and vice versa. Unless the 2nd operand is undefined,
	// in which we always assume we're always picking from the 1st operand.
	int MaskOffset =
	(!V2.isUndef() && CurrentElement < BytesInVector) ? BytesInVector : 0;
	if (Mask[j] != OriginalOrder[j] + MaskOffset) {
	OtherElementsInOrder = false;
	break;
	}
	}
	// If other elements are in original order, we record the number of shifts
	// we need to get the element we want into element 7. Also record which byte
	// in the vector we should insert into.
	if (OtherElementsInOrder) {
	// If 2nd operand is undefined, we assume no shifts and no swapping.
	if (V2.isUndef()) {
	ShiftElts = 0;
	Swap = false;
	} else {
	// Only need the last 4-bits for shifts because operands will be swapped if CurrentElement is >= 2^4.
	ShiftElts = IsLE ? LittleEndianShifts[CurrentElement & 0xF]
	: BigEndianShifts[CurrentElement & 0xF];
	Swap = CurrentElement < BytesInVector;
	}
	InsertAtByte = IsLE ? BytesInVector - (i + 1) : i;
	FoundCandidate = true;
	break;
	}
	}

	if (!FoundCandidate)
	return SDValue();

	// Candidate found, construct the proper SDAG sequence with VINSERTB,
	// optionally with VECSHL if shift is required.
	if (Swap)
	std::swap(V1, V2);
	if (V2.isUndef())
	V2 = V1;
	if (ShiftElts) {
	SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2,
	DAG.getConstant(ShiftElts, dl, MVT::i32));
	return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, Shl,
	DAG.getConstant(InsertAtByte, dl, MVT::i32));
	}
	return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, V2,
	DAG.getConstant(InsertAtByte, dl, MVT::i32));
	}

	/// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be handled
	/// by the VINSERTH instruction introduced in ISA 3.0, else just return default
	/// SDValue.
	SDValue PPCTargetLowering::lowerToVINSERTH(ShuffleVectorSDNode *N,
	SelectionDAG &DAG) const {
	const unsigned NumHalfWords = 8;
	const unsigned BytesInVector = NumHalfWords * 2;
	// Check that the shuffle is on half-words.
	if (!isNByteElemShuffleMask(N, 2, 1))
	return SDValue();

	bool IsLE = Subtarget.isLittleEndian();
	SDLoc dl(N);
	SDValue V1 = N->getOperand(0);
	SDValue V2 = N->getOperand(1);
	unsigned ShiftElts = 0, InsertAtByte = 0;
	bool Swap = false;

	// Shifts required to get the half-word we want at element 3.
	unsigned LittleEndianShifts[] = {4, 3, 2, 1, 0, 7, 6, 5};
	unsigned BigEndianShifts[] = {5, 6, 7, 0, 1, 2, 3, 4};

	uint32_t Mask = 0;
	uint32_t OriginalOrderLow = 0x1234567;
	uint32_t OriginalOrderHigh = 0x89ABCDEF;
	// Now we look at mask elements 0,2,4,6,8,10,12,14. Pack the mask into a
	// 32-bit space, only need 4-bit nibbles per element.
	for (unsigned i = 0; i < NumHalfWords; ++i) {
	unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
	Mask \|= ((uint32_t)(N->getMaskElt(i * 2) / 2) << MaskShift);
	}

	// For each mask element, find out if we're just inserting something
	// from V2 into V1 or vice versa. Possible permutations inserting an element
	// from V2 into V1:
	// X, 1, 2, 3, 4, 5, 6, 7
	// 0, X, 2, 3, 4, 5, 6, 7
	// 0, 1, X, 3, 4, 5, 6, 7
	// 0, 1, 2, X, 4, 5, 6, 7
	// 0, 1, 2, 3, X, 5, 6, 7
	// 0, 1, 2, 3, 4, X, 6, 7
	// 0, 1, 2, 3, 4, 5, X, 7
	// 0, 1, 2, 3, 4, 5, 6, X
	// Inserting from V1 into V2 will be similar, except mask range will be [8,15].

	bool FoundCandidate = false;
	// Go through the mask of half-words to find an element that's being moved
	// from one vector to the other.
	for (unsigned i = 0; i < NumHalfWords; ++i) {
	unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
	uint32_t MaskOneElt = (Mask >> MaskShift) & 0xF;
	uint32_t MaskOtherElts = ~(0xF << MaskShift);
	uint32_t TargetOrder = 0x0;

	// If both vector operands for the shuffle are the same vector, the mask
	// will contain only elements from the first one and the second one will be
	// undef.
	if (V2.isUndef()) {
	ShiftElts = 0;
	unsigned VINSERTHSrcElem = IsLE ? 4 : 3;
	TargetOrder = OriginalOrderLow;
	Swap = false;
	// Skip if not the correct element or mask of other elements don't equal
	// to our expected order.
	if (MaskOneElt == VINSERTHSrcElem &&
	(Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
	InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
	FoundCandidate = true;
	break;
	}
	} else { // If both operands are defined.
	// Target order is [8,15] if the current mask is between [0,7].
	TargetOrder =
	(MaskOneElt < NumHalfWords) ? OriginalOrderHigh : OriginalOrderLow;
	// Skip if mask of other elements don't equal our expected order.
	if ((Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
	// We only need the last 3 bits for the number of shifts.
	ShiftElts = IsLE ? LittleEndianShifts[MaskOneElt & 0x7]
	: BigEndianShifts[MaskOneElt & 0x7];
	InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
	Swap = MaskOneElt < NumHalfWords;
	FoundCandidate = true;
	break;
	}
	}
	}

	if (!FoundCandidate)
	return SDValue();

	// Candidate found, construct the proper SDAG sequence with VINSERTH,
	// optionally with VECSHL if shift is required.
	if (Swap)
	std::swap(V1, V2);
	if (V2.isUndef())
	V2 = V1;
	SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
	if (ShiftElts) {
	// Double ShiftElts because we're left shifting on v16i8 type.
	SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2,
	DAG.getConstant(2 * ShiftElts, dl, MVT::i32));
	SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, Shl);
	SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2,
	DAG.getConstant(InsertAtByte, dl, MVT::i32));
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
	}
	SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
	SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2,
	DAG.getConstant(InsertAtByte, dl, MVT::i32));
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
	}

	/// lowerToXXSPLTI32DX - Return the SDValue if this VECTOR_SHUFFLE can be
	/// handled by the XXSPLTI32DX instruction introduced in ISA 3.1, otherwise
	/// return the default SDValue.
	SDValue PPCTargetLowering::lowerToXXSPLTI32DX(ShuffleVectorSDNode *SVN,
	SelectionDAG &DAG) const {
	// The LHS and RHS may be bitcasts to v16i8 as we canonicalize shuffles
	// to v16i8. Peek through the bitcasts to get the actual operands.
	SDValue LHS = peekThroughBitcasts(SVN->getOperand(0));
	SDValue RHS = peekThroughBitcasts(SVN->getOperand(1));

	auto ShuffleMask = SVN->getMask();
	SDValue VecShuffle(SVN, 0);
	SDLoc DL(SVN);

	// Check that we have a four byte shuffle.
	if (!isNByteElemShuffleMask(SVN, 4, 1))
	return SDValue();

	// Canonicalize the RHS being a BUILD_VECTOR when lowering to xxsplti32dx.
	if (RHS->getOpcode() != ISD::BUILD_VECTOR) {
	std::swap(LHS, RHS);
	VecShuffle = DAG.getCommutedVectorShuffle(*SVN);
	ShuffleMask = cast<ShuffleVectorSDNode>(VecShuffle)->getMask();
	}

	// Ensure that the RHS is a vector of constants.
	BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
	if (!BVN)
	return SDValue();

	// Check if RHS is a splat of 4-bytes (or smaller).
	APInt APSplatValue, APSplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	if (!BVN->isConstantSplat(APSplatValue, APSplatUndef, SplatBitSize,
	HasAnyUndefs, 0, !Subtarget.isLittleEndian()) \|\|
	SplatBitSize > 32)
	return SDValue();

	// Check that the shuffle mask matches the semantics of XXSPLTI32DX.
	// The instruction splats a constant C into two words of the source vector
	// producing { C, Unchanged, C, Unchanged } or { Unchanged, C, Unchanged, C }.
	// Thus we check that the shuffle mask is the equivalent of
	// <0, [4-7], 2, [4-7]> or <[4-7], 1, [4-7], 3> respectively.
	// Note: the check above of isNByteElemShuffleMask() ensures that the bytes
	// within each word are consecutive, so we only need to check the first byte.
	SDValue Index;
	bool IsLE = Subtarget.isLittleEndian();
	if ((ShuffleMask[0] == 0 && ShuffleMask[8] == 8) &&
	(ShuffleMask[4] % 4 == 0 && ShuffleMask[12] % 4 == 0 &&
	ShuffleMask[4] > 15 && ShuffleMask[12] > 15))
	Index = DAG.getTargetConstant(IsLE ? 0 : 1, DL, MVT::i32);
	else if ((ShuffleMask[4] == 4 && ShuffleMask[12] == 12) &&
	(ShuffleMask[0] % 4 == 0 && ShuffleMask[8] % 4 == 0 &&
	ShuffleMask[0] > 15 && ShuffleMask[8] > 15))
	Index = DAG.getTargetConstant(IsLE ? 1 : 0, DL, MVT::i32);
	else
	return SDValue();

	// If the splat is narrower than 32-bits, we need to get the 32-bit value
	// for XXSPLTI32DX.
	unsigned SplatVal = APSplatValue.getZExtValue();
	for (; SplatBitSize < 32; SplatBitSize <<= 1)
	SplatVal \|= (SplatVal << SplatBitSize);

	SDValue SplatNode = DAG.getNode(
	PPCISD::XXSPLTI32DX, DL, MVT::v2i64, DAG.getBitcast(MVT::v2i64, LHS),
	Index, DAG.getTargetConstant(SplatVal, DL, MVT::i32));
	return DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, SplatNode);
	}

	/// LowerROTL - Custom lowering for ROTL(v1i128) to vector_shuffle(v16i8).
	/// We lower ROTL(v1i128) to vector_shuffle(v16i8) only if shift amount is
	/// a multiple of 8. Otherwise convert it to a scalar rotation(i128)
	/// i.e (or (shl x, C1), (srl x, 128-C1)).
	SDValue PPCTargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
	assert(Op.getOpcode() == ISD::ROTL && "Should only be called for ISD::ROTL");
	assert(Op.getValueType() == MVT::v1i128 &&
	"Only set v1i128 as custom, other type shouldn't reach here!");
	SDLoc dl(Op);
	SDValue N0 = peekThroughBitcasts(Op.getOperand(0));
	SDValue N1 = peekThroughBitcasts(Op.getOperand(1));
	unsigned SHLAmt = N1.getConstantOperandVal(0);
	if (SHLAmt % 8 == 0) {
	SmallVector<int, 16> Mask(16, 0);
	std::iota(Mask.begin(), Mask.end(), 0);
	std::rotate(Mask.begin(), Mask.begin() + SHLAmt / 8, Mask.end());
	if (SDValue Shuffle =
	DAG.getVectorShuffle(MVT::v16i8, dl, DAG.getBitcast(MVT::v16i8, N0),
	DAG.getUNDEF(MVT::v16i8), Mask))
	return DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, Shuffle);
	}
	SDValue ArgVal = DAG.getBitcast(MVT::i128, N0);
	SDValue SHLOp = DAG.getNode(ISD::SHL, dl, MVT::i128, ArgVal,
	DAG.getConstant(SHLAmt, dl, MVT::i32));
	SDValue SRLOp = DAG.getNode(ISD::SRL, dl, MVT::i128, ArgVal,
	DAG.getConstant(128 - SHLAmt, dl, MVT::i32));
	SDValue OROp = DAG.getNode(ISD::OR, dl, MVT::i128, SHLOp, SRLOp);
	return DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, OROp);
	}

	/// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE. If this
	/// is a shuffle we can handle in a single instruction, return it. Otherwise,
	/// return the code it can be lowered into. Worst case, it can always be
	/// lowered into a vperm.
	SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

	// Any nodes that were combined in the target-independent combiner prior
	// to vector legalization will not be sent to the target combine. Try to
	// combine it here.
	if (SDValue NewShuffle = combineVectorShuffle(SVOp, DAG)) {
	if (!isa<ShuffleVectorSDNode>(NewShuffle))
	return NewShuffle;
	Op = NewShuffle;
	SVOp = cast<ShuffleVectorSDNode>(Op);
	V1 = Op.getOperand(0);
	V2 = Op.getOperand(1);
	}
	EVT VT = Op.getValueType();
	bool isLittleEndian = Subtarget.isLittleEndian();

	unsigned ShiftElts, InsertAtByte;
	bool Swap = false;

	// If this is a load-and-splat, we can do that with a single instruction
	// in some cases. However if the load has multiple uses, we don't want to
	// combine it because that will just produce multiple loads.
	- const SDValue *InputLoad = getNormalLoadInput(V1);
	+ bool IsPermutedLoad = false;
	+ const SDValue *InputLoad = getNormalLoadInput(V1, IsPermutedLoad);
	if (InputLoad && Subtarget.hasVSX() && V2.isUndef() &&
	(PPC::isSplatShuffleMask(SVOp, 4) \|\| PPC::isSplatShuffleMask(SVOp, 8)) &&
	InputLoad->hasOneUse()) {
	bool IsFourByte = PPC::isSplatShuffleMask(SVOp, 4);
	int SplatIdx =
	PPC::getSplatIdxForPPCMnemonics(SVOp, IsFourByte ? 4 : 8, DAG);

	+ // The splat index for permuted loads will be in the left half of the vector
	+ // which is strictly wider than the loaded value by 8 bytes. So we need to
	+ // adjust the splat index to point to the correct address in memory.
	+ if (IsPermutedLoad) {
	+ assert(isLittleEndian && "Unexpected permuted load on big endian target");
	+ SplatIdx += IsFourByte ? 2 : 1;
	+ assert((SplatIdx < (IsFourByte ? 4 : 2)) &&
	+ "Splat of a value outside of the loaded memory");
	+ }
	+
	LoadSDNode LD = cast<LoadSDNode>(InputLoad);
	// For 4-byte load-and-splat, we need Power9.
	if ((IsFourByte && Subtarget.hasP9Vector()) \|\| !IsFourByte) {
	uint64_t Offset = 0;
	if (IsFourByte)
	Offset = isLittleEndian ? (3 - SplatIdx) * 4 : SplatIdx * 4;
	else
	Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8;

	- // If we are loading a partial vector, it does not make sense to adjust
	- // the base pointer. This happens with (splat (s_to_v_permuted (ld))).
	- if (LD->getMemoryVT().getSizeInBits() == (IsFourByte ? 32 : 64))
	- Offset = 0;
	SDValue BasePtr = LD->getBasePtr();
	if (Offset != 0)
	BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	BasePtr, DAG.getIntPtrConstant(Offset, dl));
	SDValue Ops[] = {
	LD->getChain(), // Chain
	BasePtr, // BasePtr
	DAG.getValueType(Op.getValueType()) // VT
	};
	SDVTList VTL =
	DAG.getVTList(IsFourByte ? MVT::v4i32 : MVT::v2i64, MVT::Other);
	SDValue LdSplt =
	DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl, VTL,
	Ops, LD->getMemoryVT(), LD->getMemOperand());
	if (LdSplt.getValueType() != SVOp->getValueType(0))
	LdSplt = DAG.getBitcast(SVOp->getValueType(0), LdSplt);
	return LdSplt;
	}
	}
	if (Subtarget.hasP9Vector() &&
	PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap,
	isLittleEndian)) {
	if (Swap)
	std::swap(V1, V2);
	SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
	SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2);
	if (ShiftElts) {
	SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv2, Conv2,
	DAG.getConstant(ShiftElts, dl, MVT::i32));
	SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Shl,
	DAG.getConstant(InsertAtByte, dl, MVT::i32));
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
	}
	SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Conv2,
	DAG.getConstant(InsertAtByte, dl, MVT::i32));
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
	}

	if (Subtarget.hasPrefixInstrs()) {
	SDValue SplatInsertNode;
	if ((SplatInsertNode = lowerToXXSPLTI32DX(SVOp, DAG)))
	return SplatInsertNode;
	}

	if (Subtarget.hasP9Altivec()) {
	SDValue NewISDNode;
	if ((NewISDNode = lowerToVINSERTH(SVOp, DAG)))
	return NewISDNode;

	if ((NewISDNode = lowerToVINSERTB(SVOp, DAG)))
	return NewISDNode;
	}

	if (Subtarget.hasVSX() &&
	PPC::isXXSLDWIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) {
	if (Swap)
	std::swap(V1, V2);
	SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
	SDValue Conv2 =
	DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2.isUndef() ? V1 : V2);

	SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv1, Conv2,
	DAG.getConstant(ShiftElts, dl, MVT::i32));
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Shl);
	}

	if (Subtarget.hasVSX() &&
	PPC::isXXPERMDIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) {
	if (Swap)
	std::swap(V1, V2);
	SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1);
	SDValue Conv2 =
	DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2.isUndef() ? V1 : V2);

	SDValue PermDI = DAG.getNode(PPCISD::XXPERMDI, dl, MVT::v2i64, Conv1, Conv2,
	DAG.getConstant(ShiftElts, dl, MVT::i32));
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, PermDI);
	}

	if (Subtarget.hasP9Vector()) {
	if (PPC::isXXBRHShuffleMask(SVOp)) {
	SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
	SDValue ReveHWord = DAG.getNode(ISD::BSWAP, dl, MVT::v8i16, Conv);
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveHWord);
	} else if (PPC::isXXBRWShuffleMask(SVOp)) {
	SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
	SDValue ReveWord = DAG.getNode(ISD::BSWAP, dl, MVT::v4i32, Conv);
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveWord);
	} else if (PPC::isXXBRDShuffleMask(SVOp)) {
	SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1);
	SDValue ReveDWord = DAG.getNode(ISD::BSWAP, dl, MVT::v2i64, Conv);
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveDWord);
	} else if (PPC::isXXBRQShuffleMask(SVOp)) {
	SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, V1);
	SDValue ReveQWord = DAG.getNode(ISD::BSWAP, dl, MVT::v1i128, Conv);
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveQWord);
	}
	}

	if (Subtarget.hasVSX()) {
	if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) {
	int SplatIdx = PPC::getSplatIdxForPPCMnemonics(SVOp, 4, DAG);

	SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
	SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv,
	DAG.getConstant(SplatIdx, dl, MVT::i32));
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Splat);
	}

	// Left shifts of 8 bytes are actually swaps. Convert accordingly.
	if (V2.isUndef() && PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) == 8) {
	SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
	SDValue Swap = DAG.getNode(PPCISD::SWAP_NO_CHAIN, dl, MVT::v2f64, Conv);
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Swap);
	}
	}

	if (Subtarget.hasQPX()) {
	if (VT.getVectorNumElements() != 4)
	return SDValue();

	if (V2.isUndef()) V2 = V1;

	int AlignIdx = PPC::isQVALIGNIShuffleMask(SVOp);
	if (AlignIdx != -1) {
	return DAG.getNode(PPCISD::QVALIGNI, dl, VT, V1, V2,
	DAG.getConstant(AlignIdx, dl, MVT::i32));
	} else if (SVOp->isSplat()) {
	int SplatIdx = SVOp->getSplatIndex();
	if (SplatIdx >= 4) {
	std::swap(V1, V2);
	SplatIdx -= 4;
	}

	return DAG.getNode(PPCISD::QVESPLATI, dl, VT, V1,
	DAG.getConstant(SplatIdx, dl, MVT::i32));
	}

	// Lower this into a qvgpci/qvfperm pair.

	// Compute the qvgpci literal
	unsigned idx = 0;
	for (unsigned i = 0; i < 4; ++i) {
	int m = SVOp->getMaskElt(i);
	unsigned mm = m >= 0 ? (unsigned) m : i;
	idx \|= mm << (3-i)*3;
	}

	SDValue V3 = DAG.getNode(PPCISD::QVGPCI, dl, MVT::v4f64,
	DAG.getConstant(idx, dl, MVT::i32));
	return DAG.getNode(PPCISD::QVFPERM, dl, VT, V1, V2, V3);
	}

	// Cases that are handled by instructions that take permute immediates
	// (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be
	// selected by the instruction selector.
	if (V2.isUndef()) {
	if (PPC::isSplatShuffleMask(SVOp, 1) \|\|
	PPC::isSplatShuffleMask(SVOp, 2) \|\|
	PPC::isSplatShuffleMask(SVOp, 4) \|\|
	PPC::isVPKUWUMShuffleMask(SVOp, 1, DAG) \|\|
	PPC::isVPKUHUMShuffleMask(SVOp, 1, DAG) \|\|
	PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) != -1 \|\|
	PPC::isVMRGLShuffleMask(SVOp, 1, 1, DAG) \|\|
	PPC::isVMRGLShuffleMask(SVOp, 2, 1, DAG) \|\|
	PPC::isVMRGLShuffleMask(SVOp, 4, 1, DAG) \|\|
	PPC::isVMRGHShuffleMask(SVOp, 1, 1, DAG) \|\|
	PPC::isVMRGHShuffleMask(SVOp, 2, 1, DAG) \|\|
	PPC::isVMRGHShuffleMask(SVOp, 4, 1, DAG) \|\|
	(Subtarget.hasP8Altivec() && (
	PPC::isVPKUDUMShuffleMask(SVOp, 1, DAG) \|\|
	PPC::isVMRGEOShuffleMask(SVOp, true, 1, DAG) \|\|
	PPC::isVMRGEOShuffleMask(SVOp, false, 1, DAG)))) {
	return Op;
	}
	}

	// Altivec has a variety of "shuffle immediates" that take two vector inputs
	// and produce a fixed permutation. If any of these match, do not lower to
	// VPERM.
	unsigned int ShuffleKind = isLittleEndian ? 2 : 0;
	if (PPC::isVPKUWUMShuffleMask(SVOp, ShuffleKind, DAG) \|\|
	PPC::isVPKUHUMShuffleMask(SVOp, ShuffleKind, DAG) \|\|
	PPC::isVSLDOIShuffleMask(SVOp, ShuffleKind, DAG) != -1 \|\|
	PPC::isVMRGLShuffleMask(SVOp, 1, ShuffleKind, DAG) \|\|
	PPC::isVMRGLShuffleMask(SVOp, 2, ShuffleKind, DAG) \|\|
	PPC::isVMRGLShuffleMask(SVOp, 4, ShuffleKind, DAG) \|\|
	PPC::isVMRGHShuffleMask(SVOp, 1, ShuffleKind, DAG) \|\|
	PPC::isVMRGHShuffleMask(SVOp, 2, ShuffleKind, DAG) \|\|
	PPC::isVMRGHShuffleMask(SVOp, 4, ShuffleKind, DAG) \|\|
	(Subtarget.hasP8Altivec() && (
	PPC::isVPKUDUMShuffleMask(SVOp, ShuffleKind, DAG) \|\|
	PPC::isVMRGEOShuffleMask(SVOp, true, ShuffleKind, DAG) \|\|
	PPC::isVMRGEOShuffleMask(SVOp, false, ShuffleKind, DAG))))
	return Op;

	// Check to see if this is a shuffle of 4-byte values. If so, we can use our
	// perfect shuffle table to emit an optimal matching sequence.
	ArrayRef<int> PermMask = SVOp->getMask();

	unsigned PFIndexes[4];
	bool isFourElementShuffle = true;
	for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number
	unsigned EltNo = 8; // Start out undef.
	for (unsigned j = 0; j != 4; ++j) { // Intra-element byte.
	if (PermMask[i*4+j] < 0)
	continue; // Undef, ignore it.

	unsigned ByteSource = PermMask[i*4+j];
	if ((ByteSource & 3) != j) {
	isFourElementShuffle = false;
	break;
	}

	if (EltNo == 8) {
	EltNo = ByteSource/4;
	} else if (EltNo != ByteSource/4) {
	isFourElementShuffle = false;
	break;
	}
	}
	PFIndexes[i] = EltNo;
	}

	// If this shuffle can be expressed as a shuffle of 4-byte elements, use the
	// perfect shuffle vector to determine if it is cost effective to do this as
	// discrete instructions, or whether we should use a vperm.
	// For now, we skip this for little endian until such time as we have a
	// little-endian perfect shuffle table.
	if (isFourElementShuffle && !isLittleEndian) {
	// Compute the index in the perfect shuffle table.
	unsigned PFTableIndex =
	PFIndexes[0]999+PFIndexes[1]99+PFIndexes[2]9+PFIndexes[3];

	unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
	unsigned Cost = (PFEntry >> 30);

	// Determining when to avoid vperm is tricky. Many things affect the cost
	// of vperm, particularly how many times the perm mask needs to be computed.
	// For example, if the perm mask can be hoisted out of a loop or is already
	// used (perhaps because there are multiple permutes with the same shuffle
	// mask?) the vperm has a cost of 1. OTOH, hoisting the permute mask out of
	// the loop requires an extra register.
	//
	// As a compromise, we only emit discrete instructions if the shuffle can be
	// generated in 3 or fewer operations. When we have loop information
	// available, if this block is within a loop, we should avoid using vperm
	// for 3-operation perms and use a constant pool load instead.
	if (Cost < 3)
	return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
	}

	// Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant
	// vector that will get spilled to the constant pool.
	if (V2.isUndef()) V2 = V1;

	// The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except
	// that it is in input element units, not in bytes. Convert now.

	// For little endian, the order of the input vectors is reversed, and
	// the permutation mask is complemented with respect to 31. This is
	// necessary to produce proper semantics with the big-endian-biased vperm
	// instruction.
	EVT EltVT = V1.getValueType().getVectorElementType();
	unsigned BytesPerElement = EltVT.getSizeInBits()/8;

	SmallVector<SDValue, 16> ResultMask;
	for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
	unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i];

	for (unsigned j = 0; j != BytesPerElement; ++j)
	if (isLittleEndian)
	ResultMask.push_back(DAG.getConstant(31 - (SrcElt*BytesPerElement + j),
	dl, MVT::i32));
	else
	ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement + j, dl,
	MVT::i32));
	}

	ShufflesHandledWithVPERM++;
	SDValue VPermMask = DAG.getBuildVector(MVT::v16i8, dl, ResultMask);
	LLVM_DEBUG(dbgs() << "Emitting a VPERM for the following shuffle:\n");
	LLVM_DEBUG(SVOp->dump());
	LLVM_DEBUG(dbgs() << "With the following permute control vector:\n");
	LLVM_DEBUG(VPermMask.dump());

	if (isLittleEndian)
	return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(),
	V2, V1, VPermMask);
	else
	return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(),
	V1, V2, VPermMask);
	}

	/// getVectorCompareInfo - Given an intrinsic, return false if it is not a
	/// vector comparison. If it is, return true and fill in Opc/isDot with
	/// information about the intrinsic.
	static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
	bool &isDot, const PPCSubtarget &Subtarget) {
	unsigned IntrinsicID =
	cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue();
	CompareOpc = -1;
	isDot = false;
	switch (IntrinsicID) {
	default:
	return false;
	// Comparison predicates.
	case Intrinsic::ppc_altivec_vcmpbfp_p:
	CompareOpc = 966;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpeqfp_p:
	CompareOpc = 198;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpequb_p:
	CompareOpc = 6;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpequh_p:
	CompareOpc = 70;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpequw_p:
	CompareOpc = 134;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpequd_p:
	if (Subtarget.hasP8Altivec()) {
	CompareOpc = 199;
	isDot = true;
	} else
	return false;
	break;
	case Intrinsic::ppc_altivec_vcmpneb_p:
	case Intrinsic::ppc_altivec_vcmpneh_p:
	case Intrinsic::ppc_altivec_vcmpnew_p:
	case Intrinsic::ppc_altivec_vcmpnezb_p:
	case Intrinsic::ppc_altivec_vcmpnezh_p:
	case Intrinsic::ppc_altivec_vcmpnezw_p:
	if (Subtarget.hasP9Altivec()) {
	switch (IntrinsicID) {
	default:
	llvm_unreachable("Unknown comparison intrinsic.");
	case Intrinsic::ppc_altivec_vcmpneb_p:
	CompareOpc = 7;
	break;
	case Intrinsic::ppc_altivec_vcmpneh_p:
	CompareOpc = 71;
	break;
	case Intrinsic::ppc_altivec_vcmpnew_p:
	CompareOpc = 135;
	break;
	case Intrinsic::ppc_altivec_vcmpnezb_p:
	CompareOpc = 263;
	break;
	case Intrinsic::ppc_altivec_vcmpnezh_p:
	CompareOpc = 327;
	break;
	case Intrinsic::ppc_altivec_vcmpnezw_p:
	CompareOpc = 391;
	break;
	}
	isDot = true;
	} else
	return false;
	break;
	case Intrinsic::ppc_altivec_vcmpgefp_p:
	CompareOpc = 454;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpgtfp_p:
	CompareOpc = 710;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpgtsb_p:
	CompareOpc = 774;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpgtsh_p:
	CompareOpc = 838;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpgtsw_p:
	CompareOpc = 902;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpgtsd_p:
	if (Subtarget.hasP8Altivec()) {
	CompareOpc = 967;
	isDot = true;
	} else
	return false;
	break;
	case Intrinsic::ppc_altivec_vcmpgtub_p:
	CompareOpc = 518;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpgtuh_p:
	CompareOpc = 582;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpgtuw_p:
	CompareOpc = 646;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpgtud_p:
	if (Subtarget.hasP8Altivec()) {
	CompareOpc = 711;
	isDot = true;
	} else
	return false;
	break;

	// VSX predicate comparisons use the same infrastructure
	case Intrinsic::ppc_vsx_xvcmpeqdp_p:
	case Intrinsic::ppc_vsx_xvcmpgedp_p:
	case Intrinsic::ppc_vsx_xvcmpgtdp_p:
	case Intrinsic::ppc_vsx_xvcmpeqsp_p:
	case Intrinsic::ppc_vsx_xvcmpgesp_p:
	case Intrinsic::ppc_vsx_xvcmpgtsp_p:
	if (Subtarget.hasVSX()) {
	switch (IntrinsicID) {
	case Intrinsic::ppc_vsx_xvcmpeqdp_p:
	CompareOpc = 99;
	break;
	case Intrinsic::ppc_vsx_xvcmpgedp_p:
	CompareOpc = 115;
	break;
	case Intrinsic::ppc_vsx_xvcmpgtdp_p:
	CompareOpc = 107;
	break;
	case Intrinsic::ppc_vsx_xvcmpeqsp_p:
	CompareOpc = 67;
	break;
	case Intrinsic::ppc_vsx_xvcmpgesp_p:
	CompareOpc = 83;
	break;
	case Intrinsic::ppc_vsx_xvcmpgtsp_p:
	CompareOpc = 75;
	break;
	}
	isDot = true;
	} else
	return false;
	break;

	// Normal Comparisons.
	case Intrinsic::ppc_altivec_vcmpbfp:
	CompareOpc = 966;
	break;
	case Intrinsic::ppc_altivec_vcmpeqfp:
	CompareOpc = 198;
	break;
	case Intrinsic::ppc_altivec_vcmpequb:
	CompareOpc = 6;
	break;
	case Intrinsic::ppc_altivec_vcmpequh:
	CompareOpc = 70;
	break;
	case Intrinsic::ppc_altivec_vcmpequw:
	CompareOpc = 134;
	break;
	case Intrinsic::ppc_altivec_vcmpequd:
	if (Subtarget.hasP8Altivec())
	CompareOpc = 199;
	else
	return false;
	break;
	case Intrinsic::ppc_altivec_vcmpneb:
	case Intrinsic::ppc_altivec_vcmpneh:
	case Intrinsic::ppc_altivec_vcmpnew:
	case Intrinsic::ppc_altivec_vcmpnezb:
	case Intrinsic::ppc_altivec_vcmpnezh:
	case Intrinsic::ppc_altivec_vcmpnezw:
	if (Subtarget.hasP9Altivec())
	switch (IntrinsicID) {
	default:
	llvm_unreachable("Unknown comparison intrinsic.");
	case Intrinsic::ppc_altivec_vcmpneb:
	CompareOpc = 7;
	break;
	case Intrinsic::ppc_altivec_vcmpneh:
	CompareOpc = 71;
	break;
	case Intrinsic::ppc_altivec_vcmpnew:
	CompareOpc = 135;
	break;
	case Intrinsic::ppc_altivec_vcmpnezb:
	CompareOpc = 263;
	break;
	case Intrinsic::ppc_altivec_vcmpnezh:
	CompareOpc = 327;
	break;
	case Intrinsic::ppc_altivec_vcmpnezw:
	CompareOpc = 391;
	break;
	}
	else
	return false;
	break;
	case Intrinsic::ppc_altivec_vcmpgefp:
	CompareOpc = 454;
	break;
	case Intrinsic::ppc_altivec_vcmpgtfp:
	CompareOpc = 710;
	break;
	case Intrinsic::ppc_altivec_vcmpgtsb:
	CompareOpc = 774;
	break;
	case Intrinsic::ppc_altivec_vcmpgtsh:
	CompareOpc = 838;
	break;
	case Intrinsic::ppc_altivec_vcmpgtsw:
	CompareOpc = 902;
	break;
	case Intrinsic::ppc_altivec_vcmpgtsd:
	if (Subtarget.hasP8Altivec())
	CompareOpc = 967;
	else
	return false;
	break;
	case Intrinsic::ppc_altivec_vcmpgtub:
	CompareOpc = 518;
	break;
	case Intrinsic::ppc_altivec_vcmpgtuh:
	CompareOpc = 582;
	break;
	case Intrinsic::ppc_altivec_vcmpgtuw:
	CompareOpc = 646;
	break;
	case Intrinsic::ppc_altivec_vcmpgtud:
	if (Subtarget.hasP8Altivec())
	CompareOpc = 711;
	else
	return false;
	break;
	}
	return true;
	}

	/// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom
	/// lower, do it, otherwise return null.
	SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
	SelectionDAG &DAG) const {
	unsigned IntrinsicID =
	cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();

	SDLoc dl(Op);

	if (IntrinsicID == Intrinsic::thread_pointer) {
	// Reads the thread pointer register, used for __builtin_thread_pointer.
	if (Subtarget.isPPC64())
	return DAG.getRegister(PPC::X13, MVT::i64);
	return DAG.getRegister(PPC::R2, MVT::i32);
	}

	// If this is a lowered altivec predicate compare, CompareOpc is set to the
	// opcode number of the comparison.
	int CompareOpc;
	bool isDot;
	if (!getVectorCompareInfo(Op, CompareOpc, isDot, Subtarget))
	return SDValue(); // Don't custom lower most intrinsics.

	// If this is a non-dot comparison, make the VCMP node and we are done.
	if (!isDot) {
	SDValue Tmp = DAG.getNode(PPCISD::VCMP, dl, Op.getOperand(2).getValueType(),
	Op.getOperand(1), Op.getOperand(2),
	DAG.getConstant(CompareOpc, dl, MVT::i32));
	return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Tmp);
	}

	// Create the PPCISD altivec 'dot' comparison node.
	SDValue Ops[] = {
	Op.getOperand(2), // LHS
	Op.getOperand(3), // RHS
	DAG.getConstant(CompareOpc, dl, MVT::i32)
	};
	EVT VTs[] = { Op.getOperand(2).getValueType(), MVT::Glue };
	SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops);

	// Now that we have the comparison, emit a copy from the CR to a GPR.
	// This is flagged to the above dot comparison.
	SDValue Flags = DAG.getNode(PPCISD::MFOCRF, dl, MVT::i32,
	DAG.getRegister(PPC::CR6, MVT::i32),
	CompNode.getValue(1));

	// Unpack the result based on how the target uses it.
	unsigned BitNo; // Bit # of CR6.
	bool InvertBit; // Invert result?
	switch (cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()) {
	default: // Can't happen, don't crash on invalid number though.
	case 0: // Return the value of the EQ bit of CR6.
	BitNo = 0; InvertBit = false;
	break;
	case 1: // Return the inverted value of the EQ bit of CR6.
	BitNo = 0; InvertBit = true;
	break;
	case 2: // Return the value of the LT bit of CR6.
	BitNo = 2; InvertBit = false;
	break;
	case 3: // Return the inverted value of the LT bit of CR6.
	BitNo = 2; InvertBit = true;
	break;
	}

	// Shift the bit into the low position.
	Flags = DAG.getNode(ISD::SRL, dl, MVT::i32, Flags,
	DAG.getConstant(8 - (3 - BitNo), dl, MVT::i32));
	// Isolate the bit.
	Flags = DAG.getNode(ISD::AND, dl, MVT::i32, Flags,
	DAG.getConstant(1, dl, MVT::i32));

	// If we are supposed to, toggle the bit.
	if (InvertBit)
	Flags = DAG.getNode(ISD::XOR, dl, MVT::i32, Flags,
	DAG.getConstant(1, dl, MVT::i32));
	return Flags;
	}

	SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
	SelectionDAG &DAG) const {
	// SelectionDAGBuilder::visitTargetIntrinsic may insert one extra chain to
	// the beginning of the argument list.
	int ArgStart = isa<ConstantSDNode>(Op.getOperand(0)) ? 0 : 1;
	SDLoc DL(Op);
	switch (cast<ConstantSDNode>(Op.getOperand(ArgStart))->getZExtValue()) {
	case Intrinsic::ppc_cfence: {
	assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument.");
	assert(Subtarget.isPPC64() && "Only 64-bit is supported for now.");
	return SDValue(DAG.getMachineNode(PPC::CFENCE8, DL, MVT::Other,
	DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64,
	Op.getOperand(ArgStart + 1)),
	Op.getOperand(0)),
	0);
	}
	default:
	break;
	}
	return SDValue();
	}

	// Lower scalar BSWAP64 to xxbrd.
	SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const {
	SDLoc dl(Op);
	// MTVSRDD
	Op = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, Op.getOperand(0),
	Op.getOperand(0));
	// XXBRD
	Op = DAG.getNode(ISD::BSWAP, dl, MVT::v2i64, Op);
	// MFVSRD
	int VectorIndex = 0;
	if (Subtarget.isLittleEndian())
	VectorIndex = 1;
	Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
	DAG.getTargetConstant(VectorIndex, dl, MVT::i32));
	return Op;
	}

	// ATOMIC_CMP_SWAP for i8/i16 needs to zero-extend its input since it will be
	// compared to a value that is atomically loaded (atomic loads zero-extend).
	SDValue PPCTargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getOpcode() == ISD::ATOMIC_CMP_SWAP &&
	"Expecting an atomic compare-and-swap here.");
	SDLoc dl(Op);
	auto *AtomicNode = cast<AtomicSDNode>(Op.getNode());
	EVT MemVT = AtomicNode->getMemoryVT();
	if (MemVT.getSizeInBits() >= 32)
	return Op;

	SDValue CmpOp = Op.getOperand(2);
	// If this is already correctly zero-extended, leave it alone.
	auto HighBits = APInt::getHighBitsSet(32, 32 - MemVT.getSizeInBits());
	if (DAG.MaskedValueIsZero(CmpOp, HighBits))
	return Op;

	// Clear the high bits of the compare operand.
	unsigned MaskVal = (1 << MemVT.getSizeInBits()) - 1;
	SDValue NewCmpOp =
	DAG.getNode(ISD::AND, dl, MVT::i32, CmpOp,
	DAG.getConstant(MaskVal, dl, MVT::i32));

	// Replace the existing compare operand with the properly zero-extended one.
	SmallVector<SDValue, 4> Ops;
	for (int i = 0, e = AtomicNode->getNumOperands(); i < e; i++)
	Ops.push_back(AtomicNode->getOperand(i));
	Ops[2] = NewCmpOp;
	MachineMemOperand *MMO = AtomicNode->getMemOperand();
	SDVTList Tys = DAG.getVTList(MVT::i32, MVT::Other);
	auto NodeTy =
	(MemVT == MVT::i8) ? PPCISD::ATOMIC_CMP_SWAP_8 : PPCISD::ATOMIC_CMP_SWAP_16;
	return DAG.getMemIntrinsicNode(NodeTy, dl, Tys, Ops, MemVT, MMO);
	}

	SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	// Create a stack slot that is 16-byte aligned.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	int FrameIdx = MFI.CreateStackObject(16, Align(16), false);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);

	// Store the input value into Value#0 of the stack slot.
	SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
	MachinePointerInfo());
	// Load it out.
	return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo());
	}

	SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT &&
	"Should only be called for ISD::INSERT_VECTOR_ELT");

	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(2));
	// We have legal lowering for constant indices but not for variable ones.
	if (!C)
	return SDValue();

	EVT VT = Op.getValueType();
	SDLoc dl(Op);
	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	// We can use MTVSRZ + VECINSERT for v8i16 and v16i8 types.
	if (VT == MVT::v8i16 \|\| VT == MVT::v16i8) {
	SDValue Mtvsrz = DAG.getNode(PPCISD::MTVSRZ, dl, VT, V2);
	unsigned BytesInEachElement = VT.getVectorElementType().getSizeInBits() / 8;
	unsigned InsertAtElement = C->getZExtValue();
	unsigned InsertAtByte = InsertAtElement * BytesInEachElement;
	if (Subtarget.isLittleEndian()) {
	InsertAtByte = (16 - BytesInEachElement) - InsertAtByte;
	}
	return DAG.getNode(PPCISD::VECINSERT, dl, VT, V1, Mtvsrz,
	DAG.getConstant(InsertAtByte, dl, MVT::i32));
	}
	return Op;
	}

	SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	SDNode *N = Op.getNode();

	assert(N->getOperand(0).getValueType() == MVT::v4i1 &&
	"Unknown extract_vector_elt type");

	SDValue Value = N->getOperand(0);

	// The first part of this is like the store lowering except that we don't
	// need to track the chain.

	// The values are now known to be -1 (false) or 1 (true). To convert this
	// into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
	// This can be done with an fma and the 0.5 constant: (V+1.0)0.5 = 0.5V+0.5
	Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);

	// FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to
	// understand how to form the extending load.
	SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);

	Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);

	// Now convert to an integer and store.
	Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
	DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32),
	Value);

	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	int FrameIdx = MFI.CreateStackObject(16, Align(16), false);
	MachinePointerInfo PtrInfo =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);

	SDValue StoreChain = DAG.getEntryNode();
	SDValue Ops[] = {StoreChain,
	DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32),
	Value, FIdx};
	SDVTList VTs = DAG.getVTList(/chain/ MVT::Other);

	StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID,
	dl, VTs, Ops, MVT::v4i32, PtrInfo);

	// Extract the value requested.
	unsigned Offset = 4*cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
	SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
	Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);

	SDValue IntVal =
	DAG.getLoad(MVT::i32, dl, StoreChain, Idx, PtrInfo.getWithOffset(Offset));

	if (!Subtarget.useCRBits())
	return IntVal;

	return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, IntVal);
	}

	/// Lowering for QPX v4i1 loads
	SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	LoadSDNode *LN = cast<LoadSDNode>(Op.getNode());
	SDValue LoadChain = LN->getChain();
	SDValue BasePtr = LN->getBasePtr();

	if (Op.getValueType() == MVT::v4f64 \|\|
	Op.getValueType() == MVT::v4f32) {
	EVT MemVT = LN->getMemoryVT();
	unsigned Alignment = LN->getAlignment();

	// If this load is properly aligned, then it is legal.
	if (Alignment >= MemVT.getStoreSize())
	return Op;

	EVT ScalarVT = Op.getValueType().getScalarType(),
	ScalarMemVT = MemVT.getScalarType();
	unsigned Stride = ScalarMemVT.getStoreSize();

	SDValue Vals[4], LoadChains[4];
	for (unsigned Idx = 0; Idx < 4; ++Idx) {
	SDValue Load;
	if (ScalarVT != ScalarMemVT)
	Load = DAG.getExtLoad(LN->getExtensionType(), dl, ScalarVT, LoadChain,
	BasePtr,
	LN->getPointerInfo().getWithOffset(Idx * Stride),
	ScalarMemVT, MinAlign(Alignment, Idx * Stride),
	LN->getMemOperand()->getFlags(), LN->getAAInfo());
	else
	Load = DAG.getLoad(ScalarVT, dl, LoadChain, BasePtr,
	LN->getPointerInfo().getWithOffset(Idx * Stride),
	MinAlign(Alignment, Idx * Stride),
	LN->getMemOperand()->getFlags(), LN->getAAInfo());

	if (Idx == 0 && LN->isIndexed()) {
	assert(LN->getAddressingMode() == ISD::PRE_INC &&
	"Unknown addressing mode on vector load");
	Load = DAG.getIndexedLoad(Load, dl, BasePtr, LN->getOffset(),
	LN->getAddressingMode());
	}

	Vals[Idx] = Load;
	LoadChains[Idx] = Load.getValue(1);

	BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
	DAG.getConstant(Stride, dl,
	BasePtr.getValueType()));
	}

	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
	SDValue Value = DAG.getBuildVector(Op.getValueType(), dl, Vals);

	if (LN->isIndexed()) {
	SDValue RetOps[] = { Value, Vals[0].getValue(1), TF };
	return DAG.getMergeValues(RetOps, dl);
	}

	SDValue RetOps[] = { Value, TF };
	return DAG.getMergeValues(RetOps, dl);
	}

	assert(Op.getValueType() == MVT::v4i1 && "Unknown load to lower");
	assert(LN->isUnindexed() && "Indexed v4i1 loads are not supported");

	// To lower v4i1 from a byte array, we load the byte elements of the
	// vector and then reuse the BUILD_VECTOR logic.

	SDValue VectElmts[4], VectElmtChains[4];
	for (unsigned i = 0; i < 4; ++i) {
	SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType());
	Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx);

	VectElmts[i] = DAG.getExtLoad(
	ISD::EXTLOAD, dl, MVT::i32, LoadChain, Idx,
	LN->getPointerInfo().getWithOffset(i), MVT::i8,
	/* Alignment = */ 1, LN->getMemOperand()->getFlags(), LN->getAAInfo());
	VectElmtChains[i] = VectElmts[i].getValue(1);
	}

	LoadChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, VectElmtChains);
	SDValue Value = DAG.getBuildVector(MVT::v4i1, dl, VectElmts);

	SDValue RVals[] = { Value, LoadChain };
	return DAG.getMergeValues(RVals, dl);
	}

	/// Lowering for QPX v4i1 stores
	SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	StoreSDNode *SN = cast<StoreSDNode>(Op.getNode());
	SDValue StoreChain = SN->getChain();
	SDValue BasePtr = SN->getBasePtr();
	SDValue Value = SN->getValue();

	if (Value.getValueType() == MVT::v4f64 \|\|
	Value.getValueType() == MVT::v4f32) {
	EVT MemVT = SN->getMemoryVT();
	unsigned Alignment = SN->getAlignment();

	// If this store is properly aligned, then it is legal.
	if (Alignment >= MemVT.getStoreSize())
	return Op;

	EVT ScalarVT = Value.getValueType().getScalarType(),
	ScalarMemVT = MemVT.getScalarType();
	unsigned Stride = ScalarMemVT.getStoreSize();

	SDValue Stores[4];
	for (unsigned Idx = 0; Idx < 4; ++Idx) {
	SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value,
	DAG.getVectorIdxConstant(Idx, dl));
	SDValue Store;
	if (ScalarVT != ScalarMemVT)
	Store =
	DAG.getTruncStore(StoreChain, dl, Ex, BasePtr,
	SN->getPointerInfo().getWithOffset(Idx * Stride),
	ScalarMemVT, MinAlign(Alignment, Idx * Stride),
	SN->getMemOperand()->getFlags(), SN->getAAInfo());
	else
	Store = DAG.getStore(StoreChain, dl, Ex, BasePtr,
	SN->getPointerInfo().getWithOffset(Idx * Stride),
	MinAlign(Alignment, Idx * Stride),
	SN->getMemOperand()->getFlags(), SN->getAAInfo());

	if (Idx == 0 && SN->isIndexed()) {
	assert(SN->getAddressingMode() == ISD::PRE_INC &&
	"Unknown addressing mode on vector store");
	Store = DAG.getIndexedStore(Store, dl, BasePtr, SN->getOffset(),
	SN->getAddressingMode());
	}

	BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
	DAG.getConstant(Stride, dl,
	BasePtr.getValueType()));
	Stores[Idx] = Store;
	}

	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);

	if (SN->isIndexed()) {
	SDValue RetOps[] = { TF, Stores[0].getValue(1) };
	return DAG.getMergeValues(RetOps, dl);
	}

	return TF;
	}

	assert(SN->isUnindexed() && "Indexed v4i1 stores are not supported");
	assert(Value.getValueType() == MVT::v4i1 && "Unknown store to lower");

	// The values are now known to be -1 (false) or 1 (true). To convert this
	// into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
	// This can be done with an fma and the 0.5 constant: (V+1.0)0.5 = 0.5V+0.5
	Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);

	// FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to
	// understand how to form the extending load.
	SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);

	Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);

	// Now convert to an integer and store.
	Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
	DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32),
	Value);

	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	int FrameIdx = MFI.CreateStackObject(16, Align(16), false);
	MachinePointerInfo PtrInfo =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);

	SDValue Ops[] = {StoreChain,
	DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32),
	Value, FIdx};
	SDVTList VTs = DAG.getVTList(/chain/ MVT::Other);

	StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID,
	dl, VTs, Ops, MVT::v4i32, PtrInfo);

	// Move data into the byte array.
	SDValue Loads[4], LoadChains[4];
	for (unsigned i = 0; i < 4; ++i) {
	unsigned Offset = 4*i;
	SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
	Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);

	Loads[i] = DAG.getLoad(MVT::i32, dl, StoreChain, Idx,
	PtrInfo.getWithOffset(Offset));
	LoadChains[i] = Loads[i].getValue(1);
	}

	StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);

	SDValue Stores[4];
	for (unsigned i = 0; i < 4; ++i) {
	SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType());
	Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx);

	Stores[i] = DAG.getTruncStore(
	StoreChain, dl, Loads[i], Idx, SN->getPointerInfo().getWithOffset(i),
	MVT::i8, /* Alignment = */ 1, SN->getMemOperand()->getFlags(),
	SN->getAAInfo());
	}

	StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);

	return StoreChain;
	}

	SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
	SDLoc dl(Op);
	if (Op.getValueType() == MVT::v4i32) {
	SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);

	SDValue Zero = getCanonicalConstSplat(0, 1, MVT::v4i32, DAG, dl);
	// +16 as shift amt.
	SDValue Neg16 = getCanonicalConstSplat(-16, 4, MVT::v4i32, DAG, dl);
	SDValue RHSSwap = // = vrlw RHS, 16
	BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG, dl);

	// Shrinkify inputs to v8i16.
	LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, LHS);
	RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHS);
	RHSSwap = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHSSwap);

	// Low parts multiplied together, generating 32-bit results (we ignore the
	// top parts).
	SDValue LoProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh,
	LHS, RHS, DAG, dl, MVT::v4i32);

	SDValue HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm,
	LHS, RHSSwap, Zero, DAG, dl, MVT::v4i32);
	// Shift the high parts up 16 bits.
	HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd,
	Neg16, DAG, dl);
	return DAG.getNode(ISD::ADD, dl, MVT::v4i32, LoProd, HiProd);
	} else if (Op.getValueType() == MVT::v16i8) {
	SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
	bool isLittleEndian = Subtarget.isLittleEndian();

	// Multiply the even 8-bit parts, producing 16-bit sums.
	SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub,
	LHS, RHS, DAG, dl, MVT::v8i16);
	EvenParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, EvenParts);

	// Multiply the odd 8-bit parts, producing 16-bit sums.
	SDValue OddParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuloub,
	LHS, RHS, DAG, dl, MVT::v8i16);
	OddParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OddParts);

	// Merge the results together. Because vmuleub and vmuloub are
	// instructions with a big-endian bias, we must reverse the
	// element numbering and reverse the meaning of "odd" and "even"
	// when generating little endian code.
	int Ops[16];
	for (unsigned i = 0; i != 8; ++i) {
	if (isLittleEndian) {
	Ops[i2 ] = 2i;
	Ops[i2+1] = 2i+16;
	} else {
	Ops[i2 ] = 2i+1;
	Ops[i2+1] = 2i+1+16;
	}
	}
	if (isLittleEndian)
	return DAG.getVectorShuffle(MVT::v16i8, dl, OddParts, EvenParts, Ops);
	else
	return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops);
	} else {
	llvm_unreachable("Unknown mul to lower!");
	}
	}

	SDValue PPCTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {

	assert(Op.getOpcode() == ISD::ABS && "Should only be called for ISD::ABS");

	EVT VT = Op.getValueType();
	assert(VT.isVector() &&
	"Only set vector abs as custom, scalar abs shouldn't reach here!");
	assert((VT == MVT::v2i64 \|\| VT == MVT::v4i32 \|\| VT == MVT::v8i16 \|\|
	VT == MVT::v16i8) &&
	"Unexpected vector element type!");
	assert((VT != MVT::v2i64 \|\| Subtarget.hasP8Altivec()) &&
	"Current subtarget doesn't support smax v2i64!");

	// For vector abs, it can be lowered to:
	// abs x
	// ==>
	// y = -x
	// smax(x, y)

	SDLoc dl(Op);
	SDValue X = Op.getOperand(0);
	SDValue Zero = DAG.getConstant(0, dl, VT);
	SDValue Y = DAG.getNode(ISD::SUB, dl, VT, Zero, X);

	// SMAX patch https://reviews.llvm.org/D47332
	// hasn't landed yet, so use intrinsic first here.
	// TODO: Should use SMAX directly once SMAX patch landed
	Intrinsic::ID BifID = Intrinsic::ppc_altivec_vmaxsw;
	if (VT == MVT::v2i64)
	BifID = Intrinsic::ppc_altivec_vmaxsd;
	else if (VT == MVT::v8i16)
	BifID = Intrinsic::ppc_altivec_vmaxsh;
	else if (VT == MVT::v16i8)
	BifID = Intrinsic::ppc_altivec_vmaxsb;

	return BuildIntrinsicOp(BifID, X, Y, DAG, dl, VT);
	}

	// Custom lowering for fpext vf32 to v2f64
	SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {

	assert(Op.getOpcode() == ISD::FP_EXTEND &&
	"Should only be called for ISD::FP_EXTEND");

	// FIXME: handle extends from half precision float vectors on P9.
	// We only want to custom lower an extend from v2f32 to v2f64.
	if (Op.getValueType() != MVT::v2f64 \|\|
	Op.getOperand(0).getValueType() != MVT::v2f32)
	return SDValue();

	SDLoc dl(Op);
	SDValue Op0 = Op.getOperand(0);

	switch (Op0.getOpcode()) {
	default:
	return SDValue();
	case ISD::EXTRACT_SUBVECTOR: {
	assert(Op0.getNumOperands() == 2 &&
	isa<ConstantSDNode>(Op0->getOperand(1)) &&
	"Node should have 2 operands with second one being a constant!");

	if (Op0.getOperand(0).getValueType() != MVT::v4f32)
	return SDValue();

	// Custom lower is only done for high or low doubleword.
	int Idx = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
	if (Idx % 2 != 0)
	return SDValue();

	// Since input is v4f32, at this point Idx is either 0 or 2.
	// Shift to get the doubleword position we want.
	int DWord = Idx >> 1;

	// High and low word positions are different on little endian.
	if (Subtarget.isLittleEndian())
	DWord ^= 0x1;

	return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64,
	Op0.getOperand(0), DAG.getConstant(DWord, dl, MVT::i32));
	}
	case ISD::FADD:
	case ISD::FMUL:
	case ISD::FSUB: {
	SDValue NewLoad[2];
	for (unsigned i = 0, ie = Op0.getNumOperands(); i != ie; ++i) {
	// Ensure both input are loads.
	SDValue LdOp = Op0.getOperand(i);
	if (LdOp.getOpcode() != ISD::LOAD)
	return SDValue();
	// Generate new load node.
	LoadSDNode *LD = cast<LoadSDNode>(LdOp);
	SDValue LoadOps[] = {LD->getChain(), LD->getBasePtr()};
	NewLoad[i] = DAG.getMemIntrinsicNode(
	PPCISD::LD_VSX_LH, dl, DAG.getVTList(MVT::v4f32, MVT::Other), LoadOps,
	LD->getMemoryVT(), LD->getMemOperand());
	}
	SDValue NewOp =
	DAG.getNode(Op0.getOpcode(), SDLoc(Op0), MVT::v4f32, NewLoad[0],
	NewLoad[1], Op0.getNode()->getFlags());
	return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64, NewOp,
	DAG.getConstant(0, dl, MVT::i32));
	}
	case ISD::LOAD: {
	LoadSDNode *LD = cast<LoadSDNode>(Op0);
	SDValue LoadOps[] = {LD->getChain(), LD->getBasePtr()};
	SDValue NewLd = DAG.getMemIntrinsicNode(
	PPCISD::LD_VSX_LH, dl, DAG.getVTList(MVT::v4f32, MVT::Other), LoadOps,
	LD->getMemoryVT(), LD->getMemOperand());
	return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64, NewLd,
	DAG.getConstant(0, dl, MVT::i32));
	}
	}
	llvm_unreachable("ERROR:Should return for all cases within swtich.");
	}

	/// LowerOperation - Provide custom lowering hooks for some operations.
	///
	SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
	switch (Op.getOpcode()) {
	default: llvm_unreachable("Wasn't expecting to be able to lower this!");
	case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
	case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
	case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
	case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
	case ISD::JumpTable: return LowerJumpTable(Op, DAG);
	case ISD::SETCC: return LowerSETCC(Op, DAG);
	case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
	case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);

	// Variable argument lowering.
	case ISD::VASTART: return LowerVASTART(Op, DAG);
	case ISD::VAARG: return LowerVAARG(Op, DAG);
	case ISD::VACOPY: return LowerVACOPY(Op, DAG);

	case ISD::STACKRESTORE: return LowerSTACKRESTORE(Op, DAG);
	case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
	case ISD::GET_DYNAMIC_AREA_OFFSET:
	return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG);

	// Exception handling lowering.
	case ISD::EH_DWARF_CFA: return LowerEH_DWARF_CFA(Op, DAG);
	case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
	case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);

	case ISD::LOAD: return LowerLOAD(Op, DAG);
	case ISD::STORE: return LowerSTORE(Op, DAG);
	case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
	case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
	case ISD::FP_TO_UINT:
	case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, SDLoc(Op));
	case ISD::UINT_TO_FP:
	case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
	case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);

	// Lower 64-bit shifts.
	case ISD::SHL_PARTS: return LowerSHL_PARTS(Op, DAG);
	case ISD::SRL_PARTS: return LowerSRL_PARTS(Op, DAG);
	case ISD::SRA_PARTS: return LowerSRA_PARTS(Op, DAG);

	// Vector-related lowering.
	case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
	case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
	case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
	case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
	case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
	case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
	case ISD::MUL: return LowerMUL(Op, DAG);
	case ISD::ABS: return LowerABS(Op, DAG);
	case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
	case ISD::ROTL: return LowerROTL(Op, DAG);

	// For counter-based loop handling.
	case ISD::INTRINSIC_W_CHAIN: return SDValue();

	case ISD::BITCAST: return LowerBITCAST(Op, DAG);

	// Frame & Return address.
	case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
	case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);

	case ISD::INTRINSIC_VOID:
	return LowerINTRINSIC_VOID(Op, DAG);
	case ISD::BSWAP:
	return LowerBSWAP(Op, DAG);
	case ISD::ATOMIC_CMP_SWAP:
	return LowerATOMIC_CMP_SWAP(Op, DAG);
	}
	}

	void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
	SmallVectorImpl<SDValue>&Results,
	SelectionDAG &DAG) const {
	SDLoc dl(N);
	switch (N->getOpcode()) {
	default:
	llvm_unreachable("Do not know how to custom type legalize this operation!");
	case ISD::READCYCLECOUNTER: {
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
	SDValue RTB = DAG.getNode(PPCISD::READ_TIME_BASE, dl, VTs, N->getOperand(0));

	Results.push_back(
	DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, RTB, RTB.getValue(1)));
	Results.push_back(RTB.getValue(2));
	break;
	}
	case ISD::INTRINSIC_W_CHAIN: {
	if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() !=
	Intrinsic::loop_decrement)
	break;

	assert(N->getValueType(0) == MVT::i1 &&
	"Unexpected result type for CTR decrement intrinsic");
	EVT SVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
	N->getValueType(0));
	SDVTList VTs = DAG.getVTList(SVT, MVT::Other);
	SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0),
	N->getOperand(1));

	Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewInt));
	Results.push_back(NewInt.getValue(1));
	break;
	}
	case ISD::VAARG: {
	if (!Subtarget.isSVR4ABI() \|\| Subtarget.isPPC64())
	return;

	EVT VT = N->getValueType(0);

	if (VT == MVT::i64) {
	SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG);

	Results.push_back(NewNode);
	Results.push_back(NewNode.getValue(1));
	}
	return;
	}
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	// LowerFP_TO_INT() can only handle f32 and f64.
	if (N->getOperand(0).getValueType() == MVT::ppcf128)
	return;
	Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl));
	return;
	case ISD::TRUNCATE: {
	EVT TrgVT = N->getValueType(0);
	EVT OpVT = N->getOperand(0).getValueType();
	if (TrgVT.isVector() &&
	isOperationCustom(N->getOpcode(), TrgVT) &&
	OpVT.getSizeInBits() <= 128 &&
	isPowerOf2_32(OpVT.getVectorElementType().getSizeInBits()))
	Results.push_back(LowerTRUNCATEVector(SDValue(N, 0), DAG));
	return;
	}
	case ISD::BITCAST:
	// Don't handle bitcast here.
	return;
	case ISD::FP_EXTEND:
	SDValue Lowered = LowerFP_EXTEND(SDValue(N, 0), DAG);
	if (Lowered)
	Results.push_back(Lowered);
	return;
	}
	}

	//===----------------------------------------------------------------------===//
	// Other Lowering Code
	//===----------------------------------------------------------------------===//

	static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) {
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	Function *Func = Intrinsic::getDeclaration(M, Id);
	return Builder.CreateCall(Func, {});
	}

	// The mappings for emitLeading/TrailingFence is taken from
	// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
	Instruction *PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
	Instruction *Inst,
	AtomicOrdering Ord) const {
	if (Ord == AtomicOrdering::SequentiallyConsistent)
	return callIntrinsic(Builder, Intrinsic::ppc_sync);
	if (isReleaseOrStronger(Ord))
	return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
	return nullptr;
	}

	Instruction *PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
	Instruction *Inst,
	AtomicOrdering Ord) const {
	if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) {
	// See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
	// http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
	// and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification.
	if (isa<LoadInst>(Inst) && Subtarget.isPPC64())
	return Builder.CreateCall(
	Intrinsic::getDeclaration(
	Builder.GetInsertBlock()->getParent()->getParent(),
	Intrinsic::ppc_cfence, {Inst->getType()}),
	{Inst});
	// FIXME: Can use isync for rmw operation.
	return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
	}
	return nullptr;
	}

	MachineBasicBlock *
	PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
	unsigned AtomicSize,
	unsigned BinOpcode,
	unsigned CmpOpcode,
	unsigned CmpPred) const {
	// This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();

	auto LoadMnemonic = PPC::LDARX;
	auto StoreMnemonic = PPC::STDCX;
	switch (AtomicSize) {
	default:
	llvm_unreachable("Unexpected size of atomic entity");
	case 1:
	LoadMnemonic = PPC::LBARX;
	StoreMnemonic = PPC::STBCX;
	assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4");
	break;
	case 2:
	LoadMnemonic = PPC::LHARX;
	StoreMnemonic = PPC::STHCX;
	assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4");
	break;
	case 4:
	LoadMnemonic = PPC::LWARX;
	StoreMnemonic = PPC::STWCX;
	break;
	case 8:
	LoadMnemonic = PPC::LDARX;
	StoreMnemonic = PPC::STDCX;
	break;
	}

	const BasicBlock *LLVM_BB = BB->getBasicBlock();
	MachineFunction *F = BB->getParent();
	MachineFunction::iterator It = ++BB->getIterator();

	Register dest = MI.getOperand(0).getReg();
	Register ptrA = MI.getOperand(1).getReg();
	Register ptrB = MI.getOperand(2).getReg();
	Register incr = MI.getOperand(3).getReg();
	DebugLoc dl = MI.getDebugLoc();

	MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *loop2MBB =
	CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr;
	MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
	F->insert(It, loopMBB);
	if (CmpOpcode)
	F->insert(It, loop2MBB);
	F->insert(It, exitMBB);
	exitMBB->splice(exitMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	exitMBB->transferSuccessorsAndUpdatePHIs(BB);

	MachineRegisterInfo &RegInfo = F->getRegInfo();
	Register TmpReg = (!BinOpcode) ? incr :
	RegInfo.createVirtualRegister( AtomicSize == 8 ? &PPC::G8RCRegClass
	: &PPC::GPRCRegClass);

	// thisMBB:
	// ...
	// fallthrough --> loopMBB
	BB->addSuccessor(loopMBB);

	// loopMBB:
	// l[wd]arx dest, ptr
	// add r0, dest, incr
	// st[wd]cx. r0, ptr
	// bne- loopMBB
	// fallthrough --> exitMBB

	// For max/min...
	// loopMBB:
	// l[wd]arx dest, ptr
	// cmpl?[wd] incr, dest
	// bgt exitMBB
	// loop2MBB:
	// st[wd]cx. dest, ptr
	// bne- loopMBB
	// fallthrough --> exitMBB

	BB = loopMBB;
	BuildMI(BB, dl, TII->get(LoadMnemonic), dest)
	.addReg(ptrA).addReg(ptrB);
	if (BinOpcode)
	BuildMI(BB, dl, TII->get(BinOpcode), TmpReg).addReg(incr).addReg(dest);
	if (CmpOpcode) {
	// Signed comparisons of byte or halfword values must be sign-extended.
	if (CmpOpcode == PPC::CMPW && AtomicSize < 4) {
	Register ExtReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
	BuildMI(BB, dl, TII->get(AtomicSize == 1 ? PPC::EXTSB : PPC::EXTSH),
	ExtReg).addReg(dest);
	BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0)
	.addReg(incr).addReg(ExtReg);
	} else
	BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0)
	.addReg(incr).addReg(dest);

	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(CmpPred).addReg(PPC::CR0).addMBB(exitMBB);
	BB->addSuccessor(loop2MBB);
	BB->addSuccessor(exitMBB);
	BB = loop2MBB;
	}
	BuildMI(BB, dl, TII->get(StoreMnemonic))
	.addReg(TmpReg).addReg(ptrA).addReg(ptrB);
	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB);
	BB->addSuccessor(loopMBB);
	BB->addSuccessor(exitMBB);

	// exitMBB:
	// ...
	BB = exitMBB;
	return BB;
	}

	MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary(
	MachineInstr &MI, MachineBasicBlock *BB,
	bool is8bit, // operation
	unsigned BinOpcode, unsigned CmpOpcode, unsigned CmpPred) const {
	// If we support part-word atomic mnemonics, just use them
	if (Subtarget.hasPartwordAtomics())
	return EmitAtomicBinary(MI, BB, is8bit ? 1 : 2, BinOpcode, CmpOpcode,
	CmpPred);

	// This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	// In 64 bit mode we have to use 64 bits for addresses, even though the
	// lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address
	// registers without caring whether they're 32 or 64, but here we're
	// doing actual arithmetic on the addresses.
	bool is64bit = Subtarget.isPPC64();
	bool isLittleEndian = Subtarget.isLittleEndian();
	unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;

	const BasicBlock *LLVM_BB = BB->getBasicBlock();
	MachineFunction *F = BB->getParent();
	MachineFunction::iterator It = ++BB->getIterator();

	Register dest = MI.getOperand(0).getReg();
	Register ptrA = MI.getOperand(1).getReg();
	Register ptrB = MI.getOperand(2).getReg();
	Register incr = MI.getOperand(3).getReg();
	DebugLoc dl = MI.getDebugLoc();

	MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *loop2MBB =
	CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr;
	MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
	F->insert(It, loopMBB);
	if (CmpOpcode)
	F->insert(It, loop2MBB);
	F->insert(It, exitMBB);
	exitMBB->splice(exitMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	exitMBB->transferSuccessorsAndUpdatePHIs(BB);

	MachineRegisterInfo &RegInfo = F->getRegInfo();
	const TargetRegisterClass *RC =
	is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
	const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;

	Register PtrReg = RegInfo.createVirtualRegister(RC);
	Register Shift1Reg = RegInfo.createVirtualRegister(GPRC);
	Register ShiftReg =
	isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(GPRC);
	Register Incr2Reg = RegInfo.createVirtualRegister(GPRC);
	Register MaskReg = RegInfo.createVirtualRegister(GPRC);
	Register Mask2Reg = RegInfo.createVirtualRegister(GPRC);
	Register Mask3Reg = RegInfo.createVirtualRegister(GPRC);
	Register Tmp2Reg = RegInfo.createVirtualRegister(GPRC);
	Register Tmp3Reg = RegInfo.createVirtualRegister(GPRC);
	Register Tmp4Reg = RegInfo.createVirtualRegister(GPRC);
	Register TmpDestReg = RegInfo.createVirtualRegister(GPRC);
	Register Ptr1Reg;
	Register TmpReg =
	(!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(GPRC);

	// thisMBB:
	// ...
	// fallthrough --> loopMBB
	BB->addSuccessor(loopMBB);

	// The 4-byte load must be aligned, while a char or short may be
	// anywhere in the word. Hence all this nasty bookkeeping code.
	// add ptr1, ptrA, ptrB [copy if ptrA==0]
	// rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
	// xori shift, shift1, 24 [16]
	// rlwinm ptr, ptr1, 0, 0, 29
	// slw incr2, incr, shift
	// li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
	// slw mask, mask2, shift
	// loopMBB:
	// lwarx tmpDest, ptr
	// add tmp, tmpDest, incr2
	// andc tmp2, tmpDest, mask
	// and tmp3, tmp, mask
	// or tmp4, tmp3, tmp2
	// stwcx. tmp4, ptr
	// bne- loopMBB
	// fallthrough --> exitMBB
	// srw dest, tmpDest, shift
	if (ptrA != ZeroReg) {
	Ptr1Reg = RegInfo.createVirtualRegister(RC);
	BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
	.addReg(ptrA)
	.addReg(ptrB);
	} else {
	Ptr1Reg = ptrB;
	}
	// We need use 32-bit subregister to avoid mismatch register class in 64-bit
	// mode.
	BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg)
	.addReg(Ptr1Reg, 0, is64bit ? PPC::sub_32 : 0)
	.addImm(3)
	.addImm(27)
	.addImm(is8bit ? 28 : 27);
	if (!isLittleEndian)
	BuildMI(BB, dl, TII->get(PPC::XORI), ShiftReg)
	.addReg(Shift1Reg)
	.addImm(is8bit ? 24 : 16);
	if (is64bit)
	BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
	.addReg(Ptr1Reg)
	.addImm(0)
	.addImm(61);
	else
	BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
	.addReg(Ptr1Reg)
	.addImm(0)
	.addImm(0)
	.addImm(29);
	BuildMI(BB, dl, TII->get(PPC::SLW), Incr2Reg).addReg(incr).addReg(ShiftReg);
	if (is8bit)
	BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
	else {
	BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
	BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg)
	.addReg(Mask3Reg)
	.addImm(65535);
	}
	BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
	.addReg(Mask2Reg)
	.addReg(ShiftReg);

	BB = loopMBB;
	BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
	.addReg(ZeroReg)
	.addReg(PtrReg);
	if (BinOpcode)
	BuildMI(BB, dl, TII->get(BinOpcode), TmpReg)
	.addReg(Incr2Reg)
	.addReg(TmpDestReg);
	BuildMI(BB, dl, TII->get(PPC::ANDC), Tmp2Reg)
	.addReg(TmpDestReg)
	.addReg(MaskReg);
	BuildMI(BB, dl, TII->get(PPC::AND), Tmp3Reg).addReg(TmpReg).addReg(MaskReg);
	if (CmpOpcode) {
	// For unsigned comparisons, we can directly compare the shifted values.
	// For signed comparisons we shift and sign extend.
	Register SReg = RegInfo.createVirtualRegister(GPRC);
	BuildMI(BB, dl, TII->get(PPC::AND), SReg)
	.addReg(TmpDestReg)
	.addReg(MaskReg);
	unsigned ValueReg = SReg;
	unsigned CmpReg = Incr2Reg;
	if (CmpOpcode == PPC::CMPW) {
	ValueReg = RegInfo.createVirtualRegister(GPRC);
	BuildMI(BB, dl, TII->get(PPC::SRW), ValueReg)
	.addReg(SReg)
	.addReg(ShiftReg);
	Register ValueSReg = RegInfo.createVirtualRegister(GPRC);
	BuildMI(BB, dl, TII->get(is8bit ? PPC::EXTSB : PPC::EXTSH), ValueSReg)
	.addReg(ValueReg);
	ValueReg = ValueSReg;
	CmpReg = incr;
	}
	BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0)
	.addReg(CmpReg)
	.addReg(ValueReg);
	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(CmpPred)
	.addReg(PPC::CR0)
	.addMBB(exitMBB);
	BB->addSuccessor(loop2MBB);
	BB->addSuccessor(exitMBB);
	BB = loop2MBB;
	}
	BuildMI(BB, dl, TII->get(PPC::OR), Tmp4Reg).addReg(Tmp3Reg).addReg(Tmp2Reg);
	BuildMI(BB, dl, TII->get(PPC::STWCX))
	.addReg(Tmp4Reg)
	.addReg(ZeroReg)
	.addReg(PtrReg);
	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(PPC::PRED_NE)
	.addReg(PPC::CR0)
	.addMBB(loopMBB);
	BB->addSuccessor(loopMBB);
	BB->addSuccessor(exitMBB);

	// exitMBB:
	// ...
	BB = exitMBB;
	BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest)
	.addReg(TmpDestReg)
	.addReg(ShiftReg);
	return BB;
	}

	llvm::MachineBasicBlock *
	PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();

	MachineFunction *MF = MBB->getParent();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	const BasicBlock *BB = MBB->getBasicBlock();
	MachineFunction::iterator I = ++MBB->getIterator();

	Register DstReg = MI.getOperand(0).getReg();
	const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
	assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
	Register mainDstReg = MRI.createVirtualRegister(RC);
	Register restoreDstReg = MRI.createVirtualRegister(RC);

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) &&
	"Invalid Pointer Size!");
	// For v = setjmp(buf), we generate
	//
	// thisMBB:
	// SjLjSetup mainMBB
	// bl mainMBB
	// v_restore = 1
	// b sinkMBB
	//
	// mainMBB:
	// buf[LabelOffset] = LR
	// v_main = 0
	//
	// sinkMBB:
	// v = phi(main, restore)
	//

	MachineBasicBlock *thisMBB = MBB;
	MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(I, mainMBB);
	MF->insert(I, sinkMBB);

	MachineInstrBuilder MIB;

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

	// Note that the structure of the jmp_buf used here is not compatible
	// with that used by libc, and is not designed to be. Specifically, it
	// stores only those 'reserved' registers that LLVM does not otherwise
	// understand how to spill. Also, by convention, by the time this
	// intrinsic is called, Clang has already stored the frame address in the
	// first slot of the buffer and stack address in the third. Following the
	// X86 target code, we'll store the jump address in the second slot. We also
	// need to save the TOC pointer (R2) to handle jumps between shared
	// libraries, and that will be stored in the fourth slot. The thread
	// identifier (R13) is not affected.

	// thisMBB:
	const int64_t LabelOffset = 1 * PVT.getStoreSize();
	const int64_t TOCOffset = 3 * PVT.getStoreSize();
	const int64_t BPOffset = 4 * PVT.getStoreSize();

	// Prepare IP either in reg.
	const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
	Register LabelReg = MRI.createVirtualRegister(PtrRC);
	Register BufReg = MI.getOperand(1).getReg();

	if (Subtarget.is64BitELFABI()) {
	setUsesTOCBasePtr(*MBB->getParent());
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD))
	.addReg(PPC::X2)
	.addImm(TOCOffset)
	.addReg(BufReg)
	.cloneMemRefs(MI);
	}

	// Naked functions never have a base pointer, and so we use r1. For all
	// other functions, this decision must be delayed until during PEI.
	unsigned BaseReg;
	if (MF->getFunction().hasFnAttribute(Attribute::Naked))
	BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1;
	else
	BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP;

	MIB = BuildMI(*thisMBB, MI, DL,
	TII->get(Subtarget.isPPC64() ? PPC::STD : PPC::STW))
	.addReg(BaseReg)
	.addImm(BPOffset)
	.addReg(BufReg)
	.cloneMemRefs(MI);

	// Setup
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB);
	MIB.addRegMask(TRI->getNoPreservedMask());

	BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1);

	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::EH_SjLj_Setup))
	.addMBB(mainMBB);
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::B)).addMBB(sinkMBB);

	thisMBB->addSuccessor(mainMBB, BranchProbability::getZero());
	thisMBB->addSuccessor(sinkMBB, BranchProbability::getOne());

	// mainMBB:
	// mainDstReg = 0
	MIB =
	BuildMI(mainMBB, DL,
	TII->get(Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg);

	// Store IP
	if (Subtarget.isPPC64()) {
	MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD))
	.addReg(LabelReg)
	.addImm(LabelOffset)
	.addReg(BufReg);
	} else {
	MIB = BuildMI(mainMBB, DL, TII->get(PPC::STW))
	.addReg(LabelReg)
	.addImm(LabelOffset)
	.addReg(BufReg);
	}
	MIB.cloneMemRefs(MI);

	BuildMI(mainMBB, DL, TII->get(PPC::LI), mainDstReg).addImm(0);
	mainMBB->addSuccessor(sinkMBB);

	// sinkMBB:
	BuildMI(*sinkMBB, sinkMBB->begin(), DL,
	TII->get(PPC::PHI), DstReg)
	.addReg(mainDstReg).addMBB(mainMBB)
	.addReg(restoreDstReg).addMBB(thisMBB);

	MI.eraseFromParent();
	return sinkMBB;
	}

	MachineBasicBlock *
	PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();

	MachineFunction *MF = MBB->getParent();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) &&
	"Invalid Pointer Size!");

	const TargetRegisterClass *RC =
	(PVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
	Register Tmp = MRI.createVirtualRegister(RC);
	// Since FP is only updated here but NOT referenced, it's treated as GPR.
	unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31;
	unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1;
	unsigned BP =
	(PVT == MVT::i64)
	? PPC::X30
	: (Subtarget.isSVR4ABI() && isPositionIndependent() ? PPC::R29
	: PPC::R30);

	MachineInstrBuilder MIB;

	const int64_t LabelOffset = 1 * PVT.getStoreSize();
	const int64_t SPOffset = 2 * PVT.getStoreSize();
	const int64_t TOCOffset = 3 * PVT.getStoreSize();
	const int64_t BPOffset = 4 * PVT.getStoreSize();

	Register BufReg = MI.getOperand(0).getReg();

	// Reload FP (the jumped-to function may not have had a
	// frame pointer, and if so, then its r31 will be restored
	// as necessary).
	if (PVT == MVT::i64) {
	MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), FP)
	.addImm(0)
	.addReg(BufReg);
	} else {
	MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), FP)
	.addImm(0)
	.addReg(BufReg);
	}
	MIB.cloneMemRefs(MI);

	// Reload IP
	if (PVT == MVT::i64) {
	MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), Tmp)
	.addImm(LabelOffset)
	.addReg(BufReg);
	} else {
	MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), Tmp)
	.addImm(LabelOffset)
	.addReg(BufReg);
	}
	MIB.cloneMemRefs(MI);

	// Reload SP
	if (PVT == MVT::i64) {
	MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), SP)
	.addImm(SPOffset)
	.addReg(BufReg);
	} else {
	MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), SP)
	.addImm(SPOffset)
	.addReg(BufReg);
	}
	MIB.cloneMemRefs(MI);

	// Reload BP
	if (PVT == MVT::i64) {
	MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), BP)
	.addImm(BPOffset)
	.addReg(BufReg);
	} else {
	MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), BP)
	.addImm(BPOffset)
	.addReg(BufReg);
	}
	MIB.cloneMemRefs(MI);

	// Reload TOC
	if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) {
	setUsesTOCBasePtr(*MBB->getParent());
	MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2)
	.addImm(TOCOffset)
	.addReg(BufReg)
	.cloneMemRefs(MI);
	}

	// Jump
	BuildMI(*MBB, MI, DL,
	TII->get(PVT == MVT::i64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(Tmp);
	BuildMI(*MBB, MI, DL, TII->get(PVT == MVT::i64 ? PPC::BCTR8 : PPC::BCTR));

	MI.eraseFromParent();
	return MBB;
	}

	bool PPCTargetLowering::hasInlineStackProbe(MachineFunction &MF) const {
	// If the function specifically requests inline stack probes, emit them.
	if (MF.getFunction().hasFnAttribute("probe-stack"))
	return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
	"inline-asm";
	return false;
	}

	unsigned PPCTargetLowering::getStackProbeSize(MachineFunction &MF) const {
	const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
	unsigned StackAlign = TFI->getStackAlignment();
	assert(StackAlign >= 1 && isPowerOf2_32(StackAlign) &&
	"Unexpected stack alignment");
	// The default stack probe size is 4096 if the function has no
	// stack-probe-size attribute.
	unsigned StackProbeSize = 4096;
	const Function &Fn = MF.getFunction();
	if (Fn.hasFnAttribute("stack-probe-size"))
	Fn.getFnAttribute("stack-probe-size")
	.getValueAsString()
	.getAsInteger(0, StackProbeSize);
	// Round down to the stack alignment.
	StackProbeSize &= ~(StackAlign - 1);
	return StackProbeSize ? StackProbeSize : StackAlign;
	}

	// Lower dynamic stack allocation with probing. `emitProbedAlloca` is splitted
	// into three phases. In the first phase, it uses pseudo instruction
	// PREPARE_PROBED_ALLOCA to get the future result of actual FramePointer and
	// FinalStackPtr. In the second phase, it generates a loop for probing blocks.
	// At last, it uses pseudo instruction DYNAREAOFFSET to get the future result of
	// MaxCallFrameSize so that it can calculate correct data area pointer.
	MachineBasicBlock *
	PPCTargetLowering::emitProbedAlloca(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	const bool isPPC64 = Subtarget.isPPC64();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	const unsigned ProbeSize = getStackProbeSize(*MF);
	const BasicBlock *ProbedBB = MBB->getBasicBlock();
	MachineRegisterInfo &MRI = MF->getRegInfo();
	// The CFG of probing stack looks as
	// +-----+
	// \| MBB \|
	// +--+--+
	// \|
	// +----v----+
	// +--->+ TestMBB +---+
	// \| +----+----+ \|
	// \| \| \|
	// \| +-----v----+ \|
	// +---+ BlockMBB \| \|
	// +----------+ \|
	// \|
	// +---------+ \|
	// \| TailMBB +<--+
	// +---------+
	// In MBB, calculate previous frame pointer and final stack pointer.
	// In TestMBB, test if sp is equal to final stack pointer, if so, jump to
	// TailMBB. In BlockMBB, update the sp atomically and jump back to TestMBB.
	// TailMBB is spliced via \p MI.
	MachineBasicBlock *TestMBB = MF->CreateMachineBasicBlock(ProbedBB);
	MachineBasicBlock *TailMBB = MF->CreateMachineBasicBlock(ProbedBB);
	MachineBasicBlock *BlockMBB = MF->CreateMachineBasicBlock(ProbedBB);

	MachineFunction::iterator MBBIter = ++MBB->getIterator();
	MF->insert(MBBIter, TestMBB);
	MF->insert(MBBIter, BlockMBB);
	MF->insert(MBBIter, TailMBB);

	const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
	const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;

	Register DstReg = MI.getOperand(0).getReg();
	Register NegSizeReg = MI.getOperand(1).getReg();
	Register SPReg = isPPC64 ? PPC::X1 : PPC::R1;
	Register FinalStackPtr = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
	Register FramePointer = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
	Register ActualNegSizeReg = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);

	// Since value of NegSizeReg might be realigned in prologepilog, insert a
	// PREPARE_PROBED_ALLOCA pseudo instruction to get actual FramePointer and
	// NegSize.
	unsigned ProbeOpc;
	if (!MRI.hasOneNonDBGUse(NegSizeReg))
	ProbeOpc =
	isPPC64 ? PPC::PREPARE_PROBED_ALLOCA_64 : PPC::PREPARE_PROBED_ALLOCA_32;
	else
	// By introducing PREPARE_PROBED_ALLOCA_NEGSIZE_OPT, ActualNegSizeReg
	// and NegSizeReg will be allocated in the same phyreg to avoid
	// redundant copy when NegSizeReg has only one use which is current MI and
	// will be replaced by PREPARE_PROBED_ALLOCA then.
	ProbeOpc = isPPC64 ? PPC::PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_64
	: PPC::PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_32;
	BuildMI(*MBB, {MI}, DL, TII->get(ProbeOpc), FramePointer)
	.addDef(ActualNegSizeReg)
	.addReg(NegSizeReg)
	.add(MI.getOperand(2))
	.add(MI.getOperand(3));

	// Calculate final stack pointer, which equals to SP + ActualNegSize.
	BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::ADD8 : PPC::ADD4),
	FinalStackPtr)
	.addReg(SPReg)
	.addReg(ActualNegSizeReg);

	// Materialize a scratch register for update.
	int64_t NegProbeSize = -(int64_t)ProbeSize;
	assert(isInt<32>(NegProbeSize) && "Unhandled probe size!");
	Register ScratchReg = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
	if (!isInt<16>(NegProbeSize)) {
	Register TempReg = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
	BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::LIS8 : PPC::LIS), TempReg)
	.addImm(NegProbeSize >> 16);
	BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::ORI8 : PPC::ORI),
	ScratchReg)
	.addReg(TempReg)
	.addImm(NegProbeSize & 0xFFFF);
	} else
	BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::LI8 : PPC::LI), ScratchReg)
	.addImm(NegProbeSize);

	{
	// Probing leading residual part.
	Register Div = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
	BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::DIVD : PPC::DIVW), Div)
	.addReg(ActualNegSizeReg)
	.addReg(ScratchReg);
	Register Mul = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
	BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::MULLD : PPC::MULLW), Mul)
	.addReg(Div)
	.addReg(ScratchReg);
	Register NegMod = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
	BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::SUBF8 : PPC::SUBF), NegMod)
	.addReg(Mul)
	.addReg(ActualNegSizeReg);
	BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::STDUX : PPC::STWUX), SPReg)
	.addReg(FramePointer)
	.addReg(SPReg)
	.addReg(NegMod);
	}

	{
	// Remaining part should be multiple of ProbeSize.
	Register CmpResult = MRI.createVirtualRegister(&PPC::CRRCRegClass);
	BuildMI(TestMBB, DL, TII->get(isPPC64 ? PPC::CMPD : PPC::CMPW), CmpResult)
	.addReg(SPReg)
	.addReg(FinalStackPtr);
	BuildMI(TestMBB, DL, TII->get(PPC::BCC))
	.addImm(PPC::PRED_EQ)
	.addReg(CmpResult)
	.addMBB(TailMBB);
	TestMBB->addSuccessor(BlockMBB);
	TestMBB->addSuccessor(TailMBB);
	}

	{
	// Touch the block.
	// \|P...\|P...\|P...
	BuildMI(BlockMBB, DL, TII->get(isPPC64 ? PPC::STDUX : PPC::STWUX), SPReg)
	.addReg(FramePointer)
	.addReg(SPReg)
	.addReg(ScratchReg);
	BuildMI(BlockMBB, DL, TII->get(PPC::B)).addMBB(TestMBB);
	BlockMBB->addSuccessor(TestMBB);
	}

	// Calculation of MaxCallFrameSize is deferred to prologepilog, use
	// DYNAREAOFFSET pseudo instruction to get the future result.
	Register MaxCallFrameSizeReg =
	MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
	BuildMI(TailMBB, DL,
	TII->get(isPPC64 ? PPC::DYNAREAOFFSET8 : PPC::DYNAREAOFFSET),
	MaxCallFrameSizeReg)
	.add(MI.getOperand(2))
	.add(MI.getOperand(3));
	BuildMI(TailMBB, DL, TII->get(isPPC64 ? PPC::ADD8 : PPC::ADD4), DstReg)
	.addReg(SPReg)
	.addReg(MaxCallFrameSizeReg);

	// Splice instructions after MI to TailMBB.
	TailMBB->splice(TailMBB->end(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	TailMBB->transferSuccessorsAndUpdatePHIs(MBB);
	MBB->addSuccessor(TestMBB);

	// Delete the pseudo instruction.
	MI.eraseFromParent();

	++NumDynamicAllocaProbed;
	return TailMBB;
	}

	MachineBasicBlock *
	PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	if (MI.getOpcode() == TargetOpcode::STACKMAP \|\|
	MI.getOpcode() == TargetOpcode::PATCHPOINT) {
	if (Subtarget.is64BitELFABI() &&
	MI.getOpcode() == TargetOpcode::PATCHPOINT &&
	!Subtarget.isUsingPCRelativeCalls()) {
	// Call lowering should have added an r2 operand to indicate a dependence
	// on the TOC base pointer value. It can't however, because there is no
	// way to mark the dependence as implicit there, and so the stackmap code
	// will confuse it with a regular operand. Instead, add the dependence
	// here.
	MI.addOperand(MachineOperand::CreateReg(PPC::X2, false, true));
	}

	return emitPatchPoint(MI, BB);
	}

	if (MI.getOpcode() == PPC::EH_SjLj_SetJmp32 \|\|
	MI.getOpcode() == PPC::EH_SjLj_SetJmp64) {
	return emitEHSjLjSetJmp(MI, BB);
	} else if (MI.getOpcode() == PPC::EH_SjLj_LongJmp32 \|\|
	MI.getOpcode() == PPC::EH_SjLj_LongJmp64) {
	return emitEHSjLjLongJmp(MI, BB);
	}

	const TargetInstrInfo *TII = Subtarget.getInstrInfo();

	// To "insert" these instructions we actually have to insert their
	// control-flow patterns.
	const BasicBlock *LLVM_BB = BB->getBasicBlock();
	MachineFunction::iterator It = ++BB->getIterator();

	MachineFunction *F = BB->getParent();

	if (MI.getOpcode() == PPC::SELECT_CC_I4 \|\|
	MI.getOpcode() == PPC::SELECT_CC_I8 \|\| MI.getOpcode() == PPC::SELECT_I4 \|\|
	MI.getOpcode() == PPC::SELECT_I8) {
	SmallVector<MachineOperand, 2> Cond;
	if (MI.getOpcode() == PPC::SELECT_CC_I4 \|\|
	MI.getOpcode() == PPC::SELECT_CC_I8)
	Cond.push_back(MI.getOperand(4));
	else
	Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET));
	Cond.push_back(MI.getOperand(1));

	DebugLoc dl = MI.getDebugLoc();
	TII->insertSelect(*BB, MI, dl, MI.getOperand(0).getReg(), Cond,
	MI.getOperand(2).getReg(), MI.getOperand(3).getReg());
	} else if (MI.getOpcode() == PPC::SELECT_CC_F4 \|\|
	MI.getOpcode() == PPC::SELECT_CC_F8 \|\|
	MI.getOpcode() == PPC::SELECT_CC_F16 \|\|
	MI.getOpcode() == PPC::SELECT_CC_QFRC \|\|
	MI.getOpcode() == PPC::SELECT_CC_QSRC \|\|
	MI.getOpcode() == PPC::SELECT_CC_QBRC \|\|
	MI.getOpcode() == PPC::SELECT_CC_VRRC \|\|
	MI.getOpcode() == PPC::SELECT_CC_VSFRC \|\|
	MI.getOpcode() == PPC::SELECT_CC_VSSRC \|\|
	MI.getOpcode() == PPC::SELECT_CC_VSRC \|\|
	MI.getOpcode() == PPC::SELECT_CC_SPE4 \|\|
	MI.getOpcode() == PPC::SELECT_CC_SPE \|\|
	MI.getOpcode() == PPC::SELECT_F4 \|\|
	MI.getOpcode() == PPC::SELECT_F8 \|\|
	MI.getOpcode() == PPC::SELECT_F16 \|\|
	MI.getOpcode() == PPC::SELECT_QFRC \|\|
	MI.getOpcode() == PPC::SELECT_QSRC \|\|
	MI.getOpcode() == PPC::SELECT_QBRC \|\|
	MI.getOpcode() == PPC::SELECT_SPE \|\|
	MI.getOpcode() == PPC::SELECT_SPE4 \|\|
	MI.getOpcode() == PPC::SELECT_VRRC \|\|
	MI.getOpcode() == PPC::SELECT_VSFRC \|\|
	MI.getOpcode() == PPC::SELECT_VSSRC \|\|
	MI.getOpcode() == PPC::SELECT_VSRC) {
	// The incoming instruction knows the destination vreg to set, the
	// condition code register to branch on, the true/false values to
	// select between, and a branch opcode to use.

	// thisMBB:
	// ...
	// TrueVal = ...
	// cmpTY ccX, r1, r2
	// bCC copy1MBB
	// fallthrough --> copy0MBB
	MachineBasicBlock *thisMBB = BB;
	MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
	DebugLoc dl = MI.getDebugLoc();
	F->insert(It, copy0MBB);
	F->insert(It, sinkMBB);

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(BB);

	// Next, add the true and fallthrough blocks as its successors.
	BB->addSuccessor(copy0MBB);
	BB->addSuccessor(sinkMBB);

	if (MI.getOpcode() == PPC::SELECT_I4 \|\| MI.getOpcode() == PPC::SELECT_I8 \|\|
	MI.getOpcode() == PPC::SELECT_F4 \|\| MI.getOpcode() == PPC::SELECT_F8 \|\|
	MI.getOpcode() == PPC::SELECT_F16 \|\|
	MI.getOpcode() == PPC::SELECT_SPE4 \|\|
	MI.getOpcode() == PPC::SELECT_SPE \|\|
	MI.getOpcode() == PPC::SELECT_QFRC \|\|
	MI.getOpcode() == PPC::SELECT_QSRC \|\|
	MI.getOpcode() == PPC::SELECT_QBRC \|\|
	MI.getOpcode() == PPC::SELECT_VRRC \|\|
	MI.getOpcode() == PPC::SELECT_VSFRC \|\|
	MI.getOpcode() == PPC::SELECT_VSSRC \|\|
	MI.getOpcode() == PPC::SELECT_VSRC) {
	BuildMI(BB, dl, TII->get(PPC::BC))
	.addReg(MI.getOperand(1).getReg())
	.addMBB(sinkMBB);
	} else {
	unsigned SelectPred = MI.getOperand(4).getImm();
	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(SelectPred)
	.addReg(MI.getOperand(1).getReg())
	.addMBB(sinkMBB);
	}

	// copy0MBB:
	// %FalseValue = ...
	// # fallthrough to sinkMBB
	BB = copy0MBB;

	// Update machine-CFG edges
	BB->addSuccessor(sinkMBB);

	// sinkMBB:
	// %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
	// ...
	BB = sinkMBB;
	BuildMI(*BB, BB->begin(), dl, TII->get(PPC::PHI), MI.getOperand(0).getReg())
	.addReg(MI.getOperand(3).getReg())
	.addMBB(copy0MBB)
	.addReg(MI.getOperand(2).getReg())
	.addMBB(thisMBB);
	} else if (MI.getOpcode() == PPC::ReadTB) {
	// To read the 64-bit time-base register on a 32-bit target, we read the
	// two halves. Should the counter have wrapped while it was being read, we
	// need to try again.
	// ...
	// readLoop:
	// mfspr Rx,TBU # load from TBU
	// mfspr Ry,TB # load from TB
	// mfspr Rz,TBU # load from TBU
	// cmpw crX,Rx,Rz # check if 'old'='new'
	// bne readLoop # branch if they're not equal
	// ...

	MachineBasicBlock *readMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
	DebugLoc dl = MI.getDebugLoc();
	F->insert(It, readMBB);
	F->insert(It, sinkMBB);

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(BB);

	BB->addSuccessor(readMBB);
	BB = readMBB;

	MachineRegisterInfo &RegInfo = F->getRegInfo();
	Register ReadAgainReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
	Register LoReg = MI.getOperand(0).getReg();
	Register HiReg = MI.getOperand(1).getReg();

	BuildMI(BB, dl, TII->get(PPC::MFSPR), HiReg).addImm(269);
	BuildMI(BB, dl, TII->get(PPC::MFSPR), LoReg).addImm(268);
	BuildMI(BB, dl, TII->get(PPC::MFSPR), ReadAgainReg).addImm(269);

	Register CmpReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);

	BuildMI(BB, dl, TII->get(PPC::CMPW), CmpReg)
	.addReg(HiReg)
	.addReg(ReadAgainReg);
	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(PPC::PRED_NE)
	.addReg(CmpReg)
	.addMBB(readMBB);

	BB->addSuccessor(readMBB);
	BB->addSuccessor(sinkMBB);
	} else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ADD4);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ADD4);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I32)
	BB = EmitAtomicBinary(MI, BB, 4, PPC::ADD4);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I64)
	BB = EmitAtomicBinary(MI, BB, 8, PPC::ADD8);

	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::AND);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::AND);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I32)
	BB = EmitAtomicBinary(MI, BB, 4, PPC::AND);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I64)
	BB = EmitAtomicBinary(MI, BB, 8, PPC::AND8);

	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::OR);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::OR);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I32)
	BB = EmitAtomicBinary(MI, BB, 4, PPC::OR);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I64)
	BB = EmitAtomicBinary(MI, BB, 8, PPC::OR8);

	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::XOR);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::XOR);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I32)
	BB = EmitAtomicBinary(MI, BB, 4, PPC::XOR);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I64)
	BB = EmitAtomicBinary(MI, BB, 8, PPC::XOR8);

	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::NAND);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::NAND);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I32)
	BB = EmitAtomicBinary(MI, BB, 4, PPC::NAND);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I64)
	BB = EmitAtomicBinary(MI, BB, 8, PPC::NAND8);

	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::SUBF);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I32)
	BB = EmitAtomicBinary(MI, BB, 4, PPC::SUBF);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I64)
	BB = EmitAtomicBinary(MI, BB, 8, PPC::SUBF8);

	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_GE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_GE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I32)
	BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_GE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I64)
	BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_GE);

	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_LE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_LE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I32)
	BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_LE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I64)
	BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_LE);

	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_GE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_GE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I32)
	BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_GE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I64)
	BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_GE);

	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_LE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_LE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I32)
	BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_LE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I64)
	BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_LE);

	else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, 0);
	else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, 0);
	else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I32)
	BB = EmitAtomicBinary(MI, BB, 4, 0);
	else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I64)
	BB = EmitAtomicBinary(MI, BB, 8, 0);
	else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 \|\|
	MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 \|\|
	(Subtarget.hasPartwordAtomics() &&
	MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8) \|\|
	(Subtarget.hasPartwordAtomics() &&
	MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16)) {
	bool is64bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64;

	auto LoadMnemonic = PPC::LDARX;
	auto StoreMnemonic = PPC::STDCX;
	switch (MI.getOpcode()) {
	default:
	llvm_unreachable("Compare and swap of unknown size");
	case PPC::ATOMIC_CMP_SWAP_I8:
	LoadMnemonic = PPC::LBARX;
	StoreMnemonic = PPC::STBCX;
	assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
	break;
	case PPC::ATOMIC_CMP_SWAP_I16:
	LoadMnemonic = PPC::LHARX;
	StoreMnemonic = PPC::STHCX;
	assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
	break;
	case PPC::ATOMIC_CMP_SWAP_I32:
	LoadMnemonic = PPC::LWARX;
	StoreMnemonic = PPC::STWCX;
	break;
	case PPC::ATOMIC_CMP_SWAP_I64:
	LoadMnemonic = PPC::LDARX;
	StoreMnemonic = PPC::STDCX;
	break;
	}
	Register dest = MI.getOperand(0).getReg();
	Register ptrA = MI.getOperand(1).getReg();
	Register ptrB = MI.getOperand(2).getReg();
	Register oldval = MI.getOperand(3).getReg();
	Register newval = MI.getOperand(4).getReg();
	DebugLoc dl = MI.getDebugLoc();

	MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
	F->insert(It, loop1MBB);
	F->insert(It, loop2MBB);
	F->insert(It, midMBB);
	F->insert(It, exitMBB);
	exitMBB->splice(exitMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	exitMBB->transferSuccessorsAndUpdatePHIs(BB);

	// thisMBB:
	// ...
	// fallthrough --> loopMBB
	BB->addSuccessor(loop1MBB);

	// loop1MBB:
	// l[bhwd]arx dest, ptr
	// cmp[wd] dest, oldval
	// bne- midMBB
	// loop2MBB:
	// st[bhwd]cx. newval, ptr
	// bne- loopMBB
	// b exitBB
	// midMBB:
	// st[bhwd]cx. dest, ptr
	// exitBB:
	BB = loop1MBB;
	BuildMI(BB, dl, TII->get(LoadMnemonic), dest).addReg(ptrA).addReg(ptrB);
	BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), PPC::CR0)
	.addReg(oldval)
	.addReg(dest);
	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(PPC::PRED_NE)
	.addReg(PPC::CR0)
	.addMBB(midMBB);
	BB->addSuccessor(loop2MBB);
	BB->addSuccessor(midMBB);

	BB = loop2MBB;
	BuildMI(BB, dl, TII->get(StoreMnemonic))
	.addReg(newval)
	.addReg(ptrA)
	.addReg(ptrB);
	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(PPC::PRED_NE)
	.addReg(PPC::CR0)
	.addMBB(loop1MBB);
	BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
	BB->addSuccessor(loop1MBB);
	BB->addSuccessor(exitMBB);

	BB = midMBB;
	BuildMI(BB, dl, TII->get(StoreMnemonic))
	.addReg(dest)
	.addReg(ptrA)
	.addReg(ptrB);
	BB->addSuccessor(exitMBB);

	// exitMBB:
	// ...
	BB = exitMBB;
	} else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 \|\|
	MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16) {
	// We must use 64-bit registers for addresses when targeting 64-bit,
	// since we're actually doing arithmetic on them. Other registers
	// can be 32-bit.
	bool is64bit = Subtarget.isPPC64();
	bool isLittleEndian = Subtarget.isLittleEndian();
	bool is8bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8;

	Register dest = MI.getOperand(0).getReg();
	Register ptrA = MI.getOperand(1).getReg();
	Register ptrB = MI.getOperand(2).getReg();
	Register oldval = MI.getOperand(3).getReg();
	Register newval = MI.getOperand(4).getReg();
	DebugLoc dl = MI.getDebugLoc();

	MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
	F->insert(It, loop1MBB);
	F->insert(It, loop2MBB);
	F->insert(It, midMBB);
	F->insert(It, exitMBB);
	exitMBB->splice(exitMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	exitMBB->transferSuccessorsAndUpdatePHIs(BB);

	MachineRegisterInfo &RegInfo = F->getRegInfo();
	const TargetRegisterClass *RC =
	is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
	const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;

	Register PtrReg = RegInfo.createVirtualRegister(RC);
	Register Shift1Reg = RegInfo.createVirtualRegister(GPRC);
	Register ShiftReg =
	isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(GPRC);
	Register NewVal2Reg = RegInfo.createVirtualRegister(GPRC);
	Register NewVal3Reg = RegInfo.createVirtualRegister(GPRC);
	Register OldVal2Reg = RegInfo.createVirtualRegister(GPRC);
	Register OldVal3Reg = RegInfo.createVirtualRegister(GPRC);
	Register MaskReg = RegInfo.createVirtualRegister(GPRC);
	Register Mask2Reg = RegInfo.createVirtualRegister(GPRC);
	Register Mask3Reg = RegInfo.createVirtualRegister(GPRC);
	Register Tmp2Reg = RegInfo.createVirtualRegister(GPRC);
	Register Tmp4Reg = RegInfo.createVirtualRegister(GPRC);
	Register TmpDestReg = RegInfo.createVirtualRegister(GPRC);
	Register Ptr1Reg;
	Register TmpReg = RegInfo.createVirtualRegister(GPRC);
	Register ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
	// thisMBB:
	// ...
	// fallthrough --> loopMBB
	BB->addSuccessor(loop1MBB);

	// The 4-byte load must be aligned, while a char or short may be
	// anywhere in the word. Hence all this nasty bookkeeping code.
	// add ptr1, ptrA, ptrB [copy if ptrA==0]
	// rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
	// xori shift, shift1, 24 [16]
	// rlwinm ptr, ptr1, 0, 0, 29
	// slw newval2, newval, shift
	// slw oldval2, oldval,shift
	// li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
	// slw mask, mask2, shift
	// and newval3, newval2, mask
	// and oldval3, oldval2, mask
	// loop1MBB:
	// lwarx tmpDest, ptr
	// and tmp, tmpDest, mask
	// cmpw tmp, oldval3
	// bne- midMBB
	// loop2MBB:
	// andc tmp2, tmpDest, mask
	// or tmp4, tmp2, newval3
	// stwcx. tmp4, ptr
	// bne- loop1MBB
	// b exitBB
	// midMBB:
	// stwcx. tmpDest, ptr
	// exitBB:
	// srw dest, tmpDest, shift
	if (ptrA != ZeroReg) {
	Ptr1Reg = RegInfo.createVirtualRegister(RC);
	BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
	.addReg(ptrA)
	.addReg(ptrB);
	} else {
	Ptr1Reg = ptrB;
	}

	// We need use 32-bit subregister to avoid mismatch register class in 64-bit
	// mode.
	BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg)
	.addReg(Ptr1Reg, 0, is64bit ? PPC::sub_32 : 0)
	.addImm(3)
	.addImm(27)
	.addImm(is8bit ? 28 : 27);
	if (!isLittleEndian)
	BuildMI(BB, dl, TII->get(PPC::XORI), ShiftReg)
	.addReg(Shift1Reg)
	.addImm(is8bit ? 24 : 16);
	if (is64bit)
	BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
	.addReg(Ptr1Reg)
	.addImm(0)
	.addImm(61);
	else
	BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
	.addReg(Ptr1Reg)
	.addImm(0)
	.addImm(0)
	.addImm(29);
	BuildMI(BB, dl, TII->get(PPC::SLW), NewVal2Reg)
	.addReg(newval)
	.addReg(ShiftReg);
	BuildMI(BB, dl, TII->get(PPC::SLW), OldVal2Reg)
	.addReg(oldval)
	.addReg(ShiftReg);
	if (is8bit)
	BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
	else {
	BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
	BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg)
	.addReg(Mask3Reg)
	.addImm(65535);
	}
	BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
	.addReg(Mask2Reg)
	.addReg(ShiftReg);
	BuildMI(BB, dl, TII->get(PPC::AND), NewVal3Reg)
	.addReg(NewVal2Reg)
	.addReg(MaskReg);
	BuildMI(BB, dl, TII->get(PPC::AND), OldVal3Reg)
	.addReg(OldVal2Reg)
	.addReg(MaskReg);

	BB = loop1MBB;
	BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
	.addReg(ZeroReg)
	.addReg(PtrReg);
	BuildMI(BB, dl, TII->get(PPC::AND), TmpReg)
	.addReg(TmpDestReg)
	.addReg(MaskReg);
	BuildMI(BB, dl, TII->get(PPC::CMPW), PPC::CR0)
	.addReg(TmpReg)
	.addReg(OldVal3Reg);
	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(PPC::PRED_NE)
	.addReg(PPC::CR0)
	.addMBB(midMBB);
	BB->addSuccessor(loop2MBB);
	BB->addSuccessor(midMBB);

	BB = loop2MBB;
	BuildMI(BB, dl, TII->get(PPC::ANDC), Tmp2Reg)
	.addReg(TmpDestReg)
	.addReg(MaskReg);
	BuildMI(BB, dl, TII->get(PPC::OR), Tmp4Reg)
	.addReg(Tmp2Reg)
	.addReg(NewVal3Reg);
	BuildMI(BB, dl, TII->get(PPC::STWCX))
	.addReg(Tmp4Reg)
	.addReg(ZeroReg)
	.addReg(PtrReg);
	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(PPC::PRED_NE)
	.addReg(PPC::CR0)
	.addMBB(loop1MBB);
	BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
	BB->addSuccessor(loop1MBB);
	BB->addSuccessor(exitMBB);

	BB = midMBB;
	BuildMI(BB, dl, TII->get(PPC::STWCX))
	.addReg(TmpDestReg)
	.addReg(ZeroReg)
	.addReg(PtrReg);
	BB->addSuccessor(exitMBB);

	// exitMBB:
	// ...
	BB = exitMBB;
	BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest)
	.addReg(TmpReg)
	.addReg(ShiftReg);
	} else if (MI.getOpcode() == PPC::FADDrtz) {
	// This pseudo performs an FADD with rounding mode temporarily forced
	// to round-to-zero. We emit this via custom inserter since the FPSCR
	// is not modeled at the SelectionDAG level.
	Register Dest = MI.getOperand(0).getReg();
	Register Src1 = MI.getOperand(1).getReg();
	Register Src2 = MI.getOperand(2).getReg();
	DebugLoc dl = MI.getDebugLoc();

	MachineRegisterInfo &RegInfo = F->getRegInfo();
	Register MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);

	// Save FPSCR value.
	BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg);

	// Set rounding mode to round-to-zero.
	BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1)).addImm(31);
	BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0)).addImm(30);

	// Perform addition.
	BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest).addReg(Src1).addReg(Src2);

	// Restore FPSCR value.
	BuildMI(*BB, MI, dl, TII->get(PPC::MTFSFb)).addImm(1).addReg(MFFSReg);
	} else if (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT \|\|
	MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT \|\|
	MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8 \|\|
	MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT8) {
	unsigned Opcode = (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8 \|\|
	MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT8)
	? PPC::ANDI8_rec
	: PPC::ANDI_rec;
	bool IsEQ = (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT \|\|
	MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8);

	MachineRegisterInfo &RegInfo = F->getRegInfo();
	Register Dest = RegInfo.createVirtualRegister(
	Opcode == PPC::ANDI_rec ? &PPC::GPRCRegClass : &PPC::G8RCRegClass);

	DebugLoc Dl = MI.getDebugLoc();
	BuildMI(*BB, MI, Dl, TII->get(Opcode), Dest)
	.addReg(MI.getOperand(1).getReg())
	.addImm(1);
	BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY),
	MI.getOperand(0).getReg())
	.addReg(IsEQ ? PPC::CR0EQ : PPC::CR0GT);
	} else if (MI.getOpcode() == PPC::TCHECK_RET) {
	DebugLoc Dl = MI.getDebugLoc();
	MachineRegisterInfo &RegInfo = F->getRegInfo();
	Register CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
	BuildMI(*BB, MI, Dl, TII->get(PPC::TCHECK), CRReg);
	BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY),
	MI.getOperand(0).getReg())
	.addReg(CRReg);
	} else if (MI.getOpcode() == PPC::TBEGIN_RET) {
	DebugLoc Dl = MI.getDebugLoc();
	unsigned Imm = MI.getOperand(1).getImm();
	BuildMI(*BB, MI, Dl, TII->get(PPC::TBEGIN)).addImm(Imm);
	BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY),
	MI.getOperand(0).getReg())
	.addReg(PPC::CR0EQ);
	} else if (MI.getOpcode() == PPC::SETRNDi) {
	DebugLoc dl = MI.getDebugLoc();
	Register OldFPSCRReg = MI.getOperand(0).getReg();

	// Save FPSCR value.
	BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg);

	// The floating point rounding mode is in the bits 62:63 of FPCSR, and has
	// the following settings:
	// 00 Round to nearest
	// 01 Round to 0
	// 10 Round to +inf
	// 11 Round to -inf

	// When the operand is immediate, using the two least significant bits of
	// the immediate to set the bits 62:63 of FPSCR.
	unsigned Mode = MI.getOperand(1).getImm();
	BuildMI(*BB, MI, dl, TII->get((Mode & 1) ? PPC::MTFSB1 : PPC::MTFSB0))
	.addImm(31);

	BuildMI(*BB, MI, dl, TII->get((Mode & 2) ? PPC::MTFSB1 : PPC::MTFSB0))
	.addImm(30);
	} else if (MI.getOpcode() == PPC::SETRND) {
	DebugLoc dl = MI.getDebugLoc();

	// Copy register from F8RCRegClass::SrcReg to G8RCRegClass::DestReg
	// or copy register from G8RCRegClass::SrcReg to F8RCRegClass::DestReg.
	// If the target doesn't have DirectMove, we should use stack to do the
	// conversion, because the target doesn't have the instructions like mtvsrd
	// or mfvsrd to do this conversion directly.
	auto copyRegFromG8RCOrF8RC = [&] (unsigned DestReg, unsigned SrcReg) {
	if (Subtarget.hasDirectMove()) {
	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), DestReg)
	.addReg(SrcReg);
	} else {
	// Use stack to do the register copy.
	unsigned StoreOp = PPC::STD, LoadOp = PPC::LFD;
	MachineRegisterInfo &RegInfo = F->getRegInfo();
	const TargetRegisterClass *RC = RegInfo.getRegClass(SrcReg);
	if (RC == &PPC::F8RCRegClass) {
	// Copy register from F8RCRegClass to G8RCRegclass.
	assert((RegInfo.getRegClass(DestReg) == &PPC::G8RCRegClass) &&
	"Unsupported RegClass.");

	StoreOp = PPC::STFD;
	LoadOp = PPC::LD;
	} else {
	// Copy register from G8RCRegClass to F8RCRegclass.
	assert((RegInfo.getRegClass(SrcReg) == &PPC::G8RCRegClass) &&
	(RegInfo.getRegClass(DestReg) == &PPC::F8RCRegClass) &&
	"Unsupported RegClass.");
	}

	MachineFrameInfo &MFI = F->getFrameInfo();
	int FrameIdx = MFI.CreateStackObject(8, Align(8), false);

	MachineMemOperand *MMOStore = F->getMachineMemOperand(
	MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),
	MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx),
	MFI.getObjectAlign(FrameIdx));

	// Store the SrcReg into the stack.
	BuildMI(*BB, MI, dl, TII->get(StoreOp))
	.addReg(SrcReg)
	.addImm(0)
	.addFrameIndex(FrameIdx)
	.addMemOperand(MMOStore);

	MachineMemOperand *MMOLoad = F->getMachineMemOperand(
	MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),
	MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),
	MFI.getObjectAlign(FrameIdx));

	// Load from the stack where SrcReg is stored, and save to DestReg,
	// so we have done the RegClass conversion from RegClass::SrcReg to
	// RegClass::DestReg.
	BuildMI(*BB, MI, dl, TII->get(LoadOp), DestReg)
	.addImm(0)
	.addFrameIndex(FrameIdx)
	.addMemOperand(MMOLoad);
	}
	};

	Register OldFPSCRReg = MI.getOperand(0).getReg();

	// Save FPSCR value.
	BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg);

	// When the operand is gprc register, use two least significant bits of the
	// register and mtfsf instruction to set the bits 62:63 of FPSCR.
	//
	// copy OldFPSCRTmpReg, OldFPSCRReg
	// (INSERT_SUBREG ExtSrcReg, (IMPLICIT_DEF ImDefReg), SrcOp, 1)
	// rldimi NewFPSCRTmpReg, ExtSrcReg, OldFPSCRReg, 0, 62
	// copy NewFPSCRReg, NewFPSCRTmpReg
	// mtfsf 255, NewFPSCRReg
	MachineOperand SrcOp = MI.getOperand(1);
	MachineRegisterInfo &RegInfo = F->getRegInfo();
	Register OldFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);

	copyRegFromG8RCOrF8RC(OldFPSCRTmpReg, OldFPSCRReg);

	Register ImDefReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
	Register ExtSrcReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);

	// The first operand of INSERT_SUBREG should be a register which has
	// subregisters, we only care about its RegClass, so we should use an
	// IMPLICIT_DEF register.
	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::IMPLICIT_DEF), ImDefReg);
	BuildMI(*BB, MI, dl, TII->get(PPC::INSERT_SUBREG), ExtSrcReg)
	.addReg(ImDefReg)
	.add(SrcOp)
	.addImm(1);

	Register NewFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
	BuildMI(*BB, MI, dl, TII->get(PPC::RLDIMI), NewFPSCRTmpReg)
	.addReg(OldFPSCRTmpReg)
	.addReg(ExtSrcReg)
	.addImm(0)
	.addImm(62);

	Register NewFPSCRReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);
	copyRegFromG8RCOrF8RC(NewFPSCRReg, NewFPSCRTmpReg);

	// The mask 255 means that put the 32:63 bits of NewFPSCRReg to the 32:63
	// bits of FPSCR.
	BuildMI(*BB, MI, dl, TII->get(PPC::MTFSF))
	.addImm(255)
	.addReg(NewFPSCRReg)
	.addImm(0)
	.addImm(0);
	} else if (MI.getOpcode() == PPC::PROBED_ALLOCA_32 \|\|
	MI.getOpcode() == PPC::PROBED_ALLOCA_64) {
	return emitProbedAlloca(MI, BB);
	} else {
	llvm_unreachable("Unexpected instr type to insert");
	}

	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	//===----------------------------------------------------------------------===//
	// Target Optimization Hooks
	//===----------------------------------------------------------------------===//

	static int getEstimateRefinementSteps(EVT VT, const PPCSubtarget &Subtarget) {
	// For the estimates, convergence is quadratic, so we essentially double the
	// number of digits correct after every iteration. For both FRE and FRSQRTE,
	// the minimum architected relative accuracy is 2^-5. When hasRecipPrec(),
	// this is 2^-14. IEEE float has 23 digits and double has 52 digits.
	int RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3;
	if (VT.getScalarType() == MVT::f64)
	RefinementSteps++;
	return RefinementSteps;
	}

	SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
	int Enabled, int &RefinementSteps,
	bool &UseOneConstNR,
	bool Reciprocal) const {
	EVT VT = Operand.getValueType();
	if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) \|\|
	(VT == MVT::f64 && Subtarget.hasFRSQRTE()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasAltivec()) \|\|
	(VT == MVT::v2f64 && Subtarget.hasVSX()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasQPX()) \|\|
	(VT == MVT::v4f64 && Subtarget.hasQPX())) {
	if (RefinementSteps == ReciprocalEstimate::Unspecified)
	RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);

	// The Newton-Raphson computation with a single constant does not provide
	// enough accuracy on some CPUs.
	UseOneConstNR = !Subtarget.needsTwoConstNR();
	return DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand);
	}
	return SDValue();
	}

	SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG,
	int Enabled,
	int &RefinementSteps) const {
	EVT VT = Operand.getValueType();
	if ((VT == MVT::f32 && Subtarget.hasFRES()) \|\|
	(VT == MVT::f64 && Subtarget.hasFRE()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasAltivec()) \|\|
	(VT == MVT::v2f64 && Subtarget.hasVSX()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasQPX()) \|\|
	(VT == MVT::v4f64 && Subtarget.hasQPX())) {
	if (RefinementSteps == ReciprocalEstimate::Unspecified)
	RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
	return DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand);
	}
	return SDValue();
	}

	unsigned PPCTargetLowering::combineRepeatedFPDivisors() const {
	// Note: This functionality is used only when unsafe-fp-math is enabled, and
	// on cores with reciprocal estimates (which are used when unsafe-fp-math is
	// enabled for division), this functionality is redundant with the default
	// combiner logic (once the division -> reciprocal/multiply transformation
	// has taken place). As a result, this matters more for older cores than for
	// newer ones.

	// Combine multiple FDIVs with the same divisor into multiple FMULs by the
	// reciprocal if there are two or more FDIVs (for embedded cores with only
	// one FP pipeline) for three or more FDIVs (for generic OOO cores).
	switch (Subtarget.getCPUDirective()) {
	default:
	return 3;
	case PPC::DIR_440:
	case PPC::DIR_A2:
	case PPC::DIR_E500:
	case PPC::DIR_E500mc:
	case PPC::DIR_E5500:
	return 2;
	}
	}

	// isConsecutiveLSLoc needs to work even if all adds have not yet been
	// collapsed, and so we need to look through chains of them.
	static void getBaseWithConstantOffset(SDValue Loc, SDValue &Base,
	int64_t& Offset, SelectionDAG &DAG) {
	if (DAG.isBaseWithConstantOffset(Loc)) {
	Base = Loc.getOperand(0);
	Offset += cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue();

	// The base might itself be a base plus an offset, and if so, accumulate
	// that as well.
	getBaseWithConstantOffset(Loc.getOperand(0), Base, Offset, DAG);
	}
	}

	static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base,
	unsigned Bytes, int Dist,
	SelectionDAG &DAG) {
	if (VT.getSizeInBits() / 8 != Bytes)
	return false;

	SDValue BaseLoc = Base->getBasePtr();
	if (Loc.getOpcode() == ISD::FrameIndex) {
	if (BaseLoc.getOpcode() != ISD::FrameIndex)
	return false;
	const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	int FI = cast<FrameIndexSDNode>(Loc)->getIndex();
	int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex();
	int FS = MFI.getObjectSize(FI);
	int BFS = MFI.getObjectSize(BFI);
	if (FS != BFS \|\| FS != (int)Bytes) return false;
	return MFI.getObjectOffset(FI) == (MFI.getObjectOffset(BFI) + Dist*Bytes);
	}

	SDValue Base1 = Loc, Base2 = BaseLoc;
	int64_t Offset1 = 0, Offset2 = 0;
	getBaseWithConstantOffset(Loc, Base1, Offset1, DAG);
	getBaseWithConstantOffset(BaseLoc, Base2, Offset2, DAG);
	if (Base1 == Base2 && Offset1 == (Offset2 + Dist * Bytes))
	return true;

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	const GlobalValue *GV1 = nullptr;
	const GlobalValue *GV2 = nullptr;
	Offset1 = 0;
	Offset2 = 0;
	bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1);
	bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
	if (isGA1 && isGA2 && GV1 == GV2)
	return Offset1 == (Offset2 + Dist*Bytes);
	return false;
	}

	// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does
	// not enforce equality of the chain operands.
	static bool isConsecutiveLS(SDNode N, LSBaseSDNode Base,
	unsigned Bytes, int Dist,
	SelectionDAG &DAG) {
	if (LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(N)) {
	EVT VT = LS->getMemoryVT();
	SDValue Loc = LS->getBasePtr();
	return isConsecutiveLSLoc(Loc, VT, Base, Bytes, Dist, DAG);
	}

	if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
	EVT VT;
	switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
	default: return false;
	case Intrinsic::ppc_qpx_qvlfd:
	case Intrinsic::ppc_qpx_qvlfda:
	VT = MVT::v4f64;
	break;
	case Intrinsic::ppc_qpx_qvlfs:
	case Intrinsic::ppc_qpx_qvlfsa:
	VT = MVT::v4f32;
	break;
	case Intrinsic::ppc_qpx_qvlfcd:
	case Intrinsic::ppc_qpx_qvlfcda:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_qpx_qvlfcs:
	case Intrinsic::ppc_qpx_qvlfcsa:
	VT = MVT::v2f32;
	break;
	case Intrinsic::ppc_qpx_qvlfiwa:
	case Intrinsic::ppc_qpx_qvlfiwz:
	case Intrinsic::ppc_altivec_lvx:
	case Intrinsic::ppc_altivec_lvxl:
	case Intrinsic::ppc_vsx_lxvw4x:
	case Intrinsic::ppc_vsx_lxvw4x_be:
	VT = MVT::v4i32;
	break;
	case Intrinsic::ppc_vsx_lxvd2x:
	case Intrinsic::ppc_vsx_lxvd2x_be:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_altivec_lvebx:
	VT = MVT::i8;
	break;
	case Intrinsic::ppc_altivec_lvehx:
	VT = MVT::i16;
	break;
	case Intrinsic::ppc_altivec_lvewx:
	VT = MVT::i32;
	break;
	}

	return isConsecutiveLSLoc(N->getOperand(2), VT, Base, Bytes, Dist, DAG);
	}

	if (N->getOpcode() == ISD::INTRINSIC_VOID) {
	EVT VT;
	switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
	default: return false;
	case Intrinsic::ppc_qpx_qvstfd:
	case Intrinsic::ppc_qpx_qvstfda:
	VT = MVT::v4f64;
	break;
	case Intrinsic::ppc_qpx_qvstfs:
	case Intrinsic::ppc_qpx_qvstfsa:
	VT = MVT::v4f32;
	break;
	case Intrinsic::ppc_qpx_qvstfcd:
	case Intrinsic::ppc_qpx_qvstfcda:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_qpx_qvstfcs:
	case Intrinsic::ppc_qpx_qvstfcsa:
	VT = MVT::v2f32;
	break;
	case Intrinsic::ppc_qpx_qvstfiw:
	case Intrinsic::ppc_qpx_qvstfiwa:
	case Intrinsic::ppc_altivec_stvx:
	case Intrinsic::ppc_altivec_stvxl:
	case Intrinsic::ppc_vsx_stxvw4x:
	VT = MVT::v4i32;
	break;
	case Intrinsic::ppc_vsx_stxvd2x:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_vsx_stxvw4x_be:
	VT = MVT::v4i32;
	break;
	case Intrinsic::ppc_vsx_stxvd2x_be:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_altivec_stvebx:
	VT = MVT::i8;
	break;
	case Intrinsic::ppc_altivec_stvehx:
	VT = MVT::i16;
	break;
	case Intrinsic::ppc_altivec_stvewx:
	VT = MVT::i32;
	break;
	}

	return isConsecutiveLSLoc(N->getOperand(3), VT, Base, Bytes, Dist, DAG);
	}

	return false;
	}

	// Return true is there is a nearyby consecutive load to the one provided
	// (regardless of alignment). We search up and down the chain, looking though
	// token factors and other loads (but nothing else). As a result, a true result
	// indicates that it is safe to create a new consecutive load adjacent to the
	// load provided.
	static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
	SDValue Chain = LD->getChain();
	EVT VT = LD->getMemoryVT();

	SmallSet<SDNode *, 16> LoadRoots;
	SmallVector<SDNode *, 8> Queue(1, Chain.getNode());
	SmallSet<SDNode *, 16> Visited;

	// First, search up the chain, branching to follow all token-factor operands.
	// If we find a consecutive load, then we're done, otherwise, record all
	// nodes just above the top-level loads and token factors.
	while (!Queue.empty()) {
	SDNode *ChainNext = Queue.pop_back_val();
	if (!Visited.insert(ChainNext).second)
	continue;

	if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(ChainNext)) {
	if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
	return true;

	if (!Visited.count(ChainLD->getChain().getNode()))
	Queue.push_back(ChainLD->getChain().getNode());
	} else if (ChainNext->getOpcode() == ISD::TokenFactor) {
	for (const SDUse &O : ChainNext->ops())
	if (!Visited.count(O.getNode()))
	Queue.push_back(O.getNode());
	} else
	LoadRoots.insert(ChainNext);
	}

	// Second, search down the chain, starting from the top-level nodes recorded
	// in the first phase. These top-level nodes are the nodes just above all
	// loads and token factors. Starting with their uses, recursively look though
	// all loads (just the chain uses) and token factors to find a consecutive
	// load.
	Visited.clear();
	Queue.clear();

	for (SmallSet<SDNode *, 16>::iterator I = LoadRoots.begin(),
	IE = LoadRoots.end(); I != IE; ++I) {
	Queue.push_back(*I);

	while (!Queue.empty()) {
	SDNode *LoadRoot = Queue.pop_back_val();
	if (!Visited.insert(LoadRoot).second)
	continue;

	if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(LoadRoot))
	if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
	return true;

	for (SDNode::use_iterator UI = LoadRoot->use_begin(),
	UE = LoadRoot->use_end(); UI != UE; ++UI)
	if (((isa<MemSDNode>(*UI) &&
	cast<MemSDNode>(*UI)->getChain().getNode() == LoadRoot) \|\|
	UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI))
	Queue.push_back(*UI);
	}
	}

	return false;
	}

	/// This function is called when we have proved that a SETCC node can be replaced
	/// by subtraction (and other supporting instructions) so that the result of
	/// comparison is kept in a GPR instead of CR. This function is purely for
	/// codegen purposes and has some flags to guide the codegen process.
	static SDValue generateEquivalentSub(SDNode *N, int Size, bool Complement,
	bool Swap, SDLoc &DL, SelectionDAG &DAG) {
	assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");

	// Zero extend the operands to the largest legal integer. Originally, they
	// must be of a strictly smaller size.
	auto Op0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(0),
	DAG.getConstant(Size, DL, MVT::i32));
	auto Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(1),
	DAG.getConstant(Size, DL, MVT::i32));

	// Swap if needed. Depends on the condition code.
	if (Swap)
	std::swap(Op0, Op1);

	// Subtract extended integers.
	auto SubNode = DAG.getNode(ISD::SUB, DL, MVT::i64, Op0, Op1);

	// Move the sign bit to the least significant position and zero out the rest.
	// Now the least significant bit carries the result of original comparison.
	auto Shifted = DAG.getNode(ISD::SRL, DL, MVT::i64, SubNode,
	DAG.getConstant(Size - 1, DL, MVT::i32));
	auto Final = Shifted;

	// Complement the result if needed. Based on the condition code.
	if (Complement)
	Final = DAG.getNode(ISD::XOR, DL, MVT::i64, Shifted,
	DAG.getConstant(1, DL, MVT::i64));

	return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Final);
	}

	SDValue PPCTargetLowering::ConvertSETCCToSubtract(SDNode *N,
	DAGCombinerInfo &DCI) const {
	assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");

	SelectionDAG &DAG = DCI.DAG;
	SDLoc DL(N);

	// Size of integers being compared has a critical role in the following
	// analysis, so we prefer to do this when all types are legal.
	if (!DCI.isAfterLegalizeDAG())
	return SDValue();

	// If all users of SETCC extend its value to a legal integer type
	// then we replace SETCC with a subtraction
	for (SDNode::use_iterator UI = N->use_begin(),
	UE = N->use_end(); UI != UE; ++UI) {
	if (UI->getOpcode() != ISD::ZERO_EXTEND)
	return SDValue();
	}

	ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
	auto OpSize = N->getOperand(0).getValueSizeInBits();

	unsigned Size = DAG.getDataLayout().getLargestLegalIntTypeSizeInBits();

	if (OpSize < Size) {
	switch (CC) {
	default: break;
	case ISD::SETULT:
	return generateEquivalentSub(N, Size, false, false, DL, DAG);
	case ISD::SETULE:
	return generateEquivalentSub(N, Size, true, true, DL, DAG);
	case ISD::SETUGT:
	return generateEquivalentSub(N, Size, false, true, DL, DAG);
	case ISD::SETUGE:
	return generateEquivalentSub(N, Size, true, false, DL, DAG);
	}
	}

	return SDValue();
	}

	SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);

	assert(Subtarget.useCRBits() && "Expecting to be tracking CR bits");
	// If we're tracking CR bits, we need to be careful that we don't have:
	// trunc(binary-ops(zext(x), zext(y)))
	// or
	// trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
	// such that we're unnecessarily moving things into GPRs when it would be
	// better to keep them in CR bits.

	// Note that trunc here can be an actual i1 trunc, or can be the effective
	// truncation that comes from a setcc or select_cc.
	if (N->getOpcode() == ISD::TRUNCATE &&
	N->getValueType(0) != MVT::i1)
	return SDValue();

	if (N->getOperand(0).getValueType() != MVT::i32 &&
	N->getOperand(0).getValueType() != MVT::i64)
	return SDValue();

	if (N->getOpcode() == ISD::SETCC \|\|
	N->getOpcode() == ISD::SELECT_CC) {
	// If we're looking at a comparison, then we need to make sure that the
	// high bits (all except for the first) don't matter the result.
	ISD::CondCode CC =
	cast<CondCodeSDNode>(N->getOperand(
	N->getOpcode() == ISD::SETCC ? 2 : 4))->get();
	unsigned OpBits = N->getOperand(0).getValueSizeInBits();

	if (ISD::isSignedIntSetCC(CC)) {
	if (DAG.ComputeNumSignBits(N->getOperand(0)) != OpBits \|\|
	DAG.ComputeNumSignBits(N->getOperand(1)) != OpBits)
	return SDValue();
	} else if (ISD::isUnsignedIntSetCC(CC)) {
	if (!DAG.MaskedValueIsZero(N->getOperand(0),
	APInt::getHighBitsSet(OpBits, OpBits-1)) \|\|
	!DAG.MaskedValueIsZero(N->getOperand(1),
	APInt::getHighBitsSet(OpBits, OpBits-1)))
	return (N->getOpcode() == ISD::SETCC ? ConvertSETCCToSubtract(N, DCI)
	: SDValue());
	} else {
	// This is neither a signed nor an unsigned comparison, just make sure
	// that the high bits are equal.
	KnownBits Op1Known = DAG.computeKnownBits(N->getOperand(0));
	KnownBits Op2Known = DAG.computeKnownBits(N->getOperand(1));

	// We don't really care about what is known about the first bit (if
	// anything), so clear it in all masks prior to comparing them.
	Op1Known.Zero.clearBit(0); Op1Known.One.clearBit(0);
	Op2Known.Zero.clearBit(0); Op2Known.One.clearBit(0);

	if (Op1Known.Zero != Op2Known.Zero \|\| Op1Known.One != Op2Known.One)
	return SDValue();
	}
	}

	// We now know that the higher-order bits are irrelevant, we just need to
	// make sure that all of the intermediate operations are bit operations, and
	// all inputs are extensions.
	if (N->getOperand(0).getOpcode() != ISD::AND &&
	N->getOperand(0).getOpcode() != ISD::OR &&
	N->getOperand(0).getOpcode() != ISD::XOR &&
	N->getOperand(0).getOpcode() != ISD::SELECT &&
	N->getOperand(0).getOpcode() != ISD::SELECT_CC &&
	N->getOperand(0).getOpcode() != ISD::TRUNCATE &&
	N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND &&
	N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
	N->getOperand(0).getOpcode() != ISD::ANY_EXTEND)
	return SDValue();

	if ((N->getOpcode() == ISD::SETCC \|\| N->getOpcode() == ISD::SELECT_CC) &&
	N->getOperand(1).getOpcode() != ISD::AND &&
	N->getOperand(1).getOpcode() != ISD::OR &&
	N->getOperand(1).getOpcode() != ISD::XOR &&
	N->getOperand(1).getOpcode() != ISD::SELECT &&
	N->getOperand(1).getOpcode() != ISD::SELECT_CC &&
	N->getOperand(1).getOpcode() != ISD::TRUNCATE &&
	N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND &&
	N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
	N->getOperand(1).getOpcode() != ISD::ANY_EXTEND)
	return SDValue();

	SmallVector<SDValue, 4> Inputs;
	SmallVector<SDValue, 8> BinOps, PromOps;
	SmallPtrSet<SDNode *, 16> Visited;

	for (unsigned i = 0; i < 2; ++i) {
	if (((N->getOperand(i).getOpcode() == ISD::SIGN_EXTEND \|\|
	N->getOperand(i).getOpcode() == ISD::ZERO_EXTEND \|\|
	N->getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
	N->getOperand(i).getOperand(0).getValueType() == MVT::i1) \|\|
	isa<ConstantSDNode>(N->getOperand(i)))
	Inputs.push_back(N->getOperand(i));
	else
	BinOps.push_back(N->getOperand(i));

	if (N->getOpcode() == ISD::TRUNCATE)
	break;
	}

	// Visit all inputs, collect all binary operations (and, or, xor and
	// select) that are all fed by extensions.
	while (!BinOps.empty()) {
	SDValue BinOp = BinOps.back();
	BinOps.pop_back();

	if (!Visited.insert(BinOp.getNode()).second)
	continue;

	PromOps.push_back(BinOp);

	for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
	// The condition of the select is not promoted.
	if (BinOp.getOpcode() == ISD::SELECT && i == 0)
	continue;
	if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
	continue;

	if (((BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND \|\|
	BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND \|\|
	BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
	BinOp.getOperand(i).getOperand(0).getValueType() == MVT::i1) \|\|
	isa<ConstantSDNode>(BinOp.getOperand(i))) {
	Inputs.push_back(BinOp.getOperand(i));
	} else if (BinOp.getOperand(i).getOpcode() == ISD::AND \|\|
	BinOp.getOperand(i).getOpcode() == ISD::OR \|\|
	BinOp.getOperand(i).getOpcode() == ISD::XOR \|\|
	BinOp.getOperand(i).getOpcode() == ISD::SELECT \|\|
	BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC \|\|
	BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE \|\|
	BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND \|\|
	BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND \|\|
	BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) {
	BinOps.push_back(BinOp.getOperand(i));
	} else {
	// We have an input that is not an extension or another binary
	// operation; we'll abort this transformation.
	return SDValue();
	}
	}
	}

	// Make sure that this is a self-contained cluster of operations (which
	// is not quite the same thing as saying that everything has only one
	// use).
	for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
	if (isa<ConstantSDNode>(Inputs[i]))
	continue;

	for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(),
	UE = Inputs[i].getNode()->use_end();
	UI != UE; ++UI) {
	SDNode User = UI;
	if (User != N && !Visited.count(User))
	return SDValue();

	// Make sure that we're not going to promote the non-output-value
	// operand(s) or SELECT or SELECT_CC.
	// FIXME: Although we could sometimes handle this, and it does occur in
	// practice that one of the condition inputs to the select is also one of
	// the outputs, we currently can't deal with this.
	if (User->getOpcode() == ISD::SELECT) {
	if (User->getOperand(0) == Inputs[i])
	return SDValue();
	} else if (User->getOpcode() == ISD::SELECT_CC) {
	if (User->getOperand(0) == Inputs[i] \|\|
	User->getOperand(1) == Inputs[i])
	return SDValue();
	}
	}
	}

	for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
	for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(),
	UE = PromOps[i].getNode()->use_end();
	UI != UE; ++UI) {
	SDNode User = UI;
	if (User != N && !Visited.count(User))
	return SDValue();

	// Make sure that we're not going to promote the non-output-value
	// operand(s) or SELECT or SELECT_CC.
	// FIXME: Although we could sometimes handle this, and it does occur in
	// practice that one of the condition inputs to the select is also one of
	// the outputs, we currently can't deal with this.
	if (User->getOpcode() == ISD::SELECT) {
	if (User->getOperand(0) == PromOps[i])
	return SDValue();
	} else if (User->getOpcode() == ISD::SELECT_CC) {
	if (User->getOperand(0) == PromOps[i] \|\|
	User->getOperand(1) == PromOps[i])
	return SDValue();
	}
	}
	}

	// Replace all inputs with the extension operand.
	for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
	// Constants may have users outside the cluster of to-be-promoted nodes,
	// and so we need to replace those as we do the promotions.
	if (isa<ConstantSDNode>(Inputs[i]))
	continue;
	else
	DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0));
	}

	std::list<HandleSDNode> PromOpHandles;
	for (auto &PromOp : PromOps)
	PromOpHandles.emplace_back(PromOp);

	// Replace all operations (these are all the same, but have a different
	// (i1) return type). DAG.getNode will validate that the types of
	// a binary operator match, so go through the list in reverse so that
	// we've likely promoted both operands first. Any intermediate truncations or
	// extensions disappear.
	while (!PromOpHandles.empty()) {
	SDValue PromOp = PromOpHandles.back().getValue();
	PromOpHandles.pop_back();

	if (PromOp.getOpcode() == ISD::TRUNCATE \|\|
	PromOp.getOpcode() == ISD::SIGN_EXTEND \|\|
	PromOp.getOpcode() == ISD::ZERO_EXTEND \|\|
	PromOp.getOpcode() == ISD::ANY_EXTEND) {
	if (!isa<ConstantSDNode>(PromOp.getOperand(0)) &&
	PromOp.getOperand(0).getValueType() != MVT::i1) {
	// The operand is not yet ready (see comment below).
	PromOpHandles.emplace_front(PromOp);
	continue;
	}

	SDValue RepValue = PromOp.getOperand(0);
	if (isa<ConstantSDNode>(RepValue))
	RepValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, RepValue);

	DAG.ReplaceAllUsesOfValueWith(PromOp, RepValue);
	continue;
	}

	unsigned C;
	switch (PromOp.getOpcode()) {
	default: C = 0; break;
	case ISD::SELECT: C = 1; break;
	case ISD::SELECT_CC: C = 2; break;
	}

	if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) &&
	PromOp.getOperand(C).getValueType() != MVT::i1) \|\|
	(!isa<ConstantSDNode>(PromOp.getOperand(C+1)) &&
	PromOp.getOperand(C+1).getValueType() != MVT::i1)) {
	// The to-be-promoted operands of this node have not yet been
	// promoted (this should be rare because we're going through the
	// list backward, but if one of the operands has several users in
	// this cluster of to-be-promoted nodes, it is possible).
	PromOpHandles.emplace_front(PromOp);
	continue;
	}

	SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),
	PromOp.getNode()->op_end());

	// If there are any constant inputs, make sure they're replaced now.
	for (unsigned i = 0; i < 2; ++i)
	if (isa<ConstantSDNode>(Ops[C+i]))
	Ops[C+i] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ops[C+i]);

	DAG.ReplaceAllUsesOfValueWith(PromOp,
	DAG.getNode(PromOp.getOpcode(), dl, MVT::i1, Ops));
	}

	// Now we're left with the initial truncation itself.
	if (N->getOpcode() == ISD::TRUNCATE)
	return N->getOperand(0);

	// Otherwise, this is a comparison. The operands to be compared have just
	// changed type (to i1), but everything else is the same.
	return SDValue(N, 0);
	}

	SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);

	// If we're tracking CR bits, we need to be careful that we don't have:
	// zext(binary-ops(trunc(x), trunc(y)))
	// or
	// zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
	// such that we're unnecessarily moving things into CR bits that can more
	// efficiently stay in GPRs. Note that if we're not certain that the high
	// bits are set as required by the final extension, we still may need to do
	// some masking to get the proper behavior.

	// This same functionality is important on PPC64 when dealing with
	// 32-to-64-bit extensions; these occur often when 32-bit values are used as
	// the return values of functions. Because it is so similar, it is handled
	// here as well.

	if (N->getValueType(0) != MVT::i32 &&
	N->getValueType(0) != MVT::i64)
	return SDValue();

	if (!((N->getOperand(0).getValueType() == MVT::i1 && Subtarget.useCRBits()) \|\|
	(N->getOperand(0).getValueType() == MVT::i32 && Subtarget.isPPC64())))
	return SDValue();

	if (N->getOperand(0).getOpcode() != ISD::AND &&
	N->getOperand(0).getOpcode() != ISD::OR &&
	N->getOperand(0).getOpcode() != ISD::XOR &&
	N->getOperand(0).getOpcode() != ISD::SELECT &&
	N->getOperand(0).getOpcode() != ISD::SELECT_CC)
	return SDValue();

	SmallVector<SDValue, 4> Inputs;
	SmallVector<SDValue, 8> BinOps(1, N->getOperand(0)), PromOps;
	SmallPtrSet<SDNode *, 16> Visited;

	// Visit all inputs, collect all binary operations (and, or, xor and
	// select) that are all fed by truncations.
	while (!BinOps.empty()) {
	SDValue BinOp = BinOps.back();
	BinOps.pop_back();

	if (!Visited.insert(BinOp.getNode()).second)
	continue;

	PromOps.push_back(BinOp);

	for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
	// The condition of the select is not promoted.
	if (BinOp.getOpcode() == ISD::SELECT && i == 0)
	continue;
	if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
	continue;

	if (BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE \|\|
	isa<ConstantSDNode>(BinOp.getOperand(i))) {
	Inputs.push_back(BinOp.getOperand(i));
	} else if (BinOp.getOperand(i).getOpcode() == ISD::AND \|\|
	BinOp.getOperand(i).getOpcode() == ISD::OR \|\|
	BinOp.getOperand(i).getOpcode() == ISD::XOR \|\|
	BinOp.getOperand(i).getOpcode() == ISD::SELECT \|\|
	BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC) {
	BinOps.push_back(BinOp.getOperand(i));
	} else {
	// We have an input that is not a truncation or another binary
	// operation; we'll abort this transformation.
	return SDValue();
	}
	}
	}

	// The operands of a select that must be truncated when the select is
	// promoted because the operand is actually part of the to-be-promoted set.
	DenseMap<SDNode *, EVT> SelectTruncOp[2];

	// Make sure that this is a self-contained cluster of operations (which
	// is not quite the same thing as saying that everything has only one
	// use).
	for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
	if (isa<ConstantSDNode>(Inputs[i]))
	continue;

	for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(),
	UE = Inputs[i].getNode()->use_end();
	UI != UE; ++UI) {
	SDNode User = UI;
	if (User != N && !Visited.count(User))
	return SDValue();

	// If we're going to promote the non-output-value operand(s) or SELECT or
	// SELECT_CC, record them for truncation.
	if (User->getOpcode() == ISD::SELECT) {
	if (User->getOperand(0) == Inputs[i])
	SelectTruncOp[0].insert(std::make_pair(User,
	User->getOperand(0).getValueType()));
	} else if (User->getOpcode() == ISD::SELECT_CC) {
	if (User->getOperand(0) == Inputs[i])
	SelectTruncOp[0].insert(std::make_pair(User,
	User->getOperand(0).getValueType()));
	if (User->getOperand(1) == Inputs[i])
	SelectTruncOp[1].insert(std::make_pair(User,
	User->getOperand(1).getValueType()));
	}
	}
	}

	for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
	for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(),
	UE = PromOps[i].getNode()->use_end();
	UI != UE; ++UI) {
	SDNode User = UI;
	if (User != N && !Visited.count(User))
	return SDValue();

	// If we're going to promote the non-output-value operand(s) or SELECT or
	// SELECT_CC, record them for truncation.
	if (User->getOpcode() == ISD::SELECT) {
	if (User->getOperand(0) == PromOps[i])
	SelectTruncOp[0].insert(std::make_pair(User,
	User->getOperand(0).getValueType()));
	} else if (User->getOpcode() == ISD::SELECT_CC) {
	if (User->getOperand(0) == PromOps[i])
	SelectTruncOp[0].insert(std::make_pair(User,
	User->getOperand(0).getValueType()));
	if (User->getOperand(1) == PromOps[i])
	SelectTruncOp[1].insert(std::make_pair(User,
	User->getOperand(1).getValueType()));
	}
	}
	}

	unsigned PromBits = N->getOperand(0).getValueSizeInBits();
	bool ReallyNeedsExt = false;
	if (N->getOpcode() != ISD::ANY_EXTEND) {
	// If all of the inputs are not already sign/zero extended, then
	// we'll still need to do that at the end.
	for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
	if (isa<ConstantSDNode>(Inputs[i]))
	continue;

	unsigned OpBits =
	Inputs[i].getOperand(0).getValueSizeInBits();
	assert(PromBits < OpBits && "Truncation not to a smaller bit count?");

	if ((N->getOpcode() == ISD::ZERO_EXTEND &&
	!DAG.MaskedValueIsZero(Inputs[i].getOperand(0),
	APInt::getHighBitsSet(OpBits,
	OpBits-PromBits))) \|\|
	(N->getOpcode() == ISD::SIGN_EXTEND &&
	DAG.ComputeNumSignBits(Inputs[i].getOperand(0)) <
	(OpBits-(PromBits-1)))) {
	ReallyNeedsExt = true;
	break;
	}
	}
	}

	// Replace all inputs, either with the truncation operand, or a
	// truncation or extension to the final output type.
	for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
	// Constant inputs need to be replaced with the to-be-promoted nodes that
	// use them because they might have users outside of the cluster of
	// promoted nodes.
	if (isa<ConstantSDNode>(Inputs[i]))
	continue;

	SDValue InSrc = Inputs[i].getOperand(0);
	if (Inputs[i].getValueType() == N->getValueType(0))
	DAG.ReplaceAllUsesOfValueWith(Inputs[i], InSrc);
	else if (N->getOpcode() == ISD::SIGN_EXTEND)
	DAG.ReplaceAllUsesOfValueWith(Inputs[i],
	DAG.getSExtOrTrunc(InSrc, dl, N->getValueType(0)));
	else if (N->getOpcode() == ISD::ZERO_EXTEND)
	DAG.ReplaceAllUsesOfValueWith(Inputs[i],
	DAG.getZExtOrTrunc(InSrc, dl, N->getValueType(0)));
	else
	DAG.ReplaceAllUsesOfValueWith(Inputs[i],
	DAG.getAnyExtOrTrunc(InSrc, dl, N->getValueType(0)));
	}

	std::list<HandleSDNode> PromOpHandles;
	for (auto &PromOp : PromOps)
	PromOpHandles.emplace_back(PromOp);

	// Replace all operations (these are all the same, but have a different
	// (promoted) return type). DAG.getNode will validate that the types of
	// a binary operator match, so go through the list in reverse so that
	// we've likely promoted both operands first.
	while (!PromOpHandles.empty()) {
	SDValue PromOp = PromOpHandles.back().getValue();
	PromOpHandles.pop_back();

	unsigned C;
	switch (PromOp.getOpcode()) {
	default: C = 0; break;
	case ISD::SELECT: C = 1; break;
	case ISD::SELECT_CC: C = 2; break;
	}

	if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) &&
	PromOp.getOperand(C).getValueType() != N->getValueType(0)) \|\|
	(!isa<ConstantSDNode>(PromOp.getOperand(C+1)) &&
	PromOp.getOperand(C+1).getValueType() != N->getValueType(0))) {
	// The to-be-promoted operands of this node have not yet been
	// promoted (this should be rare because we're going through the
	// list backward, but if one of the operands has several users in
	// this cluster of to-be-promoted nodes, it is possible).
	PromOpHandles.emplace_front(PromOp);
	continue;
	}

	// For SELECT and SELECT_CC nodes, we do a similar check for any
	// to-be-promoted comparison inputs.
	if (PromOp.getOpcode() == ISD::SELECT \|\|
	PromOp.getOpcode() == ISD::SELECT_CC) {
	if ((SelectTruncOp[0].count(PromOp.getNode()) &&
	PromOp.getOperand(0).getValueType() != N->getValueType(0)) \|\|
	(SelectTruncOp[1].count(PromOp.getNode()) &&
	PromOp.getOperand(1).getValueType() != N->getValueType(0))) {
	PromOpHandles.emplace_front(PromOp);
	continue;
	}
	}

	SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),
	PromOp.getNode()->op_end());

	// If this node has constant inputs, then they'll need to be promoted here.
	for (unsigned i = 0; i < 2; ++i) {
	if (!isa<ConstantSDNode>(Ops[C+i]))
	continue;
	if (Ops[C+i].getValueType() == N->getValueType(0))
	continue;

	if (N->getOpcode() == ISD::SIGN_EXTEND)
	Ops[C+i] = DAG.getSExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
	else if (N->getOpcode() == ISD::ZERO_EXTEND)
	Ops[C+i] = DAG.getZExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
	else
	Ops[C+i] = DAG.getAnyExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
	}

	// If we've promoted the comparison inputs of a SELECT or SELECT_CC,
	// truncate them again to the original value type.
	if (PromOp.getOpcode() == ISD::SELECT \|\|
	PromOp.getOpcode() == ISD::SELECT_CC) {
	auto SI0 = SelectTruncOp[0].find(PromOp.getNode());
	if (SI0 != SelectTruncOp[0].end())
	Ops[0] = DAG.getNode(ISD::TRUNCATE, dl, SI0->second, Ops[0]);
	auto SI1 = SelectTruncOp[1].find(PromOp.getNode());
	if (SI1 != SelectTruncOp[1].end())
	Ops[1] = DAG.getNode(ISD::TRUNCATE, dl, SI1->second, Ops[1]);
	}

	DAG.ReplaceAllUsesOfValueWith(PromOp,
	DAG.getNode(PromOp.getOpcode(), dl, N->getValueType(0), Ops));
	}

	// Now we're left with the initial extension itself.
	if (!ReallyNeedsExt)
	return N->getOperand(0);

	// To zero extend, just mask off everything except for the first bit (in the
	// i1 case).
	if (N->getOpcode() == ISD::ZERO_EXTEND)
	return DAG.getNode(ISD::AND, dl, N->getValueType(0), N->getOperand(0),
	DAG.getConstant(APInt::getLowBitsSet(
	N->getValueSizeInBits(0), PromBits),
	dl, N->getValueType(0)));

	assert(N->getOpcode() == ISD::SIGN_EXTEND &&
	"Invalid extension type");
	EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0), DAG.getDataLayout());
	SDValue ShiftCst =
	DAG.getConstant(N->getValueSizeInBits(0) - PromBits, dl, ShiftAmountTy);
	return DAG.getNode(
	ISD::SRA, dl, N->getValueType(0),
	DAG.getNode(ISD::SHL, dl, N->getValueType(0), N->getOperand(0), ShiftCst),
	ShiftCst);
	}

	SDValue PPCTargetLowering::combineSetCC(SDNode *N,
	DAGCombinerInfo &DCI) const {
	assert(N->getOpcode() == ISD::SETCC &&
	"Should be called with a SETCC node");

	ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
	if (CC == ISD::SETNE \|\| CC == ISD::SETEQ) {
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);

	// If there is a '0 - y' pattern, canonicalize the pattern to the RHS.
	if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
	LHS.hasOneUse())
	std::swap(LHS, RHS);

	// x == 0-y --> x+y == 0
	// x != 0-y --> x+y != 0
	if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
	RHS.hasOneUse()) {
	SDLoc DL(N);
	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);
	EVT OpVT = LHS.getValueType();
	SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
	return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
	}
	}

	return DAGCombineTruncBoolExt(N, DCI);
	}

	// Is this an extending load from an f32 to an f64?
	static bool isFPExtLoad(SDValue Op) {
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode()))
	return LD->getExtensionType() == ISD::EXTLOAD &&
	Op.getValueType() == MVT::f64;
	return false;
	}

	/// Reduces the number of fp-to-int conversion when building a vector.
	///
	/// If this vector is built out of floating to integer conversions,
	/// transform it to a vector built out of floating point values followed by a
	/// single floating to integer conversion of the vector.
	/// Namely (build_vector (fptosi $A), (fptosi $B), ...)
	/// becomes (fptosi (build_vector ($A, $B, ...)))
	SDValue PPCTargetLowering::
	combineElementTruncationToVectorTruncation(SDNode *N,
	DAGCombinerInfo &DCI) const {
	assert(N->getOpcode() == ISD::BUILD_VECTOR &&
	"Should be called with a BUILD_VECTOR node");

	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);

	SDValue FirstInput = N->getOperand(0);
	assert(FirstInput.getOpcode() == PPCISD::MFVSR &&
	"The input operand must be an fp-to-int conversion.");

	// This combine happens after legalization so the fp_to_[su]i nodes are
	// already converted to PPCSISD nodes.
	unsigned FirstConversion = FirstInput.getOperand(0).getOpcode();
	if (FirstConversion == PPCISD::FCTIDZ \|\|
	FirstConversion == PPCISD::FCTIDUZ \|\|
	FirstConversion == PPCISD::FCTIWZ \|\|
	FirstConversion == PPCISD::FCTIWUZ) {
	bool IsSplat = true;
	bool Is32Bit = FirstConversion == PPCISD::FCTIWZ \|\|
	FirstConversion == PPCISD::FCTIWUZ;
	EVT SrcVT = FirstInput.getOperand(0).getValueType();
	SmallVector<SDValue, 4> Ops;
	EVT TargetVT = N->getValueType(0);
	for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
	SDValue NextOp = N->getOperand(i);
	if (NextOp.getOpcode() != PPCISD::MFVSR)
	return SDValue();
	unsigned NextConversion = NextOp.getOperand(0).getOpcode();
	if (NextConversion != FirstConversion)
	return SDValue();
	// If we are converting to 32-bit integers, we need to add an FP_ROUND.
	// This is not valid if the input was originally double precision. It is
	// also not profitable to do unless this is an extending load in which
	// case doing this combine will allow us to combine consecutive loads.
	if (Is32Bit && !isFPExtLoad(NextOp.getOperand(0).getOperand(0)))
	return SDValue();
	if (N->getOperand(i) != FirstInput)
	IsSplat = false;
	}

	// If this is a splat, we leave it as-is since there will be only a single
	// fp-to-int conversion followed by a splat of the integer. This is better
	// for 32-bit and smaller ints and neutral for 64-bit ints.
	if (IsSplat)
	return SDValue();

	// Now that we know we have the right type of node, get its operands
	for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
	SDValue In = N->getOperand(i).getOperand(0);
	if (Is32Bit) {
	// For 32-bit values, we need to add an FP_ROUND node (if we made it
	// here, we know that all inputs are extending loads so this is safe).
	if (In.isUndef())
	Ops.push_back(DAG.getUNDEF(SrcVT));
	else {
	SDValue Trunc = DAG.getNode(ISD::FP_ROUND, dl,
	MVT::f32, In.getOperand(0),
	DAG.getIntPtrConstant(1, dl));
	Ops.push_back(Trunc);
	}
	} else
	Ops.push_back(In.isUndef() ? DAG.getUNDEF(SrcVT) : In.getOperand(0));
	}

	unsigned Opcode;
	if (FirstConversion == PPCISD::FCTIDZ \|\|
	FirstConversion == PPCISD::FCTIWZ)
	Opcode = ISD::FP_TO_SINT;
	else
	Opcode = ISD::FP_TO_UINT;

	EVT NewVT = TargetVT == MVT::v2i64 ? MVT::v2f64 : MVT::v4f32;
	SDValue BV = DAG.getBuildVector(NewVT, dl, Ops);
	return DAG.getNode(Opcode, dl, TargetVT, BV);
	}
	return SDValue();
	}

	/// Reduce the number of loads when building a vector.
	///
	/// Building a vector out of multiple loads can be converted to a load
	/// of the vector type if the loads are consecutive. If the loads are
	/// consecutive but in descending order, a shuffle is added at the end
	/// to reorder the vector.
	static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) {
	assert(N->getOpcode() == ISD::BUILD_VECTOR &&
	"Should be called with a BUILD_VECTOR node");

	SDLoc dl(N);

	// Return early for non byte-sized type, as they can't be consecutive.
	if (!N->getValueType(0).getVectorElementType().isByteSized())
	return SDValue();

	bool InputsAreConsecutiveLoads = true;
	bool InputsAreReverseConsecutive = true;
	unsigned ElemSize = N->getValueType(0).getScalarType().getStoreSize();
	SDValue FirstInput = N->getOperand(0);
	bool IsRoundOfExtLoad = false;

	if (FirstInput.getOpcode() == ISD::FP_ROUND &&
	FirstInput.getOperand(0).getOpcode() == ISD::LOAD) {
	LoadSDNode *LD = dyn_cast<LoadSDNode>(FirstInput.getOperand(0));
	IsRoundOfExtLoad = LD->getExtensionType() == ISD::EXTLOAD;
	}
	// Not a build vector of (possibly fp_rounded) loads.
	if ((!IsRoundOfExtLoad && FirstInput.getOpcode() != ISD::LOAD) \|\|
	N->getNumOperands() == 1)
	return SDValue();

	for (int i = 1, e = N->getNumOperands(); i < e; ++i) {
	// If any inputs are fp_round(extload), they all must be.
	if (IsRoundOfExtLoad && N->getOperand(i).getOpcode() != ISD::FP_ROUND)
	return SDValue();

	SDValue NextInput = IsRoundOfExtLoad ? N->getOperand(i).getOperand(0) :
	N->getOperand(i);
	if (NextInput.getOpcode() != ISD::LOAD)
	return SDValue();

	SDValue PreviousInput =
	IsRoundOfExtLoad ? N->getOperand(i-1).getOperand(0) : N->getOperand(i-1);
	LoadSDNode *LD1 = dyn_cast<LoadSDNode>(PreviousInput);
	LoadSDNode *LD2 = dyn_cast<LoadSDNode>(NextInput);

	// If any inputs are fp_round(extload), they all must be.
	if (IsRoundOfExtLoad && LD2->getExtensionType() != ISD::EXTLOAD)
	return SDValue();

	if (!isConsecutiveLS(LD2, LD1, ElemSize, 1, DAG))
	InputsAreConsecutiveLoads = false;
	if (!isConsecutiveLS(LD1, LD2, ElemSize, 1, DAG))
	InputsAreReverseConsecutive = false;

	// Exit early if the loads are neither consecutive nor reverse consecutive.
	if (!InputsAreConsecutiveLoads && !InputsAreReverseConsecutive)
	return SDValue();
	}

	assert(!(InputsAreConsecutiveLoads && InputsAreReverseConsecutive) &&
	"The loads cannot be both consecutive and reverse consecutive.");

	SDValue FirstLoadOp =
	IsRoundOfExtLoad ? FirstInput.getOperand(0) : FirstInput;
	SDValue LastLoadOp =
	IsRoundOfExtLoad ? N->getOperand(N->getNumOperands()-1).getOperand(0) :
	N->getOperand(N->getNumOperands()-1);

	LoadSDNode *LD1 = dyn_cast<LoadSDNode>(FirstLoadOp);
	LoadSDNode *LDL = dyn_cast<LoadSDNode>(LastLoadOp);
	if (InputsAreConsecutiveLoads) {
	assert(LD1 && "Input needs to be a LoadSDNode.");
	return DAG.getLoad(N->getValueType(0), dl, LD1->getChain(),
	LD1->getBasePtr(), LD1->getPointerInfo(),
	LD1->getAlignment());
	}
	if (InputsAreReverseConsecutive) {
	assert(LDL && "Input needs to be a LoadSDNode.");
	SDValue Load = DAG.getLoad(N->getValueType(0), dl, LDL->getChain(),
	LDL->getBasePtr(), LDL->getPointerInfo(),
	LDL->getAlignment());
	SmallVector<int, 16> Ops;
	for (int i = N->getNumOperands() - 1; i >= 0; i--)
	Ops.push_back(i);

	return DAG.getVectorShuffle(N->getValueType(0), dl, Load,
	DAG.getUNDEF(N->getValueType(0)), Ops);
	}
	return SDValue();
	}

	// This function adds the required vector_shuffle needed to get
	// the elements of the vector extract in the correct position
	// as specified by the CorrectElems encoding.
	static SDValue addShuffleForVecExtend(SDNode *N, SelectionDAG &DAG,
	SDValue Input, uint64_t Elems,
	uint64_t CorrectElems) {
	SDLoc dl(N);

	unsigned NumElems = Input.getValueType().getVectorNumElements();
	SmallVector<int, 16> ShuffleMask(NumElems, -1);

	// Knowing the element indices being extracted from the original
	// vector and the order in which they're being inserted, just put
	// them at element indices required for the instruction.
	for (unsigned i = 0; i < N->getNumOperands(); i++) {
	if (DAG.getDataLayout().isLittleEndian())
	ShuffleMask[CorrectElems & 0xF] = Elems & 0xF;
	else
	ShuffleMask[(CorrectElems & 0xF0) >> 4] = (Elems & 0xF0) >> 4;
	CorrectElems = CorrectElems >> 8;
	Elems = Elems >> 8;
	}

	SDValue Shuffle =
	DAG.getVectorShuffle(Input.getValueType(), dl, Input,
	DAG.getUNDEF(Input.getValueType()), ShuffleMask);

	EVT VT = N->getValueType(0);
	SDValue Conv = DAG.getBitcast(VT, Shuffle);

	EVT ExtVT = EVT::getVectorVT(*DAG.getContext(),
	Input.getValueType().getVectorElementType(),
	VT.getVectorNumElements());
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Conv,
	DAG.getValueType(ExtVT));
	}

	// Look for build vector patterns where input operands come from sign
	// extended vector_extract elements of specific indices. If the correct indices
	// aren't used, add a vector shuffle to fix up the indices and create
	// SIGN_EXTEND_INREG node which selects the vector sign extend instructions
	// during instruction selection.
	static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) {
	// This array encodes the indices that the vector sign extend instructions
	// extract from when extending from one type to another for both BE and LE.
	// The right nibble of each byte corresponds to the LE incides.
	// and the left nibble of each byte corresponds to the BE incides.
	// For example: 0x3074B8FC byte->word
	// For LE: the allowed indices are: 0x0,0x4,0x8,0xC
	// For BE: the allowed indices are: 0x3,0x7,0xB,0xF
	// For example: 0x000070F8 byte->double word
	// For LE: the allowed indices are: 0x0,0x8
	// For BE: the allowed indices are: 0x7,0xF
	uint64_t TargetElems[] = {
	0x3074B8FC, // b->w
	0x000070F8, // b->d
	0x10325476, // h->w
	0x00003074, // h->d
	0x00001032, // w->d
	};

	uint64_t Elems = 0;
	int Index;
	SDValue Input;

	auto isSExtOfVecExtract = [&](SDValue Op) -> bool {
	if (!Op)
	return false;
	if (Op.getOpcode() != ISD::SIGN_EXTEND &&
	Op.getOpcode() != ISD::SIGN_EXTEND_INREG)
	return false;

	// A SIGN_EXTEND_INREG might be fed by an ANY_EXTEND to produce a value
	// of the right width.
	SDValue Extract = Op.getOperand(0);
	if (Extract.getOpcode() == ISD::ANY_EXTEND)
	Extract = Extract.getOperand(0);
	if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return false;

	ConstantSDNode *ExtOp = dyn_cast<ConstantSDNode>(Extract.getOperand(1));
	if (!ExtOp)
	return false;

	Index = ExtOp->getZExtValue();
	if (Input && Input != Extract.getOperand(0))
	return false;

	if (!Input)
	Input = Extract.getOperand(0);

	Elems = Elems << 8;
	Index = DAG.getDataLayout().isLittleEndian() ? Index : Index << 4;
	Elems \|= Index;

	return true;
	};

	// If the build vector operands aren't sign extended vector extracts,
	// of the same input vector, then return.
	for (unsigned i = 0; i < N->getNumOperands(); i++) {
	if (!isSExtOfVecExtract(N->getOperand(i))) {
	return SDValue();
	}
	}

	// If the vector extract indicies are not correct, add the appropriate
	// vector_shuffle.
	int TgtElemArrayIdx;
	int InputSize = Input.getValueType().getScalarSizeInBits();
	int OutputSize = N->getValueType(0).getScalarSizeInBits();
	if (InputSize + OutputSize == 40)
	TgtElemArrayIdx = 0;
	else if (InputSize + OutputSize == 72)
	TgtElemArrayIdx = 1;
	else if (InputSize + OutputSize == 48)
	TgtElemArrayIdx = 2;
	else if (InputSize + OutputSize == 80)
	TgtElemArrayIdx = 3;
	else if (InputSize + OutputSize == 96)
	TgtElemArrayIdx = 4;
	else
	return SDValue();

	uint64_t CorrectElems = TargetElems[TgtElemArrayIdx];
	CorrectElems = DAG.getDataLayout().isLittleEndian()
	? CorrectElems & 0x0F0F0F0F0F0F0F0F
	: CorrectElems & 0xF0F0F0F0F0F0F0F0;
	if (Elems != CorrectElems) {
	return addShuffleForVecExtend(N, DAG, Input, Elems, CorrectElems);
	}

	// Regular lowering will catch cases where a shuffle is not needed.
	return SDValue();
	}

	SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N,
	DAGCombinerInfo &DCI) const {
	assert(N->getOpcode() == ISD::BUILD_VECTOR &&
	"Should be called with a BUILD_VECTOR node");

	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);

	if (!Subtarget.hasVSX())
	return SDValue();

	// The target independent DAG combiner will leave a build_vector of
	// float-to-int conversions intact. We can generate MUCH better code for
	// a float-to-int conversion of a vector of floats.
	SDValue FirstInput = N->getOperand(0);
	if (FirstInput.getOpcode() == PPCISD::MFVSR) {
	SDValue Reduced = combineElementTruncationToVectorTruncation(N, DCI);
	if (Reduced)
	return Reduced;
	}

	// If we're building a vector out of consecutive loads, just load that
	// vector type.
	SDValue Reduced = combineBVOfConsecutiveLoads(N, DAG);
	if (Reduced)
	return Reduced;

	// If we're building a vector out of extended elements from another vector
	// we have P9 vector integer extend instructions. The code assumes legal
	// input types (i.e. it can't handle things like v4i16) so do not run before
	// legalization.
	if (Subtarget.hasP9Altivec() && !DCI.isBeforeLegalize()) {
	Reduced = combineBVOfVecSExt(N, DAG);
	if (Reduced)
	return Reduced;
	}


	if (N->getValueType(0) != MVT::v2f64)
	return SDValue();

	// Looking for:
	// (build_vector ([su]int_to_fp (extractelt 0)), [su]int_to_fp (extractelt 1))
	if (FirstInput.getOpcode() != ISD::SINT_TO_FP &&
	FirstInput.getOpcode() != ISD::UINT_TO_FP)
	return SDValue();
	if (N->getOperand(1).getOpcode() != ISD::SINT_TO_FP &&
	N->getOperand(1).getOpcode() != ISD::UINT_TO_FP)
	return SDValue();
	if (FirstInput.getOpcode() != N->getOperand(1).getOpcode())
	return SDValue();

	SDValue Ext1 = FirstInput.getOperand(0);
	SDValue Ext2 = N->getOperand(1).getOperand(0);
	if(Ext1.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Ext2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	ConstantSDNode *Ext1Op = dyn_cast<ConstantSDNode>(Ext1.getOperand(1));
	ConstantSDNode *Ext2Op = dyn_cast<ConstantSDNode>(Ext2.getOperand(1));
	if (!Ext1Op \|\| !Ext2Op)
	return SDValue();
	if (Ext1.getOperand(0).getValueType() != MVT::v4i32 \|\|
	Ext1.getOperand(0) != Ext2.getOperand(0))
	return SDValue();

	int FirstElem = Ext1Op->getZExtValue();
	int SecondElem = Ext2Op->getZExtValue();
	int SubvecIdx;
	if (FirstElem == 0 && SecondElem == 1)
	SubvecIdx = Subtarget.isLittleEndian() ? 1 : 0;
	else if (FirstElem == 2 && SecondElem == 3)
	SubvecIdx = Subtarget.isLittleEndian() ? 0 : 1;
	else
	return SDValue();

	SDValue SrcVec = Ext1.getOperand(0);
	auto NodeType = (N->getOperand(1).getOpcode() == ISD::SINT_TO_FP) ?
	PPCISD::SINT_VEC_TO_FP : PPCISD::UINT_VEC_TO_FP;
	return DAG.getNode(NodeType, dl, MVT::v2f64,
	SrcVec, DAG.getIntPtrConstant(SubvecIdx, dl));
	}

	SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
	DAGCombinerInfo &DCI) const {
	assert((N->getOpcode() == ISD::SINT_TO_FP \|\|
	N->getOpcode() == ISD::UINT_TO_FP) &&
	"Need an int -> FP conversion node here");

	if (useSoftFloat() \|\| !Subtarget.has64BitSupport())
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);
	SDValue Op(N, 0);

	// Don't handle ppc_fp128 here or conversions that are out-of-range capable
	// from the hardware.
	if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
	return SDValue();
	if (Op.getOperand(0).getValueType().getSimpleVT() <= MVT(MVT::i1) \|\|
	Op.getOperand(0).getValueType().getSimpleVT() > MVT(MVT::i64))
	return SDValue();

	SDValue FirstOperand(Op.getOperand(0));
	bool SubWordLoad = FirstOperand.getOpcode() == ISD::LOAD &&
	(FirstOperand.getValueType() == MVT::i8 \|\|
	FirstOperand.getValueType() == MVT::i16);
	if (Subtarget.hasP9Vector() && Subtarget.hasP9Altivec() && SubWordLoad) {
	bool Signed = N->getOpcode() == ISD::SINT_TO_FP;
	bool DstDouble = Op.getValueType() == MVT::f64;
	unsigned ConvOp = Signed ?
	(DstDouble ? PPCISD::FCFID : PPCISD::FCFIDS) :
	(DstDouble ? PPCISD::FCFIDU : PPCISD::FCFIDUS);
	SDValue WidthConst =
	DAG.getIntPtrConstant(FirstOperand.getValueType() == MVT::i8 ? 1 : 2,
	dl, false);
	LoadSDNode *LDN = cast<LoadSDNode>(FirstOperand.getNode());
	SDValue Ops[] = { LDN->getChain(), LDN->getBasePtr(), WidthConst };
	SDValue Ld = DAG.getMemIntrinsicNode(PPCISD::LXSIZX, dl,
	DAG.getVTList(MVT::f64, MVT::Other),
	Ops, MVT::i8, LDN->getMemOperand());

	// For signed conversion, we need to sign-extend the value in the VSR
	if (Signed) {
	SDValue ExtOps[] = { Ld, WidthConst };
	SDValue Ext = DAG.getNode(PPCISD::VEXTS, dl, MVT::f64, ExtOps);
	return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ext);
	} else
	return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ld);
	}


	// For i32 intermediate values, unfortunately, the conversion functions
	// leave the upper 32 bits of the value are undefined. Within the set of
	// scalar instructions, we have no method for zero- or sign-extending the
	// value. Thus, we cannot handle i32 intermediate values here.
	if (Op.getOperand(0).getValueType() == MVT::i32)
	return SDValue();

	assert((Op.getOpcode() == ISD::SINT_TO_FP \|\| Subtarget.hasFPCVT()) &&
	"UINT_TO_FP is supported only with FPCVT");

	// If we have FCFIDS, then use it when converting to single-precision.
	// Otherwise, convert to double-precision and then round.
	unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
	? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS
	: PPCISD::FCFIDS)
	: (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU
	: PPCISD::FCFID);
	MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
	? MVT::f32
	: MVT::f64;

	// If we're converting from a float, to an int, and back to a float again,
	// then we don't need the store/load pair at all.
	if ((Op.getOperand(0).getOpcode() == ISD::FP_TO_UINT &&
	Subtarget.hasFPCVT()) \|\|
	(Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT)) {
	SDValue Src = Op.getOperand(0).getOperand(0);
	if (Src.getValueType() == MVT::f32) {
	Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
	DCI.AddToWorklist(Src.getNode());
	} else if (Src.getValueType() != MVT::f64) {
	// Make sure that we don't pick up a ppc_fp128 source value.
	return SDValue();
	}

	unsigned FCTOp =
	Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
	PPCISD::FCTIDUZ;

	SDValue Tmp = DAG.getNode(FCTOp, dl, MVT::f64, Src);
	SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Tmp);

	if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {
	FP = DAG.getNode(ISD::FP_ROUND, dl,
	MVT::f32, FP, DAG.getIntPtrConstant(0, dl));
	DCI.AddToWorklist(FP.getNode());
	}

	return FP;
	}

	return SDValue();
	}

	// expandVSXLoadForLE - Convert VSX loads (which may be intrinsics for
	// builtins) into loads with swaps.
	SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);
	SDValue Chain;
	SDValue Base;
	MachineMemOperand *MMO;

	switch (N->getOpcode()) {
	default:
	llvm_unreachable("Unexpected opcode for little endian VSX load");
	case ISD::LOAD: {
	LoadSDNode *LD = cast<LoadSDNode>(N);
	Chain = LD->getChain();
	Base = LD->getBasePtr();
	MMO = LD->getMemOperand();
	// If the MMO suggests this isn't a load of a full vector, leave
	// things alone. For a built-in, we have to make the change for
	// correctness, so if there is a size problem that will be a bug.
	if (MMO->getSize() < 16)
	return SDValue();
	break;
	}
	case ISD::INTRINSIC_W_CHAIN: {
	MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N);
	Chain = Intrin->getChain();
	// Similarly to the store case below, Intrin->getBasePtr() doesn't get
	// us what we want. Get operand 2 instead.
	Base = Intrin->getOperand(2);
	MMO = Intrin->getMemOperand();
	break;
	}
	}

	MVT VecTy = N->getValueType(0).getSimpleVT();

	// Do not expand to PPCISD::LXVD2X + PPCISD::XXSWAPD when the load is
	// aligned and the type is a vector with elements up to 4 bytes
	if (Subtarget.needsSwapsForVSXMemOps() && MMO->getAlign() >= Align(16) &&
	VecTy.getScalarSizeInBits() <= 32) {
	return SDValue();
	}

	SDValue LoadOps[] = { Chain, Base };
	SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl,
	DAG.getVTList(MVT::v2f64, MVT::Other),
	LoadOps, MVT::v2f64, MMO);

	DCI.AddToWorklist(Load.getNode());
	Chain = Load.getValue(1);
	SDValue Swap = DAG.getNode(
	PPCISD::XXSWAPD, dl, DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Load);
	DCI.AddToWorklist(Swap.getNode());

	// Add a bitcast if the resulting load type doesn't match v2f64.
	if (VecTy != MVT::v2f64) {
	SDValue N = DAG.getNode(ISD::BITCAST, dl, VecTy, Swap);
	DCI.AddToWorklist(N.getNode());
	// Package {bitcast value, swap's chain} to match Load's shape.
	return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VecTy, MVT::Other),
	N, Swap.getValue(1));
	}

	return Swap;
	}

	// expandVSXStoreForLE - Convert VSX stores (which may be intrinsics for
	// builtins) into stores with swaps.
	SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);
	SDValue Chain;
	SDValue Base;
	unsigned SrcOpnd;
	MachineMemOperand *MMO;

	switch (N->getOpcode()) {
	default:
	llvm_unreachable("Unexpected opcode for little endian VSX store");
	case ISD::STORE: {
	StoreSDNode *ST = cast<StoreSDNode>(N);
	Chain = ST->getChain();
	Base = ST->getBasePtr();
	MMO = ST->getMemOperand();
	SrcOpnd = 1;
	// If the MMO suggests this isn't a store of a full vector, leave
	// things alone. For a built-in, we have to make the change for
	// correctness, so if there is a size problem that will be a bug.
	if (MMO->getSize() < 16)
	return SDValue();
	break;
	}
	case ISD::INTRINSIC_VOID: {
	MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N);
	Chain = Intrin->getChain();
	// Intrin->getBasePtr() oddly does not get what we want.
	Base = Intrin->getOperand(3);
	MMO = Intrin->getMemOperand();
	SrcOpnd = 2;
	break;
	}
	}

	SDValue Src = N->getOperand(SrcOpnd);
	MVT VecTy = Src.getValueType().getSimpleVT();

	// Do not expand to PPCISD::XXSWAPD and PPCISD::STXVD2X when the load is
	// aligned and the type is a vector with elements up to 4 bytes
	if (Subtarget.needsSwapsForVSXMemOps() && MMO->getAlign() >= Align(16) &&
	VecTy.getScalarSizeInBits() <= 32) {
	return SDValue();
	}

	// All stores are done as v2f64 and possible bit cast.
	if (VecTy != MVT::v2f64) {
	Src = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Src);
	DCI.AddToWorklist(Src.getNode());
	}

	SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl,
	DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Src);
	DCI.AddToWorklist(Swap.getNode());
	Chain = Swap.getValue(1);
	SDValue StoreOps[] = { Chain, Swap, Base };
	SDValue Store = DAG.getMemIntrinsicNode(PPCISD::STXVD2X, dl,
	DAG.getVTList(MVT::Other),
	StoreOps, VecTy, MMO);
	DCI.AddToWorklist(Store.getNode());
	return Store;
	}

	// Handle DAG combine for STORE (FP_TO_INT F).
	SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N,
	DAGCombinerInfo &DCI) const {

	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);
	unsigned Opcode = N->getOperand(1).getOpcode();

	assert((Opcode == ISD::FP_TO_SINT \|\| Opcode == ISD::FP_TO_UINT)
	&& "Not a FP_TO_INT Instruction!");

	SDValue Val = N->getOperand(1).getOperand(0);
	EVT Op1VT = N->getOperand(1).getValueType();
	EVT ResVT = Val.getValueType();

	// Floating point types smaller than 32 bits are not legal on Power.
	if (ResVT.getScalarSizeInBits() < 32)
	return SDValue();

	// Only perform combine for conversion to i64/i32 or power9 i16/i8.
	bool ValidTypeForStoreFltAsInt =
	(Op1VT == MVT::i32 \|\| Op1VT == MVT::i64 \|\|
	(Subtarget.hasP9Vector() && (Op1VT == MVT::i16 \|\| Op1VT == MVT::i8)));

	if (ResVT == MVT::ppcf128 \|\| !Subtarget.hasP8Vector() \|\|
	cast<StoreSDNode>(N)->isTruncatingStore() \|\| !ValidTypeForStoreFltAsInt)
	return SDValue();

	// Extend f32 values to f64
	if (ResVT.getScalarSizeInBits() == 32) {
	Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val);
	DCI.AddToWorklist(Val.getNode());
	}

	// Set signed or unsigned conversion opcode.
	unsigned ConvOpcode = (Opcode == ISD::FP_TO_SINT) ?
	PPCISD::FP_TO_SINT_IN_VSR :
	PPCISD::FP_TO_UINT_IN_VSR;

	Val = DAG.getNode(ConvOpcode,
	dl, ResVT == MVT::f128 ? MVT::f128 : MVT::f64, Val);
	DCI.AddToWorklist(Val.getNode());

	// Set number of bytes being converted.
	unsigned ByteSize = Op1VT.getScalarSizeInBits() / 8;
	SDValue Ops[] = { N->getOperand(0), Val, N->getOperand(2),
	DAG.getIntPtrConstant(ByteSize, dl, false),
	DAG.getValueType(Op1VT) };

	Val = DAG.getMemIntrinsicNode(PPCISD::ST_VSR_SCAL_INT, dl,
	DAG.getVTList(MVT::Other), Ops,
	cast<StoreSDNode>(N)->getMemoryVT(),
	cast<StoreSDNode>(N)->getMemOperand());

	DCI.AddToWorklist(Val.getNode());
	return Val;
	}

	static bool isAlternatingShuffMask(const ArrayRef<int> &Mask, int NumElts) {
	// Check that the source of the element keeps flipping
	// (i.e. Mask[i] < NumElts -> Mask[i+i] >= NumElts).
	bool PrevElemFromFirstVec = Mask[0] < NumElts;
	for (int i = 1, e = Mask.size(); i < e; i++) {
	if (PrevElemFromFirstVec && Mask[i] < NumElts)
	return false;
	if (!PrevElemFromFirstVec && Mask[i] >= NumElts)
	return false;
	PrevElemFromFirstVec = !PrevElemFromFirstVec;
	}
	return true;
	}

	static bool isSplatBV(SDValue Op) {
	if (Op.getOpcode() != ISD::BUILD_VECTOR)
	return false;
	SDValue FirstOp;

	// Find first non-undef input.
	for (int i = 0, e = Op.getNumOperands(); i < e; i++) {
	FirstOp = Op.getOperand(i);
	if (!FirstOp.isUndef())
	break;
	}

	// All inputs are undef or the same as the first non-undef input.
	for (int i = 1, e = Op.getNumOperands(); i < e; i++)
	if (Op.getOperand(i) != FirstOp && !Op.getOperand(i).isUndef())
	return false;
	return true;
	}

	static SDValue isScalarToVec(SDValue Op) {
	if (Op.getOpcode() == ISD::SCALAR_TO_VECTOR)
	return Op;
	if (Op.getOpcode() != ISD::BITCAST)
	return SDValue();
	Op = Op.getOperand(0);
	if (Op.getOpcode() == ISD::SCALAR_TO_VECTOR)
	return Op;
	return SDValue();
	}

	static void fixupShuffleMaskForPermutedSToV(SmallVectorImpl<int> &ShuffV,
	int LHSMaxIdx, int RHSMinIdx,
	int RHSMaxIdx, int HalfVec) {
	for (int i = 0, e = ShuffV.size(); i < e; i++) {
	int Idx = ShuffV[i];
	if ((Idx >= 0 && Idx < LHSMaxIdx) \|\| (Idx >= RHSMinIdx && Idx < RHSMaxIdx))
	ShuffV[i] += HalfVec;
	}
	return;
	}

	// Replace a SCALAR_TO_VECTOR with a SCALAR_TO_VECTOR_PERMUTED except if
	// the original is:
	// (<n x Ty> (scalar_to_vector (Ty (extract_elt <n x Ty> %a, C))))
	// In such a case, just change the shuffle mask to extract the element
	// from the permuted index.
	static SDValue getSToVPermuted(SDValue OrigSToV, SelectionDAG &DAG) {
	SDLoc dl(OrigSToV);
	EVT VT = OrigSToV.getValueType();
	assert(OrigSToV.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	"Expecting a SCALAR_TO_VECTOR here");
	SDValue Input = OrigSToV.getOperand(0);

	if (Input.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(Input.getOperand(1));
	SDValue OrigVector = Input.getOperand(0);

	// Can't handle non-const element indices or different vector types
	// for the input to the extract and the output of the scalar_to_vector.
	if (Idx && VT == OrigVector.getValueType()) {
	SmallVector<int, 16> NewMask(VT.getVectorNumElements(), -1);
	NewMask[VT.getVectorNumElements() / 2] = Idx->getZExtValue();
	return DAG.getVectorShuffle(VT, dl, OrigVector, OrigVector, NewMask);
	}
	}
	return DAG.getNode(PPCISD::SCALAR_TO_VECTOR_PERMUTED, dl, VT,
	OrigSToV.getOperand(0));
	}

	// On little endian subtargets, combine shuffles such as:
	// vector_shuffle<16,1,17,3,18,5,19,7,20,9,21,11,22,13,23,15>, <zero>, %b
	// into:
	// vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7>, <zero>, %b
	// because the latter can be matched to a single instruction merge.
	// Furthermore, SCALAR_TO_VECTOR on little endian always involves a permute
	// to put the value into element zero. Adjust the shuffle mask so that the
	// vector can remain in permuted form (to prevent a swap prior to a shuffle).
	SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN,
	SelectionDAG &DAG) const {
	SDValue LHS = SVN->getOperand(0);
	SDValue RHS = SVN->getOperand(1);
	auto Mask = SVN->getMask();
	int NumElts = LHS.getValueType().getVectorNumElements();
	SDValue Res(SVN, 0);
	SDLoc dl(SVN);

	// None of these combines are useful on big endian systems since the ISA
	// already has a big endian bias.
	if (!Subtarget.isLittleEndian() \|\| !Subtarget.hasVSX())
	return Res;

	// If this is not a shuffle of a shuffle and the first element comes from
	// the second vector, canonicalize to the commuted form. This will make it
	// more likely to match one of the single instruction patterns.
	if (Mask[0] >= NumElts && LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
	RHS.getOpcode() != ISD::VECTOR_SHUFFLE) {
	std::swap(LHS, RHS);
	Res = DAG.getCommutedVectorShuffle(*SVN);
	Mask = cast<ShuffleVectorSDNode>(Res)->getMask();
	}

	// Adjust the shuffle mask if either input vector comes from a
	// SCALAR_TO_VECTOR and keep the respective input vector in permuted
	// form (to prevent the need for a swap).
	SmallVector<int, 16> ShuffV(Mask.begin(), Mask.end());
	SDValue SToVLHS = isScalarToVec(LHS);
	SDValue SToVRHS = isScalarToVec(RHS);
	if (SToVLHS \|\| SToVRHS) {
	int NumEltsIn = SToVLHS ? SToVLHS.getValueType().getVectorNumElements()
	: SToVRHS.getValueType().getVectorNumElements();
	int NumEltsOut = ShuffV.size();

	// Initially assume that neither input is permuted. These will be adjusted
	// accordingly if either input is.
	int LHSMaxIdx = -1;
	int RHSMinIdx = -1;
	int RHSMaxIdx = -1;
	int HalfVec = LHS.getValueType().getVectorNumElements() / 2;

	// Get the permuted scalar to vector nodes for the source(s) that come from
	// ISD::SCALAR_TO_VECTOR.
	if (SToVLHS) {
	// Set up the values for the shuffle vector fixup.
	LHSMaxIdx = NumEltsOut / NumEltsIn;
	SToVLHS = getSToVPermuted(SToVLHS, DAG);
	if (SToVLHS.getValueType() != LHS.getValueType())
	SToVLHS = DAG.getBitcast(LHS.getValueType(), SToVLHS);
	LHS = SToVLHS;
	}
	if (SToVRHS) {
	RHSMinIdx = NumEltsOut;
	RHSMaxIdx = NumEltsOut / NumEltsIn + RHSMinIdx;
	SToVRHS = getSToVPermuted(SToVRHS, DAG);
	if (SToVRHS.getValueType() != RHS.getValueType())
	SToVRHS = DAG.getBitcast(RHS.getValueType(), SToVRHS);
	RHS = SToVRHS;
	}

	// Fix up the shuffle mask to reflect where the desired element actually is.
	// The minimum and maximum indices that correspond to element zero for both
	// the LHS and RHS are computed and will control which shuffle mask entries
	// are to be changed. For example, if the RHS is permuted, any shuffle mask
	// entries in the range [RHSMinIdx,RHSMaxIdx) will be incremented by
	// HalfVec to refer to the corresponding element in the permuted vector.
	fixupShuffleMaskForPermutedSToV(ShuffV, LHSMaxIdx, RHSMinIdx, RHSMaxIdx,
	HalfVec);
	Res = DAG.getVectorShuffle(SVN->getValueType(0), dl, LHS, RHS, ShuffV);

	// We may have simplified away the shuffle. We won't be able to do anything
	// further with it here.
	if (!isa<ShuffleVectorSDNode>(Res))
	return Res;
	Mask = cast<ShuffleVectorSDNode>(Res)->getMask();
	}

	// The common case after we commuted the shuffle is that the RHS is a splat
	// and we have elements coming in from the splat at indices that are not
	// conducive to using a merge.
	// Example:
	// vector_shuffle<0,17,1,19,2,21,3,23,4,25,5,27,6,29,7,31> t1, <zero>
	if (!isSplatBV(RHS))
	return Res;

	// We are looking for a mask such that all even elements are from
	// one vector and all odd elements from the other.
	if (!isAlternatingShuffMask(Mask, NumElts))
	return Res;

	// Adjust the mask so we are pulling in the same index from the splat
	// as the index from the interesting vector in consecutive elements.
	// Example (even elements from first vector):
	// vector_shuffle<0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23> t1, <zero>
	if (Mask[0] < NumElts)
	for (int i = 1, e = Mask.size(); i < e; i += 2)
	ShuffV[i] = (ShuffV[i - 1] + NumElts);
	// Example (odd elements from first vector):
	// vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7> t1, <zero>
	else
	for (int i = 0, e = Mask.size(); i < e; i += 2)
	ShuffV[i] = (ShuffV[i + 1] + NumElts);

	// If the RHS has undefs, we need to remove them since we may have created
	// a shuffle that adds those instead of the splat value.
	SDValue SplatVal = cast<BuildVectorSDNode>(RHS.getNode())->getSplatValue();
	RHS = DAG.getSplatBuildVector(RHS.getValueType(), dl, SplatVal);

	Res = DAG.getVectorShuffle(SVN->getValueType(0), dl, LHS, RHS, ShuffV);
	return Res;
	}

	SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN,
	LSBaseSDNode *LSBase,
	DAGCombinerInfo &DCI) const {
	assert((ISD::isNormalLoad(LSBase) \|\| ISD::isNormalStore(LSBase)) &&
	"Not a reverse memop pattern!");

	auto IsElementReverse = [](const ShuffleVectorSDNode *SVN) -> bool {
	auto Mask = SVN->getMask();
	int i = 0;
	auto I = Mask.rbegin();
	auto E = Mask.rend();

	for (; I != E; ++I) {
	if (*I != i)
	return false;
	i++;
	}
	return true;
	};

	SelectionDAG &DAG = DCI.DAG;
	EVT VT = SVN->getValueType(0);

	if (!isTypeLegal(VT) \|\| !Subtarget.isLittleEndian() \|\| !Subtarget.hasVSX())
	return SDValue();

	// Before P9, we have PPCVSXSwapRemoval pass to hack the element order.
	// See comment in PPCVSXSwapRemoval.cpp.
	// It is conflict with PPCVSXSwapRemoval opt. So we don't do it.
	if (!Subtarget.hasP9Vector())
	return SDValue();

	if(!IsElementReverse(SVN))
	return SDValue();

	if (LSBase->getOpcode() == ISD::LOAD) {
	SDLoc dl(SVN);
	SDValue LoadOps[] = {LSBase->getChain(), LSBase->getBasePtr()};
	return DAG.getMemIntrinsicNode(
	PPCISD::LOAD_VEC_BE, dl, DAG.getVTList(VT, MVT::Other), LoadOps,
	LSBase->getMemoryVT(), LSBase->getMemOperand());
	}

	if (LSBase->getOpcode() == ISD::STORE) {
	SDLoc dl(LSBase);
	SDValue StoreOps[] = {LSBase->getChain(), SVN->getOperand(0),
	LSBase->getBasePtr()};
	return DAG.getMemIntrinsicNode(
	PPCISD::STORE_VEC_BE, dl, DAG.getVTList(MVT::Other), StoreOps,
	LSBase->getMemoryVT(), LSBase->getMemOperand());
	}

	llvm_unreachable("Expected a load or store node here");
	}

	SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);
	switch (N->getOpcode()) {
	default: break;
	case ISD::ADD:
	return combineADD(N, DCI);
	case ISD::SHL:
	return combineSHL(N, DCI);
	case ISD::SRA:
	return combineSRA(N, DCI);
	case ISD::SRL:
	return combineSRL(N, DCI);
	case ISD::MUL:
	return combineMUL(N, DCI);
	case ISD::FMA:
	case PPCISD::FNMSUB:
	return combineFMALike(N, DCI);
	case PPCISD::SHL:
	if (isNullConstant(N->getOperand(0))) // 0 << V -> 0.
	return N->getOperand(0);
	break;
	case PPCISD::SRL:
	if (isNullConstant(N->getOperand(0))) // 0 >>u V -> 0.
	return N->getOperand(0);
	break;
	case PPCISD::SRA:
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
	if (C->isNullValue() \|\| // 0 >>s V -> 0.
	C->isAllOnesValue()) // -1 >>s V -> -1.
	return N->getOperand(0);
	}
	break;
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	return DAGCombineExtBoolTrunc(N, DCI);
	case ISD::TRUNCATE:
	return combineTRUNCATE(N, DCI);
	case ISD::SETCC:
	if (SDValue CSCC = combineSetCC(N, DCI))
	return CSCC;
	LLVM_FALLTHROUGH;
	case ISD::SELECT_CC:
	return DAGCombineTruncBoolExt(N, DCI);
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	return combineFPToIntToFP(N, DCI);
	case ISD::VECTOR_SHUFFLE:
	if (ISD::isNormalLoad(N->getOperand(0).getNode())) {
	LSBaseSDNode* LSBase = cast<LSBaseSDNode>(N->getOperand(0));
	return combineVReverseMemOP(cast<ShuffleVectorSDNode>(N), LSBase, DCI);
	}
	return combineVectorShuffle(cast<ShuffleVectorSDNode>(N), DCI.DAG);
	case ISD::STORE: {

	EVT Op1VT = N->getOperand(1).getValueType();
	unsigned Opcode = N->getOperand(1).getOpcode();

	if (Opcode == ISD::FP_TO_SINT \|\| Opcode == ISD::FP_TO_UINT) {
	SDValue Val= combineStoreFPToInt(N, DCI);
	if (Val)
	return Val;
	}

	if (Opcode == ISD::VECTOR_SHUFFLE && ISD::isNormalStore(N)) {
	ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N->getOperand(1));
	SDValue Val= combineVReverseMemOP(SVN, cast<LSBaseSDNode>(N), DCI);
	if (Val)
	return Val;
	}

	// Turn STORE (BSWAP) -> sthbrx/stwbrx.
	if (cast<StoreSDNode>(N)->isUnindexed() && Opcode == ISD::BSWAP &&
	N->getOperand(1).getNode()->hasOneUse() &&
	(Op1VT == MVT::i32 \|\| Op1VT == MVT::i16 \|\|
	(Subtarget.hasLDBRX() && Subtarget.isPPC64() && Op1VT == MVT::i64))) {

	// STBRX can only handle simple types and it makes no sense to store less
	// two bytes in byte-reversed order.
	EVT mVT = cast<StoreSDNode>(N)->getMemoryVT();
	if (mVT.isExtended() \|\| mVT.getSizeInBits() < 16)
	break;

	SDValue BSwapOp = N->getOperand(1).getOperand(0);
	// Do an any-extend to 32-bits if this is a half-word input.
	if (BSwapOp.getValueType() == MVT::i16)
	BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp);

	// If the type of BSWAP operand is wider than stored memory width
	// it need to be shifted to the right side before STBRX.
	if (Op1VT.bitsGT(mVT)) {
	int Shift = Op1VT.getSizeInBits() - mVT.getSizeInBits();
	BSwapOp = DAG.getNode(ISD::SRL, dl, Op1VT, BSwapOp,
	DAG.getConstant(Shift, dl, MVT::i32));
	// Need to truncate if this is a bswap of i64 stored as i32/i16.
	if (Op1VT == MVT::i64)
	BSwapOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BSwapOp);
	}

	SDValue Ops[] = {
	N->getOperand(0), BSwapOp, N->getOperand(2), DAG.getValueType(mVT)
	};
	return
	DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other),
	Ops, cast<StoreSDNode>(N)->getMemoryVT(),
	cast<StoreSDNode>(N)->getMemOperand());
	}

	// STORE Constant:i32<0> -> STORE<trunc to i32> Constant:i64<0>
	// So it can increase the chance of CSE constant construction.
	if (Subtarget.isPPC64() && !DCI.isBeforeLegalize() &&
	isa<ConstantSDNode>(N->getOperand(1)) && Op1VT == MVT::i32) {
	// Need to sign-extended to 64-bits to handle negative values.
	EVT MemVT = cast<StoreSDNode>(N)->getMemoryVT();
	uint64_t Val64 = SignExtend64(N->getConstantOperandVal(1),
	MemVT.getSizeInBits());
	SDValue Const64 = DAG.getConstant(Val64, dl, MVT::i64);

	// DAG.getTruncStore() can't be used here because it doesn't accept
	// the general (base + offset) addressing mode.
	// So we use UpdateNodeOperands and setTruncatingStore instead.
	DAG.UpdateNodeOperands(N, N->getOperand(0), Const64, N->getOperand(2),
	N->getOperand(3));
	cast<StoreSDNode>(N)->setTruncatingStore(true);
	return SDValue(N, 0);
	}

	// For little endian, VSX stores require generating xxswapd/lxvd2x.
	// Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
	if (Op1VT.isSimple()) {
	MVT StoreVT = Op1VT.getSimpleVT();
	if (Subtarget.needsSwapsForVSXMemOps() &&
	(StoreVT == MVT::v2f64 \|\| StoreVT == MVT::v2i64 \|\|
	StoreVT == MVT::v4f32 \|\| StoreVT == MVT::v4i32))
	return expandVSXStoreForLE(N, DCI);
	}
	break;
	}
	case ISD::LOAD: {
	LoadSDNode *LD = cast<LoadSDNode>(N);
	EVT VT = LD->getValueType(0);

	// For little endian, VSX loads require generating lxvd2x/xxswapd.
	// Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
	if (VT.isSimple()) {
	MVT LoadVT = VT.getSimpleVT();
	if (Subtarget.needsSwapsForVSXMemOps() &&
	(LoadVT == MVT::v2f64 \|\| LoadVT == MVT::v2i64 \|\|
	LoadVT == MVT::v4f32 \|\| LoadVT == MVT::v4i32))
	return expandVSXLoadForLE(N, DCI);
	}

	// We sometimes end up with a 64-bit integer load, from which we extract
	// two single-precision floating-point numbers. This happens with
	// std::complex<float>, and other similar structures, because of the way we
	// canonicalize structure copies. However, if we lack direct moves,
	// then the final bitcasts from the extracted integer values to the
	// floating-point numbers turn into store/load pairs. Even with direct moves,
	// just loading the two floating-point numbers is likely better.
	auto ReplaceTwoFloatLoad = [&]() {
	if (VT != MVT::i64)
	return false;

	if (LD->getExtensionType() != ISD::NON_EXTLOAD \|\|
	LD->isVolatile())
	return false;

	// We're looking for a sequence like this:
	// t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64
	// t16: i64 = srl t13, Constant:i32<32>
	// t17: i32 = truncate t16
	// t18: f32 = bitcast t17
	// t19: i32 = truncate t13
	// t20: f32 = bitcast t19

	if (!LD->hasNUsesOfValue(2, 0))
	return false;

	auto UI = LD->use_begin();
	while (UI.getUse().getResNo() != 0) ++UI;
	SDNode Trunc = UI++;
	while (UI.getUse().getResNo() != 0) ++UI;
	SDNode RightShift = UI;
	if (Trunc->getOpcode() != ISD::TRUNCATE)
	std::swap(Trunc, RightShift);

	if (Trunc->getOpcode() != ISD::TRUNCATE \|\|
	Trunc->getValueType(0) != MVT::i32 \|\|
	!Trunc->hasOneUse())
	return false;
	if (RightShift->getOpcode() != ISD::SRL \|\|
	!isa<ConstantSDNode>(RightShift->getOperand(1)) \|\|
	RightShift->getConstantOperandVal(1) != 32 \|\|
	!RightShift->hasOneUse())
	return false;

	SDNode Trunc2 = RightShift->use_begin();
	if (Trunc2->getOpcode() != ISD::TRUNCATE \|\|
	Trunc2->getValueType(0) != MVT::i32 \|\|
	!Trunc2->hasOneUse())
	return false;

	SDNode Bitcast = Trunc->use_begin();
	SDNode Bitcast2 = Trunc2->use_begin();

	if (Bitcast->getOpcode() != ISD::BITCAST \|\|
	Bitcast->getValueType(0) != MVT::f32)
	return false;
	if (Bitcast2->getOpcode() != ISD::BITCAST \|\|
	Bitcast2->getValueType(0) != MVT::f32)
	return false;

	if (Subtarget.isLittleEndian())
	std::swap(Bitcast, Bitcast2);

	// Bitcast has the second float (in memory-layout order) and Bitcast2
	// has the first one.

	SDValue BasePtr = LD->getBasePtr();
	if (LD->isIndexed()) {
	assert(LD->getAddressingMode() == ISD::PRE_INC &&
	"Non-pre-inc AM on PPC?");
	BasePtr =
	DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
	LD->getOffset());
	}

	auto MMOFlags =
	LD->getMemOperand()->getFlags() & ~MachineMemOperand::MOVolatile;
	SDValue FloatLoad = DAG.getLoad(MVT::f32, dl, LD->getChain(), BasePtr,
	LD->getPointerInfo(), LD->getAlignment(),
	MMOFlags, LD->getAAInfo());
	SDValue AddPtr =
	DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
	BasePtr, DAG.getIntPtrConstant(4, dl));
	SDValue FloatLoad2 = DAG.getLoad(
	MVT::f32, dl, SDValue(FloatLoad.getNode(), 1), AddPtr,
	LD->getPointerInfo().getWithOffset(4),
	MinAlign(LD->getAlignment(), 4), MMOFlags, LD->getAAInfo());

	if (LD->isIndexed()) {
	// Note that DAGCombine should re-form any pre-increment load(s) from
	// what is produced here if that makes sense.
	DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), BasePtr);
	}

	DCI.CombineTo(Bitcast2, FloatLoad);
	DCI.CombineTo(Bitcast, FloatLoad2);

	DAG.ReplaceAllUsesOfValueWith(SDValue(LD, LD->isIndexed() ? 2 : 1),
	SDValue(FloatLoad2.getNode(), 1));
	return true;
	};

	if (ReplaceTwoFloatLoad())
	return SDValue(N, 0);

	EVT MemVT = LD->getMemoryVT();
	Type Ty = MemVT.getTypeForEVT(DAG.getContext());
	Align ABIAlignment = DAG.getDataLayout().getABITypeAlign(Ty);
	Type STy = MemVT.getScalarType().getTypeForEVT(DAG.getContext());
	Align ScalarABIAlignment = DAG.getDataLayout().getABITypeAlign(STy);
	if (LD->isUnindexed() && VT.isVector() &&
	((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) &&
	// P8 and later hardware should just use LOAD.
	!Subtarget.hasP8Vector() &&
	(VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
	VT == MVT::v4f32)) \|\|
	(Subtarget.hasQPX() && (VT == MVT::v4f64 \|\| VT == MVT::v4f32) &&
	LD->getAlign() >= ScalarABIAlignment)) &&
	LD->getAlign() < ABIAlignment) {
	// This is a type-legal unaligned Altivec or QPX load.
	SDValue Chain = LD->getChain();
	SDValue Ptr = LD->getBasePtr();
	bool isLittleEndian = Subtarget.isLittleEndian();

	// This implements the loading of unaligned vectors as described in
	// the venerable Apple Velocity Engine overview. Specifically:
	// https://developer.apple.com/hardwaredrivers/ve/alignment.html
	// https://developer.apple.com/hardwaredrivers/ve/code_optimization.html
	//
	// The general idea is to expand a sequence of one or more unaligned
	// loads into an alignment-based permutation-control instruction (lvsl
	// or lvsr), a series of regular vector loads (which always truncate
	// their input address to an aligned address), and a series of
	// permutations. The results of these permutations are the requested
	// loaded values. The trick is that the last "extra" load is not taken
	// from the address you might suspect (sizeof(vector) bytes after the
	// last requested load), but rather sizeof(vector) - 1 bytes after the
	// last requested vector. The point of this is to avoid a page fault if
	// the base address happened to be aligned. This works because if the
	// base address is aligned, then adding less than a full vector length
	// will cause the last vector in the sequence to be (re)loaded.
	// Otherwise, the next vector will be fetched as you might suspect was
	// necessary.

	// We might be able to reuse the permutation generation from
	// a different base address offset from this one by an aligned amount.
	// The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this
	// optimization later.
	Intrinsic::ID Intr, IntrLD, IntrPerm;
	MVT PermCntlTy, PermTy, LDTy;
	if (Subtarget.hasAltivec()) {
	Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr :
	Intrinsic::ppc_altivec_lvsl;
	IntrLD = Intrinsic::ppc_altivec_lvx;
	IntrPerm = Intrinsic::ppc_altivec_vperm;
	PermCntlTy = MVT::v16i8;
	PermTy = MVT::v4i32;
	LDTy = MVT::v4i32;
	} else {
	Intr = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlpcld :
	Intrinsic::ppc_qpx_qvlpcls;
	IntrLD = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlfd :
	Intrinsic::ppc_qpx_qvlfs;
	IntrPerm = Intrinsic::ppc_qpx_qvfperm;
	PermCntlTy = MVT::v4f64;
	PermTy = MVT::v4f64;
	LDTy = MemVT.getSimpleVT();
	}

	SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, PermCntlTy);

	// Create the new MMO for the new base load. It is like the original MMO,
	// but represents an area in memory almost twice the vector size centered
	// on the original address. If the address is unaligned, we might start
	// reading up to (sizeof(vector)-1) bytes below the address of the
	// original unaligned load.
	MachineFunction &MF = DAG.getMachineFunction();
	MachineMemOperand *BaseMMO =
	MF.getMachineMemOperand(LD->getMemOperand(),
	-(long)MemVT.getStoreSize()+1,
	2*MemVT.getStoreSize()-1);

	// Create the new base load.
	SDValue LDXIntID =
	DAG.getTargetConstant(IntrLD, dl, getPointerTy(MF.getDataLayout()));
	SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr };
	SDValue BaseLoad =
	DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
	DAG.getVTList(PermTy, MVT::Other),
	BaseLoadOps, LDTy, BaseMMO);

	// Note that the value of IncOffset (which is provided to the next
	// load's pointer info offset value, and thus used to calculate the
	// alignment), and the value of IncValue (which is actually used to
	// increment the pointer value) are different! This is because we
	// require the next load to appear to be aligned, even though it
	// is actually offset from the base pointer by a lesser amount.
	int IncOffset = VT.getSizeInBits() / 8;
	int IncValue = IncOffset;

	// Walk (both up and down) the chain looking for another load at the real
	// (aligned) offset (the alignment of the other load does not matter in
	// this case). If found, then do not use the offset reduction trick, as
	// that will prevent the loads from being later combined (as they would
	// otherwise be duplicates).
	if (!findConsecutiveLoad(LD, DAG))
	--IncValue;

	SDValue Increment =
	DAG.getConstant(IncValue, dl, getPointerTy(MF.getDataLayout()));
	Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);

	MachineMemOperand *ExtraMMO =
	MF.getMachineMemOperand(LD->getMemOperand(),
	1, 2*MemVT.getStoreSize()-1);
	SDValue ExtraLoadOps[] = { Chain, LDXIntID, Ptr };
	SDValue ExtraLoad =
	DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
	DAG.getVTList(PermTy, MVT::Other),
	ExtraLoadOps, LDTy, ExtraMMO);

	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	BaseLoad.getValue(1), ExtraLoad.getValue(1));

	// Because vperm has a big-endian bias, we must reverse the order
	// of the input vectors and complement the permute control vector
	// when generating little endian code. We have already handled the
	// latter by using lvsr instead of lvsl, so just reverse BaseLoad
	// and ExtraLoad here.
	SDValue Perm;
	if (isLittleEndian)
	Perm = BuildIntrinsicOp(IntrPerm,
	ExtraLoad, BaseLoad, PermCntl, DAG, dl);
	else
	Perm = BuildIntrinsicOp(IntrPerm,
	BaseLoad, ExtraLoad, PermCntl, DAG, dl);

	if (VT != PermTy)
	Perm = Subtarget.hasAltivec() ?
	DAG.getNode(ISD::BITCAST, dl, VT, Perm) :
	DAG.getNode(ISD::FP_ROUND, dl, VT, Perm, // QPX
	DAG.getTargetConstant(1, dl, MVT::i64));
	// second argument is 1 because this rounding
	// is always exact.

	// The output of the permutation is our loaded result, the TokenFactor is
	// our new chain.
	DCI.CombineTo(N, Perm, TF);
	return SDValue(N, 0);
	}
	}
	break;
	case ISD::INTRINSIC_WO_CHAIN: {
	bool isLittleEndian = Subtarget.isLittleEndian();
	unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
	Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr
	: Intrinsic::ppc_altivec_lvsl);
	if ((IID == Intr \|\|
	IID == Intrinsic::ppc_qpx_qvlpcld \|\|
	IID == Intrinsic::ppc_qpx_qvlpcls) &&
	N->getOperand(1)->getOpcode() == ISD::ADD) {
	SDValue Add = N->getOperand(1);

	int Bits = IID == Intrinsic::ppc_qpx_qvlpcld ?
	5 /* 32 byte alignment / : 4 / 16 byte alignment */;

	if (DAG.MaskedValueIsZero(Add->getOperand(1),
	APInt::getAllOnesValue(Bits /* alignment */)
	.zext(Add.getScalarValueSizeInBits()))) {
	SDNode *BasePtr = Add->getOperand(0).getNode();
	for (SDNode::use_iterator UI = BasePtr->use_begin(),
	UE = BasePtr->use_end();
	UI != UE; ++UI) {
	if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
	cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() == IID) {
	// We've found another LVSL/LVSR, and this address is an aligned
	// multiple of that one. The results will be the same, so use the
	// one we've just found instead.

	return SDValue(*UI, 0);
	}
	}
	}

	if (isa<ConstantSDNode>(Add->getOperand(1))) {
	SDNode *BasePtr = Add->getOperand(0).getNode();
	for (SDNode::use_iterator UI = BasePtr->use_begin(),
	UE = BasePtr->use_end(); UI != UE; ++UI) {
	if (UI->getOpcode() == ISD::ADD &&
	isa<ConstantSDNode>(UI->getOperand(1)) &&
	(cast<ConstantSDNode>(Add->getOperand(1))->getZExtValue() -
	cast<ConstantSDNode>(UI->getOperand(1))->getZExtValue()) %
	(1ULL << Bits) == 0) {
	SDNode OtherAdd = UI;
	for (SDNode::use_iterator VI = OtherAdd->use_begin(),
	VE = OtherAdd->use_end(); VI != VE; ++VI) {
	if (VI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
	cast<ConstantSDNode>(VI->getOperand(0))->getZExtValue() == IID) {
	return SDValue(*VI, 0);
	}
	}
	}
	}
	}
	}

	// Combine vmaxsw/h/b(a, a's negation) to abs(a)
	// Expose the vabsduw/h/b opportunity for down stream
	if (!DCI.isAfterLegalizeDAG() && Subtarget.hasP9Altivec() &&
	(IID == Intrinsic::ppc_altivec_vmaxsw \|\|
	IID == Intrinsic::ppc_altivec_vmaxsh \|\|
	IID == Intrinsic::ppc_altivec_vmaxsb)) {
	SDValue V1 = N->getOperand(1);
	SDValue V2 = N->getOperand(2);
	if ((V1.getSimpleValueType() == MVT::v4i32 \|\|
	V1.getSimpleValueType() == MVT::v8i16 \|\|
	V1.getSimpleValueType() == MVT::v16i8) &&
	V1.getSimpleValueType() == V2.getSimpleValueType()) {
	// (0-a, a)
	if (V1.getOpcode() == ISD::SUB &&
	ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) &&
	V1.getOperand(1) == V2) {
	return DAG.getNode(ISD::ABS, dl, V2.getValueType(), V2);
	}
	// (a, 0-a)
	if (V2.getOpcode() == ISD::SUB &&
	ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) &&
	V2.getOperand(1) == V1) {
	return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1);
	}
	// (x-y, y-x)
	if (V1.getOpcode() == ISD::SUB && V2.getOpcode() == ISD::SUB &&
	V1.getOperand(0) == V2.getOperand(1) &&
	V1.getOperand(1) == V2.getOperand(0)) {
	return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1);
	}
	}
	}
	}

	break;
	case ISD::INTRINSIC_W_CHAIN:
	// For little endian, VSX loads require generating lxvd2x/xxswapd.
	// Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
	if (Subtarget.needsSwapsForVSXMemOps()) {
	switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
	default:
	break;
	case Intrinsic::ppc_vsx_lxvw4x:
	case Intrinsic::ppc_vsx_lxvd2x:
	return expandVSXLoadForLE(N, DCI);
	}
	}
	break;
	case ISD::INTRINSIC_VOID:
	// For little endian, VSX stores require generating xxswapd/stxvd2x.
	// Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
	if (Subtarget.needsSwapsForVSXMemOps()) {
	switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
	default:
	break;
	case Intrinsic::ppc_vsx_stxvw4x:
	case Intrinsic::ppc_vsx_stxvd2x:
	return expandVSXStoreForLE(N, DCI);
	}
	}
	break;
	case ISD::BSWAP:
	// Turn BSWAP (LOAD) -> lhbrx/lwbrx.
	if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
	N->getOperand(0).hasOneUse() &&
	(N->getValueType(0) == MVT::i32 \|\| N->getValueType(0) == MVT::i16 \|\|
	(Subtarget.hasLDBRX() && Subtarget.isPPC64() &&
	N->getValueType(0) == MVT::i64))) {
	SDValue Load = N->getOperand(0);
	LoadSDNode *LD = cast<LoadSDNode>(Load);
	// Create the byte-swapping load.
	SDValue Ops[] = {
	LD->getChain(), // Chain
	LD->getBasePtr(), // Ptr
	DAG.getValueType(N->getValueType(0)) // VT
	};
	SDValue BSLoad =
	DAG.getMemIntrinsicNode(PPCISD::LBRX, dl,
	DAG.getVTList(N->getValueType(0) == MVT::i64 ?
	MVT::i64 : MVT::i32, MVT::Other),
	Ops, LD->getMemoryVT(), LD->getMemOperand());

	// If this is an i16 load, insert the truncate.
	SDValue ResVal = BSLoad;
	if (N->getValueType(0) == MVT::i16)
	ResVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, BSLoad);

	// First, combine the bswap away. This makes the value produced by the
	// load dead.
	DCI.CombineTo(N, ResVal);

	// Next, combine the load away, we give it a bogus result value but a real
	// chain result. The result value is dead because the bswap is dead.
	DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1));

	// Return N so it doesn't get rechecked!
	return SDValue(N, 0);
	}
	break;
	case PPCISD::VCMP:
	// If a VCMPo node already exists with exactly the same operands as this
	// node, use its result instead of this node (VCMPo computes both a CR6 and
	// a normal output).
	//
	if (!N->getOperand(0).hasOneUse() &&
	!N->getOperand(1).hasOneUse() &&
	!N->getOperand(2).hasOneUse()) {

	// Scan all of the users of the LHS, looking for VCMPo's that match.
	SDNode *VCMPoNode = nullptr;

	SDNode *LHSN = N->getOperand(0).getNode();
	for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end();
	UI != E; ++UI)
	if (UI->getOpcode() == PPCISD::VCMPo &&
	UI->getOperand(1) == N->getOperand(1) &&
	UI->getOperand(2) == N->getOperand(2) &&
	UI->getOperand(0) == N->getOperand(0)) {
	VCMPoNode = *UI;
	break;
	}

	// If there is no VCMPo node, or if the flag value has a single use, don't
	// transform this.
	if (!VCMPoNode \|\| VCMPoNode->hasNUsesOfValue(0, 1))
	break;

	// Look at the (necessarily single) use of the flag value. If it has a
	// chain, this transformation is more complex. Note that multiple things
	// could use the value result, which we should ignore.
	SDNode *FlagUser = nullptr;
	for (SDNode::use_iterator UI = VCMPoNode->use_begin();
	FlagUser == nullptr; ++UI) {
	assert(UI != VCMPoNode->use_end() && "Didn't find user!");
	SDNode User = UI;
	for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
	if (User->getOperand(i) == SDValue(VCMPoNode, 1)) {
	FlagUser = User;
	break;
	}
	}
	}

	// If the user is a MFOCRF instruction, we know this is safe.
	// Otherwise we give up for right now.
	if (FlagUser->getOpcode() == PPCISD::MFOCRF)
	return SDValue(VCMPoNode, 0);
	}
	break;
	case ISD::BRCOND: {
	SDValue Cond = N->getOperand(1);
	SDValue Target = N->getOperand(2);

	if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
	cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() ==
	Intrinsic::loop_decrement) {

	// We now need to make the intrinsic dead (it cannot be instruction
	// selected).
	DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Cond.getOperand(0));
	assert(Cond.getNode()->hasOneUse() &&
	"Counter decrement has more than one use");

	return DAG.getNode(PPCISD::BDNZ, dl, MVT::Other,
	N->getOperand(0), Target);
	}
	}
	break;
	case ISD::BR_CC: {
	// If this is a branch on an altivec predicate comparison, lower this so
	// that we don't have to do a MFOCRF: instead, branch directly on CR6. This
	// lowering is done pre-legalize, because the legalizer lowers the predicate
	// compare down to code that is difficult to reassemble.
	ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
	SDValue LHS = N->getOperand(2), RHS = N->getOperand(3);

	// Sometimes the promoted value of the intrinsic is ANDed by some non-zero
	// value. If so, pass-through the AND to get to the intrinsic.
	if (LHS.getOpcode() == ISD::AND &&
	LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN &&
	cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() ==
	Intrinsic::loop_decrement &&
	isa<ConstantSDNode>(LHS.getOperand(1)) &&
	!isNullConstant(LHS.getOperand(1)))
	LHS = LHS.getOperand(0);

	if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
	cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() ==
	Intrinsic::loop_decrement &&
	isa<ConstantSDNode>(RHS)) {
	assert((CC == ISD::SETEQ \|\| CC == ISD::SETNE) &&
	"Counter decrement comparison is not EQ or NE");

	unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue();
	bool isBDNZ = (CC == ISD::SETEQ && Val) \|\|
	(CC == ISD::SETNE && !Val);

	// We now need to make the intrinsic dead (it cannot be instruction
	// selected).
	DAG.ReplaceAllUsesOfValueWith(LHS.getValue(1), LHS.getOperand(0));
	assert(LHS.getNode()->hasOneUse() &&
	"Counter decrement has more than one use");

	return DAG.getNode(isBDNZ ? PPCISD::BDNZ : PPCISD::BDZ, dl, MVT::Other,
	N->getOperand(0), N->getOperand(4));
	}

	int CompareOpc;
	bool isDot;

	if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
	isa<ConstantSDNode>(RHS) && (CC == ISD::SETEQ \|\| CC == ISD::SETNE) &&
	getVectorCompareInfo(LHS, CompareOpc, isDot, Subtarget)) {
	assert(isDot && "Can't compare against a vector result!");

	// If this is a comparison against something other than 0/1, then we know
	// that the condition is never/always true.
	unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue();
	if (Val != 0 && Val != 1) {
	if (CC == ISD::SETEQ) // Cond never true, remove branch.
	return N->getOperand(0);
	// Always !=, turn it into an unconditional branch.
	return DAG.getNode(ISD::BR, dl, MVT::Other,
	N->getOperand(0), N->getOperand(4));
	}

	bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0);

	// Create the PPCISD altivec 'dot' comparison node.
	SDValue Ops[] = {
	LHS.getOperand(2), // LHS of compare
	LHS.getOperand(3), // RHS of compare
	DAG.getConstant(CompareOpc, dl, MVT::i32)
	};
	EVT VTs[] = { LHS.getOperand(2).getValueType(), MVT::Glue };
	SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops);

	// Unpack the result based on how the target uses it.
	PPC::Predicate CompOpc;
	switch (cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue()) {
	default: // Can't happen, don't crash on invalid number though.
	case 0: // Branch on the value of the EQ bit of CR6.
	CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE;
	break;
	case 1: // Branch on the inverted value of the EQ bit of CR6.
	CompOpc = BranchOnWhenPredTrue ? PPC::PRED_NE : PPC::PRED_EQ;
	break;
	case 2: // Branch on the value of the LT bit of CR6.
	CompOpc = BranchOnWhenPredTrue ? PPC::PRED_LT : PPC::PRED_GE;
	break;
	case 3: // Branch on the inverted value of the LT bit of CR6.
	CompOpc = BranchOnWhenPredTrue ? PPC::PRED_GE : PPC::PRED_LT;
	break;
	}

	return DAG.getNode(PPCISD::COND_BRANCH, dl, MVT::Other, N->getOperand(0),
	DAG.getConstant(CompOpc, dl, MVT::i32),
	DAG.getRegister(PPC::CR6, MVT::i32),
	N->getOperand(4), CompNode.getValue(1));
	}
	break;
	}
	case ISD::BUILD_VECTOR:
	return DAGCombineBuildVector(N, DCI);
	case ISD::ABS:
	return combineABS(N, DCI);
	case ISD::VSELECT:
	return combineVSelect(N, DCI);
	}

	return SDValue();
	}

	SDValue
	PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
	SelectionDAG &DAG,
	SmallVectorImpl<SDNode *> &Created) const {
	// fold (sdiv X, pow2)
	EVT VT = N->getValueType(0);
	if (VT == MVT::i64 && !Subtarget.isPPC64())
	return SDValue();
	if ((VT != MVT::i32 && VT != MVT::i64) \|\|
	!(Divisor.isPowerOf2() \|\| (-Divisor).isPowerOf2()))
	return SDValue();

	SDLoc DL(N);
	SDValue N0 = N->getOperand(0);

	bool IsNegPow2 = (-Divisor).isPowerOf2();
	unsigned Lg2 = (IsNegPow2 ? -Divisor : Divisor).countTrailingZeros();
	SDValue ShiftAmt = DAG.getConstant(Lg2, DL, VT);

	SDValue Op = DAG.getNode(PPCISD::SRA_ADDZE, DL, VT, N0, ShiftAmt);
	Created.push_back(Op.getNode());

	if (IsNegPow2) {
	Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
	Created.push_back(Op.getNode());
	}

	return Op;
	}

	//===----------------------------------------------------------------------===//
	// Inline Assembly Support
	//===----------------------------------------------------------------------===//

	void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
	KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth) const {
	Known.resetAll();
	switch (Op.getOpcode()) {
	default: break;
	case PPCISD::LBRX: {
	// lhbrx is known to have the top bits cleared out.
	if (cast<VTSDNode>(Op.getOperand(2))->getVT() == MVT::i16)
	Known.Zero = 0xFFFF0000;
	break;
	}
	case ISD::INTRINSIC_WO_CHAIN: {
	switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
	default: break;
	case Intrinsic::ppc_altivec_vcmpbfp_p:
	case Intrinsic::ppc_altivec_vcmpeqfp_p:
	case Intrinsic::ppc_altivec_vcmpequb_p:
	case Intrinsic::ppc_altivec_vcmpequh_p:
	case Intrinsic::ppc_altivec_vcmpequw_p:
	case Intrinsic::ppc_altivec_vcmpequd_p:
	case Intrinsic::ppc_altivec_vcmpgefp_p:
	case Intrinsic::ppc_altivec_vcmpgtfp_p:
	case Intrinsic::ppc_altivec_vcmpgtsb_p:
	case Intrinsic::ppc_altivec_vcmpgtsh_p:
	case Intrinsic::ppc_altivec_vcmpgtsw_p:
	case Intrinsic::ppc_altivec_vcmpgtsd_p:
	case Intrinsic::ppc_altivec_vcmpgtub_p:
	case Intrinsic::ppc_altivec_vcmpgtuh_p:
	case Intrinsic::ppc_altivec_vcmpgtuw_p:
	case Intrinsic::ppc_altivec_vcmpgtud_p:
	Known.Zero = ~1U; // All bits but the low one are known to be zero.
	break;
	}
	}
	}
	}

	Align PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
	switch (Subtarget.getCPUDirective()) {
	default: break;
	case PPC::DIR_970:
	case PPC::DIR_PWR4:
	case PPC::DIR_PWR5:
	case PPC::DIR_PWR5X:
	case PPC::DIR_PWR6:
	case PPC::DIR_PWR6X:
	case PPC::DIR_PWR7:
	case PPC::DIR_PWR8:
	case PPC::DIR_PWR9:
	case PPC::DIR_PWR10:
	case PPC::DIR_PWR_FUTURE: {
	if (!ML)
	break;

	if (!DisableInnermostLoopAlign32) {
	// If the nested loop is an innermost loop, prefer to a 32-byte alignment,
	// so that we can decrease cache misses and branch-prediction misses.
	// Actual alignment of the loop will depend on the hotness check and other
	// logic in alignBlocks.
	if (ML->getLoopDepth() > 1 && ML->getSubLoops().empty())
	return Align(32);
	}

	const PPCInstrInfo *TII = Subtarget.getInstrInfo();

	// For small loops (between 5 and 8 instructions), align to a 32-byte
	// boundary so that the entire loop fits in one instruction-cache line.
	uint64_t LoopSize = 0;
	for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I)
	for (auto J = (I)->begin(), JE = (I)->end(); J != JE; ++J) {
	LoopSize += TII->getInstSizeInBytes(*J);
	if (LoopSize > 32)
	break;
	}

	if (LoopSize > 16 && LoopSize <= 32)
	return Align(32);

	break;
	}
	}

	return TargetLowering::getPrefLoopAlignment(ML);
	}

	/// getConstraintType - Given a constraint, return the type of
	/// constraint it is for this target.
	PPCTargetLowering::ConstraintType
	PPCTargetLowering::getConstraintType(StringRef Constraint) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	default: break;
	case 'b':
	case 'r':
	case 'f':
	case 'd':
	case 'v':
	case 'y':
	return C_RegisterClass;
	case 'Z':
	// FIXME: While Z does indicate a memory constraint, it specifically
	// indicates an r+r address (used in conjunction with the 'y' modifier
	// in the replacement string). Currently, we're forcing the base
	// register to be r0 in the asm printer (which is interpreted as zero)
	// and forming the complete address in the second register. This is
	// suboptimal.
	return C_Memory;
	}
	} else if (Constraint == "wc") { // individual CR bits.
	return C_RegisterClass;
	} else if (Constraint == "wa" \|\| Constraint == "wd" \|\|
	Constraint == "wf" \|\| Constraint == "ws" \|\|
	Constraint == "wi" \|\| Constraint == "ww") {
	return C_RegisterClass; // VSX registers.
	}
	return TargetLowering::getConstraintType(Constraint);
	}

	/// Examine constraint type and operand type and determine a weight value.
	/// This object must already have been set up with the operand type
	/// and the current alternative constraint selected.
	TargetLowering::ConstraintWeight
	PPCTargetLowering::getSingleConstraintMatchWeight(
	AsmOperandInfo &info, const char *constraint) const {
	ConstraintWeight weight = CW_Invalid;
	Value *CallOperandVal = info.CallOperandVal;
	// If we don't have a value, we can't do a match,
	// but allow it at the lowest weight.
	if (!CallOperandVal)
	return CW_Default;
	Type *type = CallOperandVal->getType();

	// Look at the constraint type.
	if (StringRef(constraint) == "wc" && type->isIntegerTy(1))
	return CW_Register; // an individual CR bit.
	else if ((StringRef(constraint) == "wa" \|\|
	StringRef(constraint) == "wd" \|\|
	StringRef(constraint) == "wf") &&
	type->isVectorTy())
	return CW_Register;
	else if (StringRef(constraint) == "wi" && type->isIntegerTy(64))
	return CW_Register; // just hold 64-bit integers data.
	else if (StringRef(constraint) == "ws" && type->isDoubleTy())
	return CW_Register;
	else if (StringRef(constraint) == "ww" && type->isFloatTy())
	return CW_Register;

	switch (*constraint) {
	default:
	weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
	break;
	case 'b':
	if (type->isIntegerTy())
	weight = CW_Register;
	break;
	case 'f':
	if (type->isFloatTy())
	weight = CW_Register;
	break;
	case 'd':
	if (type->isDoubleTy())
	weight = CW_Register;
	break;
	case 'v':
	if (type->isVectorTy())
	weight = CW_Register;
	break;
	case 'y':
	weight = CW_Register;
	break;
	case 'Z':
	weight = CW_Memory;
	break;
	}
	return weight;
	}

	std::pair<unsigned, const TargetRegisterClass *>
	PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint,
	MVT VT) const {
	if (Constraint.size() == 1) {
	// GCC RS6000 Constraint Letters
	switch (Constraint[0]) {
	case 'b': // R1-R31
	if (VT == MVT::i64 && Subtarget.isPPC64())
	return std::make_pair(0U, &PPC::G8RC_NOX0RegClass);
	return std::make_pair(0U, &PPC::GPRC_NOR0RegClass);
	case 'r': // R0-R31
	if (VT == MVT::i64 && Subtarget.isPPC64())
	return std::make_pair(0U, &PPC::G8RCRegClass);
	return std::make_pair(0U, &PPC::GPRCRegClass);
	// 'd' and 'f' constraints are both defined to be "the floating point
	// registers", where one is for 32-bit and the other for 64-bit. We don't
	// really care overly much here so just give them all the same reg classes.
	case 'd':
	case 'f':
	if (Subtarget.hasSPE()) {
	if (VT == MVT::f32 \|\| VT == MVT::i32)
	return std::make_pair(0U, &PPC::GPRCRegClass);
	if (VT == MVT::f64 \|\| VT == MVT::i64)
	return std::make_pair(0U, &PPC::SPERCRegClass);
	} else {
	if (VT == MVT::f32 \|\| VT == MVT::i32)
	return std::make_pair(0U, &PPC::F4RCRegClass);
	if (VT == MVT::f64 \|\| VT == MVT::i64)
	return std::make_pair(0U, &PPC::F8RCRegClass);
	if (VT == MVT::v4f64 && Subtarget.hasQPX())
	return std::make_pair(0U, &PPC::QFRCRegClass);
	if (VT == MVT::v4f32 && Subtarget.hasQPX())
	return std::make_pair(0U, &PPC::QSRCRegClass);
	}
	break;
	case 'v':
	if (VT == MVT::v4f64 && Subtarget.hasQPX())
	return std::make_pair(0U, &PPC::QFRCRegClass);
	if (VT == MVT::v4f32 && Subtarget.hasQPX())
	return std::make_pair(0U, &PPC::QSRCRegClass);
	if (Subtarget.hasAltivec())
	return std::make_pair(0U, &PPC::VRRCRegClass);
	break;
	case 'y': // crrc
	return std::make_pair(0U, &PPC::CRRCRegClass);
	}
	} else if (Constraint == "wc" && Subtarget.useCRBits()) {
	// An individual CR bit.
	return std::make_pair(0U, &PPC::CRBITRCRegClass);
	} else if ((Constraint == "wa" \|\| Constraint == "wd" \|\|
	Constraint == "wf" \|\| Constraint == "wi") &&
	Subtarget.hasVSX()) {
	return std::make_pair(0U, &PPC::VSRCRegClass);
	} else if ((Constraint == "ws" \|\| Constraint == "ww") && Subtarget.hasVSX()) {
	if (VT == MVT::f32 && Subtarget.hasP8Vector())
	return std::make_pair(0U, &PPC::VSSRCRegClass);
	else
	return std::make_pair(0U, &PPC::VSFRCRegClass);
	}

	// If we name a VSX register, we can't defer to the base class because it
	// will not recognize the correct register (their names will be VSL{0-31}
	// and V{0-31} so they won't match). So we match them here.
	if (Constraint.size() > 3 && Constraint[1] == 'v' && Constraint[2] == 's') {
	int VSNum = atoi(Constraint.data() + 3);
	assert(VSNum >= 0 && VSNum <= 63 &&
	"Attempted to access a vsr out of range");
	if (VSNum < 32)
	return std::make_pair(PPC::VSL0 + VSNum, &PPC::VSRCRegClass);
	return std::make_pair(PPC::V0 + VSNum - 32, &PPC::VSRCRegClass);
	}
	std::pair<unsigned, const TargetRegisterClass *> R =
	TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

	// r[0-9]+ are used, on PPC64, to refer to the corresponding 64-bit registers
	// (which we call X[0-9]+). If a 64-bit value has been requested, and a
	// 32-bit GPR has been selected, then 'upgrade' it to the 64-bit parent
	// register.
	// FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use
	// the AsmName field from *RegisterInfo.td, then this would not be necessary.
	if (R.first && VT == MVT::i64 && Subtarget.isPPC64() &&
	PPC::GPRCRegClass.contains(R.first))
	return std::make_pair(TRI->getMatchingSuperReg(R.first,
	PPC::sub_32, &PPC::G8RCRegClass),
	&PPC::G8RCRegClass);

	// GCC accepts 'cc' as an alias for 'cr0', and we need to do the same.
	if (!R.second && StringRef("{cc}").equals_lower(Constraint)) {
	R.first = PPC::CR0;
	R.second = &PPC::CRRCRegClass;
	}

	return R;
	}

	/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
	/// vector. If it is invalid, don't add anything to Ops.
	void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
	std::string &Constraint,
	std::vector<SDValue>&Ops,
	SelectionDAG &DAG) const {
	SDValue Result;

	// Only support length 1 constraints.
	if (Constraint.length() > 1) return;

	char Letter = Constraint[0];
	switch (Letter) {
	default: break;
	case 'I':
	case 'J':
	case 'K':
	case 'L':
	case 'M':
	case 'N':
	case 'O':
	case 'P': {
	ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op);
	if (!CST) return; // Must be an immediate to match.
	SDLoc dl(Op);
	int64_t Value = CST->getSExtValue();
	EVT TCVT = MVT::i64; // All constants taken to be 64 bits so that negative
	// numbers are printed as such.
	switch (Letter) {
	default: llvm_unreachable("Unknown constraint letter!");
	case 'I': // "I" is a signed 16-bit constant.
	if (isInt<16>(Value))
	Result = DAG.getTargetConstant(Value, dl, TCVT);
	break;
	case 'J': // "J" is a constant with only the high-order 16 bits nonzero.
	if (isShiftedUInt<16, 16>(Value))
	Result = DAG.getTargetConstant(Value, dl, TCVT);
	break;
	case 'L': // "L" is a signed 16-bit constant shifted left 16 bits.
	if (isShiftedInt<16, 16>(Value))
	Result = DAG.getTargetConstant(Value, dl, TCVT);
	break;
	case 'K': // "K" is a constant with only the low-order 16 bits nonzero.
	if (isUInt<16>(Value))
	Result = DAG.getTargetConstant(Value, dl, TCVT);
	break;
	case 'M': // "M" is a constant that is greater than 31.
	if (Value > 31)
	Result = DAG.getTargetConstant(Value, dl, TCVT);
	break;
	case 'N': // "N" is a positive constant that is an exact power of two.
	if (Value > 0 && isPowerOf2_64(Value))
	Result = DAG.getTargetConstant(Value, dl, TCVT);
	break;
	case 'O': // "O" is the constant zero.
	if (Value == 0)
	Result = DAG.getTargetConstant(Value, dl, TCVT);
	break;
	case 'P': // "P" is a constant whose negation is a signed 16-bit constant.
	if (isInt<16>(-Value))
	Result = DAG.getTargetConstant(Value, dl, TCVT);
	break;
	}
	break;
	}
	}

	if (Result.getNode()) {
	Ops.push_back(Result);
	return;
	}

	// Handle standard constraint letters.
	TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
	}

	// isLegalAddressingMode - Return true if the addressing mode represented
	// by AM is legal for this target, for a load/store of the specified type.
	bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS, Instruction *I) const {
	// PPC does not allow r+i addressing modes for vectors!
	if (Ty->isVectorTy() && AM.BaseOffs != 0)
	return false;

	// PPC allows a sign-extended 16-bit immediate field.
	if (AM.BaseOffs <= -(1LL << 16) \|\| AM.BaseOffs >= (1LL << 16)-1)
	return false;

	// No global is ever allowed as a base.
	if (AM.BaseGV)
	return false;

	// PPC only support r+r,
	switch (AM.Scale) {
	case 0: // "r+i" or just "i", depending on HasBaseReg.
	break;
	case 1:
	if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed.
	return false;
	// Otherwise we have r+r or r+i.
	break;
	case 2:
	if (AM.HasBaseReg \|\| AM.BaseOffs) // 2r+r or 2r+i is not allowed.
	return false;
	// Allow 2*r as r+r.
	break;
	default:
	// No other scales are supported.
	return false;
	}

	return true;
	}

	SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MFI.setReturnAddressIsTaken(true);

	if (verifyReturnAddressArgumentIsConstant(Op, DAG))
	return SDValue();

	SDLoc dl(Op);
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();

	// Make sure the function does not optimize away the store of the RA to
	// the stack.
	PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
	FuncInfo->setLRStoreRequired();
	bool isPPC64 = Subtarget.isPPC64();
	auto PtrVT = getPointerTy(MF.getDataLayout());

	if (Depth > 0) {
	SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
	SDValue Offset =
	DAG.getConstant(Subtarget.getFrameLowering()->getReturnSaveOffset(), dl,
	isPPC64 ? MVT::i64 : MVT::i32);
	return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
	DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
	MachinePointerInfo());
	}

	// Just load the return address off the stack.
	SDValue RetAddrFI = getReturnAddrFrameIndex(DAG);
	return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
	MachinePointerInfo());
	}

	SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();

	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MFI.setFrameAddressIsTaken(true);

	EVT PtrVT = getPointerTy(MF.getDataLayout());
	bool isPPC64 = PtrVT == MVT::i64;

	// Naked functions never have a frame pointer, and so we use r1. For all
	// other functions, this decision must be delayed until during PEI.
	unsigned FrameReg;
	if (MF.getFunction().hasFnAttribute(Attribute::Naked))
	FrameReg = isPPC64 ? PPC::X1 : PPC::R1;
	else
	FrameReg = isPPC64 ? PPC::FP8 : PPC::FP;

	SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg,
	PtrVT);
	while (Depth--)
	FrameAddr = DAG.getLoad(Op.getValueType(), dl, DAG.getEntryNode(),
	FrameAddr, MachinePointerInfo());
	return FrameAddr;
	}

	// FIXME? Maybe this could be a TableGen attribute on some registers and
	// this table could be generated automatically from RegInfo.
	Register PPCTargetLowering::getRegisterByName(const char* RegName, LLT VT,
	const MachineFunction &MF) const {
	bool isPPC64 = Subtarget.isPPC64();

	bool is64Bit = isPPC64 && VT == LLT::scalar(64);
	if (!is64Bit && VT != LLT::scalar(32))
	report_fatal_error("Invalid register global variable type");

	Register Reg = StringSwitch<Register>(RegName)
	.Case("r1", is64Bit ? PPC::X1 : PPC::R1)
	.Case("r2", isPPC64 ? Register() : PPC::R2)
	.Case("r13", (is64Bit ? PPC::X13 : PPC::R13))
	.Default(Register());

	if (Reg)
	return Reg;
	report_fatal_error("Invalid register name global variable");
	}

	bool PPCTargetLowering::isAccessedAsGotIndirect(SDValue GA) const {
	// 32-bit SVR4 ABI access everything as got-indirect.
	if (Subtarget.is32BitELFABI())
	return true;

	// AIX accesses everything indirectly through the TOC, which is similar to
	// the GOT.
	if (Subtarget.isAIXABI())
	return true;

	CodeModel::Model CModel = getTargetMachine().getCodeModel();
	// If it is small or large code model, module locals are accessed
	// indirectly by loading their address from .toc/.got.
	if (CModel == CodeModel::Small \|\| CModel == CodeModel::Large)
	return true;

	// JumpTable and BlockAddress are accessed as got-indirect.
	if (isa<JumpTableSDNode>(GA) \|\| isa<BlockAddressSDNode>(GA))
	return true;

	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA))
	return Subtarget.isGVIndirectSymbol(G->getGlobal());

	return false;
	}

	bool
	PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
	// The PowerPC target isn't yet aware of offsets.
	return false;
	}

	bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
	const CallInst &I,
	MachineFunction &MF,
	unsigned Intrinsic) const {
	switch (Intrinsic) {
	case Intrinsic::ppc_qpx_qvlfd:
	case Intrinsic::ppc_qpx_qvlfs:
	case Intrinsic::ppc_qpx_qvlfcd:
	case Intrinsic::ppc_qpx_qvlfcs:
	case Intrinsic::ppc_qpx_qvlfiwa:
	case Intrinsic::ppc_qpx_qvlfiwz:
	case Intrinsic::ppc_altivec_lvx:
	case Intrinsic::ppc_altivec_lvxl:
	case Intrinsic::ppc_altivec_lvebx:
	case Intrinsic::ppc_altivec_lvehx:
	case Intrinsic::ppc_altivec_lvewx:
	case Intrinsic::ppc_vsx_lxvd2x:
	case Intrinsic::ppc_vsx_lxvw4x: {
	EVT VT;
	switch (Intrinsic) {
	case Intrinsic::ppc_altivec_lvebx:
	VT = MVT::i8;
	break;
	case Intrinsic::ppc_altivec_lvehx:
	VT = MVT::i16;
	break;
	case Intrinsic::ppc_altivec_lvewx:
	VT = MVT::i32;
	break;
	case Intrinsic::ppc_vsx_lxvd2x:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_qpx_qvlfd:
	VT = MVT::v4f64;
	break;
	case Intrinsic::ppc_qpx_qvlfs:
	VT = MVT::v4f32;
	break;
	case Intrinsic::ppc_qpx_qvlfcd:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_qpx_qvlfcs:
	VT = MVT::v2f32;
	break;
	default:
	VT = MVT::v4i32;
	break;
	}

	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = VT;
	Info.ptrVal = I.getArgOperand(0);
	Info.offset = -VT.getStoreSize()+1;
	Info.size = 2*VT.getStoreSize()-1;
	Info.align = Align(1);
	Info.flags = MachineMemOperand::MOLoad;
	return true;
	}
	case Intrinsic::ppc_qpx_qvlfda:
	case Intrinsic::ppc_qpx_qvlfsa:
	case Intrinsic::ppc_qpx_qvlfcda:
	case Intrinsic::ppc_qpx_qvlfcsa:
	case Intrinsic::ppc_qpx_qvlfiwaa:
	case Intrinsic::ppc_qpx_qvlfiwza: {
	EVT VT;
	switch (Intrinsic) {
	case Intrinsic::ppc_qpx_qvlfda:
	VT = MVT::v4f64;
	break;
	case Intrinsic::ppc_qpx_qvlfsa:
	VT = MVT::v4f32;
	break;
	case Intrinsic::ppc_qpx_qvlfcda:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_qpx_qvlfcsa:
	VT = MVT::v2f32;
	break;
	default:
	VT = MVT::v4i32;
	break;
	}

	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = VT;
	Info.ptrVal = I.getArgOperand(0);
	Info.offset = 0;
	Info.size = VT.getStoreSize();
	Info.align = Align(1);
	Info.flags = MachineMemOperand::MOLoad;
	return true;
	}
	case Intrinsic::ppc_qpx_qvstfd:
	case Intrinsic::ppc_qpx_qvstfs:
	case Intrinsic::ppc_qpx_qvstfcd:
	case Intrinsic::ppc_qpx_qvstfcs:
	case Intrinsic::ppc_qpx_qvstfiw:
	case Intrinsic::ppc_altivec_stvx:
	case Intrinsic::ppc_altivec_stvxl:
	case Intrinsic::ppc_altivec_stvebx:
	case Intrinsic::ppc_altivec_stvehx:
	case Intrinsic::ppc_altivec_stvewx:
	case Intrinsic::ppc_vsx_stxvd2x:
	case Intrinsic::ppc_vsx_stxvw4x: {
	EVT VT;
	switch (Intrinsic) {
	case Intrinsic::ppc_altivec_stvebx:
	VT = MVT::i8;
	break;
	case Intrinsic::ppc_altivec_stvehx:
	VT = MVT::i16;
	break;
	case Intrinsic::ppc_altivec_stvewx:
	VT = MVT::i32;
	break;
	case Intrinsic::ppc_vsx_stxvd2x:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_qpx_qvstfd:
	VT = MVT::v4f64;
	break;
	case Intrinsic::ppc_qpx_qvstfs:
	VT = MVT::v4f32;
	break;
	case Intrinsic::ppc_qpx_qvstfcd:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_qpx_qvstfcs:
	VT = MVT::v2f32;
	break;
	default:
	VT = MVT::v4i32;
	break;
	}

	Info.opc = ISD::INTRINSIC_VOID;
	Info.memVT = VT;
	Info.ptrVal = I.getArgOperand(1);
	Info.offset = -VT.getStoreSize()+1;
	Info.size = 2*VT.getStoreSize()-1;
	Info.align = Align(1);
	Info.flags = MachineMemOperand::MOStore;
	return true;
	}
	case Intrinsic::ppc_qpx_qvstfda:
	case Intrinsic::ppc_qpx_qvstfsa:
	case Intrinsic::ppc_qpx_qvstfcda:
	case Intrinsic::ppc_qpx_qvstfcsa:
	case Intrinsic::ppc_qpx_qvstfiwa: {
	EVT VT;
	switch (Intrinsic) {
	case Intrinsic::ppc_qpx_qvstfda:
	VT = MVT::v4f64;
	break;
	case Intrinsic::ppc_qpx_qvstfsa:
	VT = MVT::v4f32;
	break;
	case Intrinsic::ppc_qpx_qvstfcda:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_qpx_qvstfcsa:
	VT = MVT::v2f32;
	break;
	default:
	VT = MVT::v4i32;
	break;
	}

	Info.opc = ISD::INTRINSIC_VOID;
	Info.memVT = VT;
	Info.ptrVal = I.getArgOperand(1);
	Info.offset = 0;
	Info.size = VT.getStoreSize();
	Info.align = Align(1);
	Info.flags = MachineMemOperand::MOStore;
	return true;
	}
	default:
	break;
	}

	return false;
	}

	/// It returns EVT::Other if the type should be determined using generic
	/// target-independent logic.
	EVT PPCTargetLowering::getOptimalMemOpType(
	const MemOp &Op, const AttributeList &FuncAttributes) const {
	if (getTargetMachine().getOptLevel() != CodeGenOpt::None) {
	// When expanding a memset, require at least two QPX instructions to cover
	// the cost of loading the value to be stored from the constant pool.
	if (Subtarget.hasQPX() && Op.size() >= 32 &&
	(Op.isMemcpy() \|\| Op.size() >= 64) && Op.isAligned(Align(32)) &&
	!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
	return MVT::v4f64;
	}

	// We should use Altivec/VSX loads and stores when available. For unaligned
	// addresses, unaligned VSX loads are only fast starting with the P8.
	if (Subtarget.hasAltivec() && Op.size() >= 16 &&
	(Op.isAligned(Align(16)) \|\|
	((Op.isMemset() && Subtarget.hasVSX()) \|\| Subtarget.hasP8Vector())))
	return MVT::v4i32;
	}

	if (Subtarget.isPPC64()) {
	return MVT::i64;
	}

	return MVT::i32;
	}

	/// Returns true if it is beneficial to convert a load of a constant
	/// to just the constant itself.
	bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const {
	assert(Ty->isIntegerTy());

	unsigned BitSize = Ty->getPrimitiveSizeInBits();
	return !(BitSize == 0 \|\| BitSize > 64);
	}

	bool PPCTargetLowering::isTruncateFree(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;
	unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
	unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
	return NumBits1 == 64 && NumBits2 == 32;
	}

	bool PPCTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
	if (!VT1.isInteger() \|\| !VT2.isInteger())
	return false;
	unsigned NumBits1 = VT1.getSizeInBits();
	unsigned NumBits2 = VT2.getSizeInBits();
	return NumBits1 == 64 && NumBits2 == 32;
	}

	bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
	// Generally speaking, zexts are not free, but they are free when they can be
	// folded with other operations.
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val)) {
	EVT MemVT = LD->getMemoryVT();
	if ((MemVT == MVT::i1 \|\| MemVT == MVT::i8 \|\| MemVT == MVT::i16 \|\|
	(Subtarget.isPPC64() && MemVT == MVT::i32)) &&
	(LD->getExtensionType() == ISD::NON_EXTLOAD \|\|
	LD->getExtensionType() == ISD::ZEXTLOAD))
	return true;
	}

	// FIXME: Add other cases...
	// - 32-bit shifts with a zext to i64
	// - zext after ctlz, bswap, etc.
	// - zext after and by a constant mask

	return TargetLowering::isZExtFree(Val, VT2);
	}

	bool PPCTargetLowering::isFPExtFree(EVT DestVT, EVT SrcVT) const {
	assert(DestVT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
	"invalid fpext types");
	// Extending to float128 is not free.
	if (DestVT == MVT::f128)
	return false;
	return true;
	}

	bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
	return isInt<16>(Imm) \|\| isUInt<16>(Imm);
	}

	bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const {
	return isInt<16>(Imm) \|\| isUInt<16>(Imm);
	}

	bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
	unsigned,
	unsigned,
	MachineMemOperand::Flags,
	bool *Fast) const {
	if (DisablePPCUnaligned)
	return false;

	// PowerPC supports unaligned memory access for simple non-vector types.
	// Although accessing unaligned addresses is not as efficient as accessing
	// aligned addresses, it is generally more efficient than manual expansion,
	// and generally only traps for software emulation when crossing page
	// boundaries.

	if (!VT.isSimple())
	return false;

	if (VT.isFloatingPoint() && !VT.isVector() &&
	!Subtarget.allowsUnalignedFPAccess())
	return false;

	if (VT.getSimpleVT().isVector()) {
	if (Subtarget.hasVSX()) {
	if (VT != MVT::v2f64 && VT != MVT::v2i64 &&
	VT != MVT::v4f32 && VT != MVT::v4i32)
	return false;
	} else {
	return false;
	}
	}

	if (VT == MVT::ppcf128)
	return false;

	if (Fast)
	*Fast = true;

	return true;
	}

	bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
	EVT VT) const {
	return isFMAFasterThanFMulAndFAdd(
	MF.getFunction(), VT.getTypeForEVT(MF.getFunction().getContext()));
	}

	bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
	Type *Ty) const {
	switch (Ty->getScalarType()->getTypeID()) {
	case Type::FloatTyID:
	case Type::DoubleTyID:
	return true;
	case Type::FP128TyID:
	return Subtarget.hasP9Vector();
	default:
	return false;
	}
	}

	// Currently this is a copy from AArch64TargetLowering::isProfitableToHoist.
	// FIXME: add more patterns which are profitable to hoist.
	bool PPCTargetLowering::isProfitableToHoist(Instruction *I) const {
	if (I->getOpcode() != Instruction::FMul)
	return true;

	if (!I->hasOneUse())
	return true;

	Instruction *User = I->user_back();
	assert(User && "A single use instruction with no uses.");

	if (User->getOpcode() != Instruction::FSub &&
	User->getOpcode() != Instruction::FAdd)
	return true;

	const TargetOptions &Options = getTargetMachine().Options;
	const Function *F = I->getFunction();
	const DataLayout &DL = F->getParent()->getDataLayout();
	Type *Ty = User->getOperand(0)->getType();

	return !(
	isFMAFasterThanFMulAndFAdd(*F, Ty) &&
	isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
	(Options.AllowFPOpFusion == FPOpFusion::Fast \|\| Options.UnsafeFPMath));
	}

	const MCPhysReg *
	PPCTargetLowering::getScratchRegisters(CallingConv::ID) const {
	// LR is a callee-save register, but we must treat it as clobbered by any call
	// site. Hence we include LR in the scratch registers, which are in turn added
	// as implicit-defs for stackmaps and patchpoints. The same reasoning applies
	// to CTR, which is used by any indirect call.
	static const MCPhysReg ScratchRegs[] = {
	PPC::X12, PPC::LR8, PPC::CTR8, 0
	};

	return ScratchRegs;
	}

	Register PPCTargetLowering::getExceptionPointerRegister(
	const Constant *PersonalityFn) const {
	return Subtarget.isPPC64() ? PPC::X3 : PPC::R3;
	}

	Register PPCTargetLowering::getExceptionSelectorRegister(
	const Constant *PersonalityFn) const {
	return Subtarget.isPPC64() ? PPC::X4 : PPC::R4;
	}

	bool
	PPCTargetLowering::shouldExpandBuildVectorWithShuffles(
	EVT VT , unsigned DefinedValues) const {
	if (VT == MVT::v2i64)
	return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves

	if (Subtarget.hasVSX() \|\| Subtarget.hasQPX())
	return true;

	return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
	}

	Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const {
	if (DisableILPPref \|\| Subtarget.enableMachineScheduler())
	return TargetLowering::getSchedulingPreference(N);

	return Sched::ILP;
	}

	// Create a fast isel object.
	FastISel *
	PPCTargetLowering::createFastISel(FunctionLoweringInfo &FuncInfo,
	const TargetLibraryInfo *LibInfo) const {
	return PPC::createFastISel(FuncInfo, LibInfo);
	}

	// 'Inverted' means the FMA opcode after negating one multiplicand.
	// For example, (fma -a b c) = (fnmsub a b c)
	static unsigned invertFMAOpcode(unsigned Opc) {
	switch (Opc) {
	default:
	llvm_unreachable("Invalid FMA opcode for PowerPC!");
	case ISD::FMA:
	return PPCISD::FNMSUB;
	case PPCISD::FNMSUB:
	return ISD::FMA;
	}
	}

	SDValue PPCTargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
	bool LegalOps, bool OptForSize,
	NegatibleCost &Cost,
	unsigned Depth) const {
	if (Depth > SelectionDAG::MaxRecursionDepth)
	return SDValue();

	unsigned Opc = Op.getOpcode();
	EVT VT = Op.getValueType();
	SDNodeFlags Flags = Op.getNode()->getFlags();

	switch (Opc) {
	case PPCISD::FNMSUB:
	// TODO: QPX subtarget is deprecated. No transformation here.
	if (!Op.hasOneUse() \|\| !isTypeLegal(VT) \|\| Subtarget.hasQPX())
	break;

	const TargetOptions &Options = getTargetMachine().Options;
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);
	SDValue N2 = Op.getOperand(2);
	SDLoc Loc(Op);

	NegatibleCost N2Cost = NegatibleCost::Expensive;
	SDValue NegN2 =
	getNegatedExpression(N2, DAG, LegalOps, OptForSize, N2Cost, Depth + 1);

	if (!NegN2)
	return SDValue();

	// (fneg (fnmsub a b c)) => (fnmsub (fneg a) b (fneg c))
	// (fneg (fnmsub a b c)) => (fnmsub a (fneg b) (fneg c))
	// These transformations may change sign of zeroes. For example,
	// -(-ab-(-c))=-0 while -(-(ab-c))=+0 when a=b=c=1.
	if (Flags.hasNoSignedZeros() \|\| Options.NoSignedZerosFPMath) {
	// Try and choose the cheaper one to negate.
	NegatibleCost N0Cost = NegatibleCost::Expensive;
	SDValue NegN0 = getNegatedExpression(N0, DAG, LegalOps, OptForSize,
	N0Cost, Depth + 1);

	NegatibleCost N1Cost = NegatibleCost::Expensive;
	SDValue NegN1 = getNegatedExpression(N1, DAG, LegalOps, OptForSize,
	N1Cost, Depth + 1);

	if (NegN0 && N0Cost <= N1Cost) {
	Cost = std::min(N0Cost, N2Cost);
	return DAG.getNode(Opc, Loc, VT, NegN0, N1, NegN2, Flags);
	} else if (NegN1) {
	Cost = std::min(N1Cost, N2Cost);
	return DAG.getNode(Opc, Loc, VT, N0, NegN1, NegN2, Flags);
	}
	}

	// (fneg (fnmsub a b c)) => (fma a b (fneg c))
	if (isOperationLegal(ISD::FMA, VT)) {
	Cost = N2Cost;
	return DAG.getNode(ISD::FMA, Loc, VT, N0, N1, NegN2, Flags);
	}

	break;
	}

	return TargetLowering::getNegatedExpression(Op, DAG, LegalOps, OptForSize,
	Cost, Depth);
	}

	// Override to enable LOAD_STACK_GUARD lowering on Linux.
	bool PPCTargetLowering::useLoadStackGuardNode() const {
	if (!Subtarget.isTargetLinux())
	return TargetLowering::useLoadStackGuardNode();
	return true;
	}

	// Override to disable global variable loading on Linux.
	void PPCTargetLowering::insertSSPDeclarations(Module &M) const {
	if (!Subtarget.isTargetLinux())
	return TargetLowering::insertSSPDeclarations(M);
	}

	bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
	bool ForCodeSize) const {
	if (!VT.isSimple() \|\| !Subtarget.hasVSX())
	return false;

	switch(VT.getSimpleVT().SimpleTy) {
	default:
	// For FP types that are currently not supported by PPC backend, return
	// false. Examples: f16, f80.
	return false;
	case MVT::f32:
	case MVT::f64:
	if (Subtarget.hasPrefixInstrs()) {
	// With prefixed instructions, we can materialize anything that can be
	// represented with a 32-bit immediate, not just positive zero.
	APFloat APFloatOfImm = Imm;
	return convertToNonDenormSingle(APFloatOfImm);
	}
	LLVM_FALLTHROUGH;
	case MVT::ppcf128:
	return Imm.isPosZero();
	}
	}

	// For vector shift operation op, fold
	// (op x, (and y, ((1 << numbits(x)) - 1))) -> (target op x, y)
	static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N,
	SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	unsigned OpSizeInBits = VT.getScalarSizeInBits();
	unsigned Opcode = N->getOpcode();
	unsigned TargetOpcode;

	switch (Opcode) {
	default:
	llvm_unreachable("Unexpected shift operation");
	case ISD::SHL:
	TargetOpcode = PPCISD::SHL;
	break;
	case ISD::SRL:
	TargetOpcode = PPCISD::SRL;
	break;
	case ISD::SRA:
	TargetOpcode = PPCISD::SRA;
	break;
	}

	if (VT.isVector() && TLI.isOperationLegal(Opcode, VT) &&
	N1->getOpcode() == ISD::AND)
	if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1)))
	if (Mask->getZExtValue() == OpSizeInBits - 1)
	return DAG.getNode(TargetOpcode, SDLoc(N), VT, N0, N1->getOperand(0));

	return SDValue();
	}

	SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const {
	if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
	return Value;

	SDValue N0 = N->getOperand(0);
	ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!Subtarget.isISA3_0() \|\|
	N0.getOpcode() != ISD::SIGN_EXTEND \|\|
	N0.getOperand(0).getValueType() != MVT::i32 \|\|
	CN1 == nullptr \|\| N->getValueType(0) != MVT::i64)
	return SDValue();

	// We can't save an operation here if the value is already extended, and
	// the existing shift is easier to combine.
	SDValue ExtsSrc = N0.getOperand(0);
	if (ExtsSrc.getOpcode() == ISD::TRUNCATE &&
	ExtsSrc.getOperand(0).getOpcode() == ISD::AssertSext)
	return SDValue();

	SDLoc DL(N0);
	SDValue ShiftBy = SDValue(CN1, 0);
	// We want the shift amount to be i32 on the extswli, but the shift could
	// have an i64.
	if (ShiftBy.getValueType() == MVT::i64)
	ShiftBy = DCI.DAG.getConstant(CN1->getZExtValue(), DL, MVT::i32);

	return DCI.DAG.getNode(PPCISD::EXTSWSLI, DL, MVT::i64, N0->getOperand(0),
	ShiftBy);
	}

	SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const {
	if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
	return Value;

	return SDValue();
	}

	SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const {
	if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
	return Value;

	return SDValue();
	}

	// Transform (add X, (zext(setne Z, C))) -> (addze X, (addic (addi Z, -C), -1))
	// Transform (add X, (zext(sete Z, C))) -> (addze X, (subfic (addi Z, -C), 0))
	// When C is zero, the equation (addi Z, -C) can be simplified to Z
	// Requirement: -C in [-32768, 32767], X and Z are MVT::i64 types
	static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG,
	const PPCSubtarget &Subtarget) {
	if (!Subtarget.isPPC64())
	return SDValue();

	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);

	auto isZextOfCompareWithConstant = [](SDValue Op) {
	if (Op.getOpcode() != ISD::ZERO_EXTEND \|\| !Op.hasOneUse() \|\|
	Op.getValueType() != MVT::i64)
	return false;

	SDValue Cmp = Op.getOperand(0);
	if (Cmp.getOpcode() != ISD::SETCC \|\| !Cmp.hasOneUse() \|\|
	Cmp.getOperand(0).getValueType() != MVT::i64)
	return false;

	if (auto *Constant = dyn_cast<ConstantSDNode>(Cmp.getOperand(1))) {
	int64_t NegConstant = 0 - Constant->getSExtValue();
	// Due to the limitations of the addi instruction,
	// -C is required to be [-32768, 32767].
	return isInt<16>(NegConstant);
	}

	return false;
	};

	bool LHSHasPattern = isZextOfCompareWithConstant(LHS);
	bool RHSHasPattern = isZextOfCompareWithConstant(RHS);

	// If there is a pattern, canonicalize a zext operand to the RHS.
	if (LHSHasPattern && !RHSHasPattern)
	std::swap(LHS, RHS);
	else if (!LHSHasPattern && !RHSHasPattern)
	return SDValue();

	SDLoc DL(N);
	SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Glue);
	SDValue Cmp = RHS.getOperand(0);
	SDValue Z = Cmp.getOperand(0);
	auto *Constant = dyn_cast<ConstantSDNode>(Cmp.getOperand(1));

	assert(Constant && "Constant Should not be a null pointer.");
	int64_t NegConstant = 0 - Constant->getSExtValue();

	switch(cast<CondCodeSDNode>(Cmp.getOperand(2))->get()) {
	default: break;
	case ISD::SETNE: {
	// when C == 0
	// --> addze X, (addic Z, -1).carry
	// /
	// add X, (zext(setne Z, C))--
	// \ when -32768 <= -C <= 32767 && C != 0
	// --> addze X, (addic (addi Z, -C), -1).carry
	SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Z,
	DAG.getConstant(NegConstant, DL, MVT::i64));
	SDValue AddOrZ = NegConstant != 0 ? Add : Z;
	SDValue Addc = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i64, MVT::Glue),
	AddOrZ, DAG.getConstant(-1ULL, DL, MVT::i64));
	return DAG.getNode(ISD::ADDE, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64),
	SDValue(Addc.getNode(), 1));
	}
	case ISD::SETEQ: {
	// when C == 0
	// --> addze X, (subfic Z, 0).carry
	// /
	// add X, (zext(sete Z, C))--
	// \ when -32768 <= -C <= 32767 && C != 0
	// --> addze X, (subfic (addi Z, -C), 0).carry
	SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Z,
	DAG.getConstant(NegConstant, DL, MVT::i64));
	SDValue AddOrZ = NegConstant != 0 ? Add : Z;
	SDValue Subc = DAG.getNode(ISD::SUBC, DL, DAG.getVTList(MVT::i64, MVT::Glue),
	DAG.getConstant(0, DL, MVT::i64), AddOrZ);
	return DAG.getNode(ISD::ADDE, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64),
	SDValue(Subc.getNode(), 1));
	}
	}

	return SDValue();
	}

	// Transform
	// (add C1, (MAT_PCREL_ADDR GlobalAddr+C2)) to
	// (MAT_PCREL_ADDR GlobalAddr+(C1+C2))
	// In this case both C1 and C2 must be known constants.
	// C1+C2 must fit into a 34 bit signed integer.
	static SDValue combineADDToMAT_PCREL_ADDR(SDNode *N, SelectionDAG &DAG,
	const PPCSubtarget &Subtarget) {
	if (!Subtarget.isUsingPCRelativeCalls())
	return SDValue();

	// Check both Operand 0 and Operand 1 of the ADD node for the PCRel node.
	// If we find that node try to cast the Global Address and the Constant.
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);

	if (LHS.getOpcode() != PPCISD::MAT_PCREL_ADDR)
	std::swap(LHS, RHS);

	if (LHS.getOpcode() != PPCISD::MAT_PCREL_ADDR)
	return SDValue();

	// Operand zero of PPCISD::MAT_PCREL_ADDR is the GA node.
	GlobalAddressSDNode *GSDN = dyn_cast<GlobalAddressSDNode>(LHS.getOperand(0));
	ConstantSDNode* ConstNode = dyn_cast<ConstantSDNode>(RHS);

	// Check that both casts succeeded.
	if (!GSDN \|\| !ConstNode)
	return SDValue();

	int64_t NewOffset = GSDN->getOffset() + ConstNode->getSExtValue();
	SDLoc DL(GSDN);

	// The signed int offset needs to fit in 34 bits.
	if (!isInt<34>(NewOffset))
	return SDValue();

	// The new global address is a copy of the old global address except
	// that it has the updated Offset.
	SDValue GA =
	DAG.getTargetGlobalAddress(GSDN->getGlobal(), DL, GSDN->getValueType(0),
	NewOffset, GSDN->getTargetFlags());
	SDValue MatPCRel =
	DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, GSDN->getValueType(0), GA);
	return MatPCRel;
	}

	SDValue PPCTargetLowering::combineADD(SDNode *N, DAGCombinerInfo &DCI) const {
	if (auto Value = combineADDToADDZE(N, DCI.DAG, Subtarget))
	return Value;

	if (auto Value = combineADDToMAT_PCREL_ADDR(N, DCI.DAG, Subtarget))
	return Value;

	return SDValue();
	}

	// Detect TRUNCATE operations on bitcasts of float128 values.
	// What we are looking for here is the situtation where we extract a subset
	// of bits from a 128 bit float.
	// This can be of two forms:
	// 1) BITCAST of f128 feeding TRUNCATE
	// 2) BITCAST of f128 feeding SRL (a shift) feeding TRUNCATE
	// The reason this is required is because we do not have a legal i128 type
	// and so we want to prevent having to store the f128 and then reload part
	// of it.
	SDValue PPCTargetLowering::combineTRUNCATE(SDNode *N,
	DAGCombinerInfo &DCI) const {
	// If we are using CRBits then try that first.
	if (Subtarget.useCRBits()) {
	// Check if CRBits did anything and return that if it did.
	if (SDValue CRTruncValue = DAGCombineTruncBoolExt(N, DCI))
	return CRTruncValue;
	}

	SDLoc dl(N);
	SDValue Op0 = N->getOperand(0);

	// fold (truncate (abs (sub (zext a), (zext b)))) -> (vabsd a, b)
	if (Subtarget.hasP9Altivec() && Op0.getOpcode() == ISD::ABS) {
	EVT VT = N->getValueType(0);
	if (VT != MVT::v4i32 && VT != MVT::v8i16 && VT != MVT::v16i8)
	return SDValue();
	SDValue Sub = Op0.getOperand(0);
	if (Sub.getOpcode() == ISD::SUB) {
	SDValue SubOp0 = Sub.getOperand(0);
	SDValue SubOp1 = Sub.getOperand(1);
	if ((SubOp0.getOpcode() == ISD::ZERO_EXTEND) &&
	(SubOp1.getOpcode() == ISD::ZERO_EXTEND)) {
	return DCI.DAG.getNode(PPCISD::VABSD, dl, VT, SubOp0.getOperand(0),
	SubOp1.getOperand(0),
	DCI.DAG.getTargetConstant(0, dl, MVT::i32));
	}
	}
	}

	// Looking for a truncate of i128 to i64.
	if (Op0.getValueType() != MVT::i128 \|\| N->getValueType(0) != MVT::i64)
	return SDValue();

	int EltToExtract = DCI.DAG.getDataLayout().isBigEndian() ? 1 : 0;

	// SRL feeding TRUNCATE.
	if (Op0.getOpcode() == ISD::SRL) {
	ConstantSDNode *ConstNode = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
	// The right shift has to be by 64 bits.
	if (!ConstNode \|\| ConstNode->getZExtValue() != 64)
	return SDValue();

	// Switch the element number to extract.
	EltToExtract = EltToExtract ? 0 : 1;
	// Update Op0 past the SRL.
	Op0 = Op0.getOperand(0);
	}

	// BITCAST feeding a TRUNCATE possibly via SRL.
	if (Op0.getOpcode() == ISD::BITCAST &&
	Op0.getValueType() == MVT::i128 &&
	Op0.getOperand(0).getValueType() == MVT::f128) {
	SDValue Bitcast = DCI.DAG.getBitcast(MVT::v2i64, Op0.getOperand(0));
	return DCI.DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Bitcast,
	DCI.DAG.getTargetConstant(EltToExtract, dl, MVT::i32));
	}
	return SDValue();
	}

	SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;

	ConstantSDNode *ConstOpOrElement = isConstOrConstSplat(N->getOperand(1));
	if (!ConstOpOrElement)
	return SDValue();

	// An imul is usually smaller than the alternative sequence for legal type.
	if (DAG.getMachineFunction().getFunction().hasMinSize() &&
	isOperationLegal(ISD::MUL, N->getValueType(0)))
	return SDValue();

	auto IsProfitable = [this](bool IsNeg, bool IsAddOne, EVT VT) -> bool {
	switch (this->Subtarget.getCPUDirective()) {
	default:
	// TODO: enhance the condition for subtarget before pwr8
	return false;
	case PPC::DIR_PWR8:
	// type mul add shl
	// scalar 4 1 1
	// vector 7 2 2
	return true;
	case PPC::DIR_PWR9:
	case PPC::DIR_PWR10:
	case PPC::DIR_PWR_FUTURE:
	// type mul add shl
	// scalar 5 2 2
	// vector 7 2 2

	// The cycle RATIO of related operations are showed as a table above.
	// Because mul is 5(scalar)/7(vector), add/sub/shl are all 2 for both
	// scalar and vector type. For 2 instrs patterns, add/sub + shl
	// are 4, it is always profitable; but for 3 instrs patterns
	// (mul x, -(2^N + 1)) => -(add (shl x, N), x), sub + add + shl are 6.
	// So we should only do it for vector type.
	return IsAddOne && IsNeg ? VT.isVector() : true;
	}
	};

	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	const APInt &MulAmt = ConstOpOrElement->getAPIntValue();
	bool IsNeg = MulAmt.isNegative();
	APInt MulAmtAbs = MulAmt.abs();

	if ((MulAmtAbs - 1).isPowerOf2()) {
	// (mul x, 2^N + 1) => (add (shl x, N), x)
	// (mul x, -(2^N + 1)) => -(add (shl x, N), x)

	if (!IsProfitable(IsNeg, true, VT))
	return SDValue();

	SDValue Op0 = N->getOperand(0);
	SDValue Op1 =
	DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant((MulAmtAbs - 1).logBase2(), DL, VT));
	SDValue Res = DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);

	if (!IsNeg)
	return Res;

	return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
	} else if ((MulAmtAbs + 1).isPowerOf2()) {
	// (mul x, 2^N - 1) => (sub (shl x, N), x)
	// (mul x, -(2^N - 1)) => (sub x, (shl x, N))

	if (!IsProfitable(IsNeg, false, VT))
	return SDValue();

	SDValue Op0 = N->getOperand(0);
	SDValue Op1 =
	DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant((MulAmtAbs + 1).logBase2(), DL, VT));

	if (!IsNeg)
	return DAG.getNode(ISD::SUB, DL, VT, Op1, Op0);
	else
	return DAG.getNode(ISD::SUB, DL, VT, Op0, Op1);

	} else {
	return SDValue();
	}
	}

	// Combine fma-like op (like fnmsub) with fnegs to appropriate op. Do this
	// in combiner since we need to check SD flags and other subtarget features.
	SDValue PPCTargetLowering::combineFMALike(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue N2 = N->getOperand(2);
	SDNodeFlags Flags = N->getFlags();
	EVT VT = N->getValueType(0);
	SelectionDAG &DAG = DCI.DAG;
	const TargetOptions &Options = getTargetMachine().Options;
	unsigned Opc = N->getOpcode();
	bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
	bool LegalOps = !DCI.isBeforeLegalizeOps();
	SDLoc Loc(N);

	// TODO: QPX subtarget is deprecated. No transformation here.
	if (Subtarget.hasQPX() \|\| !isOperationLegal(ISD::FMA, VT))
	return SDValue();

	// Allowing transformation to FNMSUB may change sign of zeroes when ab-c=0
	// since (fnmsub a b c)=-0 while c-ab=+0.
	if (!Flags.hasNoSignedZeros() && !Options.NoSignedZerosFPMath)
	return SDValue();

	// (fma (fneg a) b c) => (fnmsub a b c)
	// (fnmsub (fneg a) b c) => (fma a b c)
	if (SDValue NegN0 = getCheaperNegatedExpression(N0, DAG, LegalOps, CodeSize))
	return DAG.getNode(invertFMAOpcode(Opc), Loc, VT, NegN0, N1, N2, Flags);

	// (fma a (fneg b) c) => (fnmsub a b c)
	// (fnmsub a (fneg b) c) => (fma a b c)
	if (SDValue NegN1 = getCheaperNegatedExpression(N1, DAG, LegalOps, CodeSize))
	return DAG.getNode(invertFMAOpcode(Opc), Loc, VT, N0, NegN1, N2, Flags);

	return SDValue();
	}

	bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
	// Only duplicate to increase tail-calls for the 64bit SysV ABIs.
	if (!Subtarget.is64BitELFABI())
	return false;

	// If not a tail call then no need to proceed.
	if (!CI->isTailCall())
	return false;

	// If sibling calls have been disabled and tail-calls aren't guaranteed
	// there is no reason to duplicate.
	auto &TM = getTargetMachine();
	if (!TM.Options.GuaranteedTailCallOpt && DisableSCO)
	return false;

	// Can't tail call a function called indirectly, or if it has variadic args.
	const Function *Callee = CI->getCalledFunction();
	if (!Callee \|\| Callee->isVarArg())
	return false;

	// Make sure the callee and caller calling conventions are eligible for tco.
	const Function *Caller = CI->getParent()->getParent();
	if (!areCallingConvEligibleForTCO_64SVR4(Caller->getCallingConv(),
	CI->getCallingConv()))
	return false;

	// If the function is local then we have a good chance at tail-calling it
	return getTargetMachine().shouldAssumeDSOLocal(*Caller->getParent(), Callee);
	}

	bool PPCTargetLowering::hasBitPreservingFPLogic(EVT VT) const {
	if (!Subtarget.hasVSX())
	return false;
	if (Subtarget.hasP9Vector() && VT == MVT::f128)
	return true;
	return VT == MVT::f32 \|\| VT == MVT::f64 \|\|
	VT == MVT::v4f32 \|\| VT == MVT::v2f64;
	}

	bool PPCTargetLowering::
	isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const {
	const Value *Mask = AndI.getOperand(1);
	// If the mask is suitable for andi. or andis. we should sink the and.
	if (const ConstantInt *CI = dyn_cast<ConstantInt>(Mask)) {
	// Can't handle constants wider than 64-bits.
	if (CI->getBitWidth() > 64)
	return false;
	int64_t ConstVal = CI->getZExtValue();
	return isUInt<16>(ConstVal) \|\|
	(isUInt<16>(ConstVal >> 16) && !(ConstVal & 0xFFFF));
	}

	// For non-constant masks, we can always use the record-form and.
	return true;
	}

	// Transform (abs (sub (zext a), (zext b))) to (vabsd a b 0)
	// Transform (abs (sub (zext a), (zext_invec b))) to (vabsd a b 0)
	// Transform (abs (sub (zext_invec a), (zext_invec b))) to (vabsd a b 0)
	// Transform (abs (sub (zext_invec a), (zext b))) to (vabsd a b 0)
	// Transform (abs (sub a, b) to (vabsd a b 1)) if a & b of type v4i32
	SDValue PPCTargetLowering::combineABS(SDNode *N, DAGCombinerInfo &DCI) const {
	assert((N->getOpcode() == ISD::ABS) && "Need ABS node here");
	assert(Subtarget.hasP9Altivec() &&
	"Only combine this when P9 altivec supported!");
	EVT VT = N->getValueType(0);
	if (VT != MVT::v4i32 && VT != MVT::v8i16 && VT != MVT::v16i8)
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);
	if (N->getOperand(0).getOpcode() == ISD::SUB) {
	// Even for signed integers, if it's known to be positive (as signed
	// integer) due to zero-extended inputs.
	unsigned SubOpcd0 = N->getOperand(0)->getOperand(0).getOpcode();
	unsigned SubOpcd1 = N->getOperand(0)->getOperand(1).getOpcode();
	if ((SubOpcd0 == ISD::ZERO_EXTEND \|\|
	SubOpcd0 == ISD::ZERO_EXTEND_VECTOR_INREG) &&
	(SubOpcd1 == ISD::ZERO_EXTEND \|\|
	SubOpcd1 == ISD::ZERO_EXTEND_VECTOR_INREG)) {
	return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(0).getValueType(),
	N->getOperand(0)->getOperand(0),
	N->getOperand(0)->getOperand(1),
	DAG.getTargetConstant(0, dl, MVT::i32));
	}

	// For type v4i32, it can be optimized with xvnegsp + vabsduw
	if (N->getOperand(0).getValueType() == MVT::v4i32 &&
	N->getOperand(0).hasOneUse()) {
	return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(0).getValueType(),
	N->getOperand(0)->getOperand(0),
	N->getOperand(0)->getOperand(1),
	DAG.getTargetConstant(1, dl, MVT::i32));
	}
	}

	return SDValue();
	}

	// For type v4i32/v8ii16/v16i8, transform
	// from (vselect (setcc a, b, setugt), (sub a, b), (sub b, a)) to (vabsd a, b)
	// from (vselect (setcc a, b, setuge), (sub a, b), (sub b, a)) to (vabsd a, b)
	// from (vselect (setcc a, b, setult), (sub b, a), (sub a, b)) to (vabsd a, b)
	// from (vselect (setcc a, b, setule), (sub b, a), (sub a, b)) to (vabsd a, b)
	SDValue PPCTargetLowering::combineVSelect(SDNode *N,
	DAGCombinerInfo &DCI) const {
	assert((N->getOpcode() == ISD::VSELECT) && "Need VSELECT node here");
	assert(Subtarget.hasP9Altivec() &&
	"Only combine this when P9 altivec supported!");

	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);
	SDValue Cond = N->getOperand(0);
	SDValue TrueOpnd = N->getOperand(1);
	SDValue FalseOpnd = N->getOperand(2);
	EVT VT = N->getOperand(1).getValueType();

	if (Cond.getOpcode() != ISD::SETCC \|\| TrueOpnd.getOpcode() != ISD::SUB \|\|
	FalseOpnd.getOpcode() != ISD::SUB)
	return SDValue();

	// ABSD only available for type v4i32/v8i16/v16i8
	if (VT != MVT::v4i32 && VT != MVT::v8i16 && VT != MVT::v16i8)
	return SDValue();

	// At least to save one more dependent computation
	if (!(Cond.hasOneUse() \|\| TrueOpnd.hasOneUse() \|\| FalseOpnd.hasOneUse()))
	return SDValue();

	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

	// Can only handle unsigned comparison here
	switch (CC) {
	default:
	return SDValue();
	case ISD::SETUGT:
	case ISD::SETUGE:
	break;
	case ISD::SETULT:
	case ISD::SETULE:
	std::swap(TrueOpnd, FalseOpnd);
	break;
	}

	SDValue CmpOpnd1 = Cond.getOperand(0);
	SDValue CmpOpnd2 = Cond.getOperand(1);

	// SETCC CmpOpnd1 CmpOpnd2 cond
	// TrueOpnd = CmpOpnd1 - CmpOpnd2
	// FalseOpnd = CmpOpnd2 - CmpOpnd1
	if (TrueOpnd.getOperand(0) == CmpOpnd1 &&
	TrueOpnd.getOperand(1) == CmpOpnd2 &&
	FalseOpnd.getOperand(0) == CmpOpnd2 &&
	FalseOpnd.getOperand(1) == CmpOpnd1) {
	return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(1).getValueType(),
	CmpOpnd1, CmpOpnd2,
	DAG.getTargetConstant(0, dl, MVT::i32));
	}

	return SDValue();
	}
	diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
	index a0ae05081adc..7570385e38e3 100644
	--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
	+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
	@@ -1,302 +1,626 @@
	//===-- RISCVISelDAGToDAG.cpp - A dag to dag inst selector for RISCV ------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines an instruction selector for the RISCV target.
	//
	//===----------------------------------------------------------------------===//

	#include "RISCVISelDAGToDAG.h"
	#include "MCTargetDesc/RISCVMCTargetDesc.h"
	#include "Utils/RISCVMatInt.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/Support/Alignment.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"

	using namespace llvm;

	#define DEBUG_TYPE "riscv-isel"

	void RISCVDAGToDAGISel::PostprocessISelDAG() {
	doPeepholeLoadStoreADDI();
	}

	static SDNode selectImm(SelectionDAG CurDAG, const SDLoc &DL, int64_t Imm,
	MVT XLenVT) {
	RISCVMatInt::InstSeq Seq;
	RISCVMatInt::generateInstSeq(Imm, XLenVT == MVT::i64, Seq);

	SDNode *Result = nullptr;
	SDValue SrcReg = CurDAG->getRegister(RISCV::X0, XLenVT);
	for (RISCVMatInt::Inst &Inst : Seq) {
	SDValue SDImm = CurDAG->getTargetConstant(Inst.Imm, DL, XLenVT);
	if (Inst.Opc == RISCV::LUI)
	Result = CurDAG->getMachineNode(RISCV::LUI, DL, XLenVT, SDImm);
	else
	Result = CurDAG->getMachineNode(Inst.Opc, DL, XLenVT, SrcReg, SDImm);

	// Only the first instruction has X0 as its source.
	SrcReg = SDValue(Result, 0);
	}

	return Result;
	}

	// Returns true if the Node is an ISD::AND with a constant argument. If so,
	// set Mask to that constant value.
	static bool isConstantMask(SDNode *Node, uint64_t &Mask) {
	if (Node->getOpcode() == ISD::AND &&
	Node->getOperand(1).getOpcode() == ISD::Constant) {
	Mask = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
	return true;
	}
	return false;
	}

	void RISCVDAGToDAGISel::Select(SDNode *Node) {
	// If we have a custom node, we have already selected.
	if (Node->isMachineOpcode()) {
	LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << "\n");
	Node->setNodeId(-1);
	return;
	}

	// Instruction Selection not handled by the auto-generated tablegen selection
	// should be handled here.
	unsigned Opcode = Node->getOpcode();
	MVT XLenVT = Subtarget->getXLenVT();
	SDLoc DL(Node);
	EVT VT = Node->getValueType(0);

	switch (Opcode) {
	case ISD::ADD: {
	// Optimize (add r, imm) to (addi (addi r, imm0) imm1) if applicable. The
	// immediate must be in specific ranges and have a single use.
	if (auto *ConstOp = dyn_cast<ConstantSDNode>(Node->getOperand(1))) {
	if (!(ConstOp->hasOneUse()))
	break;
	// The imm must be in range [-4096,-2049] or [2048,4094].
	int64_t Imm = ConstOp->getSExtValue();
	if (!(-4096 <= Imm && Imm <= -2049) && !(2048 <= Imm && Imm <= 4094))
	break;
	// Break the imm to imm0+imm1.
	SDLoc DL(Node);
	EVT VT = Node->getValueType(0);
	const SDValue ImmOp0 = CurDAG->getTargetConstant(Imm - Imm / 2, DL, VT);
	const SDValue ImmOp1 = CurDAG->getTargetConstant(Imm / 2, DL, VT);
	auto *NodeAddi0 = CurDAG->getMachineNode(RISCV::ADDI, DL, VT,
	Node->getOperand(0), ImmOp0);
	auto *NodeAddi1 = CurDAG->getMachineNode(RISCV::ADDI, DL, VT,
	SDValue(NodeAddi0, 0), ImmOp1);
	ReplaceNode(Node, NodeAddi1);
	return;
	}
	break;
	}
	case ISD::Constant: {
	auto ConstNode = cast<ConstantSDNode>(Node);
	if (VT == XLenVT && ConstNode->isNullValue()) {
	SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
	RISCV::X0, XLenVT);
	ReplaceNode(Node, New.getNode());
	return;
	}
	int64_t Imm = ConstNode->getSExtValue();
	if (XLenVT == MVT::i64) {
	ReplaceNode(Node, selectImm(CurDAG, SDLoc(Node), Imm, XLenVT));
	return;
	}
	break;
	}
	case ISD::FrameIndex: {
	SDValue Imm = CurDAG->getTargetConstant(0, DL, XLenVT);
	int FI = cast<FrameIndexSDNode>(Node)->getIndex();
	SDValue TFI = CurDAG->getTargetFrameIndex(FI, VT);
	ReplaceNode(Node, CurDAG->getMachineNode(RISCV::ADDI, DL, VT, TFI, Imm));
	return;
	}
	case ISD::SRL: {
	if (!Subtarget->is64Bit())
	break;
	SDValue Op0 = Node->getOperand(0);
	SDValue Op1 = Node->getOperand(1);
	uint64_t Mask;
	// Match (srl (and val, mask), imm) where the result would be a
	// zero-extended 32-bit integer. i.e. the mask is 0xffffffff or the result
	// is equivalent to this (SimplifyDemandedBits may have removed lower bits
	// from the mask that aren't necessary due to the right-shifting).
	if (Op1.getOpcode() == ISD::Constant &&
	isConstantMask(Op0.getNode(), Mask)) {
	uint64_t ShAmt = cast<ConstantSDNode>(Op1.getNode())->getZExtValue();

	if ((Mask \| maskTrailingOnes<uint64_t>(ShAmt)) == 0xffffffff) {
	SDValue ShAmtVal =
	CurDAG->getTargetConstant(ShAmt, SDLoc(Node), XLenVT);
	CurDAG->SelectNodeTo(Node, RISCV::SRLIW, XLenVT, Op0.getOperand(0),
	ShAmtVal);
	return;
	}
	}
	break;
	}
	case RISCVISD::READ_CYCLE_WIDE:
	assert(!Subtarget->is64Bit() && "READ_CYCLE_WIDE is only used on riscv32");

	ReplaceNode(Node, CurDAG->getMachineNode(RISCV::ReadCycleWide, DL, MVT::i32,
	MVT::i32, MVT::Other,
	Node->getOperand(0)));
	return;
	}

	// Select the default instruction.
	SelectCode(Node);
	}

	bool RISCVDAGToDAGISel::SelectInlineAsmMemoryOperand(
	const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
	switch (ConstraintID) {
	case InlineAsm::Constraint_m:
	// We just support simple memory operands that have a single address
	// operand and need no special handling.
	OutOps.push_back(Op);
	return false;
	case InlineAsm::Constraint_A:
	OutOps.push_back(Op);
	return false;
	default:
	break;
	}

	return true;
	}

	bool RISCVDAGToDAGISel::SelectAddrFI(SDValue Addr, SDValue &Base) {
	if (auto FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
	Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), Subtarget->getXLenVT());
	return true;
	}
	return false;
	}

	+// Check that it is a SLOI (Shift Left Ones Immediate). We first check that
	+// it is the right node tree:
	+//
	+// (OR (SHL RS1, VC2), VC1)
	+//
	+// and then we check that VC1, the mask used to fill with ones, is compatible
	+// with VC2, the shamt:
	+//
	+// VC1 == maskTrailingOnes<uint64_t>(VC2)
	+
	+bool RISCVDAGToDAGISel::SelectSLOI(SDValue N, SDValue &RS1, SDValue &Shamt) {
	+ MVT XLenVT = Subtarget->getXLenVT();
	+ if (N.getOpcode() == ISD::OR) {
	+ SDValue Or = N;
	+ if (Or.getOperand(0).getOpcode() == ISD::SHL) {
	+ SDValue Shl = Or.getOperand(0);
	+ if (isa<ConstantSDNode>(Shl.getOperand(1)) &&
	+ isa<ConstantSDNode>(Or.getOperand(1))) {
	+ if (XLenVT == MVT::i64) {
	+ uint64_t VC1 = Or.getConstantOperandVal(1);
	+ uint64_t VC2 = Shl.getConstantOperandVal(1);
	+ if (VC1 == maskTrailingOnes<uint64_t>(VC2)) {
	+ RS1 = Shl.getOperand(0);
	+ Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
	+ Shl.getOperand(1).getValueType());
	+ return true;
	+ }
	+ }
	+ if (XLenVT == MVT::i32) {
	+ uint32_t VC1 = Or.getConstantOperandVal(1);
	+ uint32_t VC2 = Shl.getConstantOperandVal(1);
	+ if (VC1 == maskTrailingOnes<uint32_t>(VC2)) {
	+ RS1 = Shl.getOperand(0);
	+ Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
	+ Shl.getOperand(1).getValueType());
	+ return true;
	+ }
	+ }
	+ }
	+ }
	+ }
	+ return false;
	+}
	+
	+// Check that it is a SROI (Shift Right Ones Immediate). We first check that
	+// it is the right node tree:
	+//
	+// (OR (SRL RS1, VC2), VC1)
	+//
	+// and then we check that VC1, the mask used to fill with ones, is compatible
	+// with VC2, the shamt:
	+//
	+// VC1 == maskLeadingOnes<uint64_t>(VC2)
	+
	+bool RISCVDAGToDAGISel::SelectSROI(SDValue N, SDValue &RS1, SDValue &Shamt) {
	+ MVT XLenVT = Subtarget->getXLenVT();
	+ if (N.getOpcode() == ISD::OR) {
	+ SDValue Or = N;
	+ if (Or.getOperand(0).getOpcode() == ISD::SRL) {
	+ SDValue Srl = Or.getOperand(0);
	+ if (isa<ConstantSDNode>(Srl.getOperand(1)) &&
	+ isa<ConstantSDNode>(Or.getOperand(1))) {
	+ if (XLenVT == MVT::i64) {
	+ uint64_t VC1 = Or.getConstantOperandVal(1);
	+ uint64_t VC2 = Srl.getConstantOperandVal(1);
	+ if (VC1 == maskLeadingOnes<uint64_t>(VC2)) {
	+ RS1 = Srl.getOperand(0);
	+ Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
	+ Srl.getOperand(1).getValueType());
	+ return true;
	+ }
	+ }
	+ if (XLenVT == MVT::i32) {
	+ uint32_t VC1 = Or.getConstantOperandVal(1);
	+ uint32_t VC2 = Srl.getConstantOperandVal(1);
	+ if (VC1 == maskLeadingOnes<uint32_t>(VC2)) {
	+ RS1 = Srl.getOperand(0);
	+ Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
	+ Srl.getOperand(1).getValueType());
	+ return true;
	+ }
	+ }
	+ }
	+ }
	+ }
	+ return false;
	+}
	+
	+// Check that it is a RORI (Rotate Right Immediate). We first check that
	+// it is the right node tree:
	+//
	+// (ROTL RS1, VC)
	+//
	+// The compiler translates immediate rotations to the right given by the call
	+// to the rotateright32/rotateright64 intrinsics as rotations to the left.
	+// Since the rotation to the left can be easily emulated as a rotation to the
	+// right by negating the constant, there is no encoding for ROLI.
	+// We then select the immediate left rotations as RORI by the complementary
	+// constant:
	+//
	+// Shamt == XLen - VC
	+
	+bool RISCVDAGToDAGISel::SelectRORI(SDValue N, SDValue &RS1, SDValue &Shamt) {
	+ MVT XLenVT = Subtarget->getXLenVT();
	+ if (N.getOpcode() == ISD::ROTL) {
	+ if (isa<ConstantSDNode>(N.getOperand(1))) {
	+ if (XLenVT == MVT::i64) {
	+ uint64_t VC = N.getConstantOperandVal(1);
	+ Shamt = CurDAG->getTargetConstant((64 - VC), SDLoc(N),
	+ N.getOperand(1).getValueType());
	+ RS1 = N.getOperand(0);
	+ return true;
	+ }
	+ if (XLenVT == MVT::i32) {
	+ uint32_t VC = N.getConstantOperandVal(1);
	+ Shamt = CurDAG->getTargetConstant((32 - VC), SDLoc(N),
	+ N.getOperand(1).getValueType());
	+ RS1 = N.getOperand(0);
	+ return true;
	+ }
	+ }
	+ }
	+ return false;
	+}
	+
	+
	+// Check that it is a SLLIUW (Shift Logical Left Immediate Unsigned i32
	+// on RV64).
	+// SLLIUW is the same as SLLI except for the fact that it clears the bits
	+// XLEN-1:32 of the input RS1 before shifting.
	+// We first check that it is the right node tree:
	+//
	+// (AND (SHL RS1, VC2), VC1)
	+//
	+// We check that VC2, the shamt is less than 32, otherwise the pattern is
	+// exactly the same as SLLI and we give priority to that.
	+// Eventually we check that that VC1, the mask used to clear the upper 32 bits
	+// of RS1, is correct:
	+//
	+// VC1 == (0xFFFFFFFF << VC2)
	+
	+bool RISCVDAGToDAGISel::SelectSLLIUW(SDValue N, SDValue &RS1, SDValue &Shamt) {
	+ if (N.getOpcode() == ISD::AND && Subtarget->getXLenVT() == MVT::i64) {
	+ SDValue And = N;
	+ if (And.getOperand(0).getOpcode() == ISD::SHL) {
	+ SDValue Shl = And.getOperand(0);
	+ if (isa<ConstantSDNode>(Shl.getOperand(1)) &&
	+ isa<ConstantSDNode>(And.getOperand(1))) {
	+ uint64_t VC1 = And.getConstantOperandVal(1);
	+ uint64_t VC2 = Shl.getConstantOperandVal(1);
	+ if (VC2 < 32 && VC1 == ((uint64_t)0xFFFFFFFF << VC2)) {
	+ RS1 = Shl.getOperand(0);
	+ Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
	+ Shl.getOperand(1).getValueType());
	+ return true;
	+ }
	+ }
	+ }
	+ }
	+ return false;
	+}
	+
	+// Check that it is a SLOIW (Shift Left Ones Immediate i32 on RV64).
	+// We first check that it is the right node tree:
	+//
	+// (SIGN_EXTEND_INREG (OR (SHL RS1, VC2), VC1))
	+//
	+// and then we check that VC1, the mask used to fill with ones, is compatible
	+// with VC2, the shamt:
	+//
	+// VC1 == maskTrailingOnes<uint32_t>(VC2)
	+
	+bool RISCVDAGToDAGISel::SelectSLOIW(SDValue N, SDValue &RS1, SDValue &Shamt) {
	+ if (Subtarget->getXLenVT() == MVT::i64 &&
	+ N.getOpcode() == ISD::SIGN_EXTEND_INREG &&
	+ cast<VTSDNode>(N.getOperand(1))->getVT() == MVT::i32) {
	+ if (N.getOperand(0).getOpcode() == ISD::OR) {
	+ SDValue Or = N.getOperand(0);
	+ if (Or.getOperand(0).getOpcode() == ISD::SHL) {
	+ SDValue Shl = Or.getOperand(0);
	+ if (isa<ConstantSDNode>(Shl.getOperand(1)) &&
	+ isa<ConstantSDNode>(Or.getOperand(1))) {
	+ uint32_t VC1 = Or.getConstantOperandVal(1);
	+ uint32_t VC2 = Shl.getConstantOperandVal(1);
	+ if (VC1 == maskTrailingOnes<uint32_t>(VC2)) {
	+ RS1 = Shl.getOperand(0);
	+ Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
	+ Shl.getOperand(1).getValueType());
	+ return true;
	+ }
	+ }
	+ }
	+ }
	+ }
	+ return false;
	+}
	+
	+// Check that it is a SROIW (Shift Right Ones Immediate i32 on RV64).
	+// We first check that it is the right node tree:
	+//
	+// (OR (SHL RS1, VC2), VC1)
	+//
	+// and then we check that VC1, the mask used to fill with ones, is compatible
	+// with VC2, the shamt:
	+//
	+// VC1 == maskLeadingOnes<uint32_t>(VC2)
	+
	+bool RISCVDAGToDAGISel::SelectSROIW(SDValue N, SDValue &RS1, SDValue &Shamt) {
	+ if (N.getOpcode() == ISD::OR && Subtarget->getXLenVT() == MVT::i64) {
	+ SDValue Or = N;
	+ if (Or.getOperand(0).getOpcode() == ISD::SRL) {
	+ SDValue Srl = Or.getOperand(0);
	+ if (isa<ConstantSDNode>(Srl.getOperand(1)) &&
	+ isa<ConstantSDNode>(Or.getOperand(1))) {
	+ uint32_t VC1 = Or.getConstantOperandVal(1);
	+ uint32_t VC2 = Srl.getConstantOperandVal(1);
	+ if (VC1 == maskLeadingOnes<uint32_t>(VC2)) {
	+ RS1 = Srl.getOperand(0);
	+ Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
	+ Srl.getOperand(1).getValueType());
	+ return true;
	+ }
	+ }
	+ }
	+ }
	+ return false;
	+}
	+
	+// Check that it is a RORIW (i32 Right Rotate Immediate on RV64).
	+// We first check that it is the right node tree:
	+//
	+// (SIGN_EXTEND_INREG (OR (SHL (AsserSext RS1, i32), VC2),
	+// (SRL (AND (AssertSext RS2, i32), VC3), VC1)))
	+//
	+// Then we check that the constant operands respect these constraints:
	+//
	+// VC2 == 32 - VC1
	+// VC3 == maskLeadingOnes<uint32_t>(VC2)
	+//
	+// being VC1 the Shamt we need, VC2 the complementary of Shamt over 32
	+// and VC3 a 32 bit mask of (32 - VC1) leading ones.
	+
	+bool RISCVDAGToDAGISel::SelectRORIW(SDValue N, SDValue &RS1, SDValue &Shamt) {
	+ if (N.getOpcode() == ISD::SIGN_EXTEND_INREG &&
	+ Subtarget->getXLenVT() == MVT::i64 &&
	+ cast<VTSDNode>(N.getOperand(1))->getVT() == MVT::i32) {
	+ if (N.getOperand(0).getOpcode() == ISD::OR) {
	+ SDValue Or = N.getOperand(0);
	+ if (Or.getOperand(0).getOpcode() == ISD::SHL &&
	+ Or.getOperand(1).getOpcode() == ISD::SRL) {
	+ SDValue Shl = Or.getOperand(0);
	+ SDValue Srl = Or.getOperand(1);
	+ if (Srl.getOperand(0).getOpcode() == ISD::AND) {
	+ SDValue And = Srl.getOperand(0);
	+ if (isa<ConstantSDNode>(Srl.getOperand(1)) &&
	+ isa<ConstantSDNode>(Shl.getOperand(1)) &&
	+ isa<ConstantSDNode>(And.getOperand(1))) {
	+ uint32_t VC1 = Srl.getConstantOperandVal(1);
	+ uint32_t VC2 = Shl.getConstantOperandVal(1);
	+ uint32_t VC3 = And.getConstantOperandVal(1);
	+ if (VC2 == (32 - VC1) &&
	+ VC3 == maskLeadingOnes<uint32_t>(VC2)) {
	+ RS1 = Shl.getOperand(0);
	+ Shamt = CurDAG->getTargetConstant(VC1, SDLoc(N),
	+ Srl.getOperand(1).getValueType());
	+ return true;
	+ }
	+ }
	+ }
	+ }
	+ }
	+ }
	+ return false;
	+}
	+
	+// Check that it is a FSRIW (i32 Funnel Shift Right Immediate on RV64).
	+// We first check that it is the right node tree:
	+//
	+// (SIGN_EXTEND_INREG (OR (SHL (AsserSext RS1, i32), VC2),
	+// (SRL (AND (AssertSext RS2, i32), VC3), VC1)))
	+//
	+// Then we check that the constant operands respect these constraints:
	+//
	+// VC2 == 32 - VC1
	+// VC3 == maskLeadingOnes<uint32_t>(VC2)
	+//
	+// being VC1 the Shamt we need, VC2 the complementary of Shamt over 32
	+// and VC3 a 32 bit mask of (32 - VC1) leading ones.
	+
	+bool RISCVDAGToDAGISel::SelectFSRIW(SDValue N, SDValue &RS1, SDValue &RS2,
	+ SDValue &Shamt) {
	+ if (N.getOpcode() == ISD::SIGN_EXTEND_INREG &&
	+ Subtarget->getXLenVT() == MVT::i64 &&
	+ cast<VTSDNode>(N.getOperand(1))->getVT() == MVT::i32) {
	+ if (N.getOperand(0).getOpcode() == ISD::OR) {
	+ SDValue Or = N.getOperand(0);
	+ if (Or.getOperand(0).getOpcode() == ISD::SHL &&
	+ Or.getOperand(1).getOpcode() == ISD::SRL) {
	+ SDValue Shl = Or.getOperand(0);
	+ SDValue Srl = Or.getOperand(1);
	+ if (Srl.getOperand(0).getOpcode() == ISD::AND) {
	+ SDValue And = Srl.getOperand(0);
	+ if (isa<ConstantSDNode>(Srl.getOperand(1)) &&
	+ isa<ConstantSDNode>(Shl.getOperand(1)) &&
	+ isa<ConstantSDNode>(And.getOperand(1))) {
	+ uint32_t VC1 = Srl.getConstantOperandVal(1);
	+ uint32_t VC2 = Shl.getConstantOperandVal(1);
	+ uint32_t VC3 = And.getConstantOperandVal(1);
	+ if (VC2 == (32 - VC1) &&
	+ VC3 == maskLeadingOnes<uint32_t>(VC2)) {
	+ RS1 = Shl.getOperand(0);
	+ RS2 = And.getOperand(0);
	+ Shamt = CurDAG->getTargetConstant(VC1, SDLoc(N),
	+ Srl.getOperand(1).getValueType());
	+ return true;
	+ }
	+ }
	+ }
	+ }
	+ }
	+ }
	+ return false;
	+}
	+
	// Merge an ADDI into the offset of a load/store instruction where possible.
	// (load (addi base, off1), off2) -> (load base, off1+off2)
	// (store val, (addi base, off1), off2) -> (store val, base, off1+off2)
	// This is possible when off1+off2 fits a 12-bit immediate.
	void RISCVDAGToDAGISel::doPeepholeLoadStoreADDI() {
	SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
	++Position;

	while (Position != CurDAG->allnodes_begin()) {
	SDNode N = &--Position;
	// Skip dead nodes and any non-machine opcodes.
	if (N->use_empty() \|\| !N->isMachineOpcode())
	continue;

	int OffsetOpIdx;
	int BaseOpIdx;

	// Only attempt this optimisation for I-type loads and S-type stores.
	switch (N->getMachineOpcode()) {
	default:
	continue;
	case RISCV::LB:
	case RISCV::LH:
	case RISCV::LW:
	case RISCV::LBU:
	case RISCV::LHU:
	case RISCV::LWU:
	case RISCV::LD:
	case RISCV::FLW:
	case RISCV::FLD:
	BaseOpIdx = 0;
	OffsetOpIdx = 1;
	break;
	case RISCV::SB:
	case RISCV::SH:
	case RISCV::SW:
	case RISCV::SD:
	case RISCV::FSW:
	case RISCV::FSD:
	BaseOpIdx = 1;
	OffsetOpIdx = 2;
	break;
	}

	if (!isa<ConstantSDNode>(N->getOperand(OffsetOpIdx)))
	continue;

	SDValue Base = N->getOperand(BaseOpIdx);

	// If the base is an ADDI, we can merge it in to the load/store.
	if (!Base.isMachineOpcode() \|\| Base.getMachineOpcode() != RISCV::ADDI)
	continue;

	SDValue ImmOperand = Base.getOperand(1);
	uint64_t Offset2 = N->getConstantOperandVal(OffsetOpIdx);

	if (auto Const = dyn_cast<ConstantSDNode>(ImmOperand)) {
	int64_t Offset1 = Const->getSExtValue();
	int64_t CombinedOffset = Offset1 + Offset2;
	if (!isInt<12>(CombinedOffset))
	continue;
	ImmOperand = CurDAG->getTargetConstant(CombinedOffset, SDLoc(ImmOperand),
	ImmOperand.getValueType());
	} else if (auto GA = dyn_cast<GlobalAddressSDNode>(ImmOperand)) {
	// If the off1 in (addi base, off1) is a global variable's address (its
	// low part, really), then we can rely on the alignment of that variable
	// to provide a margin of safety before off1 can overflow the 12 bits.
	// Check if off2 falls within that margin; if so off1+off2 can't overflow.
	const DataLayout &DL = CurDAG->getDataLayout();
	Align Alignment = GA->getGlobal()->getPointerAlignment(DL);
	if (Offset2 != 0 && Alignment <= Offset2)
	continue;
	int64_t Offset1 = GA->getOffset();
	int64_t CombinedOffset = Offset1 + Offset2;
	ImmOperand = CurDAG->getTargetGlobalAddress(
	GA->getGlobal(), SDLoc(ImmOperand), ImmOperand.getValueType(),
	CombinedOffset, GA->getTargetFlags());
	} else if (auto CP = dyn_cast<ConstantPoolSDNode>(ImmOperand)) {
	// Ditto.
	Align Alignment = CP->getAlign();
	if (Offset2 != 0 && Alignment <= Offset2)
	continue;
	int64_t Offset1 = CP->getOffset();
	int64_t CombinedOffset = Offset1 + Offset2;
	ImmOperand = CurDAG->getTargetConstantPool(
	CP->getConstVal(), ImmOperand.getValueType(), CP->getAlign(),
	CombinedOffset, CP->getTargetFlags());
	} else {
	continue;
	}

	LLVM_DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase: ");
	LLVM_DEBUG(Base->dump(CurDAG));
	LLVM_DEBUG(dbgs() << "\nN: ");
	LLVM_DEBUG(N->dump(CurDAG));
	LLVM_DEBUG(dbgs() << "\n");

	// Modify the offset operand of the load/store.
	if (BaseOpIdx == 0) // Load
	CurDAG->UpdateNodeOperands(N, Base.getOperand(0), ImmOperand,
	N->getOperand(2));
	else // Store
	CurDAG->UpdateNodeOperands(N, N->getOperand(0), Base.getOperand(0),
	ImmOperand, N->getOperand(3));

	// The add-immediate may now be dead, in which case remove it.
	if (Base.getNode()->use_empty())
	CurDAG->RemoveDeadNode(Base.getNode());
	}
	}

	// This pass converts a legalized DAG into a RISCV-specific DAG, ready
	// for instruction scheduling.
	FunctionPass *llvm::createRISCVISelDag(RISCVTargetMachine &TM) {
	return new RISCVDAGToDAGISel(TM);
	}
	diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
	index dcf733ec3675..0ca12510a230 100644
	--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
	+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
	@@ -1,56 +1,65 @@
	//===---- RISCVISelDAGToDAG.h - A dag to dag inst selector for RISCV ------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines an instruction selector for the RISCV target.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_RISCV_RISCVISELDAGTODAG_H
	#define LLVM_LIB_TARGET_RISCV_RISCVISELDAGTODAG_H

	#include "RISCV.h"
	#include "RISCVTargetMachine.h"
	#include "llvm/CodeGen/SelectionDAGISel.h"

	// RISCV-specific code to select RISCV machine instructions for
	// SelectionDAG operations.
	namespace llvm {
	class RISCVDAGToDAGISel : public SelectionDAGISel {
	const RISCVSubtarget *Subtarget = nullptr;

	public:
	explicit RISCVDAGToDAGISel(RISCVTargetMachine &TargetMachine)
	: SelectionDAGISel(TargetMachine) {}

	StringRef getPassName() const override {
	return "RISCV DAG->DAG Pattern Instruction Selection";
	}

	bool runOnMachineFunction(MachineFunction &MF) override {
	Subtarget = &MF.getSubtarget<RISCVSubtarget>();
	return SelectionDAGISel::runOnMachineFunction(MF);
	}

	void PostprocessISelDAG() override;

	void Select(SDNode *Node) override;

	bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
	std::vector<SDValue> &OutOps) override;

	bool SelectAddrFI(SDValue Addr, SDValue &Base);

	+ bool SelectSLOI(SDValue N, SDValue &RS1, SDValue &Shamt);
	+ bool SelectSROI(SDValue N, SDValue &RS1, SDValue &Shamt);
	+ bool SelectRORI(SDValue N, SDValue &RS1, SDValue &Shamt);
	+ bool SelectSLLIUW(SDValue N, SDValue &RS1, SDValue &Shamt);
	+ bool SelectSLOIW(SDValue N, SDValue &RS1, SDValue &Shamt);
	+ bool SelectSROIW(SDValue N, SDValue &RS1, SDValue &Shamt);
	+ bool SelectRORIW(SDValue N, SDValue &RS1, SDValue &Shamt);
	+ bool SelectFSRIW(SDValue N, SDValue &RS1, SDValue &RS2, SDValue &Shamt);
	+
	// Include the pieces autogenerated from the target description.
	#include "RISCVGenDAGISel.inc"

	private:
	void doPeepholeLoadStoreADDI();
	};
	}

	#endif
	diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
	index 91fc69b5bc10..03d9eefd59d0 100644
	--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
	+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
	@@ -1,3019 +1,3034 @@
	//===-- RISCVISelLowering.cpp - RISCV DAG Lowering Implementation --------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the interfaces that RISCV uses to lower LLVM code into a
	// selection DAG.
	//
	//===----------------------------------------------------------------------===//

	#include "RISCVISelLowering.h"
	#include "RISCV.h"
	#include "RISCVMachineFunctionInfo.h"
	#include "RISCVRegisterInfo.h"
	#include "RISCVSubtarget.h"
	#include "RISCVTargetMachine.h"
	#include "Utils/RISCVMatInt.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/CodeGen/CallingConvLower.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/SelectionDAGISel.h"
	#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/DiagnosticInfo.h"
	#include "llvm/IR/DiagnosticPrinter.h"
	#include "llvm/IR/IntrinsicsRISCV.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"

	using namespace llvm;

	#define DEBUG_TYPE "riscv-lower"

	STATISTIC(NumTailCalls, "Number of tail calls");

	RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
	const RISCVSubtarget &STI)
	: TargetLowering(TM), Subtarget(STI) {

	if (Subtarget.isRV32E())
	report_fatal_error("Codegen not yet implemented for RV32E");

	RISCVABI::ABI ABI = Subtarget.getTargetABI();
	assert(ABI != RISCVABI::ABI_Unknown && "Improperly initialised target ABI");

	if ((ABI == RISCVABI::ABI_ILP32F \|\| ABI == RISCVABI::ABI_LP64F) &&
	!Subtarget.hasStdExtF()) {
	errs() << "Hard-float 'f' ABI can't be used for a target that "
	"doesn't support the F instruction set extension (ignoring "
	"target-abi)\n";
	ABI = Subtarget.is64Bit() ? RISCVABI::ABI_LP64 : RISCVABI::ABI_ILP32;
	} else if ((ABI == RISCVABI::ABI_ILP32D \|\| ABI == RISCVABI::ABI_LP64D) &&
	!Subtarget.hasStdExtD()) {
	errs() << "Hard-float 'd' ABI can't be used for a target that "
	"doesn't support the D instruction set extension (ignoring "
	"target-abi)\n";
	ABI = Subtarget.is64Bit() ? RISCVABI::ABI_LP64 : RISCVABI::ABI_ILP32;
	}

	switch (ABI) {
	default:
	report_fatal_error("Don't know how to lower this ABI");
	case RISCVABI::ABI_ILP32:
	case RISCVABI::ABI_ILP32F:
	case RISCVABI::ABI_ILP32D:
	case RISCVABI::ABI_LP64:
	case RISCVABI::ABI_LP64F:
	case RISCVABI::ABI_LP64D:
	break;
	}

	MVT XLenVT = Subtarget.getXLenVT();

	// Set up the register classes.
	addRegisterClass(XLenVT, &RISCV::GPRRegClass);

	if (Subtarget.hasStdExtF())
	addRegisterClass(MVT::f32, &RISCV::FPR32RegClass);
	if (Subtarget.hasStdExtD())
	addRegisterClass(MVT::f64, &RISCV::FPR64RegClass);

	// Compute derived properties from the register classes.
	computeRegisterProperties(STI.getRegisterInfo());

	setStackPointerRegisterToSaveRestore(RISCV::X2);

	for (auto N : {ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD})
	setLoadExtAction(N, XLenVT, MVT::i1, Promote);

	// TODO: add all necessary setOperationAction calls.
	setOperationAction(ISD::DYNAMIC_STACKALLOC, XLenVT, Expand);

	setOperationAction(ISD::BR_JT, MVT::Other, Expand);
	setOperationAction(ISD::BR_CC, XLenVT, Expand);
	setOperationAction(ISD::SELECT, XLenVT, Custom);
	setOperationAction(ISD::SELECT_CC, XLenVT, Expand);

	setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
	setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);

	setOperationAction(ISD::VASTART, MVT::Other, Custom);
	setOperationAction(ISD::VAARG, MVT::Other, Expand);
	setOperationAction(ISD::VACOPY, MVT::Other, Expand);
	setOperationAction(ISD::VAEND, MVT::Other, Expand);

	for (auto VT : {MVT::i1, MVT::i8, MVT::i16})
	setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);

	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::ADD, MVT::i32, Custom);
	setOperationAction(ISD::SUB, MVT::i32, Custom);
	setOperationAction(ISD::SHL, MVT::i32, Custom);
	setOperationAction(ISD::SRA, MVT::i32, Custom);
	setOperationAction(ISD::SRL, MVT::i32, Custom);
	}

	if (!Subtarget.hasStdExtM()) {
	setOperationAction(ISD::MUL, XLenVT, Expand);
	setOperationAction(ISD::MULHS, XLenVT, Expand);
	setOperationAction(ISD::MULHU, XLenVT, Expand);
	setOperationAction(ISD::SDIV, XLenVT, Expand);
	setOperationAction(ISD::UDIV, XLenVT, Expand);
	setOperationAction(ISD::SREM, XLenVT, Expand);
	setOperationAction(ISD::UREM, XLenVT, Expand);
	}

	if (Subtarget.is64Bit() && Subtarget.hasStdExtM()) {
	setOperationAction(ISD::MUL, MVT::i32, Custom);
	setOperationAction(ISD::SDIV, MVT::i32, Custom);
	setOperationAction(ISD::UDIV, MVT::i32, Custom);
	setOperationAction(ISD::UREM, MVT::i32, Custom);
	}

	setOperationAction(ISD::SDIVREM, XLenVT, Expand);
	setOperationAction(ISD::UDIVREM, XLenVT, Expand);
	setOperationAction(ISD::SMUL_LOHI, XLenVT, Expand);
	setOperationAction(ISD::UMUL_LOHI, XLenVT, Expand);

	setOperationAction(ISD::SHL_PARTS, XLenVT, Custom);
	setOperationAction(ISD::SRL_PARTS, XLenVT, Custom);
	setOperationAction(ISD::SRA_PARTS, XLenVT, Custom);

	- setOperationAction(ISD::ROTL, XLenVT, Expand);
	- setOperationAction(ISD::ROTR, XLenVT, Expand);
	- setOperationAction(ISD::BSWAP, XLenVT, Expand);
	- setOperationAction(ISD::CTTZ, XLenVT, Expand);
	- setOperationAction(ISD::CTLZ, XLenVT, Expand);
	- setOperationAction(ISD::CTPOP, XLenVT, Expand);
	+ if (!(Subtarget.hasStdExtZbb() \|\| Subtarget.hasStdExtZbp())) {
	+ setOperationAction(ISD::ROTL, XLenVT, Expand);
	+ setOperationAction(ISD::ROTR, XLenVT, Expand);
	+ }
	+
	+ if (!Subtarget.hasStdExtZbp())
	+ setOperationAction(ISD::BSWAP, XLenVT, Expand);
	+
	+ if (!Subtarget.hasStdExtZbb()) {
	+ setOperationAction(ISD::CTTZ, XLenVT, Expand);
	+ setOperationAction(ISD::CTLZ, XLenVT, Expand);
	+ setOperationAction(ISD::CTPOP, XLenVT, Expand);
	+ }
	+
	+ if (Subtarget.hasStdExtZbp())
	+ setOperationAction(ISD::BITREVERSE, XLenVT, Legal);
	+
	+ if (Subtarget.hasStdExtZbt()) {
	+ setOperationAction(ISD::FSHL, XLenVT, Legal);
	+ setOperationAction(ISD::FSHR, XLenVT, Legal);
	+ }

	ISD::CondCode FPCCToExtend[] = {
	ISD::SETOGT, ISD::SETOGE, ISD::SETONE, ISD::SETUEQ, ISD::SETUGT,
	ISD::SETUGE, ISD::SETULT, ISD::SETULE, ISD::SETUNE, ISD::SETGT,
	ISD::SETGE, ISD::SETNE};

	ISD::NodeType FPOpToExtend[] = {
	ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOW, ISD::FREM, ISD::FP16_TO_FP,
	ISD::FP_TO_FP16};

	if (Subtarget.hasStdExtF()) {
	setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
	setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
	for (auto CC : FPCCToExtend)
	setCondCodeAction(CC, MVT::f32, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
	setOperationAction(ISD::SELECT, MVT::f32, Custom);
	setOperationAction(ISD::BR_CC, MVT::f32, Expand);
	for (auto Op : FPOpToExtend)
	setOperationAction(Op, MVT::f32, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
	setTruncStoreAction(MVT::f32, MVT::f16, Expand);
	}

	if (Subtarget.hasStdExtF() && Subtarget.is64Bit())
	setOperationAction(ISD::BITCAST, MVT::i32, Custom);

	if (Subtarget.hasStdExtD()) {
	setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
	setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
	for (auto CC : FPCCToExtend)
	setCondCodeAction(CC, MVT::f64, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
	setOperationAction(ISD::SELECT, MVT::f64, Custom);
	setOperationAction(ISD::BR_CC, MVT::f64, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
	setTruncStoreAction(MVT::f64, MVT::f32, Expand);
	for (auto Op : FPOpToExtend)
	setOperationAction(Op, MVT::f64, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
	setTruncStoreAction(MVT::f64, MVT::f16, Expand);
	}

	if (Subtarget.is64Bit() &&
	!(Subtarget.hasStdExtD() \|\| Subtarget.hasStdExtF())) {
	setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
	}

	setOperationAction(ISD::GlobalAddress, XLenVT, Custom);
	setOperationAction(ISD::BlockAddress, XLenVT, Custom);
	setOperationAction(ISD::ConstantPool, XLenVT, Custom);

	setOperationAction(ISD::GlobalTLSAddress, XLenVT, Custom);

	// TODO: On M-mode only targets, the cycle[h] CSR may not be present.
	// Unfortunately this can't be determined just from the ISA naming string.
	setOperationAction(ISD::READCYCLECOUNTER, MVT::i64,
	Subtarget.is64Bit() ? Legal : Custom);

	setOperationAction(ISD::TRAP, MVT::Other, Legal);
	setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);

	if (Subtarget.hasStdExtA()) {
	setMaxAtomicSizeInBitsSupported(Subtarget.getXLen());
	setMinCmpXchgSizeInBits(32);
	} else {
	setMaxAtomicSizeInBitsSupported(0);
	}

	setBooleanContents(ZeroOrOneBooleanContent);

	// Function alignments.
	const Align FunctionAlignment(Subtarget.hasStdExtC() ? 2 : 4);
	setMinFunctionAlignment(FunctionAlignment);
	setPrefFunctionAlignment(FunctionAlignment);

	// Effectively disable jump table generation.
	setMinimumJumpTableEntries(INT_MAX);

	// Jumps are expensive, compared to logic
	setJumpIsExpensive();

	// We can use any register for comparisons
	setHasMultipleConditionRegisters();
	}

	EVT RISCVTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
	EVT VT) const {
	if (!VT.isVector())
	return getPointerTy(DL);
	return VT.changeVectorElementTypeToInteger();
	}

	bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
	const CallInst &I,
	MachineFunction &MF,
	unsigned Intrinsic) const {
	switch (Intrinsic) {
	default:
	return false;
	case Intrinsic::riscv_masked_atomicrmw_xchg_i32:
	case Intrinsic::riscv_masked_atomicrmw_add_i32:
	case Intrinsic::riscv_masked_atomicrmw_sub_i32:
	case Intrinsic::riscv_masked_atomicrmw_nand_i32:
	case Intrinsic::riscv_masked_atomicrmw_max_i32:
	case Intrinsic::riscv_masked_atomicrmw_min_i32:
	case Intrinsic::riscv_masked_atomicrmw_umax_i32:
	case Intrinsic::riscv_masked_atomicrmw_umin_i32:
	case Intrinsic::riscv_masked_cmpxchg_i32:
	PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::getVT(PtrTy->getElementType());
	Info.ptrVal = I.getArgOperand(0);
	Info.offset = 0;
	Info.align = Align(4);
	Info.flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOStore \|
	MachineMemOperand::MOVolatile;
	return true;
	}
	}

	bool RISCVTargetLowering::isLegalAddressingMode(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS,
	Instruction *I) const {
	// No global is ever allowed as a base.
	if (AM.BaseGV)
	return false;

	// Require a 12-bit signed offset.
	if (!isInt<12>(AM.BaseOffs))
	return false;

	switch (AM.Scale) {
	case 0: // "r+i" or just "i", depending on HasBaseReg.
	break;
	case 1:
	if (!AM.HasBaseReg) // allow "r+i".
	break;
	return false; // disallow "r+r" or "r+r+i".
	default:
	return false;
	}

	return true;
	}

	bool RISCVTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
	return isInt<12>(Imm);
	}

	bool RISCVTargetLowering::isLegalAddImmediate(int64_t Imm) const {
	return isInt<12>(Imm);
	}

	// On RV32, 64-bit integers are split into their high and low parts and held
	// in two different registers, so the trunc is free since the low register can
	// just be used.
	bool RISCVTargetLowering::isTruncateFree(Type SrcTy, Type DstTy) const {
	if (Subtarget.is64Bit() \|\| !SrcTy->isIntegerTy() \|\| !DstTy->isIntegerTy())
	return false;
	unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
	unsigned DestBits = DstTy->getPrimitiveSizeInBits();
	return (SrcBits == 64 && DestBits == 32);
	}

	bool RISCVTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const {
	if (Subtarget.is64Bit() \|\| SrcVT.isVector() \|\| DstVT.isVector() \|\|
	!SrcVT.isInteger() \|\| !DstVT.isInteger())
	return false;
	unsigned SrcBits = SrcVT.getSizeInBits();
	unsigned DestBits = DstVT.getSizeInBits();
	return (SrcBits == 64 && DestBits == 32);
	}

	bool RISCVTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
	// Zexts are free if they can be combined with a load.
	if (auto *LD = dyn_cast<LoadSDNode>(Val)) {
	EVT MemVT = LD->getMemoryVT();
	if ((MemVT == MVT::i8 \|\| MemVT == MVT::i16 \|\|
	(Subtarget.is64Bit() && MemVT == MVT::i32)) &&
	(LD->getExtensionType() == ISD::NON_EXTLOAD \|\|
	LD->getExtensionType() == ISD::ZEXTLOAD))
	return true;
	}

	return TargetLowering::isZExtFree(Val, VT2);
	}

	bool RISCVTargetLowering::isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const {
	return Subtarget.is64Bit() && SrcVT == MVT::i32 && DstVT == MVT::i64;
	}

	bool RISCVTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
	bool ForCodeSize) const {
	if (VT == MVT::f32 && !Subtarget.hasStdExtF())
	return false;
	if (VT == MVT::f64 && !Subtarget.hasStdExtD())
	return false;
	if (Imm.isNegZero())
	return false;
	return Imm.isZero();
	}

	bool RISCVTargetLowering::hasBitPreservingFPLogic(EVT VT) const {
	return (VT == MVT::f32 && Subtarget.hasStdExtF()) \|\|
	(VT == MVT::f64 && Subtarget.hasStdExtD());
	}

	// Changes the condition code and swaps operands if necessary, so the SetCC
	// operation matches one of the comparisons supported directly in the RISC-V
	// ISA.
	static void normaliseSetCC(SDValue &LHS, SDValue &RHS, ISD::CondCode &CC) {
	switch (CC) {
	default:
	break;
	case ISD::SETGT:
	case ISD::SETLE:
	case ISD::SETUGT:
	case ISD::SETULE:
	CC = ISD::getSetCCSwappedOperands(CC);
	std::swap(LHS, RHS);
	break;
	}
	}

	// Return the RISC-V branch opcode that matches the given DAG integer
	// condition code. The CondCode must be one of those supported by the RISC-V
	// ISA (see normaliseSetCC).
	static unsigned getBranchOpcodeForIntCondCode(ISD::CondCode CC) {
	switch (CC) {
	default:
	llvm_unreachable("Unsupported CondCode");
	case ISD::SETEQ:
	return RISCV::BEQ;
	case ISD::SETNE:
	return RISCV::BNE;
	case ISD::SETLT:
	return RISCV::BLT;
	case ISD::SETGE:
	return RISCV::BGE;
	case ISD::SETULT:
	return RISCV::BLTU;
	case ISD::SETUGE:
	return RISCV::BGEU;
	}
	}

	SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
	SelectionDAG &DAG) const {
	switch (Op.getOpcode()) {
	default:
	report_fatal_error("unimplemented operand");
	case ISD::GlobalAddress:
	return lowerGlobalAddress(Op, DAG);
	case ISD::BlockAddress:
	return lowerBlockAddress(Op, DAG);
	case ISD::ConstantPool:
	return lowerConstantPool(Op, DAG);
	case ISD::GlobalTLSAddress:
	return lowerGlobalTLSAddress(Op, DAG);
	case ISD::SELECT:
	return lowerSELECT(Op, DAG);
	case ISD::VASTART:
	return lowerVASTART(Op, DAG);
	case ISD::FRAMEADDR:
	return lowerFRAMEADDR(Op, DAG);
	case ISD::RETURNADDR:
	return lowerRETURNADDR(Op, DAG);
	case ISD::SHL_PARTS:
	return lowerShiftLeftParts(Op, DAG);
	case ISD::SRA_PARTS:
	return lowerShiftRightParts(Op, DAG, true);
	case ISD::SRL_PARTS:
	return lowerShiftRightParts(Op, DAG, false);
	case ISD::BITCAST: {
	assert(Subtarget.is64Bit() && Subtarget.hasStdExtF() &&
	"Unexpected custom legalisation");
	SDLoc DL(Op);
	SDValue Op0 = Op.getOperand(0);
	if (Op.getValueType() != MVT::f32 \|\| Op0.getValueType() != MVT::i32)
	return SDValue();
	SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
	SDValue FPConv = DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, NewOp0);
	return FPConv;
	}
	case ISD::INTRINSIC_WO_CHAIN:
	return LowerINTRINSIC_WO_CHAIN(Op, DAG);
	}
	}

	static SDValue getTargetNode(GlobalAddressSDNode *N, SDLoc DL, EVT Ty,
	SelectionDAG &DAG, unsigned Flags) {
	return DAG.getTargetGlobalAddress(N->getGlobal(), DL, Ty, 0, Flags);
	}

	static SDValue getTargetNode(BlockAddressSDNode *N, SDLoc DL, EVT Ty,
	SelectionDAG &DAG, unsigned Flags) {
	return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, N->getOffset(),
	Flags);
	}

	static SDValue getTargetNode(ConstantPoolSDNode *N, SDLoc DL, EVT Ty,
	SelectionDAG &DAG, unsigned Flags) {
	return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
	N->getOffset(), Flags);
	}

	template <class NodeTy>
	SDValue RISCVTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
	bool IsLocal) const {
	SDLoc DL(N);
	EVT Ty = getPointerTy(DAG.getDataLayout());

	if (isPositionIndependent()) {
	SDValue Addr = getTargetNode(N, DL, Ty, DAG, 0);
	if (IsLocal)
	// Use PC-relative addressing to access the symbol. This generates the
	// pattern (PseudoLLA sym), which expands to (addi (auipc %pcrel_hi(sym))
	// %pcrel_lo(auipc)).
	return SDValue(DAG.getMachineNode(RISCV::PseudoLLA, DL, Ty, Addr), 0);

	// Use PC-relative addressing to access the GOT for this symbol, then load
	// the address from the GOT. This generates the pattern (PseudoLA sym),
	// which expands to (ld (addi (auipc %got_pcrel_hi(sym)) %pcrel_lo(auipc))).
	return SDValue(DAG.getMachineNode(RISCV::PseudoLA, DL, Ty, Addr), 0);
	}

	switch (getTargetMachine().getCodeModel()) {
	default:
	report_fatal_error("Unsupported code model for lowering");
	case CodeModel::Small: {
	// Generate a sequence for accessing addresses within the first 2 GiB of
	// address space. This generates the pattern (addi (lui %hi(sym)) %lo(sym)).
	SDValue AddrHi = getTargetNode(N, DL, Ty, DAG, RISCVII::MO_HI);
	SDValue AddrLo = getTargetNode(N, DL, Ty, DAG, RISCVII::MO_LO);
	SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, AddrHi), 0);
	return SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNHi, AddrLo), 0);
	}
	case CodeModel::Medium: {
	// Generate a sequence for accessing addresses within any 2GiB range within
	// the address space. This generates the pattern (PseudoLLA sym), which
	// expands to (addi (auipc %pcrel_hi(sym)) %pcrel_lo(auipc)).
	SDValue Addr = getTargetNode(N, DL, Ty, DAG, 0);
	return SDValue(DAG.getMachineNode(RISCV::PseudoLLA, DL, Ty, Addr), 0);
	}
	}
	}

	SDValue RISCVTargetLowering::lowerGlobalAddress(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	EVT Ty = Op.getValueType();
	GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
	int64_t Offset = N->getOffset();
	MVT XLenVT = Subtarget.getXLenVT();

	const GlobalValue *GV = N->getGlobal();
	bool IsLocal = getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
	SDValue Addr = getAddr(N, DAG, IsLocal);

	// In order to maximise the opportunity for common subexpression elimination,
	// emit a separate ADD node for the global address offset instead of folding
	// it in the global address node. Later peephole optimisations may choose to
	// fold it back in when profitable.
	if (Offset != 0)
	return DAG.getNode(ISD::ADD, DL, Ty, Addr,
	DAG.getConstant(Offset, DL, XLenVT));
	return Addr;
	}

	SDValue RISCVTargetLowering::lowerBlockAddress(SDValue Op,
	SelectionDAG &DAG) const {
	BlockAddressSDNode *N = cast<BlockAddressSDNode>(Op);

	return getAddr(N, DAG);
	}

	SDValue RISCVTargetLowering::lowerConstantPool(SDValue Op,
	SelectionDAG &DAG) const {
	ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);

	return getAddr(N, DAG);
	}

	SDValue RISCVTargetLowering::getStaticTLSAddr(GlobalAddressSDNode *N,
	SelectionDAG &DAG,
	bool UseGOT) const {
	SDLoc DL(N);
	EVT Ty = getPointerTy(DAG.getDataLayout());
	const GlobalValue *GV = N->getGlobal();
	MVT XLenVT = Subtarget.getXLenVT();

	if (UseGOT) {
	// Use PC-relative addressing to access the GOT for this TLS symbol, then
	// load the address from the GOT and add the thread pointer. This generates
	// the pattern (PseudoLA_TLS_IE sym), which expands to
	// (ld (auipc %tls_ie_pcrel_hi(sym)) %pcrel_lo(auipc)).
	SDValue Addr = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, 0);
	SDValue Load =
	SDValue(DAG.getMachineNode(RISCV::PseudoLA_TLS_IE, DL, Ty, Addr), 0);

	// Add the thread pointer.
	SDValue TPReg = DAG.getRegister(RISCV::X4, XLenVT);
	return DAG.getNode(ISD::ADD, DL, Ty, Load, TPReg);
	}

	// Generate a sequence for accessing the address relative to the thread
	// pointer, with the appropriate adjustment for the thread pointer offset.
	// This generates the pattern
	// (add (add_tprel (lui %tprel_hi(sym)) tp %tprel_add(sym)) %tprel_lo(sym))
	SDValue AddrHi =
	DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_TPREL_HI);
	SDValue AddrAdd =
	DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_TPREL_ADD);
	SDValue AddrLo =
	DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_TPREL_LO);

	SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, AddrHi), 0);
	SDValue TPReg = DAG.getRegister(RISCV::X4, XLenVT);
	SDValue MNAdd = SDValue(
	DAG.getMachineNode(RISCV::PseudoAddTPRel, DL, Ty, MNHi, TPReg, AddrAdd),
	0);
	return SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNAdd, AddrLo), 0);
	}

	SDValue RISCVTargetLowering::getDynamicTLSAddr(GlobalAddressSDNode *N,
	SelectionDAG &DAG) const {
	SDLoc DL(N);
	EVT Ty = getPointerTy(DAG.getDataLayout());
	IntegerType CallTy = Type::getIntNTy(DAG.getContext(), Ty.getSizeInBits());
	const GlobalValue *GV = N->getGlobal();

	// Use a PC-relative addressing mode to access the global dynamic GOT address.
	// This generates the pattern (PseudoLA_TLS_GD sym), which expands to
	// (addi (auipc %tls_gd_pcrel_hi(sym)) %pcrel_lo(auipc)).
	SDValue Addr = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, 0);
	SDValue Load =
	SDValue(DAG.getMachineNode(RISCV::PseudoLA_TLS_GD, DL, Ty, Addr), 0);

	// Prepare argument list to generate call.
	ArgListTy Args;
	ArgListEntry Entry;
	Entry.Node = Load;
	Entry.Ty = CallTy;
	Args.push_back(Entry);

	// Setup call to __tls_get_addr.
	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(DL)
	.setChain(DAG.getEntryNode())
	.setLibCallee(CallingConv::C, CallTy,
	DAG.getExternalSymbol("__tls_get_addr", Ty),
	std::move(Args));

	return LowerCallTo(CLI).first;
	}

	SDValue RISCVTargetLowering::lowerGlobalTLSAddress(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	EVT Ty = Op.getValueType();
	GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
	int64_t Offset = N->getOffset();
	MVT XLenVT = Subtarget.getXLenVT();

	TLSModel::Model Model = getTargetMachine().getTLSModel(N->getGlobal());

	SDValue Addr;
	switch (Model) {
	case TLSModel::LocalExec:
	Addr = getStaticTLSAddr(N, DAG, /UseGOT=/false);
	break;
	case TLSModel::InitialExec:
	Addr = getStaticTLSAddr(N, DAG, /UseGOT=/true);
	break;
	case TLSModel::LocalDynamic:
	case TLSModel::GeneralDynamic:
	Addr = getDynamicTLSAddr(N, DAG);
	break;
	}

	// In order to maximise the opportunity for common subexpression elimination,
	// emit a separate ADD node for the global address offset instead of folding
	// it in the global address node. Later peephole optimisations may choose to
	// fold it back in when profitable.
	if (Offset != 0)
	return DAG.getNode(ISD::ADD, DL, Ty, Addr,
	DAG.getConstant(Offset, DL, XLenVT));
	return Addr;
	}

	SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
	SDValue CondV = Op.getOperand(0);
	SDValue TrueV = Op.getOperand(1);
	SDValue FalseV = Op.getOperand(2);
	SDLoc DL(Op);
	MVT XLenVT = Subtarget.getXLenVT();

	// If the result type is XLenVT and CondV is the output of a SETCC node
	// which also operated on XLenVT inputs, then merge the SETCC node into the
	// lowered RISCVISD::SELECT_CC to take advantage of the integer
	// compare+branch instructions. i.e.:
	// (select (setcc lhs, rhs, cc), truev, falsev)
	// -> (riscvisd::select_cc lhs, rhs, cc, truev, falsev)
	if (Op.getSimpleValueType() == XLenVT && CondV.getOpcode() == ISD::SETCC &&
	CondV.getOperand(0).getSimpleValueType() == XLenVT) {
	SDValue LHS = CondV.getOperand(0);
	SDValue RHS = CondV.getOperand(1);
	auto CC = cast<CondCodeSDNode>(CondV.getOperand(2));
	ISD::CondCode CCVal = CC->get();

	normaliseSetCC(LHS, RHS, CCVal);

	SDValue TargetCC = DAG.getConstant(CCVal, DL, XLenVT);
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
	SDValue Ops[] = {LHS, RHS, TargetCC, TrueV, FalseV};
	return DAG.getNode(RISCVISD::SELECT_CC, DL, VTs, Ops);
	}

	// Otherwise:
	// (select condv, truev, falsev)
	// -> (riscvisd::select_cc condv, zero, setne, truev, falsev)
	SDValue Zero = DAG.getConstant(0, DL, XLenVT);
	SDValue SetNE = DAG.getConstant(ISD::SETNE, DL, XLenVT);

	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
	SDValue Ops[] = {CondV, Zero, SetNE, TrueV, FalseV};

	return DAG.getNode(RISCVISD::SELECT_CC, DL, VTs, Ops);
	}

	SDValue RISCVTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	RISCVMachineFunctionInfo *FuncInfo = MF.getInfo<RISCVMachineFunctionInfo>();

	SDLoc DL(Op);
	SDValue FI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
	getPointerTy(MF.getDataLayout()));

	// vastart just stores the address of the VarArgsFrameIndex slot into the
	// memory location argument.
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	return DAG.getStore(Op.getOperand(0), DL, FI, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	SDValue RISCVTargetLowering::lowerFRAMEADDR(SDValue Op,
	SelectionDAG &DAG) const {
	const RISCVRegisterInfo &RI = *Subtarget.getRegisterInfo();
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MFI.setFrameAddressIsTaken(true);
	Register FrameReg = RI.getFrameRegister(MF);
	int XLenInBytes = Subtarget.getXLen() / 8;

	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, FrameReg, VT);
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	while (Depth--) {
	int Offset = -(XLenInBytes * 2);
	SDValue Ptr = DAG.getNode(ISD::ADD, DL, VT, FrameAddr,
	DAG.getIntPtrConstant(Offset, DL));
	FrameAddr =
	DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo());
	}
	return FrameAddr;
	}

	SDValue RISCVTargetLowering::lowerRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	const RISCVRegisterInfo &RI = *Subtarget.getRegisterInfo();
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MFI.setReturnAddressIsTaken(true);
	MVT XLenVT = Subtarget.getXLenVT();
	int XLenInBytes = Subtarget.getXLen() / 8;

	if (verifyReturnAddressArgumentIsConstant(Op, DAG))
	return SDValue();

	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	if (Depth) {
	int Off = -XLenInBytes;
	SDValue FrameAddr = lowerFRAMEADDR(Op, DAG);
	SDValue Offset = DAG.getConstant(Off, DL, VT);
	return DAG.getLoad(VT, DL, DAG.getEntryNode(),
	DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
	MachinePointerInfo());
	}

	// Return the value of the return address register, marking it an implicit
	// live-in.
	Register Reg = MF.addLiveIn(RI.getRARegister(), getRegClassFor(XLenVT));
	return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, XLenVT);
	}

	SDValue RISCVTargetLowering::lowerShiftLeftParts(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	SDValue Lo = Op.getOperand(0);
	SDValue Hi = Op.getOperand(1);
	SDValue Shamt = Op.getOperand(2);
	EVT VT = Lo.getValueType();

	// if Shamt-XLEN < 0: // Shamt < XLEN
	// Lo = Lo << Shamt
	// Hi = (Hi << Shamt) \| ((Lo >>u 1) >>u (XLEN-1 - Shamt))
	// else:
	// Lo = 0
	// Hi = Lo << (Shamt-XLEN)

	SDValue Zero = DAG.getConstant(0, DL, VT);
	SDValue One = DAG.getConstant(1, DL, VT);
	SDValue MinusXLen = DAG.getConstant(-(int)Subtarget.getXLen(), DL, VT);
	SDValue XLenMinus1 = DAG.getConstant(Subtarget.getXLen() - 1, DL, VT);
	SDValue ShamtMinusXLen = DAG.getNode(ISD::ADD, DL, VT, Shamt, MinusXLen);
	SDValue XLenMinus1Shamt = DAG.getNode(ISD::SUB, DL, VT, XLenMinus1, Shamt);

	SDValue LoTrue = DAG.getNode(ISD::SHL, DL, VT, Lo, Shamt);
	SDValue ShiftRight1Lo = DAG.getNode(ISD::SRL, DL, VT, Lo, One);
	SDValue ShiftRightLo =
	DAG.getNode(ISD::SRL, DL, VT, ShiftRight1Lo, XLenMinus1Shamt);
	SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, VT, Hi, Shamt);
	SDValue HiTrue = DAG.getNode(ISD::OR, DL, VT, ShiftLeftHi, ShiftRightLo);
	SDValue HiFalse = DAG.getNode(ISD::SHL, DL, VT, Lo, ShamtMinusXLen);

	SDValue CC = DAG.getSetCC(DL, VT, ShamtMinusXLen, Zero, ISD::SETLT);

	Lo = DAG.getNode(ISD::SELECT, DL, VT, CC, LoTrue, Zero);
	Hi = DAG.getNode(ISD::SELECT, DL, VT, CC, HiTrue, HiFalse);

	SDValue Parts[2] = {Lo, Hi};
	return DAG.getMergeValues(Parts, DL);
	}

	SDValue RISCVTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
	bool IsSRA) const {
	SDLoc DL(Op);
	SDValue Lo = Op.getOperand(0);
	SDValue Hi = Op.getOperand(1);
	SDValue Shamt = Op.getOperand(2);
	EVT VT = Lo.getValueType();

	// SRA expansion:
	// if Shamt-XLEN < 0: // Shamt < XLEN
	// Lo = (Lo >>u Shamt) \| ((Hi << 1) << (XLEN-1 - Shamt))
	// Hi = Hi >>s Shamt
	// else:
	// Lo = Hi >>s (Shamt-XLEN);
	// Hi = Hi >>s (XLEN-1)
	//
	// SRL expansion:
	// if Shamt-XLEN < 0: // Shamt < XLEN
	// Lo = (Lo >>u Shamt) \| ((Hi << 1) << (XLEN-1 - Shamt))
	// Hi = Hi >>u Shamt
	// else:
	// Lo = Hi >>u (Shamt-XLEN);
	// Hi = 0;

	unsigned ShiftRightOp = IsSRA ? ISD::SRA : ISD::SRL;

	SDValue Zero = DAG.getConstant(0, DL, VT);
	SDValue One = DAG.getConstant(1, DL, VT);
	SDValue MinusXLen = DAG.getConstant(-(int)Subtarget.getXLen(), DL, VT);
	SDValue XLenMinus1 = DAG.getConstant(Subtarget.getXLen() - 1, DL, VT);
	SDValue ShamtMinusXLen = DAG.getNode(ISD::ADD, DL, VT, Shamt, MinusXLen);
	SDValue XLenMinus1Shamt = DAG.getNode(ISD::SUB, DL, VT, XLenMinus1, Shamt);

	SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, VT, Lo, Shamt);
	SDValue ShiftLeftHi1 = DAG.getNode(ISD::SHL, DL, VT, Hi, One);
	SDValue ShiftLeftHi =
	DAG.getNode(ISD::SHL, DL, VT, ShiftLeftHi1, XLenMinus1Shamt);
	SDValue LoTrue = DAG.getNode(ISD::OR, DL, VT, ShiftRightLo, ShiftLeftHi);
	SDValue HiTrue = DAG.getNode(ShiftRightOp, DL, VT, Hi, Shamt);
	SDValue LoFalse = DAG.getNode(ShiftRightOp, DL, VT, Hi, ShamtMinusXLen);
	SDValue HiFalse =
	IsSRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, XLenMinus1) : Zero;

	SDValue CC = DAG.getSetCC(DL, VT, ShamtMinusXLen, Zero, ISD::SETLT);

	Lo = DAG.getNode(ISD::SELECT, DL, VT, CC, LoTrue, LoFalse);
	Hi = DAG.getNode(ISD::SELECT, DL, VT, CC, HiTrue, HiFalse);

	SDValue Parts[2] = {Lo, Hi};
	return DAG.getMergeValues(Parts, DL);
	}

	SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
	SelectionDAG &DAG) const {
	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	SDLoc DL(Op);
	switch (IntNo) {
	default:
	return SDValue(); // Don't custom lower most intrinsics.
	case Intrinsic::thread_pointer: {
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	return DAG.getRegister(RISCV::X4, PtrVT);
	}
	}
	}

	// Returns the opcode of the target-specific SDNode that implements the 32-bit
	// form of the given Opcode.
	static RISCVISD::NodeType getRISCVWOpcode(unsigned Opcode) {
	switch (Opcode) {
	default:
	llvm_unreachable("Unexpected opcode");
	case ISD::SHL:
	return RISCVISD::SLLW;
	case ISD::SRA:
	return RISCVISD::SRAW;
	case ISD::SRL:
	return RISCVISD::SRLW;
	case ISD::SDIV:
	return RISCVISD::DIVW;
	case ISD::UDIV:
	return RISCVISD::DIVUW;
	case ISD::UREM:
	return RISCVISD::REMUW;
	}
	}

	// Converts the given 32-bit operation to a target-specific SelectionDAG node.
	// Because i32 isn't a legal type for RV64, these operations would otherwise
	// be promoted to i64, making it difficult to select the SLLW/DIVUW/.../*W
	// later one because the fact the operation was originally of type i32 is
	// lost.
	static SDValue customLegalizeToWOp(SDNode *N, SelectionDAG &DAG) {
	SDLoc DL(N);
	RISCVISD::NodeType WOpcode = getRISCVWOpcode(N->getOpcode());
	SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
	SDValue NewOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
	SDValue NewRes = DAG.getNode(WOpcode, DL, MVT::i64, NewOp0, NewOp1);
	// ReplaceNodeResults requires we maintain the same type for the return value.
	return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes);
	}

	// Converts the given 32-bit operation to a i64 operation with signed extension
	// semantic to reduce the signed extension instructions.
	static SDValue customLegalizeToWOpWithSExt(SDNode *N, SelectionDAG &DAG) {
	SDLoc DL(N);
	SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
	SDValue NewOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
	SDValue NewWOp = DAG.getNode(N->getOpcode(), DL, MVT::i64, NewOp0, NewOp1);
	SDValue NewRes = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, NewWOp,
	DAG.getValueType(MVT::i32));
	return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes);
	}

	void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const {
	SDLoc DL(N);
	switch (N->getOpcode()) {
	default:
	llvm_unreachable("Don't know how to custom type legalize this operation!");
	case ISD::STRICT_FP_TO_SINT:
	case ISD::STRICT_FP_TO_UINT:
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT: {
	bool IsStrict = N->isStrictFPOpcode();
	assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
	"Unexpected custom legalisation");
	SDValue Op0 = IsStrict ? N->getOperand(1) : N->getOperand(0);
	RTLIB::Libcall LC;
	if (N->getOpcode() == ISD::FP_TO_SINT \|\|
	N->getOpcode() == ISD::STRICT_FP_TO_SINT)
	LC = RTLIB::getFPTOSINT(Op0.getValueType(), N->getValueType(0));
	else
	LC = RTLIB::getFPTOUINT(Op0.getValueType(), N->getValueType(0));
	MakeLibCallOptions CallOptions;
	EVT OpVT = Op0.getValueType();
	CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true);
	SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
	SDValue Result;
	std::tie(Result, Chain) =
	makeLibCall(DAG, LC, N->getValueType(0), Op0, CallOptions, DL, Chain);
	Results.push_back(Result);
	if (IsStrict)
	Results.push_back(Chain);
	break;
	}
	case ISD::READCYCLECOUNTER: {
	assert(!Subtarget.is64Bit() &&
	"READCYCLECOUNTER only has custom type legalization on riscv32");

	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
	SDValue RCW =
	DAG.getNode(RISCVISD::READ_CYCLE_WIDE, DL, VTs, N->getOperand(0));

	Results.push_back(
	DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, RCW, RCW.getValue(1)));
	Results.push_back(RCW.getValue(2));
	break;
	}
	case ISD::ADD:
	case ISD::SUB:
	case ISD::MUL:
	assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
	"Unexpected custom legalisation");
	if (N->getOperand(1).getOpcode() == ISD::Constant)
	return;
	Results.push_back(customLegalizeToWOpWithSExt(N, DAG));
	break;
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL:
	assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
	"Unexpected custom legalisation");
	if (N->getOperand(1).getOpcode() == ISD::Constant)
	return;
	Results.push_back(customLegalizeToWOp(N, DAG));
	break;
	case ISD::SDIV:
	case ISD::UDIV:
	case ISD::UREM:
	assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
	Subtarget.hasStdExtM() && "Unexpected custom legalisation");
	if (N->getOperand(0).getOpcode() == ISD::Constant \|\|
	N->getOperand(1).getOpcode() == ISD::Constant)
	return;
	Results.push_back(customLegalizeToWOp(N, DAG));
	break;
	case ISD::BITCAST: {
	assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
	Subtarget.hasStdExtF() && "Unexpected custom legalisation");
	SDLoc DL(N);
	SDValue Op0 = N->getOperand(0);
	if (Op0.getValueType() != MVT::f32)
	return;
	SDValue FPConv =
	DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Op0);
	Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPConv));
	break;
	}
	}
	}

	SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;

	switch (N->getOpcode()) {
	default:
	break;
	case RISCVISD::SplitF64: {
	SDValue Op0 = N->getOperand(0);
	// If the input to SplitF64 is just BuildPairF64 then the operation is
	// redundant. Instead, use BuildPairF64's operands directly.
	if (Op0->getOpcode() == RISCVISD::BuildPairF64)
	return DCI.CombineTo(N, Op0.getOperand(0), Op0.getOperand(1));

	SDLoc DL(N);

	// It's cheaper to materialise two 32-bit integers than to load a double
	// from the constant pool and transfer it to integer registers through the
	// stack.
	if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op0)) {
	APInt V = C->getValueAPF().bitcastToAPInt();
	SDValue Lo = DAG.getConstant(V.trunc(32), DL, MVT::i32);
	SDValue Hi = DAG.getConstant(V.lshr(32).trunc(32), DL, MVT::i32);
	return DCI.CombineTo(N, Lo, Hi);
	}

	// This is a target-specific version of a DAGCombine performed in
	// DAGCombiner::visitBITCAST. It performs the equivalent of:
	// fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
	// fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
	if (!(Op0.getOpcode() == ISD::FNEG \|\| Op0.getOpcode() == ISD::FABS) \|\|
	!Op0.getNode()->hasOneUse())
	break;
	SDValue NewSplitF64 =
	DAG.getNode(RISCVISD::SplitF64, DL, DAG.getVTList(MVT::i32, MVT::i32),
	Op0.getOperand(0));
	SDValue Lo = NewSplitF64.getValue(0);
	SDValue Hi = NewSplitF64.getValue(1);
	APInt SignBit = APInt::getSignMask(32);
	if (Op0.getOpcode() == ISD::FNEG) {
	SDValue NewHi = DAG.getNode(ISD::XOR, DL, MVT::i32, Hi,
	DAG.getConstant(SignBit, DL, MVT::i32));
	return DCI.CombineTo(N, Lo, NewHi);
	}
	assert(Op0.getOpcode() == ISD::FABS);
	SDValue NewHi = DAG.getNode(ISD::AND, DL, MVT::i32, Hi,
	DAG.getConstant(~SignBit, DL, MVT::i32));
	return DCI.CombineTo(N, Lo, NewHi);
	}
	case RISCVISD::SLLW:
	case RISCVISD::SRAW:
	case RISCVISD::SRLW: {
	// Only the lower 32 bits of LHS and lower 5 bits of RHS are read.
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	APInt LHSMask = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 32);
	APInt RHSMask = APInt::getLowBitsSet(RHS.getValueSizeInBits(), 5);
	if ((SimplifyDemandedBits(N->getOperand(0), LHSMask, DCI)) \|\|
	(SimplifyDemandedBits(N->getOperand(1), RHSMask, DCI)))
	return SDValue();
	break;
	}
	case RISCVISD::FMV_X_ANYEXTW_RV64: {
	SDLoc DL(N);
	SDValue Op0 = N->getOperand(0);
	// If the input to FMV_X_ANYEXTW_RV64 is just FMV_W_X_RV64 then the
	// conversion is unnecessary and can be replaced with an ANY_EXTEND
	// of the FMV_W_X_RV64 operand.
	if (Op0->getOpcode() == RISCVISD::FMV_W_X_RV64) {
	SDValue AExtOp =
	DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0.getOperand(0));
	return DCI.CombineTo(N, AExtOp);
	}

	// This is a target-specific version of a DAGCombine performed in
	// DAGCombiner::visitBITCAST. It performs the equivalent of:
	// fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
	// fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
	if (!(Op0.getOpcode() == ISD::FNEG \|\| Op0.getOpcode() == ISD::FABS) \|\|
	!Op0.getNode()->hasOneUse())
	break;
	SDValue NewFMV = DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64,
	Op0.getOperand(0));
	APInt SignBit = APInt::getSignMask(32).sext(64);
	if (Op0.getOpcode() == ISD::FNEG) {
	return DCI.CombineTo(N,
	DAG.getNode(ISD::XOR, DL, MVT::i64, NewFMV,
	DAG.getConstant(SignBit, DL, MVT::i64)));
	}
	assert(Op0.getOpcode() == ISD::FABS);
	return DCI.CombineTo(N,
	DAG.getNode(ISD::AND, DL, MVT::i64, NewFMV,
	DAG.getConstant(~SignBit, DL, MVT::i64)));
	}
	}

	return SDValue();
	}

	bool RISCVTargetLowering::isDesirableToCommuteWithShift(
	const SDNode *N, CombineLevel Level) const {
	// The following folds are only desirable if `(OP _, c1 << c2)` can be
	// materialised in fewer instructions than `(OP _, c1)`:
	//
	// (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
	// (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
	SDValue N0 = N->getOperand(0);
	EVT Ty = N0.getValueType();
	if (Ty.isScalarInteger() &&
	(N0.getOpcode() == ISD::ADD \|\| N0.getOpcode() == ISD::OR)) {
	auto *C1 = dyn_cast<ConstantSDNode>(N0->getOperand(1));
	auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (C1 && C2) {
	APInt C1Int = C1->getAPIntValue();
	APInt ShiftedC1Int = C1Int << C2->getAPIntValue();

	// We can materialise `c1 << c2` into an add immediate, so it's "free",
	// and the combine should happen, to potentially allow further combines
	// later.
	if (ShiftedC1Int.getMinSignedBits() <= 64 &&
	isLegalAddImmediate(ShiftedC1Int.getSExtValue()))
	return true;

	// We can materialise `c1` in an add immediate, so it's "free", and the
	// combine should be prevented.
	if (C1Int.getMinSignedBits() <= 64 &&
	isLegalAddImmediate(C1Int.getSExtValue()))
	return false;

	// Neither constant will fit into an immediate, so find materialisation
	// costs.
	int C1Cost = RISCVMatInt::getIntMatCost(C1Int, Ty.getSizeInBits(),
	Subtarget.is64Bit());
	int ShiftedC1Cost = RISCVMatInt::getIntMatCost(
	ShiftedC1Int, Ty.getSizeInBits(), Subtarget.is64Bit());

	// Materialising `c1` is cheaper than materialising `c1 << c2`, so the
	// combine should be prevented.
	if (C1Cost < ShiftedC1Cost)
	return false;
	}
	}
	return true;
	}

	unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
	SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
	unsigned Depth) const {
	switch (Op.getOpcode()) {
	default:
	break;
	case RISCVISD::SLLW:
	case RISCVISD::SRAW:
	case RISCVISD::SRLW:
	case RISCVISD::DIVW:
	case RISCVISD::DIVUW:
	case RISCVISD::REMUW:
	// TODO: As the result is sign-extended, this is conservatively correct. A
	// more precise answer could be calculated for SRAW depending on known
	// bits in the shift amount.
	return 33;
	}

	return 1;
	}

	static MachineBasicBlock *emitReadCycleWidePseudo(MachineInstr &MI,
	MachineBasicBlock *BB) {
	assert(MI.getOpcode() == RISCV::ReadCycleWide && "Unexpected instruction");

	// To read the 64-bit cycle CSR on a 32-bit target, we read the two halves.
	// Should the count have wrapped while it was being read, we need to try
	// again.
	// ...
	// read:
	// rdcycleh x3 # load high word of cycle
	// rdcycle x2 # load low word of cycle
	// rdcycleh x4 # load high word of cycle
	// bne x3, x4, read # check if high word reads match, otherwise try again
	// ...

	MachineFunction &MF = *BB->getParent();
	const BasicBlock *LLVM_BB = BB->getBasicBlock();
	MachineFunction::iterator It = ++BB->getIterator();

	MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB);
	MF.insert(It, LoopMBB);

	MachineBasicBlock *DoneMBB = MF.CreateMachineBasicBlock(LLVM_BB);
	MF.insert(It, DoneMBB);

	// Transfer the remainder of BB and its successor edges to DoneMBB.
	DoneMBB->splice(DoneMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	DoneMBB->transferSuccessorsAndUpdatePHIs(BB);

	BB->addSuccessor(LoopMBB);

	MachineRegisterInfo &RegInfo = MF.getRegInfo();
	Register ReadAgainReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
	Register LoReg = MI.getOperand(0).getReg();
	Register HiReg = MI.getOperand(1).getReg();
	DebugLoc DL = MI.getDebugLoc();

	const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
	BuildMI(LoopMBB, DL, TII->get(RISCV::CSRRS), HiReg)
	.addImm(RISCVSysReg::lookupSysRegByName("CYCLEH")->Encoding)
	.addReg(RISCV::X0);
	BuildMI(LoopMBB, DL, TII->get(RISCV::CSRRS), LoReg)
	.addImm(RISCVSysReg::lookupSysRegByName("CYCLE")->Encoding)
	.addReg(RISCV::X0);
	BuildMI(LoopMBB, DL, TII->get(RISCV::CSRRS), ReadAgainReg)
	.addImm(RISCVSysReg::lookupSysRegByName("CYCLEH")->Encoding)
	.addReg(RISCV::X0);

	BuildMI(LoopMBB, DL, TII->get(RISCV::BNE))
	.addReg(HiReg)
	.addReg(ReadAgainReg)
	.addMBB(LoopMBB);

	LoopMBB->addSuccessor(LoopMBB);
	LoopMBB->addSuccessor(DoneMBB);

	MI.eraseFromParent();

	return DoneMBB;
	}

	static MachineBasicBlock *emitSplitF64Pseudo(MachineInstr &MI,
	MachineBasicBlock *BB) {
	assert(MI.getOpcode() == RISCV::SplitF64Pseudo && "Unexpected instruction");

	MachineFunction &MF = *BB->getParent();
	DebugLoc DL = MI.getDebugLoc();
	const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
	const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
	Register LoReg = MI.getOperand(0).getReg();
	Register HiReg = MI.getOperand(1).getReg();
	Register SrcReg = MI.getOperand(2).getReg();
	const TargetRegisterClass *SrcRC = &RISCV::FPR64RegClass;
	int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex(MF);

	TII.storeRegToStackSlot(*BB, MI, SrcReg, MI.getOperand(2).isKill(), FI, SrcRC,
	RI);
	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI),
	MachineMemOperand::MOLoad, 8, Align(8));
	BuildMI(*BB, MI, DL, TII.get(RISCV::LW), LoReg)
	.addFrameIndex(FI)
	.addImm(0)
	.addMemOperand(MMO);
	BuildMI(*BB, MI, DL, TII.get(RISCV::LW), HiReg)
	.addFrameIndex(FI)
	.addImm(4)
	.addMemOperand(MMO);
	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	static MachineBasicBlock *emitBuildPairF64Pseudo(MachineInstr &MI,
	MachineBasicBlock *BB) {
	assert(MI.getOpcode() == RISCV::BuildPairF64Pseudo &&
	"Unexpected instruction");

	MachineFunction &MF = *BB->getParent();
	DebugLoc DL = MI.getDebugLoc();
	const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
	const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
	Register DstReg = MI.getOperand(0).getReg();
	Register LoReg = MI.getOperand(1).getReg();
	Register HiReg = MI.getOperand(2).getReg();
	const TargetRegisterClass *DstRC = &RISCV::FPR64RegClass;
	int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex(MF);

	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI),
	MachineMemOperand::MOStore, 8, Align(8));
	BuildMI(*BB, MI, DL, TII.get(RISCV::SW))
	.addReg(LoReg, getKillRegState(MI.getOperand(1).isKill()))
	.addFrameIndex(FI)
	.addImm(0)
	.addMemOperand(MMO);
	BuildMI(*BB, MI, DL, TII.get(RISCV::SW))
	.addReg(HiReg, getKillRegState(MI.getOperand(2).isKill()))
	.addFrameIndex(FI)
	.addImm(4)
	.addMemOperand(MMO);
	TII.loadRegFromStackSlot(*BB, MI, DstReg, FI, DstRC, RI);
	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	static bool isSelectPseudo(MachineInstr &MI) {
	switch (MI.getOpcode()) {
	default:
	return false;
	case RISCV::Select_GPR_Using_CC_GPR:
	case RISCV::Select_FPR32_Using_CC_GPR:
	case RISCV::Select_FPR64_Using_CC_GPR:
	return true;
	}
	}

	static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
	MachineBasicBlock *BB) {
	// To "insert" Select_* instructions, we actually have to insert the triangle
	// control-flow pattern. The incoming instructions know the destination vreg
	// to set, the condition code register to branch on, the true/false values to
	// select between, and the condcode to use to select the appropriate branch.
	//
	// We produce the following control flow:
	// HeadMBB
	// \| \
	// \| IfFalseMBB
	// \| /
	// TailMBB
	//
	// When we find a sequence of selects we attempt to optimize their emission
	// by sharing the control flow. Currently we only handle cases where we have
	// multiple selects with the exact same condition (same LHS, RHS and CC).
	// The selects may be interleaved with other instructions if the other
	// instructions meet some requirements we deem safe:
	// - They are debug instructions. Otherwise,
	// - They do not have side-effects, do not access memory and their inputs do
	// not depend on the results of the select pseudo-instructions.
	// The TrueV/FalseV operands of the selects cannot depend on the result of
	// previous selects in the sequence.
	// These conditions could be further relaxed. See the X86 target for a
	// related approach and more information.
	Register LHS = MI.getOperand(1).getReg();
	Register RHS = MI.getOperand(2).getReg();
	auto CC = static_cast<ISD::CondCode>(MI.getOperand(3).getImm());

	SmallVector<MachineInstr *, 4> SelectDebugValues;
	SmallSet<Register, 4> SelectDests;
	SelectDests.insert(MI.getOperand(0).getReg());

	MachineInstr *LastSelectPseudo = &MI;

	for (auto E = BB->end(), SequenceMBBI = MachineBasicBlock::iterator(MI);
	SequenceMBBI != E; ++SequenceMBBI) {
	if (SequenceMBBI->isDebugInstr())
	continue;
	else if (isSelectPseudo(*SequenceMBBI)) {
	if (SequenceMBBI->getOperand(1).getReg() != LHS \|\|
	SequenceMBBI->getOperand(2).getReg() != RHS \|\|
	SequenceMBBI->getOperand(3).getImm() != CC \|\|
	SelectDests.count(SequenceMBBI->getOperand(4).getReg()) \|\|
	SelectDests.count(SequenceMBBI->getOperand(5).getReg()))
	break;
	LastSelectPseudo = &*SequenceMBBI;
	SequenceMBBI->collectDebugValues(SelectDebugValues);
	SelectDests.insert(SequenceMBBI->getOperand(0).getReg());
	} else {
	if (SequenceMBBI->hasUnmodeledSideEffects() \|\|
	SequenceMBBI->mayLoadOrStore())
	break;
	if (llvm::any_of(SequenceMBBI->operands(), [&](MachineOperand &MO) {
	return MO.isReg() && MO.isUse() && SelectDests.count(MO.getReg());
	}))
	break;
	}
	}

	const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
	const BasicBlock *LLVM_BB = BB->getBasicBlock();
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction::iterator I = ++BB->getIterator();

	MachineBasicBlock *HeadMBB = BB;
	MachineFunction *F = BB->getParent();
	MachineBasicBlock *TailMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *IfFalseMBB = F->CreateMachineBasicBlock(LLVM_BB);

	F->insert(I, IfFalseMBB);
	F->insert(I, TailMBB);

	// Transfer debug instructions associated with the selects to TailMBB.
	for (MachineInstr *DebugInstr : SelectDebugValues) {
	TailMBB->push_back(DebugInstr->removeFromParent());
	}

	// Move all instructions after the sequence to TailMBB.
	TailMBB->splice(TailMBB->end(), HeadMBB,
	std::next(LastSelectPseudo->getIterator()), HeadMBB->end());
	// Update machine-CFG edges by transferring all successors of the current
	// block to the new block which will contain the Phi nodes for the selects.
	TailMBB->transferSuccessorsAndUpdatePHIs(HeadMBB);
	// Set the successors for HeadMBB.
	HeadMBB->addSuccessor(IfFalseMBB);
	HeadMBB->addSuccessor(TailMBB);

	// Insert appropriate branch.
	unsigned Opcode = getBranchOpcodeForIntCondCode(CC);

	BuildMI(HeadMBB, DL, TII.get(Opcode))
	.addReg(LHS)
	.addReg(RHS)
	.addMBB(TailMBB);

	// IfFalseMBB just falls through to TailMBB.
	IfFalseMBB->addSuccessor(TailMBB);

	// Create PHIs for all of the select pseudo-instructions.
	auto SelectMBBI = MI.getIterator();
	auto SelectEnd = std::next(LastSelectPseudo->getIterator());
	auto InsertionPoint = TailMBB->begin();
	while (SelectMBBI != SelectEnd) {
	auto Next = std::next(SelectMBBI);
	if (isSelectPseudo(*SelectMBBI)) {
	// %Result = phi [ %TrueValue, HeadMBB ], [ %FalseValue, IfFalseMBB ]
	BuildMI(*TailMBB, InsertionPoint, SelectMBBI->getDebugLoc(),
	TII.get(RISCV::PHI), SelectMBBI->getOperand(0).getReg())
	.addReg(SelectMBBI->getOperand(4).getReg())
	.addMBB(HeadMBB)
	.addReg(SelectMBBI->getOperand(5).getReg())
	.addMBB(IfFalseMBB);
	SelectMBBI->eraseFromParent();
	}
	SelectMBBI = Next;
	}

	F->getProperties().reset(MachineFunctionProperties::Property::NoPHIs);
	return TailMBB;
	}

	MachineBasicBlock *
	RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	switch (MI.getOpcode()) {
	default:
	llvm_unreachable("Unexpected instr type to insert");
	case RISCV::ReadCycleWide:
	assert(!Subtarget.is64Bit() &&
	"ReadCycleWrite is only to be used on riscv32");
	return emitReadCycleWidePseudo(MI, BB);
	case RISCV::Select_GPR_Using_CC_GPR:
	case RISCV::Select_FPR32_Using_CC_GPR:
	case RISCV::Select_FPR64_Using_CC_GPR:
	return emitSelectPseudo(MI, BB);
	case RISCV::BuildPairF64Pseudo:
	return emitBuildPairF64Pseudo(MI, BB);
	case RISCV::SplitF64Pseudo:
	return emitSplitF64Pseudo(MI, BB);
	}
	}

	// Calling Convention Implementation.
	// The expectations for frontend ABI lowering vary from target to target.
	// Ideally, an LLVM frontend would be able to avoid worrying about many ABI
	// details, but this is a longer term goal. For now, we simply try to keep the
	// role of the frontend as simple and well-defined as possible. The rules can
	// be summarised as:
	// * Never split up large scalar arguments. We handle them here.
	// * If a hardfloat calling convention is being used, and the struct may be
	// passed in a pair of registers (fp+fp, int+fp), and both registers are
	// available, then pass as two separate arguments. If either the GPRs or FPRs
	// are exhausted, then pass according to the rule below.
	// * If a struct could never be passed in registers or directly in a stack
	// slot (as it is larger than 2*XLEN and the floating point rules don't
	// apply), then pass it using a pointer with the byval attribute.
	// * If a struct is less than 2*XLEN, then coerce to either a two-element
	// word-sized array or a 2*XLEN scalar (depending on alignment).
	// * The frontend can determine whether a struct is returned by reference or
	// not based on its size and fields. If it will be returned by reference, the
	// frontend must modify the prototype so a pointer with the sret annotation is
	// passed as the first argument. This is not necessary for large scalar
	// returns.
	// * Struct return values and varargs should be coerced to structs containing
	// register-size fields in the same situations they would be for fixed
	// arguments.

	static const MCPhysReg ArgGPRs[] = {
	RISCV::X10, RISCV::X11, RISCV::X12, RISCV::X13,
	RISCV::X14, RISCV::X15, RISCV::X16, RISCV::X17
	};
	static const MCPhysReg ArgFPR32s[] = {
	RISCV::F10_F, RISCV::F11_F, RISCV::F12_F, RISCV::F13_F,
	RISCV::F14_F, RISCV::F15_F, RISCV::F16_F, RISCV::F17_F
	};
	static const MCPhysReg ArgFPR64s[] = {
	RISCV::F10_D, RISCV::F11_D, RISCV::F12_D, RISCV::F13_D,
	RISCV::F14_D, RISCV::F15_D, RISCV::F16_D, RISCV::F17_D
	};

	// Pass a 2*XLEN argument that has been split into two XLEN values through
	// registers or the stack as necessary.
	static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1,
	ISD::ArgFlagsTy ArgFlags1, unsigned ValNo2,
	MVT ValVT2, MVT LocVT2,
	ISD::ArgFlagsTy ArgFlags2) {
	unsigned XLenInBytes = XLen / 8;
	if (Register Reg = State.AllocateReg(ArgGPRs)) {
	// At least one half can be passed via register.
	State.addLoc(CCValAssign::getReg(VA1.getValNo(), VA1.getValVT(), Reg,
	VA1.getLocVT(), CCValAssign::Full));
	} else {
	// Both halves must be passed on the stack, with proper alignment.
	Align StackAlign =
	std::max(Align(XLenInBytes), ArgFlags1.getNonZeroOrigAlign());
	State.addLoc(
	CCValAssign::getMem(VA1.getValNo(), VA1.getValVT(),
	State.AllocateStack(XLenInBytes, StackAlign),
	VA1.getLocVT(), CCValAssign::Full));
	State.addLoc(CCValAssign::getMem(
	ValNo2, ValVT2, State.AllocateStack(XLenInBytes, Align(XLenInBytes)),
	LocVT2, CCValAssign::Full));
	return false;
	}

	if (Register Reg = State.AllocateReg(ArgGPRs)) {
	// The second half can also be passed via register.
	State.addLoc(
	CCValAssign::getReg(ValNo2, ValVT2, Reg, LocVT2, CCValAssign::Full));
	} else {
	// The second half is passed via the stack, without additional alignment.
	State.addLoc(CCValAssign::getMem(
	ValNo2, ValVT2, State.AllocateStack(XLenInBytes, Align(XLenInBytes)),
	LocVT2, CCValAssign::Full));
	}

	return false;
	}

	// Implements the RISC-V calling convention. Returns true upon failure.
	static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
	MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
	ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed,
	bool IsRet, Type *OrigTy) {
	unsigned XLen = DL.getLargestLegalIntTypeSizeInBits();
	assert(XLen == 32 \|\| XLen == 64);
	MVT XLenVT = XLen == 32 ? MVT::i32 : MVT::i64;

	// Any return value split in to more than two values can't be returned
	// directly.
	if (IsRet && ValNo > 1)
	return true;

	// UseGPRForF32 if targeting one of the soft-float ABIs, if passing a
	// variadic argument, or if no F32 argument registers are available.
	bool UseGPRForF32 = true;
	// UseGPRForF64 if targeting soft-float ABIs or an FLEN=32 ABI, if passing a
	// variadic argument, or if no F64 argument registers are available.
	bool UseGPRForF64 = true;

	switch (ABI) {
	default:
	llvm_unreachable("Unexpected ABI");
	case RISCVABI::ABI_ILP32:
	case RISCVABI::ABI_LP64:
	break;
	case RISCVABI::ABI_ILP32F:
	case RISCVABI::ABI_LP64F:
	UseGPRForF32 = !IsFixed;
	break;
	case RISCVABI::ABI_ILP32D:
	case RISCVABI::ABI_LP64D:
	UseGPRForF32 = !IsFixed;
	UseGPRForF64 = !IsFixed;
	break;
	}

	if (State.getFirstUnallocated(ArgFPR32s) == array_lengthof(ArgFPR32s))
	UseGPRForF32 = true;
	if (State.getFirstUnallocated(ArgFPR64s) == array_lengthof(ArgFPR64s))
	UseGPRForF64 = true;

	// From this point on, rely on UseGPRForF32, UseGPRForF64 and similar local
	// variables rather than directly checking against the target ABI.

	if (UseGPRForF32 && ValVT == MVT::f32) {
	LocVT = XLenVT;
	LocInfo = CCValAssign::BCvt;
	} else if (UseGPRForF64 && XLen == 64 && ValVT == MVT::f64) {
	LocVT = MVT::i64;
	LocInfo = CCValAssign::BCvt;
	}

	// If this is a variadic argument, the RISC-V calling convention requires
	// that it is assigned an 'even' or 'aligned' register if it has 8-byte
	// alignment (RV32) or 16-byte alignment (RV64). An aligned register should
	// be used regardless of whether the original argument was split during
	// legalisation or not. The argument will not be passed by registers if the
	// original type is larger than 2*XLEN, so the register alignment rule does
	// not apply.
	unsigned TwoXLenInBytes = (2 * XLen) / 8;
	if (!IsFixed && ArgFlags.getNonZeroOrigAlign() == TwoXLenInBytes &&
	DL.getTypeAllocSize(OrigTy) == TwoXLenInBytes) {
	unsigned RegIdx = State.getFirstUnallocated(ArgGPRs);
	// Skip 'odd' register if necessary.
	if (RegIdx != array_lengthof(ArgGPRs) && RegIdx % 2 == 1)
	State.AllocateReg(ArgGPRs);
	}

	SmallVectorImpl<CCValAssign> &PendingLocs = State.getPendingLocs();
	SmallVectorImpl<ISD::ArgFlagsTy> &PendingArgFlags =
	State.getPendingArgFlags();

	assert(PendingLocs.size() == PendingArgFlags.size() &&
	"PendingLocs and PendingArgFlags out of sync");

	// Handle passing f64 on RV32D with a soft float ABI or when floating point
	// registers are exhausted.
	if (UseGPRForF64 && XLen == 32 && ValVT == MVT::f64) {
	assert(!ArgFlags.isSplit() && PendingLocs.empty() &&
	"Can't lower f64 if it is split");
	// Depending on available argument GPRS, f64 may be passed in a pair of
	// GPRs, split between a GPR and the stack, or passed completely on the
	// stack. LowerCall/LowerFormalArguments/LowerReturn must recognise these
	// cases.
	Register Reg = State.AllocateReg(ArgGPRs);
	LocVT = MVT::i32;
	if (!Reg) {
	unsigned StackOffset = State.AllocateStack(8, Align(8));
	State.addLoc(
	CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo));
	return false;
	}
	if (!State.AllocateReg(ArgGPRs))
	State.AllocateStack(4, Align(4));
	State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
	return false;
	}

	// Split arguments might be passed indirectly, so keep track of the pending
	// values.
	if (ArgFlags.isSplit() \|\| !PendingLocs.empty()) {
	LocVT = XLenVT;
	LocInfo = CCValAssign::Indirect;
	PendingLocs.push_back(
	CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
	PendingArgFlags.push_back(ArgFlags);
	if (!ArgFlags.isSplitEnd()) {
	return false;
	}
	}

	// If the split argument only had two elements, it should be passed directly
	// in registers or on the stack.
	if (ArgFlags.isSplitEnd() && PendingLocs.size() <= 2) {
	assert(PendingLocs.size() == 2 && "Unexpected PendingLocs.size()");
	// Apply the normal calling convention rules to the first half of the
	// split argument.
	CCValAssign VA = PendingLocs[0];
	ISD::ArgFlagsTy AF = PendingArgFlags[0];
	PendingLocs.clear();
	PendingArgFlags.clear();
	return CC_RISCVAssign2XLen(XLen, State, VA, AF, ValNo, ValVT, LocVT,
	ArgFlags);
	}

	// Allocate to a register if possible, or else a stack slot.
	Register Reg;
	if (ValVT == MVT::f32 && !UseGPRForF32)
	Reg = State.AllocateReg(ArgFPR32s, ArgFPR64s);
	else if (ValVT == MVT::f64 && !UseGPRForF64)
	Reg = State.AllocateReg(ArgFPR64s, ArgFPR32s);
	else
	Reg = State.AllocateReg(ArgGPRs);
	unsigned StackOffset =
	Reg ? 0 : State.AllocateStack(XLen / 8, Align(XLen / 8));

	// If we reach this point and PendingLocs is non-empty, we must be at the
	// end of a split argument that must be passed indirectly.
	if (!PendingLocs.empty()) {
	assert(ArgFlags.isSplitEnd() && "Expected ArgFlags.isSplitEnd()");
	assert(PendingLocs.size() > 2 && "Unexpected PendingLocs.size()");

	for (auto &It : PendingLocs) {
	if (Reg)
	It.convertToReg(Reg);
	else
	It.convertToMem(StackOffset);
	State.addLoc(It);
	}
	PendingLocs.clear();
	PendingArgFlags.clear();
	return false;
	}

	assert((!UseGPRForF32 \|\| !UseGPRForF64 \|\| LocVT == XLenVT) &&
	"Expected an XLenVT at this stage");

	if (Reg) {
	State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
	return false;
	}

	// When an f32 or f64 is passed on the stack, no bit-conversion is needed.
	if (ValVT == MVT::f32 \|\| ValVT == MVT::f64) {
	LocVT = ValVT;
	LocInfo = CCValAssign::Full;
	}
	State.addLoc(CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo));
	return false;
	}

	void RISCVTargetLowering::analyzeInputArgs(
	MachineFunction &MF, CCState &CCInfo,
	const SmallVectorImpl<ISD::InputArg> &Ins, bool IsRet) const {
	unsigned NumArgs = Ins.size();
	FunctionType *FType = MF.getFunction().getFunctionType();

	for (unsigned i = 0; i != NumArgs; ++i) {
	MVT ArgVT = Ins[i].VT;
	ISD::ArgFlagsTy ArgFlags = Ins[i].Flags;

	Type *ArgTy = nullptr;
	if (IsRet)
	ArgTy = FType->getReturnType();
	else if (Ins[i].isOrigArg())
	ArgTy = FType->getParamType(Ins[i].getOrigArgIndex());

	RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
	if (CC_RISCV(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full,
	ArgFlags, CCInfo, /IsFixed=/true, IsRet, ArgTy)) {
	LLVM_DEBUG(dbgs() << "InputArg #" << i << " has unhandled type "
	<< EVT(ArgVT).getEVTString() << '\n');
	llvm_unreachable(nullptr);
	}
	}
	}

	void RISCVTargetLowering::analyzeOutputArgs(
	MachineFunction &MF, CCState &CCInfo,
	const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsRet,
	CallLoweringInfo *CLI) const {
	unsigned NumArgs = Outs.size();

	for (unsigned i = 0; i != NumArgs; i++) {
	MVT ArgVT = Outs[i].VT;
	ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
	Type *OrigTy = CLI ? CLI->getArgs()[Outs[i].OrigArgIndex].Ty : nullptr;

	RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
	if (CC_RISCV(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full,
	ArgFlags, CCInfo, Outs[i].IsFixed, IsRet, OrigTy)) {
	LLVM_DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type "
	<< EVT(ArgVT).getEVTString() << "\n");
	llvm_unreachable(nullptr);
	}
	}
	}

	// Convert Val to a ValVT. Should not be called for CCValAssign::Indirect
	// values.
	static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDValue Val,
	const CCValAssign &VA, const SDLoc &DL) {
	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unexpected CCValAssign::LocInfo");
	case CCValAssign::Full:
	break;
	case CCValAssign::BCvt:
	if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) {
	Val = DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, Val);
	break;
	}
	Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
	break;
	}
	return Val;
	}

	// The caller is responsible for loading the full value if the argument is
	// passed with CCValAssign::Indirect.
	static SDValue unpackFromRegLoc(SelectionDAG &DAG, SDValue Chain,
	const CCValAssign &VA, const SDLoc &DL) {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineRegisterInfo &RegInfo = MF.getRegInfo();
	EVT LocVT = VA.getLocVT();
	SDValue Val;
	const TargetRegisterClass *RC;

	switch (LocVT.getSimpleVT().SimpleTy) {
	default:
	llvm_unreachable("Unexpected register type");
	case MVT::i32:
	case MVT::i64:
	RC = &RISCV::GPRRegClass;
	break;
	case MVT::f32:
	RC = &RISCV::FPR32RegClass;
	break;
	case MVT::f64:
	RC = &RISCV::FPR64RegClass;
	break;
	}

	Register VReg = RegInfo.createVirtualRegister(RC);
	RegInfo.addLiveIn(VA.getLocReg(), VReg);
	Val = DAG.getCopyFromReg(Chain, DL, VReg, LocVT);

	if (VA.getLocInfo() == CCValAssign::Indirect)
	return Val;

	return convertLocVTToValVT(DAG, Val, VA, DL);
	}

	static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDValue Val,
	const CCValAssign &VA, const SDLoc &DL) {
	EVT LocVT = VA.getLocVT();

	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unexpected CCValAssign::LocInfo");
	case CCValAssign::Full:
	break;
	case CCValAssign::BCvt:
	if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) {
	Val = DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Val);
	break;
	}
	Val = DAG.getNode(ISD::BITCAST, DL, LocVT, Val);
	break;
	}
	return Val;
	}

	// The caller is responsible for loading the full value if the argument is
	// passed with CCValAssign::Indirect.
	static SDValue unpackFromMemLoc(SelectionDAG &DAG, SDValue Chain,
	const CCValAssign &VA, const SDLoc &DL) {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	EVT LocVT = VA.getLocVT();
	EVT ValVT = VA.getValVT();
	EVT PtrVT = MVT::getIntegerVT(DAG.getDataLayout().getPointerSizeInBits(0));
	int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
	VA.getLocMemOffset(), /Immutable=/true);
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	SDValue Val;

	ISD::LoadExtType ExtType;
	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unexpected CCValAssign::LocInfo");
	case CCValAssign::Full:
	case CCValAssign::Indirect:
	case CCValAssign::BCvt:
	ExtType = ISD::NON_EXTLOAD;
	break;
	}
	Val = DAG.getExtLoad(
	ExtType, DL, LocVT, Chain, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), ValVT);
	return Val;
	}

	static SDValue unpackF64OnRV32DSoftABI(SelectionDAG &DAG, SDValue Chain,
	const CCValAssign &VA, const SDLoc &DL) {
	assert(VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64 &&
	"Unexpected VA");
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MachineRegisterInfo &RegInfo = MF.getRegInfo();

	if (VA.isMemLoc()) {
	// f64 is passed on the stack.
	int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), /Immutable=/true);
	SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
	return DAG.getLoad(MVT::f64, DL, Chain, FIN,
	MachinePointerInfo::getFixedStack(MF, FI));
	}

	assert(VA.isRegLoc() && "Expected register VA assignment");

	Register LoVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
	RegInfo.addLiveIn(VA.getLocReg(), LoVReg);
	SDValue Lo = DAG.getCopyFromReg(Chain, DL, LoVReg, MVT::i32);
	SDValue Hi;
	if (VA.getLocReg() == RISCV::X17) {
	// Second half of f64 is passed on the stack.
	int FI = MFI.CreateFixedObject(4, 0, /Immutable=/true);
	SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
	Hi = DAG.getLoad(MVT::i32, DL, Chain, FIN,
	MachinePointerInfo::getFixedStack(MF, FI));
	} else {
	// Second half of f64 is passed in another GPR.
	Register HiVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
	RegInfo.addLiveIn(VA.getLocReg() + 1, HiVReg);
	Hi = DAG.getCopyFromReg(Chain, DL, HiVReg, MVT::i32);
	}
	return DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, Lo, Hi);
	}

	// FastCC has less than 1% performance improvement for some particular
	// benchmark. But theoretically, it may has benenfit for some cases.
	static bool CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT,
	CCValAssign::LocInfo LocInfo,
	ISD::ArgFlagsTy ArgFlags, CCState &State) {

	if (LocVT == MVT::i32 \|\| LocVT == MVT::i64) {
	// X5 and X6 might be used for save-restore libcall.
	static const MCPhysReg GPRList[] = {
	RISCV::X10, RISCV::X11, RISCV::X12, RISCV::X13, RISCV::X14,
	RISCV::X15, RISCV::X16, RISCV::X17, RISCV::X7, RISCV::X28,
	RISCV::X29, RISCV::X30, RISCV::X31};
	if (unsigned Reg = State.AllocateReg(GPRList)) {
	State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
	return false;
	}
	}

	if (LocVT == MVT::f32) {
	static const MCPhysReg FPR32List[] = {
	RISCV::F10_F, RISCV::F11_F, RISCV::F12_F, RISCV::F13_F, RISCV::F14_F,
	RISCV::F15_F, RISCV::F16_F, RISCV::F17_F, RISCV::F0_F, RISCV::F1_F,
	RISCV::F2_F, RISCV::F3_F, RISCV::F4_F, RISCV::F5_F, RISCV::F6_F,
	RISCV::F7_F, RISCV::F28_F, RISCV::F29_F, RISCV::F30_F, RISCV::F31_F};
	if (unsigned Reg = State.AllocateReg(FPR32List)) {
	State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
	return false;
	}
	}

	if (LocVT == MVT::f64) {
	static const MCPhysReg FPR64List[] = {
	RISCV::F10_D, RISCV::F11_D, RISCV::F12_D, RISCV::F13_D, RISCV::F14_D,
	RISCV::F15_D, RISCV::F16_D, RISCV::F17_D, RISCV::F0_D, RISCV::F1_D,
	RISCV::F2_D, RISCV::F3_D, RISCV::F4_D, RISCV::F5_D, RISCV::F6_D,
	RISCV::F7_D, RISCV::F28_D, RISCV::F29_D, RISCV::F30_D, RISCV::F31_D};
	if (unsigned Reg = State.AllocateReg(FPR64List)) {
	State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
	return false;
	}
	}

	if (LocVT == MVT::i32 \|\| LocVT == MVT::f32) {
	unsigned Offset4 = State.AllocateStack(4, Align(4));
	State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset4, LocVT, LocInfo));
	return false;
	}

	if (LocVT == MVT::i64 \|\| LocVT == MVT::f64) {
	unsigned Offset5 = State.AllocateStack(8, Align(8));
	State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset5, LocVT, LocInfo));
	return false;
	}

	return true; // CC didn't match.
	}

	// Transform physical registers into virtual registers.
	SDValue RISCVTargetLowering::LowerFormalArguments(
	SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {

	switch (CallConv) {
	default:
	report_fatal_error("Unsupported calling convention");
	case CallingConv::C:
	case CallingConv::Fast:
	break;
	}

	MachineFunction &MF = DAG.getMachineFunction();

	const Function &Func = MF.getFunction();
	if (Func.hasFnAttribute("interrupt")) {
	if (!Func.arg_empty())
	report_fatal_error(
	"Functions with the interrupt attribute cannot have arguments!");

	StringRef Kind =
	MF.getFunction().getFnAttribute("interrupt").getValueAsString();

	if (!(Kind == "user" \|\| Kind == "supervisor" \|\| Kind == "machine"))
	report_fatal_error(
	"Function interrupt attribute argument not supported!");
	}

	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	MVT XLenVT = Subtarget.getXLenVT();
	unsigned XLenInBytes = Subtarget.getXLen() / 8;
	// Used with vargs to acumulate store chains.
	std::vector<SDValue> OutChains;

	// Assign locations to all of the incoming arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());

	if (CallConv == CallingConv::Fast)
	CCInfo.AnalyzeFormalArguments(Ins, CC_RISCV_FastCC);
	else
	analyzeInputArgs(MF, CCInfo, Ins, /IsRet=/false);

	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	SDValue ArgValue;
	// Passing f64 on RV32D with a soft float ABI must be handled as a special
	// case.
	if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64)
	ArgValue = unpackF64OnRV32DSoftABI(DAG, Chain, VA, DL);
	else if (VA.isRegLoc())
	ArgValue = unpackFromRegLoc(DAG, Chain, VA, DL);
	else
	ArgValue = unpackFromMemLoc(DAG, Chain, VA, DL);

	if (VA.getLocInfo() == CCValAssign::Indirect) {
	// If the original argument was split and passed by reference (e.g. i128
	// on RV32), we need to load all parts of it here (using the same
	// address).
	InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue,
	MachinePointerInfo()));
	unsigned ArgIndex = Ins[i].OrigArgIndex;
	assert(Ins[i].PartOffset == 0);
	while (i + 1 != e && Ins[i + 1].OrigArgIndex == ArgIndex) {
	CCValAssign &PartVA = ArgLocs[i + 1];
	unsigned PartOffset = Ins[i + 1].PartOffset;
	SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, ArgValue,
	DAG.getIntPtrConstant(PartOffset, DL));
	InVals.push_back(DAG.getLoad(PartVA.getValVT(), DL, Chain, Address,
	MachinePointerInfo()));
	++i;
	}
	continue;
	}
	InVals.push_back(ArgValue);
	}

	if (IsVarArg) {
	ArrayRef<MCPhysReg> ArgRegs = makeArrayRef(ArgGPRs);
	unsigned Idx = CCInfo.getFirstUnallocated(ArgRegs);
	const TargetRegisterClass *RC = &RISCV::GPRRegClass;
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MachineRegisterInfo &RegInfo = MF.getRegInfo();
	RISCVMachineFunctionInfo *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();

	// Offset of the first variable argument from stack pointer, and size of
	// the vararg save area. For now, the varargs save area is either zero or
	// large enough to hold a0-a7.
	int VaArgOffset, VarArgsSaveSize;

	// If all registers are allocated, then all varargs must be passed on the
	// stack and we don't need to save any argregs.
	if (ArgRegs.size() == Idx) {
	VaArgOffset = CCInfo.getNextStackOffset();
	VarArgsSaveSize = 0;
	} else {
	VarArgsSaveSize = XLenInBytes * (ArgRegs.size() - Idx);
	VaArgOffset = -VarArgsSaveSize;
	}

	// Record the frame index of the first variable argument
	// which is a value necessary to VASTART.
	int FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset, true);
	RVFI->setVarArgsFrameIndex(FI);

	// If saving an odd number of registers then create an extra stack slot to
	// ensure that the frame pointer is 2*XLEN-aligned, which in turn ensures
	// offsets to even-numbered registered remain 2*XLEN-aligned.
	if (Idx % 2) {
	MFI.CreateFixedObject(XLenInBytes, VaArgOffset - (int)XLenInBytes, true);
	VarArgsSaveSize += XLenInBytes;
	}

	// Copy the integer registers that may have been used for passing varargs
	// to the vararg save area.
	for (unsigned I = Idx; I < ArgRegs.size();
	++I, VaArgOffset += XLenInBytes) {
	const Register Reg = RegInfo.createVirtualRegister(RC);
	RegInfo.addLiveIn(ArgRegs[I], Reg);
	SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, XLenVT);
	FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset, true);
	SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
	SDValue Store = DAG.getStore(Chain, DL, ArgValue, PtrOff,
	MachinePointerInfo::getFixedStack(MF, FI));
	cast<StoreSDNode>(Store.getNode())
	->getMemOperand()
	->setValue((Value *)nullptr);
	OutChains.push_back(Store);
	}
	RVFI->setVarArgsSaveSize(VarArgsSaveSize);
	}

	// All stores are grouped in one node to allow the matching between
	// the size of Ins and InVals. This only happens for vararg functions.
	if (!OutChains.empty()) {
	OutChains.push_back(Chain);
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
	}

	return Chain;
	}

	/// isEligibleForTailCallOptimization - Check whether the call is eligible
	/// for tail call optimization.
	/// Note: This is modelled after ARM's IsEligibleForTailCallOptimization.
	bool RISCVTargetLowering::isEligibleForTailCallOptimization(
	CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF,
	const SmallVector<CCValAssign, 16> &ArgLocs) const {

	auto &Callee = CLI.Callee;
	auto CalleeCC = CLI.CallConv;
	auto &Outs = CLI.Outs;
	auto &Caller = MF.getFunction();
	auto CallerCC = Caller.getCallingConv();

	// Exception-handling functions need a special set of instructions to
	// indicate a return to the hardware. Tail-calling another function would
	// probably break this.
	// TODO: The "interrupt" attribute isn't currently defined by RISC-V. This
	// should be expanded as new function attributes are introduced.
	if (Caller.hasFnAttribute("interrupt"))
	return false;

	// Do not tail call opt if the stack is used to pass parameters.
	if (CCInfo.getNextStackOffset() != 0)
	return false;

	// Do not tail call opt if any parameters need to be passed indirectly.
	// Since long doubles (fp128) and i128 are larger than 2*XLEN, they are
	// passed indirectly. So the address of the value will be passed in a
	// register, or if not available, then the address is put on the stack. In
	// order to pass indirectly, space on the stack often needs to be allocated
	// in order to store the value. In this case the CCInfo.getNextStackOffset()
	// != 0 check is not enough and we need to check if any CCValAssign ArgsLocs
	// are passed CCValAssign::Indirect.
	for (auto &VA : ArgLocs)
	if (VA.getLocInfo() == CCValAssign::Indirect)
	return false;

	// Do not tail call opt if either caller or callee uses struct return
	// semantics.
	auto IsCallerStructRet = Caller.hasStructRetAttr();
	auto IsCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet();
	if (IsCallerStructRet \|\| IsCalleeStructRet)
	return false;

	// Externally-defined functions with weak linkage should not be
	// tail-called. The behaviour of branch instructions in this situation (as
	// used for tail calls) is implementation-defined, so we cannot rely on the
	// linker replacing the tail call with a return.
	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
	const GlobalValue *GV = G->getGlobal();
	if (GV->hasExternalWeakLinkage())
	return false;
	}

	// The callee has to preserve all registers the caller needs to preserve.
	const RISCVRegisterInfo *TRI = Subtarget.getRegisterInfo();
	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
	if (CalleeCC != CallerCC) {
	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
	if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
	return false;
	}

	// Byval parameters hand the function a pointer directly into the stack area
	// we want to reuse during a tail call. Working around this is possible
	// but less efficient and uglier in LowerCall.
	for (auto &Arg : Outs)
	if (Arg.Flags.isByVal())
	return false;

	return true;
	}

	// Lower a call to a callseq_start + CALL + callseq_end chain, and add input
	// and output parameter nodes.
	SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	SelectionDAG &DAG = CLI.DAG;
	SDLoc &DL = CLI.DL;
	SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
	SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
	SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
	SDValue Chain = CLI.Chain;
	SDValue Callee = CLI.Callee;
	bool &IsTailCall = CLI.IsTailCall;
	CallingConv::ID CallConv = CLI.CallConv;
	bool IsVarArg = CLI.IsVarArg;
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	MVT XLenVT = Subtarget.getXLenVT();

	MachineFunction &MF = DAG.getMachineFunction();

	// Analyze the operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());

	if (CallConv == CallingConv::Fast)
	ArgCCInfo.AnalyzeCallOperands(Outs, CC_RISCV_FastCC);
	else
	analyzeOutputArgs(MF, ArgCCInfo, Outs, /IsRet=/false, &CLI);

	// Check if it's really possible to do a tail call.
	if (IsTailCall)
	IsTailCall = isEligibleForTailCallOptimization(ArgCCInfo, CLI, MF, ArgLocs);

	if (IsTailCall)
	++NumTailCalls;
	else if (CLI.CB && CLI.CB->isMustTailCall())
	report_fatal_error("failed to perform tail call elimination on a call "
	"site marked musttail");

	// Get a count of how many bytes are to be pushed on the stack.
	unsigned NumBytes = ArgCCInfo.getNextStackOffset();

	// Create local copies for byval args
	SmallVector<SDValue, 8> ByValArgs;
	for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
	ISD::ArgFlagsTy Flags = Outs[i].Flags;
	if (!Flags.isByVal())
	continue;

	SDValue Arg = OutVals[i];
	unsigned Size = Flags.getByValSize();
	Align Alignment = Flags.getNonZeroByValAlign();

	int FI =
	MF.getFrameInfo().CreateStackObject(Size, Alignment, /isSS=/false);
	SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
	SDValue SizeNode = DAG.getConstant(Size, DL, XLenVT);

	Chain = DAG.getMemcpy(Chain, DL, FIPtr, Arg, SizeNode, Alignment,
	/IsVolatile=/false,
	/AlwaysInline=/false, IsTailCall,
	MachinePointerInfo(), MachinePointerInfo());
	ByValArgs.push_back(FIPtr);
	}

	if (!IsTailCall)
	Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL);

	// Copy argument values to their designated locations.
	SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
	SmallVector<SDValue, 8> MemOpChains;
	SDValue StackPtr;
	for (unsigned i = 0, j = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	SDValue ArgValue = OutVals[i];
	ISD::ArgFlagsTy Flags = Outs[i].Flags;

	// Handle passing f64 on RV32D with a soft float ABI as a special case.
	bool IsF64OnRV32DSoftABI =
	VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64;
	if (IsF64OnRV32DSoftABI && VA.isRegLoc()) {
	SDValue SplitF64 = DAG.getNode(
	RISCVISD::SplitF64, DL, DAG.getVTList(MVT::i32, MVT::i32), ArgValue);
	SDValue Lo = SplitF64.getValue(0);
	SDValue Hi = SplitF64.getValue(1);

	Register RegLo = VA.getLocReg();
	RegsToPass.push_back(std::make_pair(RegLo, Lo));

	if (RegLo == RISCV::X17) {
	// Second half of f64 is passed on the stack.
	// Work out the address of the stack slot.
	if (!StackPtr.getNode())
	StackPtr = DAG.getCopyFromReg(Chain, DL, RISCV::X2, PtrVT);
	// Emit the store.
	MemOpChains.push_back(
	DAG.getStore(Chain, DL, Hi, StackPtr, MachinePointerInfo()));
	} else {
	// Second half of f64 is passed in another GPR.
	assert(RegLo < RISCV::X31 && "Invalid register pair");
	Register RegHigh = RegLo + 1;
	RegsToPass.push_back(std::make_pair(RegHigh, Hi));
	}
	continue;
	}

	// IsF64OnRV32DSoftABI && VA.isMemLoc() is handled below in the same way
	// as any other MemLoc.

	// Promote the value if needed.
	// For now, only handle fully promoted and indirect arguments.
	if (VA.getLocInfo() == CCValAssign::Indirect) {
	// Store the argument in a stack slot and pass its address.
	SDValue SpillSlot = DAG.CreateStackTemporary(Outs[i].ArgVT);
	int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
	MemOpChains.push_back(
	DAG.getStore(Chain, DL, ArgValue, SpillSlot,
	MachinePointerInfo::getFixedStack(MF, FI)));
	// If the original argument was split (e.g. i128), we need
	// to store all parts of it here (and pass just one address).
	unsigned ArgIndex = Outs[i].OrigArgIndex;
	assert(Outs[i].PartOffset == 0);
	while (i + 1 != e && Outs[i + 1].OrigArgIndex == ArgIndex) {
	SDValue PartValue = OutVals[i + 1];
	unsigned PartOffset = Outs[i + 1].PartOffset;
	SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot,
	DAG.getIntPtrConstant(PartOffset, DL));
	MemOpChains.push_back(
	DAG.getStore(Chain, DL, PartValue, Address,
	MachinePointerInfo::getFixedStack(MF, FI)));
	++i;
	}
	ArgValue = SpillSlot;
	} else {
	ArgValue = convertValVTToLocVT(DAG, ArgValue, VA, DL);
	}

	// Use local copy if it is a byval arg.
	if (Flags.isByVal())
	ArgValue = ByValArgs[j++];

	if (VA.isRegLoc()) {
	// Queue up the argument copies and emit them at the end.
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue));
	} else {
	assert(VA.isMemLoc() && "Argument not register or memory");
	assert(!IsTailCall && "Tail call not allowed if stack is used "
	"for passing parameters");

	// Work out the address of the stack slot.
	if (!StackPtr.getNode())
	StackPtr = DAG.getCopyFromReg(Chain, DL, RISCV::X2, PtrVT);
	SDValue Address =
	DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr,
	DAG.getIntPtrConstant(VA.getLocMemOffset(), DL));

	// Emit the store.
	MemOpChains.push_back(
	DAG.getStore(Chain, DL, ArgValue, Address, MachinePointerInfo()));
	}
	}

	// Join the stores, which are independent of one another.
	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);

	SDValue Glue;

	// Build a sequence of copy-to-reg nodes, chained and glued together.
	for (auto &Reg : RegsToPass) {
	Chain = DAG.getCopyToReg(Chain, DL, Reg.first, Reg.second, Glue);
	Glue = Chain.getValue(1);
	}

	// Validate that none of the argument registers have been marked as
	// reserved, if so report an error. Do the same for the return address if this
	// is not a tailcall.
	validateCCReservedRegs(RegsToPass, MF);
	if (!IsTailCall &&
	MF.getSubtarget<RISCVSubtarget>().isRegisterReservedByUser(RISCV::X1))
	MF.getFunction().getContext().diagnose(DiagnosticInfoUnsupported{
	MF.getFunction(),
	"Return address register required, but has been reserved."});

	// If the callee is a GlobalAddress/ExternalSymbol node, turn it into a
	// TargetGlobalAddress/TargetExternalSymbol node so that legalize won't
	// split it and then direct call can be matched by PseudoCALL.
	if (GlobalAddressSDNode *S = dyn_cast<GlobalAddressSDNode>(Callee)) {
	const GlobalValue *GV = S->getGlobal();

	unsigned OpFlags = RISCVII::MO_CALL;
	if (!getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV))
	OpFlags = RISCVII::MO_PLT;

	Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
	} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
	unsigned OpFlags = RISCVII::MO_CALL;

	if (!getTargetMachine().shouldAssumeDSOLocal(*MF.getFunction().getParent(),
	nullptr))
	OpFlags = RISCVII::MO_PLT;

	Callee = DAG.getTargetExternalSymbol(S->getSymbol(), PtrVT, OpFlags);
	}

	// The first call operand is the chain and the second is the target address.
	SmallVector<SDValue, 8> Ops;
	Ops.push_back(Chain);
	Ops.push_back(Callee);

	// Add argument registers to the end of the list so that they are
	// known live into the call.
	for (auto &Reg : RegsToPass)
	Ops.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));

	if (!IsTailCall) {
	// Add a register mask operand representing the call-preserved registers.
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
	assert(Mask && "Missing call preserved mask for calling convention");
	Ops.push_back(DAG.getRegisterMask(Mask));
	}

	// Glue the call to the argument copies, if any.
	if (Glue.getNode())
	Ops.push_back(Glue);

	// Emit the call.
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

	if (IsTailCall) {
	MF.getFrameInfo().setHasTailCall();
	return DAG.getNode(RISCVISD::TAIL, DL, NodeTys, Ops);
	}

	Chain = DAG.getNode(RISCVISD::CALL, DL, NodeTys, Ops);
	DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
	Glue = Chain.getValue(1);

	// Mark the end of the call, which is glued to the call itself.
	Chain = DAG.getCALLSEQ_END(Chain,
	DAG.getConstant(NumBytes, DL, PtrVT, true),
	DAG.getConstant(0, DL, PtrVT, true),
	Glue, DL);
	Glue = Chain.getValue(1);

	// Assign locations to each value returned by this call.
	SmallVector<CCValAssign, 16> RVLocs;
	CCState RetCCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext());
	analyzeInputArgs(MF, RetCCInfo, Ins, /IsRet=/true);

	// Copy all of the result registers out of their specified physreg.
	for (auto &VA : RVLocs) {
	// Copy the value out
	SDValue RetValue =
	DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), Glue);
	// Glue the RetValue to the end of the call sequence
	Chain = RetValue.getValue(1);
	Glue = RetValue.getValue(2);

	if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64) {
	assert(VA.getLocReg() == ArgGPRs[0] && "Unexpected reg assignment");
	SDValue RetValue2 =
	DAG.getCopyFromReg(Chain, DL, ArgGPRs[1], MVT::i32, Glue);
	Chain = RetValue2.getValue(1);
	Glue = RetValue2.getValue(2);
	RetValue = DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, RetValue,
	RetValue2);
	}

	RetValue = convertLocVTToValVT(DAG, RetValue, VA, DL);

	InVals.push_back(RetValue);
	}

	return Chain;
	}

	bool RISCVTargetLowering::CanLowerReturn(
	CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
	for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
	MVT VT = Outs[i].VT;
	ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
	RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
	if (CC_RISCV(MF.getDataLayout(), ABI, i, VT, VT, CCValAssign::Full,
	ArgFlags, CCInfo, /IsFixed=/true, /IsRet=/true, nullptr))
	return false;
	}
	return true;
	}

	SDValue
	RISCVTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
	bool IsVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &DL, SelectionDAG &DAG) const {
	const MachineFunction &MF = DAG.getMachineFunction();
	const RISCVSubtarget &STI = MF.getSubtarget<RISCVSubtarget>();

	// Stores the assignment of the return value to a location.
	SmallVector<CCValAssign, 16> RVLocs;

	// Info about the registers and stack slot.
	CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());

	analyzeOutputArgs(DAG.getMachineFunction(), CCInfo, Outs, /IsRet=/true,
	nullptr);

	SDValue Glue;
	SmallVector<SDValue, 4> RetOps(1, Chain);

	// Copy the result values into the output registers.
	for (unsigned i = 0, e = RVLocs.size(); i < e; ++i) {
	SDValue Val = OutVals[i];
	CCValAssign &VA = RVLocs[i];
	assert(VA.isRegLoc() && "Can only return in registers!");

	if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64) {
	// Handle returning f64 on RV32D with a soft float ABI.
	assert(VA.isRegLoc() && "Expected return via registers");
	SDValue SplitF64 = DAG.getNode(RISCVISD::SplitF64, DL,
	DAG.getVTList(MVT::i32, MVT::i32), Val);
	SDValue Lo = SplitF64.getValue(0);
	SDValue Hi = SplitF64.getValue(1);
	Register RegLo = VA.getLocReg();
	assert(RegLo < RISCV::X31 && "Invalid register pair");
	Register RegHi = RegLo + 1;

	if (STI.isRegisterReservedByUser(RegLo) \|\|
	STI.isRegisterReservedByUser(RegHi))
	MF.getFunction().getContext().diagnose(DiagnosticInfoUnsupported{
	MF.getFunction(),
	"Return value register required, but has been reserved."});

	Chain = DAG.getCopyToReg(Chain, DL, RegLo, Lo, Glue);
	Glue = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(RegLo, MVT::i32));
	Chain = DAG.getCopyToReg(Chain, DL, RegHi, Hi, Glue);
	Glue = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(RegHi, MVT::i32));
	} else {
	// Handle a 'normal' return.
	Val = convertValVTToLocVT(DAG, Val, VA, DL);
	Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Val, Glue);

	if (STI.isRegisterReservedByUser(VA.getLocReg()))
	MF.getFunction().getContext().diagnose(DiagnosticInfoUnsupported{
	MF.getFunction(),
	"Return value register required, but has been reserved."});

	// Guarantee that all emitted copies are stuck together.
	Glue = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
	}
	}

	RetOps[0] = Chain; // Update chain.

	// Add the glue node if we have it.
	if (Glue.getNode()) {
	RetOps.push_back(Glue);
	}

	// Interrupt service routines use different return instructions.
	const Function &Func = DAG.getMachineFunction().getFunction();
	if (Func.hasFnAttribute("interrupt")) {
	if (!Func.getReturnType()->isVoidTy())
	report_fatal_error(
	"Functions with the interrupt attribute must have void return type!");

	MachineFunction &MF = DAG.getMachineFunction();
	StringRef Kind =
	MF.getFunction().getFnAttribute("interrupt").getValueAsString();

	unsigned RetOpc;
	if (Kind == "user")
	RetOpc = RISCVISD::URET_FLAG;
	else if (Kind == "supervisor")
	RetOpc = RISCVISD::SRET_FLAG;
	else
	RetOpc = RISCVISD::MRET_FLAG;

	return DAG.getNode(RetOpc, DL, MVT::Other, RetOps);
	}

	return DAG.getNode(RISCVISD::RET_FLAG, DL, MVT::Other, RetOps);
	}

	void RISCVTargetLowering::validateCCReservedRegs(
	const SmallVectorImpl<std::pair<llvm::Register, llvm::SDValue>> &Regs,
	MachineFunction &MF) const {
	const Function &F = MF.getFunction();
	const RISCVSubtarget &STI = MF.getSubtarget<RISCVSubtarget>();

	if (std::any_of(std::begin(Regs), std::end(Regs), [&STI](auto Reg) {
	return STI.isRegisterReservedByUser(Reg.first);
	}))
	F.getContext().diagnose(DiagnosticInfoUnsupported{
	F, "Argument register required, but has been reserved."});
	}

	bool RISCVTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
	return CI->isTailCall();
	}

	const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
	switch ((RISCVISD::NodeType)Opcode) {
	case RISCVISD::FIRST_NUMBER:
	break;
	case RISCVISD::RET_FLAG:
	return "RISCVISD::RET_FLAG";
	case RISCVISD::URET_FLAG:
	return "RISCVISD::URET_FLAG";
	case RISCVISD::SRET_FLAG:
	return "RISCVISD::SRET_FLAG";
	case RISCVISD::MRET_FLAG:
	return "RISCVISD::MRET_FLAG";
	case RISCVISD::CALL:
	return "RISCVISD::CALL";
	case RISCVISD::SELECT_CC:
	return "RISCVISD::SELECT_CC";
	case RISCVISD::BuildPairF64:
	return "RISCVISD::BuildPairF64";
	case RISCVISD::SplitF64:
	return "RISCVISD::SplitF64";
	case RISCVISD::TAIL:
	return "RISCVISD::TAIL";
	case RISCVISD::SLLW:
	return "RISCVISD::SLLW";
	case RISCVISD::SRAW:
	return "RISCVISD::SRAW";
	case RISCVISD::SRLW:
	return "RISCVISD::SRLW";
	case RISCVISD::DIVW:
	return "RISCVISD::DIVW";
	case RISCVISD::DIVUW:
	return "RISCVISD::DIVUW";
	case RISCVISD::REMUW:
	return "RISCVISD::REMUW";
	case RISCVISD::FMV_W_X_RV64:
	return "RISCVISD::FMV_W_X_RV64";
	case RISCVISD::FMV_X_ANYEXTW_RV64:
	return "RISCVISD::FMV_X_ANYEXTW_RV64";
	case RISCVISD::READ_CYCLE_WIDE:
	return "RISCVISD::READ_CYCLE_WIDE";
	}
	return nullptr;
	}

	/// getConstraintType - Given a constraint letter, return the type of
	/// constraint it is for this target.
	RISCVTargetLowering::ConstraintType
	RISCVTargetLowering::getConstraintType(StringRef Constraint) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	default:
	break;
	case 'f':
	return C_RegisterClass;
	case 'I':
	case 'J':
	case 'K':
	return C_Immediate;
	case 'A':
	return C_Memory;
	}
	}
	return TargetLowering::getConstraintType(Constraint);
	}

	std::pair<unsigned, const TargetRegisterClass *>
	RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint,
	MVT VT) const {
	// First, see if this is a constraint that directly corresponds to a
	// RISCV register class.
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	case 'r':
	return std::make_pair(0U, &RISCV::GPRRegClass);
	case 'f':
	if (Subtarget.hasStdExtF() && VT == MVT::f32)
	return std::make_pair(0U, &RISCV::FPR32RegClass);
	if (Subtarget.hasStdExtD() && VT == MVT::f64)
	return std::make_pair(0U, &RISCV::FPR64RegClass);
	break;
	default:
	break;
	}
	}

	// Clang will correctly decode the usage of register name aliases into their
	// official names. However, other frontends like `rustc` do not. This allows
	// users of these frontends to use the ABI names for registers in LLVM-style
	// register constraints.
	Register XRegFromAlias = StringSwitch<Register>(Constraint.lower())
	.Case("{zero}", RISCV::X0)
	.Case("{ra}", RISCV::X1)
	.Case("{sp}", RISCV::X2)
	.Case("{gp}", RISCV::X3)
	.Case("{tp}", RISCV::X4)
	.Case("{t0}", RISCV::X5)
	.Case("{t1}", RISCV::X6)
	.Case("{t2}", RISCV::X7)
	.Cases("{s0}", "{fp}", RISCV::X8)
	.Case("{s1}", RISCV::X9)
	.Case("{a0}", RISCV::X10)
	.Case("{a1}", RISCV::X11)
	.Case("{a2}", RISCV::X12)
	.Case("{a3}", RISCV::X13)
	.Case("{a4}", RISCV::X14)
	.Case("{a5}", RISCV::X15)
	.Case("{a6}", RISCV::X16)
	.Case("{a7}", RISCV::X17)
	.Case("{s2}", RISCV::X18)
	.Case("{s3}", RISCV::X19)
	.Case("{s4}", RISCV::X20)
	.Case("{s5}", RISCV::X21)
	.Case("{s6}", RISCV::X22)
	.Case("{s7}", RISCV::X23)
	.Case("{s8}", RISCV::X24)
	.Case("{s9}", RISCV::X25)
	.Case("{s10}", RISCV::X26)
	.Case("{s11}", RISCV::X27)
	.Case("{t3}", RISCV::X28)
	.Case("{t4}", RISCV::X29)
	.Case("{t5}", RISCV::X30)
	.Case("{t6}", RISCV::X31)
	.Default(RISCV::NoRegister);
	if (XRegFromAlias != RISCV::NoRegister)
	return std::make_pair(XRegFromAlias, &RISCV::GPRRegClass);

	// Since TargetLowering::getRegForInlineAsmConstraint uses the name of the
	// TableGen record rather than the AsmName to choose registers for InlineAsm
	// constraints, plus we want to match those names to the widest floating point
	// register type available, manually select floating point registers here.
	//
	// The second case is the ABI name of the register, so that frontends can also
	// use the ABI names in register constraint lists.
	if (Subtarget.hasStdExtF() \|\| Subtarget.hasStdExtD()) {
	std::pair<Register, Register> FReg =
	StringSwitch<std::pair<Register, Register>>(Constraint.lower())
	.Cases("{f0}", "{ft0}", {RISCV::F0_F, RISCV::F0_D})
	.Cases("{f1}", "{ft1}", {RISCV::F1_F, RISCV::F1_D})
	.Cases("{f2}", "{ft2}", {RISCV::F2_F, RISCV::F2_D})
	.Cases("{f3}", "{ft3}", {RISCV::F3_F, RISCV::F3_D})
	.Cases("{f4}", "{ft4}", {RISCV::F4_F, RISCV::F4_D})
	.Cases("{f5}", "{ft5}", {RISCV::F5_F, RISCV::F5_D})
	.Cases("{f6}", "{ft6}", {RISCV::F6_F, RISCV::F6_D})
	.Cases("{f7}", "{ft7}", {RISCV::F7_F, RISCV::F7_D})
	.Cases("{f8}", "{fs0}", {RISCV::F8_F, RISCV::F8_D})
	.Cases("{f9}", "{fs1}", {RISCV::F9_F, RISCV::F9_D})
	.Cases("{f10}", "{fa0}", {RISCV::F10_F, RISCV::F10_D})
	.Cases("{f11}", "{fa1}", {RISCV::F11_F, RISCV::F11_D})
	.Cases("{f12}", "{fa2}", {RISCV::F12_F, RISCV::F12_D})
	.Cases("{f13}", "{fa3}", {RISCV::F13_F, RISCV::F13_D})
	.Cases("{f14}", "{fa4}", {RISCV::F14_F, RISCV::F14_D})
	.Cases("{f15}", "{fa5}", {RISCV::F15_F, RISCV::F15_D})
	.Cases("{f16}", "{fa6}", {RISCV::F16_F, RISCV::F16_D})
	.Cases("{f17}", "{fa7}", {RISCV::F17_F, RISCV::F17_D})
	.Cases("{f18}", "{fs2}", {RISCV::F18_F, RISCV::F18_D})
	.Cases("{f19}", "{fs3}", {RISCV::F19_F, RISCV::F19_D})
	.Cases("{f20}", "{fs4}", {RISCV::F20_F, RISCV::F20_D})
	.Cases("{f21}", "{fs5}", {RISCV::F21_F, RISCV::F21_D})
	.Cases("{f22}", "{fs6}", {RISCV::F22_F, RISCV::F22_D})
	.Cases("{f23}", "{fs7}", {RISCV::F23_F, RISCV::F23_D})
	.Cases("{f24}", "{fs8}", {RISCV::F24_F, RISCV::F24_D})
	.Cases("{f25}", "{fs9}", {RISCV::F25_F, RISCV::F25_D})
	.Cases("{f26}", "{fs10}", {RISCV::F26_F, RISCV::F26_D})
	.Cases("{f27}", "{fs11}", {RISCV::F27_F, RISCV::F27_D})
	.Cases("{f28}", "{ft8}", {RISCV::F28_F, RISCV::F28_D})
	.Cases("{f29}", "{ft9}", {RISCV::F29_F, RISCV::F29_D})
	.Cases("{f30}", "{ft10}", {RISCV::F30_F, RISCV::F30_D})
	.Cases("{f31}", "{ft11}", {RISCV::F31_F, RISCV::F31_D})
	.Default({RISCV::NoRegister, RISCV::NoRegister});
	if (FReg.first != RISCV::NoRegister)
	return Subtarget.hasStdExtD()
	? std::make_pair(FReg.second, &RISCV::FPR64RegClass)
	: std::make_pair(FReg.first, &RISCV::FPR32RegClass);
	}

	return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
	}

	unsigned
	RISCVTargetLowering::getInlineAsmMemConstraint(StringRef ConstraintCode) const {
	// Currently only support length 1 constraints.
	if (ConstraintCode.size() == 1) {
	switch (ConstraintCode[0]) {
	case 'A':
	return InlineAsm::Constraint_A;
	default:
	break;
	}
	}

	return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
	}

	void RISCVTargetLowering::LowerAsmOperandForConstraint(
	SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
	SelectionDAG &DAG) const {
	// Currently only support length 1 constraints.
	if (Constraint.length() == 1) {
	switch (Constraint[0]) {
	case 'I':
	// Validate & create a 12-bit signed immediate operand.
	if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
	uint64_t CVal = C->getSExtValue();
	if (isInt<12>(CVal))
	Ops.push_back(
	DAG.getTargetConstant(CVal, SDLoc(Op), Subtarget.getXLenVT()));
	}
	return;
	case 'J':
	// Validate & create an integer zero operand.
	if (auto *C = dyn_cast<ConstantSDNode>(Op))
	if (C->getZExtValue() == 0)
	Ops.push_back(
	DAG.getTargetConstant(0, SDLoc(Op), Subtarget.getXLenVT()));
	return;
	case 'K':
	// Validate & create a 5-bit unsigned immediate operand.
	if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
	uint64_t CVal = C->getZExtValue();
	if (isUInt<5>(CVal))
	Ops.push_back(
	DAG.getTargetConstant(CVal, SDLoc(Op), Subtarget.getXLenVT()));
	}
	return;
	default:
	break;
	}
	}
	TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
	}

	Instruction *RISCVTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
	Instruction *Inst,
	AtomicOrdering Ord) const {
	if (isa<LoadInst>(Inst) && Ord == AtomicOrdering::SequentiallyConsistent)
	return Builder.CreateFence(Ord);
	if (isa<StoreInst>(Inst) && isReleaseOrStronger(Ord))
	return Builder.CreateFence(AtomicOrdering::Release);
	return nullptr;
	}

	Instruction *RISCVTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
	Instruction *Inst,
	AtomicOrdering Ord) const {
	if (isa<LoadInst>(Inst) && isAcquireOrStronger(Ord))
	return Builder.CreateFence(AtomicOrdering::Acquire);
	return nullptr;
	}

	TargetLowering::AtomicExpansionKind
	RISCVTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
	// atomicrmw {fadd,fsub} must be expanded to use compare-exchange, as floating
	// point operations can't be used in an lr/sc sequence without breaking the
	// forward-progress guarantee.
	if (AI->isFloatingPointOperation())
	return AtomicExpansionKind::CmpXChg;

	unsigned Size = AI->getType()->getPrimitiveSizeInBits();
	if (Size == 8 \|\| Size == 16)
	return AtomicExpansionKind::MaskedIntrinsic;
	return AtomicExpansionKind::None;
	}

	static Intrinsic::ID
	getIntrinsicForMaskedAtomicRMWBinOp(unsigned XLen, AtomicRMWInst::BinOp BinOp) {
	if (XLen == 32) {
	switch (BinOp) {
	default:
	llvm_unreachable("Unexpected AtomicRMW BinOp");
	case AtomicRMWInst::Xchg:
	return Intrinsic::riscv_masked_atomicrmw_xchg_i32;
	case AtomicRMWInst::Add:
	return Intrinsic::riscv_masked_atomicrmw_add_i32;
	case AtomicRMWInst::Sub:
	return Intrinsic::riscv_masked_atomicrmw_sub_i32;
	case AtomicRMWInst::Nand:
	return Intrinsic::riscv_masked_atomicrmw_nand_i32;
	case AtomicRMWInst::Max:
	return Intrinsic::riscv_masked_atomicrmw_max_i32;
	case AtomicRMWInst::Min:
	return Intrinsic::riscv_masked_atomicrmw_min_i32;
	case AtomicRMWInst::UMax:
	return Intrinsic::riscv_masked_atomicrmw_umax_i32;
	case AtomicRMWInst::UMin:
	return Intrinsic::riscv_masked_atomicrmw_umin_i32;
	}
	}

	if (XLen == 64) {
	switch (BinOp) {
	default:
	llvm_unreachable("Unexpected AtomicRMW BinOp");
	case AtomicRMWInst::Xchg:
	return Intrinsic::riscv_masked_atomicrmw_xchg_i64;
	case AtomicRMWInst::Add:
	return Intrinsic::riscv_masked_atomicrmw_add_i64;
	case AtomicRMWInst::Sub:
	return Intrinsic::riscv_masked_atomicrmw_sub_i64;
	case AtomicRMWInst::Nand:
	return Intrinsic::riscv_masked_atomicrmw_nand_i64;
	case AtomicRMWInst::Max:
	return Intrinsic::riscv_masked_atomicrmw_max_i64;
	case AtomicRMWInst::Min:
	return Intrinsic::riscv_masked_atomicrmw_min_i64;
	case AtomicRMWInst::UMax:
	return Intrinsic::riscv_masked_atomicrmw_umax_i64;
	case AtomicRMWInst::UMin:
	return Intrinsic::riscv_masked_atomicrmw_umin_i64;
	}
	}

	llvm_unreachable("Unexpected XLen\n");
	}

	Value *RISCVTargetLowering::emitMaskedAtomicRMWIntrinsic(
	IRBuilder<> &Builder, AtomicRMWInst AI, Value AlignedAddr, Value *Incr,
	Value Mask, Value ShiftAmt, AtomicOrdering Ord) const {
	unsigned XLen = Subtarget.getXLen();
	Value *Ordering =
	Builder.getIntN(XLen, static_cast<uint64_t>(AI->getOrdering()));
	Type *Tys[] = {AlignedAddr->getType()};
	Function *LrwOpScwLoop = Intrinsic::getDeclaration(
	AI->getModule(),
	getIntrinsicForMaskedAtomicRMWBinOp(XLen, AI->getOperation()), Tys);

	if (XLen == 64) {
	Incr = Builder.CreateSExt(Incr, Builder.getInt64Ty());
	Mask = Builder.CreateSExt(Mask, Builder.getInt64Ty());
	ShiftAmt = Builder.CreateSExt(ShiftAmt, Builder.getInt64Ty());
	}

	Value *Result;

	// Must pass the shift amount needed to sign extend the loaded value prior
	// to performing a signed comparison for min/max. ShiftAmt is the number of
	// bits to shift the value into position. Pass XLen-ShiftAmt-ValWidth, which
	// is the number of bits to left+right shift the value in order to
	// sign-extend.
	if (AI->getOperation() == AtomicRMWInst::Min \|\|
	AI->getOperation() == AtomicRMWInst::Max) {
	const DataLayout &DL = AI->getModule()->getDataLayout();
	unsigned ValWidth =
	DL.getTypeStoreSizeInBits(AI->getValOperand()->getType());
	Value *SextShamt =
	Builder.CreateSub(Builder.getIntN(XLen, XLen - ValWidth), ShiftAmt);
	Result = Builder.CreateCall(LrwOpScwLoop,
	{AlignedAddr, Incr, Mask, SextShamt, Ordering});
	} else {
	Result =
	Builder.CreateCall(LrwOpScwLoop, {AlignedAddr, Incr, Mask, Ordering});
	}

	if (XLen == 64)
	Result = Builder.CreateTrunc(Result, Builder.getInt32Ty());
	return Result;
	}

	TargetLowering::AtomicExpansionKind
	RISCVTargetLowering::shouldExpandAtomicCmpXchgInIR(
	AtomicCmpXchgInst *CI) const {
	unsigned Size = CI->getCompareOperand()->getType()->getPrimitiveSizeInBits();
	if (Size == 8 \|\| Size == 16)
	return AtomicExpansionKind::MaskedIntrinsic;
	return AtomicExpansionKind::None;
	}

	Value *RISCVTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
	IRBuilder<> &Builder, AtomicCmpXchgInst CI, Value AlignedAddr,
	Value CmpVal, Value NewVal, Value *Mask, AtomicOrdering Ord) const {
	unsigned XLen = Subtarget.getXLen();
	Value *Ordering = Builder.getIntN(XLen, static_cast<uint64_t>(Ord));
	Intrinsic::ID CmpXchgIntrID = Intrinsic::riscv_masked_cmpxchg_i32;
	if (XLen == 64) {
	CmpVal = Builder.CreateSExt(CmpVal, Builder.getInt64Ty());
	NewVal = Builder.CreateSExt(NewVal, Builder.getInt64Ty());
	Mask = Builder.CreateSExt(Mask, Builder.getInt64Ty());
	CmpXchgIntrID = Intrinsic::riscv_masked_cmpxchg_i64;
	}
	Type *Tys[] = {AlignedAddr->getType()};
	Function *MaskedCmpXchg =
	Intrinsic::getDeclaration(CI->getModule(), CmpXchgIntrID, Tys);
	Value *Result = Builder.CreateCall(
	MaskedCmpXchg, {AlignedAddr, CmpVal, NewVal, Mask, Ordering});
	if (XLen == 64)
	Result = Builder.CreateTrunc(Result, Builder.getInt32Ty());
	return Result;
	}

	Register RISCVTargetLowering::getExceptionPointerRegister(
	const Constant *PersonalityFn) const {
	return RISCV::X10;
	}

	Register RISCVTargetLowering::getExceptionSelectorRegister(
	const Constant *PersonalityFn) const {
	return RISCV::X11;
	}

	bool RISCVTargetLowering::shouldExtendTypeInLibCall(EVT Type) const {
	// Return false to suppress the unnecessary extensions if the LibCall
	// arguments or return value is f32 type for LP64 ABI.
	RISCVABI::ABI ABI = Subtarget.getTargetABI();
	if (ABI == RISCVABI::ABI_LP64 && (Type == MVT::f32))
	return false;

	return true;
	}

	bool RISCVTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
	SDValue C) const {
	// Check integral scalar types.
	if (VT.isScalarInteger()) {
	// Do not perform the transformation on riscv32 with the M extension.
	if (!Subtarget.is64Bit() && Subtarget.hasStdExtM())
	return false;
	if (auto *ConstNode = dyn_cast<ConstantSDNode>(C.getNode())) {
	if (ConstNode->getAPIntValue().getBitWidth() > 8 * sizeof(int64_t))
	return false;
	int64_t Imm = ConstNode->getSExtValue();
	if (isPowerOf2_64(Imm + 1) \|\| isPowerOf2_64(Imm - 1) \|\|
	isPowerOf2_64(1 - Imm) \|\| isPowerOf2_64(-1 - Imm))
	return true;
	}
	}

	return false;
	}

	#define GET_REGISTER_MATCHER
	#include "RISCVGenAsmMatcher.inc"

	Register
	RISCVTargetLowering::getRegisterByName(const char *RegName, LLT VT,
	const MachineFunction &MF) const {
	Register Reg = MatchRegisterAltName(RegName);
	if (Reg == RISCV::NoRegister)
	Reg = MatchRegisterName(RegName);
	if (Reg == RISCV::NoRegister)
	report_fatal_error(
	Twine("Invalid register name \"" + StringRef(RegName) + "\"."));
	BitVector ReservedRegs = Subtarget.getRegisterInfo()->getReservedRegs(MF);
	if (!ReservedRegs.test(Reg) && !Subtarget.isRegisterReservedByUser(Reg))
	report_fatal_error(Twine("Trying to obtain non-reserved register \"" +
	StringRef(RegName) + "\"."));
	return Reg;
	}
	diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
	index 34a463626e29..afac509f743d 100644
	--- a/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
	+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
	@@ -1,634 +1,1063 @@
	//===-- RISCVInstrInfoB.td - RISC-V 'B' instructions -------- tablegen --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file describes the RISC-V instructions from the standard 'B' Bitmanip
	// extension, version 0.92.
	// This version is still experimental as the 'B' extension hasn't been
	// ratified yet.
	//
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// Operand definitions.
	//===----------------------------------------------------------------------===//

	def UImmLog2XLenHalfAsmOperand : AsmOperandClass {
	let Name = "UImmLog2XLenHalf";
	let RenderMethod = "addImmOperands";
	let DiagnosticType = "InvalidUImmLog2XLenHalf";
	}

	def shfl_uimm : Operand<XLenVT>, ImmLeaf<XLenVT, [{
	if (Subtarget->is64Bit())
	return isUInt<5>(Imm);
	return isUInt<4>(Imm);
	}]> {
	let ParserMatchClass = UImmLog2XLenHalfAsmOperand;
	let DecoderMethod = "decodeUImmOperand<5>";
	let MCOperandPredicate = [{
	int64_t Imm;
	if (!MCOp.evaluateAsConstantImm(Imm))
	return false;
	if (STI.getTargetTriple().isArch64Bit())
	return isUInt<5>(Imm);
	return isUInt<4>(Imm);
	}];
	}

	//===----------------------------------------------------------------------===//
	// Instruction class templates
	//===----------------------------------------------------------------------===//

	// Some of these templates should be moved to RISCVInstrFormats.td once the B
	// extension has been ratified.

	let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
	class RVBUnary<bits<7> funct7, bits<5> funct5, bits<3> funct3,
	RISCVOpcode opcode, string opcodestr>
	: RVInstR<funct7, funct3, opcode, (outs GPR:$rd), (ins GPR:$rs1),
	opcodestr, "$rd, $rs1"> {
	let Inst{24-20} = funct5;
	}

	let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
	class RVBALUW_ri<bits<3> funct3, string opcodestr>
	: RVInstI<funct3, OPC_OP_IMM_32, (outs GPR:$rd),
	(ins GPR:$rs1, simm12:$imm12), opcodestr, "$rd, $rs1, $imm12">;

	let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
	class RVBShift_ri<bits<5> funct5, bits<3> funct3, RISCVOpcode opcode,
	string opcodestr>
	: RVInstI<funct3, opcode, (outs GPR:$rd),
	(ins GPR:$rs1, uimmlog2xlen:$shamt), opcodestr,
	"$rd, $rs1, $shamt"> {
	bits<6> shamt;

	let Inst{31-27} = funct5;
	// NOTE: the bit op(26)=1 is used to select funnel shifts. All other
	// shifts operations and operations that live in the encoding space
	// of the shifts (single bit operations, grev, gorc) use op(26) = 0
	let Inst{26} = 0;
	let Inst{25-20} = shamt;
	}

	let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
	class RVBShiftW_ri<bits<7> funct7, bits<3> funct3, RISCVOpcode opcode,
	string opcodestr>
	: RVInstI<funct3, opcode, (outs GPR:$rd), (ins GPR:$rs1, uimm5:$shamt),
	opcodestr, "$rd, $rs1, $shamt"> {
	bits<5> shamt;

	let Inst{31-25} = funct7;
	let Inst{24-20} = shamt;
	}

	let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
	class RVBShfl_ri<bits<6> funct6, bits<3> funct3, RISCVOpcode opcode,
	string opcodestr>
	: RVInstI<funct3, opcode, (outs GPR:$rd), (ins GPR:$rs1, shfl_uimm:$shamt),
	opcodestr, "$rd, $rs1, $shamt"> {
	bits<6> shamt;

	let Inst{31-26} = funct6;
	let Inst{25-20} = shamt;
	}

	let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
	class RVBTernaryR<bits<2> funct2, bits<3> funct3_b, RISCVOpcode opcode,
	string opcodestr, string argstr>
	: RVInstR4<funct2, opcode, (outs GPR:$rd),
	(ins GPR:$rs1, GPR:$rs2, GPR:$rs3), opcodestr, argstr> {
	let Inst{14-12} = funct3_b;
	}

	// Currently used by FSRI only
	let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
	class RVBTernaryImm6<bits<3> funct3_b, RISCVOpcode opcode,
	string opcodestr, string argstr>
	: RVInstR4<0b10, opcode, (outs GPR:$rd),
	(ins GPR:$rs1, GPR:$rs3, uimmlog2xlen:$shamt),
	opcodestr, argstr> {
	bits<6> shamt;

	// NOTE: the first argument of RVInstR4 is hardcoded to 0b10 like the other
	// funnel shift instructions. The second bit of the argument though is
	// overwritten by the shamt as the encoding of this particular instruction
	// requires. This is to obtain op(26) = 1 as required by funnel shift
	// instructions without the need of a confusing argument in the definition
	// of the instruction.
	let Inst{25-20} = shamt;
	let Inst{14-12} = funct3_b;
	}

	// Currently used by FSRIW only
	let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
	class RVBTernaryImm5<bits<2> funct2, bits<3> funct3_b, RISCVOpcode opcode,
	string opcodestr, string argstr>
	: RVInstR4<funct2, opcode, (outs GPR:$rd),
	(ins GPR:$rs1, GPR:$rs3, uimm5:$shamt), opcodestr, argstr> {
	bits<5> shamt;

	let Inst{24-20} = shamt;
	let Inst{14-12} = funct3_b;
	}

	//===----------------------------------------------------------------------===//
	// Instructions
	//===----------------------------------------------------------------------===//

	let Predicates = [HasStdExtZbbOrZbp] in {
	def ANDN : ALU_rr<0b0100000, 0b111, "andn">, Sched<[]>;
	def ORN : ALU_rr<0b0100000, 0b110, "orn">, Sched<[]>;
	def XNOR : ALU_rr<0b0100000, 0b100, "xnor">, Sched<[]>;
	} // Predicates = [HasStdExtZbbOrZbp]

	let Predicates = [HasStdExtZbb] in {
	def SLO : ALU_rr<0b0010000, 0b001, "slo">, Sched<[]>;
	def SRO : ALU_rr<0b0010000, 0b101, "sro">, Sched<[]>;
	} // Predicates = [HasStdExtZbb]

	let Predicates = [HasStdExtZbbOrZbp] in {
	def ROL : ALU_rr<0b0110000, 0b001, "rol">, Sched<[]>;
	def ROR : ALU_rr<0b0110000, 0b101, "ror">, Sched<[]>;
	} // Predicates = [HasStdExtZbbOrZbp]

	let Predicates = [HasStdExtZbs] in {
	def SBCLR : ALU_rr<0b0100100, 0b001, "sbclr">, Sched<[]>;
	def SBSET : ALU_rr<0b0010100, 0b001, "sbset">, Sched<[]>;
	def SBINV : ALU_rr<0b0110100, 0b001, "sbinv">, Sched<[]>;
	def SBEXT : ALU_rr<0b0100100, 0b101, "sbext">, Sched<[]>;
	} // Predicates = [HasStdExtZbs]

	let Predicates = [HasStdExtZbp] in {
	def GORC : ALU_rr<0b0010100, 0b101, "gorc">, Sched<[]>;
	def GREV : ALU_rr<0b0110100, 0b101, "grev">, Sched<[]>;
	} // Predicates = [HasStdExtZbp]

	let Predicates = [HasStdExtZbb] in {
	def SLOI : RVBShift_ri<0b00100, 0b001, OPC_OP_IMM, "sloi">, Sched<[]>;
	def SROI : RVBShift_ri<0b00100, 0b101, OPC_OP_IMM, "sroi">, Sched<[]>;
	} // Predicates = [HasStdExtZbb]

	let Predicates = [HasStdExtZbbOrZbp] in
	def RORI : RVBShift_ri<0b01100, 0b101, OPC_OP_IMM, "rori">, Sched<[]>;

	let Predicates = [HasStdExtZbs] in {
	def SBCLRI : RVBShift_ri<0b01001, 0b001, OPC_OP_IMM, "sbclri">, Sched<[]>;
	def SBSETI : RVBShift_ri<0b00101, 0b001, OPC_OP_IMM, "sbseti">, Sched<[]>;
	def SBINVI : RVBShift_ri<0b01101, 0b001, OPC_OP_IMM, "sbinvi">, Sched<[]>;
	def SBEXTI : RVBShift_ri<0b01001, 0b101, OPC_OP_IMM, "sbexti">, Sched<[]>;
	} // Predicates = [HasStdExtZbs]

	let Predicates = [HasStdExtZbp] in {
	def GREVI : RVBShift_ri<0b01101, 0b101, OPC_OP_IMM, "grevi">, Sched<[]>;
	def GORCI : RVBShift_ri<0b00101, 0b101, OPC_OP_IMM, "gorci">, Sched<[]>;
	} // Predicates = [HasStdExtZbp]

	let Predicates = [HasStdExtZbt] in {
	def CMIX : RVBTernaryR<0b11, 0b001, OPC_OP, "cmix", "$rd, $rs2, $rs1, $rs3">,
	Sched<[]>;
	def CMOV : RVBTernaryR<0b11, 0b101, OPC_OP, "cmov", "$rd, $rs2, $rs1, $rs3">,
	Sched<[]>;
	def FSL : RVBTernaryR<0b10, 0b001, OPC_OP, "fsl", "$rd, $rs1, $rs3, $rs2">,
	Sched<[]>;
	def FSR : RVBTernaryR<0b10, 0b101, OPC_OP, "fsr", "$rd, $rs1, $rs3, $rs2">,
	Sched<[]>;
	def FSRI : RVBTernaryImm6<0b101, OPC_OP_IMM, "fsri",
	"$rd, $rs1, $rs3, $shamt">, Sched<[]>;
	} // Predicates = [HasStdExtZbt]

	let Predicates = [HasStdExtZbb] in {
	def CLZ : RVBUnary<0b0110000, 0b00000, 0b001, RISCVOpcode<0b0010011>, "clz">,
	Sched<[]>;
	def CTZ : RVBUnary<0b0110000, 0b00001, 0b001, RISCVOpcode<0b0010011>, "ctz">,
	Sched<[]>;
	def PCNT : RVBUnary<0b0110000, 0b00010, 0b001, RISCVOpcode<0b0010011>, "pcnt">,
	Sched<[]>;
	} // Predicates = [HasStdExtZbb]

	let Predicates = [HasStdExtZbm, IsRV64] in
	def BMATFLIP : RVBUnary<0b0110000, 0b00011, 0b001, RISCVOpcode<0b0010011>,
	"bmatflip">, Sched<[]>;

	let Predicates = [HasStdExtZbb] in {
	def SEXTB : RVBUnary<0b0110000, 0b00100, 0b001, RISCVOpcode<0b0010011>,
	"sext.b">, Sched<[]>;
	def SEXTH : RVBUnary<0b0110000, 0b00101, 0b001, RISCVOpcode<0b0010011>,
	"sext.h">, Sched<[]>;
	} // Predicates = [HasStdExtZbb]

	let Predicates = [HasStdExtZbr] in {
	def CRC32B : RVBUnary<0b0110000, 0b10000, 0b001, RISCVOpcode<0b0010011>,
	"crc32.b">, Sched<[]>;
	def CRC32H : RVBUnary<0b0110000, 0b10001, 0b001, RISCVOpcode<0b0010011>,
	"crc32.h">, Sched<[]>;
	def CRC32W : RVBUnary<0b0110000, 0b10010, 0b001, RISCVOpcode<0b0010011>,
	"crc32.w">, Sched<[]>;
	} // Predicates = [HasStdExtZbr]

	let Predicates = [HasStdExtZbr, IsRV64] in
	def CRC32D : RVBUnary<0b0110000, 0b10011, 0b001, RISCVOpcode<0b0010011>,
	"crc32.d">, Sched<[]>;

	let Predicates = [HasStdExtZbr] in {
	def CRC32CB : RVBUnary<0b0110000, 0b11000, 0b001, RISCVOpcode<0b0010011>,
	"crc32c.b">, Sched<[]>;
	def CRC32CH : RVBUnary<0b0110000, 0b11001, 0b001, RISCVOpcode<0b0010011>,
	"crc32c.h">, Sched<[]>;
	def CRC32CW : RVBUnary<0b0110000, 0b11010, 0b001, RISCVOpcode<0b0010011>,
	"crc32c.w">, Sched<[]>;
	} // Predicates = [HasStdExtZbr]

	let Predicates = [HasStdExtZbr, IsRV64] in
	def CRC32CD : RVBUnary<0b0110000, 0b11011, 0b001, RISCVOpcode<0b0010011>,
	"crc32c.d">, Sched<[]>;

	let Predicates = [HasStdExtZbc] in {
	def CLMUL : ALU_rr<0b0000101, 0b001, "clmul">, Sched<[]>;
	def CLMULR : ALU_rr<0b0000101, 0b010, "clmulr">, Sched<[]>;
	def CLMULH : ALU_rr<0b0000101, 0b011, "clmulh">, Sched<[]>;
	} // Predicates = [HasStdExtZbc]

	let Predicates = [HasStdExtZbb] in {
	def MIN : ALU_rr<0b0000101, 0b100, "min">, Sched<[]>;
	def MAX : ALU_rr<0b0000101, 0b101, "max">, Sched<[]>;
	def MINU : ALU_rr<0b0000101, 0b110, "minu">, Sched<[]>;
	def MAXU : ALU_rr<0b0000101, 0b111, "maxu">, Sched<[]>;
	} // Predicates = [HasStdExtZbb]

	let Predicates = [HasStdExtZbp] in {
	def SHFL : ALU_rr<0b0000100, 0b001, "shfl">, Sched<[]>;
	def UNSHFL : ALU_rr<0b0000100, 0b101, "unshfl">, Sched<[]>;
	} // Predicates = [HasStdExtZbp]

	let Predicates = [HasStdExtZbe] in {
	def BDEP : ALU_rr<0b0100100, 0b110, "bdep">, Sched<[]>;
	def BEXT : ALU_rr<0b0000100, 0b110, "bext">, Sched<[]>;
	} // Predicates = [HasStdExtZbe]

	let Predicates = [HasStdExtZbbOrZbp] in {
	def PACK : ALU_rr<0b0000100, 0b100, "pack">, Sched<[]>;
	def PACKU : ALU_rr<0b0100100, 0b100, "packu">, Sched<[]>;
	} // Predicates = [HasStdExtZbbOrZbp]

	let Predicates = [HasStdExtZbm, IsRV64] in {
	def BMATOR : ALU_rr<0b0000100, 0b011, "bmator">, Sched<[]>;
	def BMATXOR : ALU_rr<0b0100100, 0b011, "bmatxor">, Sched<[]>;
	} // Predicates = [HasStdExtZbm, IsRV64]

	let Predicates = [HasStdExtZbbOrZbp] in
	def PACKH : ALU_rr<0b0000100, 0b111, "packh">, Sched<[]>;

	let Predicates = [HasStdExtZbf] in
	def BFP : ALU_rr<0b0100100, 0b111, "bfp">, Sched<[]>;

	let Predicates = [HasStdExtZbp] in {
	def SHFLI : RVBShfl_ri<0b000010, 0b001, OPC_OP_IMM, "shfli">, Sched<[]>;
	def UNSHFLI : RVBShfl_ri<0b000010, 0b101, OPC_OP_IMM, "unshfli">, Sched<[]>;
	} // Predicates = [HasStdExtZbp]

	let Predicates = [HasStdExtZbb, IsRV64] in {
	def ADDIWU : RVBALUW_ri<0b100, "addiwu">, Sched<[]>;
	def SLLIUW : RVBShift_ri<0b00001, 0b001, OPC_OP_IMM_32, "slliu.w">, Sched<[]>;
	def ADDWU : ALUW_rr<0b0000101, 0b000, "addwu">, Sched<[]>;
	def SUBWU : ALUW_rr<0b0100101, 0b000, "subwu">, Sched<[]>;
	def ADDUW : ALUW_rr<0b0000100, 0b000, "addu.w">, Sched<[]>;
	def SUBUW : ALUW_rr<0b0100100, 0b000, "subu.w">, Sched<[]>;
	} // Predicates = [HasStdExtZbb, IsRV64]

	let Predicates = [HasStdExtZbb, IsRV64] in {
	def SLOW : ALUW_rr<0b0010000, 0b001, "slow">, Sched<[]>;
	def SROW : ALUW_rr<0b0010000, 0b101, "srow">, Sched<[]>;
	} // Predicates = [HasStdExtZbb, IsRV64]

	let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
	def ROLW : ALUW_rr<0b0110000, 0b001, "rolw">, Sched<[]>;
	def RORW : ALUW_rr<0b0110000, 0b101, "rorw">, Sched<[]>;
	} // Predicates = [HasStdExtZbbOrZbp, IsRV64]

	let Predicates = [HasStdExtZbs, IsRV64] in {
	def SBCLRW : ALUW_rr<0b0100100, 0b001, "sbclrw">, Sched<[]>;
	def SBSETW : ALUW_rr<0b0010100, 0b001, "sbsetw">, Sched<[]>;
	def SBINVW : ALUW_rr<0b0110100, 0b001, "sbinvw">, Sched<[]>;
	def SBEXTW : ALUW_rr<0b0100100, 0b101, "sbextw">, Sched<[]>;
	} // Predicates = [HasStdExtZbs, IsRV64]

	let Predicates = [HasStdExtZbp, IsRV64] in {
	def GORCW : ALUW_rr<0b0010100, 0b101, "gorcw">, Sched<[]>;
	def GREVW : ALUW_rr<0b0110100, 0b101, "grevw">, Sched<[]>;
	} // Predicates = [HasStdExtZbp, IsRV64]

	let Predicates = [HasStdExtZbb, IsRV64] in {
	def SLOIW : RVBShiftW_ri<0b0010000, 0b001, OPC_OP_IMM_32, "sloiw">, Sched<[]>;
	def SROIW : RVBShiftW_ri<0b0010000, 0b101, OPC_OP_IMM_32, "sroiw">, Sched<[]>;
	} // Predicates = [HasStdExtZbb, IsRV64]

	let Predicates = [HasStdExtZbbOrZbp, IsRV64] in
	def RORIW : RVBShiftW_ri<0b0110000, 0b101, OPC_OP_IMM_32, "roriw">, Sched<[]>;

	let Predicates = [HasStdExtZbs, IsRV64] in {
	def SBCLRIW : RVBShiftW_ri<0b0100100, 0b001, OPC_OP_IMM_32, "sbclriw">,
	Sched<[]>;
	def SBSETIW : RVBShiftW_ri<0b0010100, 0b001, OPC_OP_IMM_32, "sbsetiw">,
	Sched<[]>;
	def SBINVIW : RVBShiftW_ri<0b0110100, 0b001, OPC_OP_IMM_32, "sbinviw">,
	Sched<[]>;
	} // Predicates = [HasStdExtZbs, IsRV64]

	let Predicates = [HasStdExtZbp, IsRV64] in {
	def GORCIW : RVBShiftW_ri<0b0010100, 0b101, OPC_OP_IMM_32, "gorciw">, Sched<[]>;
	def GREVIW : RVBShiftW_ri<0b0110100, 0b101, OPC_OP_IMM_32, "greviw">, Sched<[]>;
	} // Predicates = [HasStdExtZbp, IsRV64]

	let Predicates = [HasStdExtZbt, IsRV64] in {
	def FSLW : RVBTernaryR<0b10, 0b001, OPC_OP_32,
	"fslw", "$rd, $rs1, $rs3, $rs2">, Sched<[]>;
	def FSRW : RVBTernaryR<0b10, 0b101, OPC_OP_32, "fsrw",
	"$rd, $rs1, $rs3, $rs2">, Sched<[]>;
	def FSRIW : RVBTernaryImm5<0b10, 0b101, OPC_OP_IMM_32,
	"fsriw", "$rd, $rs1, $rs3, $shamt">, Sched<[]>;
	} // Predicates = [HasStdExtZbt, IsRV64]

	let Predicates = [HasStdExtZbb, IsRV64] in {
	def CLZW : RVBUnary<0b0110000, 0b00000, 0b001, RISCVOpcode<0b0011011>,
	"clzw">, Sched<[]>;
	def CTZW : RVBUnary<0b0110000, 0b00001, 0b001, RISCVOpcode<0b0011011>,
	"ctzw">, Sched<[]>;
	def PCNTW : RVBUnary<0b0110000, 0b00010, 0b001, RISCVOpcode<0b0011011>,
	"pcntw">, Sched<[]>;
	} // Predicates = [HasStdExtZbb, IsRV64]

	let Predicates = [HasStdExtZbc, IsRV64] in {
	def CLMULW : ALUW_rr<0b0000101, 0b001, "clmulw">, Sched<[]>;
	def CLMULRW : ALUW_rr<0b0000101, 0b010, "clmulrw">, Sched<[]>;
	def CLMULHW : ALUW_rr<0b0000101, 0b011, "clmulhw">, Sched<[]>;
	} // Predicates = [HasStdExtZbc, IsRV64]

	let Predicates = [HasStdExtZbp, IsRV64] in {
	def SHFLW : ALUW_rr<0b0000100, 0b001, "shflw">, Sched<[]>;
	def UNSHFLW : ALUW_rr<0b0000100, 0b101, "unshflw">, Sched<[]>;
	} // Predicates = [HasStdExtZbp, IsRV64]

	let Predicates = [HasStdExtZbe, IsRV64] in {
	def BDEPW : ALUW_rr<0b0100100, 0b110, "bdepw">, Sched<[]>;
	def BEXTW : ALUW_rr<0b0000100, 0b110, "bextw">, Sched<[]>;
	} // Predicates = [HasStdExtZbe, IsRV64]

	let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
	def PACKW : ALUW_rr<0b0000100, 0b100, "packw">, Sched<[]>;
	def PACKUW : ALUW_rr<0b0100100, 0b100, "packuw">, Sched<[]>;
	} // Predicates = [HasStdExtZbbOrZbp, IsRV64]

	let Predicates = [HasStdExtZbf, IsRV64] in
	def BFPW : ALUW_rr<0b0100100, 0b111, "bfpw">, Sched<[]>;

	//===----------------------------------------------------------------------===//
	// Future compressed instructions
	//===----------------------------------------------------------------------===//

	// The presence of these instructions in the B extension is purely experimental
	// and they should be moved to the C extension as soon as they are ratified.

	let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
	class RVBInstC<bits<2> funct2, string opcodestr>
	: RVInst16<(outs GPRC:$rs_wb), (ins GPRC:$rs), opcodestr, "$rs", [],
	InstFormatCR> {
	bits<3> rs;
	let Constraints = "$rs = $rs_wb";

	let Inst{15-12} = 0b0110;
	let Inst{11-10} = funct2;
	let Inst{9-7} = rs;
	let Inst{6-0} = 0b0000001;
	}

	// The namespace RVBC exists to avoid encoding conflicts with the compressed
	// instructions c.addi16sp and c.lui already implemented in the C extension.

	let DecoderNamespace = "RVBC", Predicates = [HasStdExtZbproposedc, HasStdExtC] in {
	def C_NOT : RVBInstC<0b00, "c.not">, Sched<[]>;
	def C_NEG : RVBInstC<0b01, "c.neg">, Sched<[]>;
	} // DecoderNamespace = "RVBC", Predicates = [HasStdExtZbproposedc, HasStdExtC]

	let DecoderNamespace = "RVBC", Predicates = [HasStdExtZbproposedc, HasStdExtZbbOrZbp, HasStdExtC, IsRV64] in
	def C_ZEXTW : RVBInstC<0b10, "c.zext.w">, Sched<[]>;

	//===----------------------------------------------------------------------===//
	// Pseudo Instructions
	//===----------------------------------------------------------------------===//

	let Predicates = [HasStdExtZbb, IsRV32] in {
	def : InstAlias<"zext.b $rd, $rs", (ANDI GPR:$rd, GPR:$rs, 0xFF)>;
	def : InstAlias<"zext.h $rd, $rs", (PACK GPR:$rd, GPR:$rs, X0)>;
	} // Predicates = [HasStdExtZbb, IsRV32]

	let Predicates = [HasStdExtZbb, IsRV64] in {
	def : InstAlias<"zext.b $rd, $rs", (ANDI GPR:$rd, GPR:$rs, 0xFF)>;
	def : InstAlias<"zext.h $rd, $rs", (PACKW GPR:$rd, GPR:$rs, X0)>;
	def : InstAlias<"zext.w $rd, $rs", (PACK GPR:$rd, GPR:$rs, X0)>;
	} // Predicates = [HasStdExtZbb, IsRV64]

	let Predicates = [HasStdExtZbbOrZbp] in {
	def : InstAlias<"rev.p $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00001)>,
	Sched<[]>;
	def : InstAlias<"rev2.n $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00010)>,
	Sched<[]>;
	def : InstAlias<"rev.n $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00011)>,
	Sched<[]>;
	def : InstAlias<"rev4.b $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00100)>,
	Sched<[]>;
	def : InstAlias<"rev2.b $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00110)>,
	Sched<[]>;
	def : InstAlias<"rev.b $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00111)>,
	Sched<[]>;
	def : InstAlias<"rev8.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01000)>,
	Sched<[]>;
	def : InstAlias<"rev4.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01100)>,
	Sched<[]>;
	def : InstAlias<"rev2.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01110)>,
	Sched<[]>;
	def : InstAlias<"rev.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01111)>,
	Sched<[]>;

	def : InstAlias<"zip.n $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b0001)>,
	Sched<[]>;
	def : InstAlias<"unzip.n $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0001)>,
	Sched<[]>;
	def : InstAlias<"zip2.b $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b0010)>,
	Sched<[]>;
	def : InstAlias<"unzip2.b $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0010)>,
	Sched<[]>;
	def : InstAlias<"zip.b $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b0011)>,
	Sched<[]>;
	def : InstAlias<"unzip.b $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0011)>,
	Sched<[]>;
	def : InstAlias<"zip4.h $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b0100)>,
	Sched<[]>;
	def : InstAlias<"unzip4.h $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0100)>,
	Sched<[]>;
	def : InstAlias<"zip2.h $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b0110)>,
	Sched<[]>;
	def : InstAlias<"unzip2.h $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0110)>,
	Sched<[]>;
	def : InstAlias<"zip.h $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b0111)>,
	Sched<[]>;
	def : InstAlias<"unzip.h $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0111)>,
	Sched<[]>;

	def : InstAlias<"orc.p $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00001)>,
	Sched<[]>;
	def : InstAlias<"orc2.n $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00010)>,
	Sched<[]>;
	def : InstAlias<"orc.n $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00011)>,
	Sched<[]>;
	def : InstAlias<"orc4.b $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00100)>,
	Sched<[]>;
	def : InstAlias<"orc2.b $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00110)>,
	Sched<[]>;
	def : InstAlias<"orc.b $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00111)>,
	Sched<[]>;
	def : InstAlias<"orc8.h $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b01000)>,
	Sched<[]>;
	def : InstAlias<"orc4.h $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b01100)>,
	Sched<[]>;
	def : InstAlias<"orc2.h $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b01110)>,
	Sched<[]>;
	def : InstAlias<"orc.h $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b01111)>,
	Sched<[]>;
	} // Predicates = [HasStdExtZbbOrZbp]

	let Predicates = [HasStdExtZbbOrZbp, IsRV32] in {
	def : InstAlias<"rev16 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b10000)>, Sched<[]>;
	def : InstAlias<"rev8 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b11000)>, Sched<[]>;
	def : InstAlias<"rev4 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b11100)>, Sched<[]>;
	def : InstAlias<"rev2 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b11110)>, Sched<[]>;
	def : InstAlias<"rev $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b11111)>, Sched<[]>;

	def : InstAlias<"zip8 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b1000)>,
	Sched<[]>;
	def : InstAlias<"unzip8 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1000)>,
	Sched<[]>;
	def : InstAlias<"zip4 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b1100)>,
	Sched<[]>;
	def : InstAlias<"unzip4 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1100)>,
	Sched<[]>;
	def : InstAlias<"zip2 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b1110)>,
	Sched<[]>;
	def : InstAlias<"unzip2 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1110)>,
	Sched<[]>;
	def : InstAlias<"zip $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b1111)>,
	Sched<[]>;
	def : InstAlias<"unzip $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1111)>,
	Sched<[]>;

	def : InstAlias<"orc16 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b10000)>, Sched<[]>;
	def : InstAlias<"orc8 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b11000)>, Sched<[]>;
	def : InstAlias<"orc4 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b11100)>, Sched<[]>;
	def : InstAlias<"orc2 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b11110)>, Sched<[]>;
	def : InstAlias<"orc $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b11111)>, Sched<[]>;
	} // Predicates = [HasStdExtZbbOrZbp, IsRV32]

	let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
	def : InstAlias<"rev16.w $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b010000)>,
	Sched<[]>;
	def : InstAlias<"rev8.w $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b011000)>,
	Sched<[]>;
	def : InstAlias<"rev4.w $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b011100)>,
	Sched<[]>;
	def : InstAlias<"rev2.w $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b011110)>,
	Sched<[]>;
	def : InstAlias<"rev.w $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b011111)>,
	Sched<[]>;
	def : InstAlias<"rev32 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b100000)>,
	Sched<[]>;
	def : InstAlias<"rev16 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b110000)>,
	Sched<[]>;
	def : InstAlias<"rev8 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b111000)>,
	Sched<[]>;
	def : InstAlias<"rev4 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b111100)>,
	Sched<[]>;
	def : InstAlias<"rev2 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b111110)>,
	Sched<[]>;
	def : InstAlias<"rev $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b111111)>,
	Sched<[]>;

	def : InstAlias<"zip8.w $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b01000)>,
	Sched<[]>;
	def : InstAlias<"unzip8.w $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b01000)>,
	Sched<[]>;
	def : InstAlias<"zip4.w $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b01100)>,
	Sched<[]>;
	def : InstAlias<"unzip4.w $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b01100)>,
	Sched<[]>;
	def : InstAlias<"zip2.w $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b01110)>,
	Sched<[]>;
	def : InstAlias<"unzip2.w $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b01110)>,
	Sched<[]>;
	def : InstAlias<"zip.w $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b01111)>,
	Sched<[]>;
	def : InstAlias<"unzip.w $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b01111)>,
	Sched<[]>;
	def : InstAlias<"zip16 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b10000)>,
	Sched<[]>;
	def : InstAlias<"unzip16 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b10000)>,
	Sched<[]>;
	def : InstAlias<"zip8 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b11000)>,
	Sched<[]>;
	def : InstAlias<"unzip8 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b11000)>,
	Sched<[]>;
	def : InstAlias<"zip4 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b11100)>,
	Sched<[]>;
	def : InstAlias<"unzip4 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b11100)>,
	Sched<[]>;
	def : InstAlias<"zip2 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b11110)>,
	Sched<[]>;
	def : InstAlias<"unzip2 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b11110)>,
	Sched<[]>;
	def : InstAlias<"zip $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b11111)>,
	Sched<[]>;
	def : InstAlias<"unzip $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b11111)>,
	Sched<[]>;

	def : InstAlias<"orc16.w $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b010000)>,
	Sched<[]>;
	def : InstAlias<"orc8.w $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b011000)>,
	Sched<[]>;
	def : InstAlias<"orc4.w $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b011100)>,
	Sched<[]>;
	def : InstAlias<"orc2.w $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b011110)>,
	Sched<[]>;
	def : InstAlias<"orc.w $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b011111)>,
	Sched<[]>;
	def : InstAlias<"orc32 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b100000)>,
	Sched<[]>;
	def : InstAlias<"orc16 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b110000)>,
	Sched<[]>;
	def : InstAlias<"orc8 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b111000)>,
	Sched<[]>;
	def : InstAlias<"orc4 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b111100)>,
	Sched<[]>;
	def : InstAlias<"orc2 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b111110)>,
	Sched<[]>;
	def : InstAlias<"orc $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b111111)>,
	Sched<[]>;
	} // Predicates = [HasStdExtZbbOrZbp, IsRV64]

	//===----------------------------------------------------------------------===//
	// Compressed Instruction patterns
	//===----------------------------------------------------------------------===//
	let Predicates = [HasStdExtZbproposedc, HasStdExtC] in {
	def : CompressPat<(XORI GPRC:$rs1, GPRC:$rs1, -1),
	(C_NOT GPRC:$rs1)>;
	def : CompressPat<(SUB GPRC:$rs1, X0, GPRC:$rs1),
	(C_NEG GPRC:$rs1)>;
	} // Predicates = [HasStdExtZbproposedc, HasStdExtC]

	let Predicates = [HasStdExtZbproposedc, HasStdExtZbbOrZbp, HasStdExtC, IsRV64] in {
	def : CompressPat<(PACK GPRC:$rs1, GPRC:$rs1, X0),
	(C_ZEXTW GPRC:$rs1)>;
	} // Predicates = [HasStdExtZbproposedc, HasStdExtC, IsRV64]
	+
	+//===----------------------------------------------------------------------===//
	+// Codegen patterns
	+//===----------------------------------------------------------------------===//
	+def SLOIPat : ComplexPattern<XLenVT, 2, "SelectSLOI", [or]>;
	+def SROIPat : ComplexPattern<XLenVT, 2, "SelectSROI", [or]>;
	+def RORIPat : ComplexPattern<XLenVT, 2, "SelectRORI", [rotl]>;
	+def SLLIUWPat : ComplexPattern<i64, 2, "SelectSLLIUW", [and]>;
	+def SLOIWPat : ComplexPattern<i64, 2, "SelectSLOIW", [sext_inreg]>;
	+def SROIWPat : ComplexPattern<i64, 2, "SelectSROIW", [or]>;
	+def RORIWPat : ComplexPattern<i64, 2, "SelectRORIW", [sext_inreg]>;
	+def FSRIWPat : ComplexPattern<i64, 3, "SelectFSRIW", [sext_inreg]>;
	+
	+let Predicates = [HasStdExtZbbOrZbp] in {
	+def : Pat<(and GPR:$rs1, (not GPR:$rs2)), (ANDN GPR:$rs1, GPR:$rs2)>;
	+def : Pat<(or GPR:$rs1, (not GPR:$rs2)), (ORN GPR:$rs1, GPR:$rs2)>;
	+def : Pat<(xor GPR:$rs1, (not GPR:$rs2)), (XNOR GPR:$rs1, GPR:$rs2)>;
	+} // Predicates = [HasStdExtZbbOrZbp]
	+
	+let Predicates = [HasStdExtZbb] in {
	+def : Pat<(xor (shl (xor GPR:$rs1, -1), GPR:$rs2), -1),
	+ (SLO GPR:$rs1, GPR:$rs2)>;
	+def : Pat<(xor (srl (xor GPR:$rs1, -1), GPR:$rs2), -1),
	+ (SRO GPR:$rs1, GPR:$rs2)>;
	+} // Predicates = [HasStdExtZbb]
	+
	+let Predicates = [HasStdExtZbbOrZbp] in {
	+def : Pat<(rotl GPR:$rs1, GPR:$rs2), (ROL GPR:$rs1, GPR:$rs2)>;
	+def : Pat<(fshl GPR:$rs1, GPR:$rs1, GPR:$rs2), (ROL GPR:$rs1, GPR:$rs2)>;
	+def : Pat<(rotr GPR:$rs1, GPR:$rs2), (ROR GPR:$rs1, GPR:$rs2)>;
	+def : Pat<(fshr GPR:$rs1, GPR:$rs1, GPR:$rs2), (ROR GPR:$rs1, GPR:$rs2)>;
	+} // Predicates = [HasStdExtZbbOrZbp]
	+
	+let Predicates = [HasStdExtZbs, IsRV32] in
	+def : Pat<(and (xor (shl 1, (and GPR:$rs2, 31)), -1), GPR:$rs1),
	+ (SBCLR GPR:$rs1, GPR:$rs2)>;
	+let Predicates = [HasStdExtZbs, IsRV64] in
	+def : Pat<(and (xor (shl 1, (and GPR:$rs2, 63)), -1), GPR:$rs1),
	+ (SBCLR GPR:$rs1, GPR:$rs2)>;
	+
	+let Predicates = [HasStdExtZbs] in
	+def : Pat<(and (rotl -2, GPR:$rs2), GPR:$rs1), (SBCLR GPR:$rs1, GPR:$rs2)>;
	+
	+let Predicates = [HasStdExtZbs, IsRV32] in
	+def : Pat<(or (shl 1, (and GPR:$rs2, 31)), GPR:$rs1),
	+ (SBSET GPR:$rs1, GPR:$rs2)>;
	+let Predicates = [HasStdExtZbs, IsRV64] in
	+def : Pat<(or (shl 1, (and GPR:$rs2, 63)), GPR:$rs1),
	+ (SBSET GPR:$rs1, GPR:$rs2)>;
	+
	+let Predicates = [HasStdExtZbs, IsRV32] in
	+def : Pat<(xor (shl 1, (and GPR:$rs2, 31)), GPR:$rs1),
	+ (SBINV GPR:$rs1, GPR:$rs2)>;
	+let Predicates = [HasStdExtZbs, IsRV64] in
	+def : Pat<(xor (shl 1, (and GPR:$rs2, 63)), GPR:$rs1),
	+ (SBINV GPR:$rs1, GPR:$rs2)>;
	+
	+let Predicates = [HasStdExtZbs, IsRV32] in
	+def : Pat<(and (srl GPR:$rs1, (and GPR:$rs2, 31)), 1),
	+ (SBEXT GPR:$rs1, GPR:$rs2)>;
	+
	+let Predicates = [HasStdExtZbs, IsRV64] in
	+def : Pat<(and (srl GPR:$rs1, (and GPR:$rs2, 63)), 1),
	+ (SBEXT GPR:$rs1, GPR:$rs2)>;
	+
	+let Predicates = [HasStdExtZbb] in {
	+def : Pat<(SLOIPat GPR:$rs1, uimmlog2xlen:$shamt),
	+ (SLOI GPR:$rs1, uimmlog2xlen:$shamt)>;
	+def : Pat<(SROIPat GPR:$rs1, uimmlog2xlen:$shamt),
	+ (SROI GPR:$rs1, uimmlog2xlen:$shamt)>;
	+} // Predicates = [HasStdExtZbb]
	+
	+// There's no encoding for roli in the current version of the 'B' extension
	+// (v0.92) as it can be implemented with rori by negating the immediate.
	+// For this reason we pattern-match only against rori[w].
	+let Predicates = [HasStdExtZbbOrZbp] in
	+def : Pat<(RORIPat GPR:$rs1, uimmlog2xlen:$shamt),
	+ (RORI GPR:$rs1, uimmlog2xlen:$shamt)>;
	+
	+// We don't pattern-match sbclri[w], sbseti[w], sbinvi[w] because they are
	+// pattern-matched by simple andi, ori, and xori.
	+let Predicates = [HasStdExtZbs] in
	+def : Pat<(and (srl GPR:$rs1, uimmlog2xlen:$shamt), (XLenVT 1)),
	+ (SBEXTI GPR:$rs1, uimmlog2xlen:$shamt)>;
	+
	+let Predicates = [HasStdExtZbp, IsRV32] in {
	+def : Pat<(or (or (and (srl GPR:$rs1, (i32 1)), (i32 0x55555555)), GPR:$rs1),
	+ (and (shl GPR:$rs1, (i32 1)), (i32 0xAAAAAAAA))),
	+ (GORCI GPR:$rs1, (i32 1))>;
	+def : Pat<(or (or (and (srl GPR:$rs1, (i32 2)), (i32 0x33333333)), GPR:$rs1),
	+ (and (shl GPR:$rs1, (i32 2)), (i32 0xCCCCCCCC))),
	+ (GORCI GPR:$rs1, (i32 2))>;
	+def : Pat<(or (or (and (srl GPR:$rs1, (i32 4)), (i32 0x0F0F0F0F)), GPR:$rs1),
	+ (and (shl GPR:$rs1, (i32 4)), (i32 0xF0F0F0F0))),
	+ (GORCI GPR:$rs1, (i32 4))>;
	+def : Pat<(or (or (and (srl GPR:$rs1, (i32 8)), (i32 0x00FF00FF)), GPR:$rs1),
	+ (and (shl GPR:$rs1, (i32 8)), (i32 0xFF00FF00))),
	+ (GORCI GPR:$rs1, (i32 8))>;
	+def : Pat<(or (or (srl GPR:$rs1, (i32 16)), GPR:$rs1),
	+ (shl GPR:$rs1, (i32 16))),
	+ (GORCI GPR:$rs1, (i32 16))>;
	+} // Predicates = [HasStdExtZbp, IsRV32]
	+
	+let Predicates = [HasStdExtZbp, IsRV64] in {
	+def : Pat<(or (or (and (srl GPR:$rs1, (i64 1)), (i64 0x5555555555555555)),
	+ GPR:$rs1),
	+ (and (shl GPR:$rs1, (i64 1)), (i64 0xAAAAAAAAAAAAAAAA))),
	+ (GORCI GPR:$rs1, (i64 1))>;
	+def : Pat<(or (or (and (srl GPR:$rs1, (i64 2)), (i64 0x3333333333333333)),
	+ GPR:$rs1),
	+ (and (shl GPR:$rs1, (i64 2)), (i64 0xCCCCCCCCCCCCCCCC))),
	+ (GORCI GPR:$rs1, (i64 2))>;
	+def : Pat<(or (or (and (srl GPR:$rs1, (i64 4)), (i64 0x0F0F0F0F0F0F0F0F)),
	+ GPR:$rs1),
	+ (and (shl GPR:$rs1, (i64 4)), (i64 0xF0F0F0F0F0F0F0F0))),
	+ (GORCI GPR:$rs1, (i64 4))>;
	+def : Pat<(or (or (and (srl GPR:$rs1, (i64 8)), (i64 0x00FF00FF00FF00FF)),
	+ GPR:$rs1),
	+ (and (shl GPR:$rs1, (i64 8)), (i64 0xFF00FF00FF00FF00))),
	+ (GORCI GPR:$rs1, (i64 8))>;
	+def : Pat<(or (or (and (srl GPR:$rs1, (i64 16)), (i64 0x0000FFFF0000FFFF)),
	+ GPR:$rs1),
	+ (and (shl GPR:$rs1, (i64 16)), (i64 0xFFFF0000FFFF0000))),
	+ (GORCI GPR:$rs1, (i64 16))>;
	+def : Pat<(or (or (srl GPR:$rs1, (i64 32)), GPR:$rs1),
	+ (shl GPR:$rs1, (i64 32))),
	+ (GORCI GPR:$rs1, (i64 32))>;
	+} // Predicates = [HasStdExtZbp, IsRV64]
	+
	+let Predicates = [HasStdExtZbp, IsRV32] in {
	+def : Pat<(or (and (shl GPR:$rs1, (i32 1)), (i32 0xAAAAAAAA)),
	+ (and (srl GPR:$rs1, (i32 1)), (i32 0x55555555))),
	+ (GREVI GPR:$rs1, (i32 1))>;
	+def : Pat<(or (and (shl GPR:$rs1, (i32 2)), (i32 0xCCCCCCCC)),
	+ (and (srl GPR:$rs1, (i32 2)), (i32 0x33333333))),
	+ (GREVI GPR:$rs1, (i32 2))>;
	+def : Pat<(or (and (shl GPR:$rs1, (i32 4)), (i32 0xF0F0F0F0)),
	+ (and (srl GPR:$rs1, (i32 4)), (i32 0x0F0F0F0F))),
	+ (GREVI GPR:$rs1, (i32 4))>;
	+def : Pat<(or (and (shl GPR:$rs1, (i32 8)), (i32 0xFF00FF00)),
	+ (and (srl GPR:$rs1, (i32 8)), (i32 0x00FF00FF))),
	+ (GREVI GPR:$rs1, (i32 8))>;
	+def : Pat<(rotr (bswap GPR:$rs1), (i32 16)), (GREVI GPR:$rs1, (i32 8))>;
	+def : Pat<(or (shl GPR:$rs1, (i32 16)), (srl GPR:$rs1, (i32 16))),
	+ (GREVI GPR:$rs1, (i32 16))>;
	+def : Pat<(rotl GPR:$rs1, (i32 16)), (GREVI GPR:$rs1, (i32 16))>;
	+def : Pat<(bswap GPR:$rs1), (GREVI GPR:$rs1, (i32 24))>;
	+def : Pat<(bitreverse GPR:$rs1), (GREVI GPR:$rs1, (i32 31))>;
	+} // Predicates = [HasStdExtZbp, IsRV32]
	+
	+let Predicates = [HasStdExtZbp, IsRV64] in {
	+def : Pat<(or (and (shl GPR:$rs1, (i64 1)), (i64 0xAAAAAAAAAAAAAAAA)),
	+ (and (srl GPR:$rs1, (i64 1)), (i64 0x5555555555555555))),
	+ (GREVI GPR:$rs1, (i64 1))>;
	+def : Pat<(or (and (shl GPR:$rs1, (i64 2)), (i64 0xCCCCCCCCCCCCCCCC)),
	+ (and (srl GPR:$rs1, (i64 2)), (i64 0x3333333333333333))),
	+ (GREVI GPR:$rs1, (i64 2))>;
	+def : Pat<(or (and (shl GPR:$rs1, (i64 4)), (i64 0xF0F0F0F0F0F0F0F0)),
	+ (and (srl GPR:$rs1, (i64 4)), (i64 0x0F0F0F0F0F0F0F0F))),
	+ (GREVI GPR:$rs1, (i64 4))>;
	+def : Pat<(or (and (shl GPR:$rs1, (i64 8)), (i64 0xFF00FF00FF00FF00)),
	+ (and (srl GPR:$rs1, (i64 8)), (i64 0x00FF00FF00FF00FF))),
	+ (GREVI GPR:$rs1, (i64 8))>;
	+def : Pat<(or (and (shl GPR:$rs1, (i64 16)), (i64 0xFFFF0000FFFF0000)),
	+ (and (srl GPR:$rs1, (i64 16)), (i64 0x0000FFFF0000FFFF))),
	+ (GREVI GPR:$rs1, (i64 16))>;
	+def : Pat<(or (shl GPR:$rs1, (i64 32)), (srl GPR:$rs1, (i64 32))),
	+ (GREVI GPR:$rs1, (i64 32))>;
	+def : Pat<(rotl GPR:$rs1, (i64 32)), (GREVI GPR:$rs1, (i64 32))>;
	+def : Pat<(bswap GPR:$rs1), (GREVI GPR:$rs1, (i64 56))>;
	+def : Pat<(bitreverse GPR:$rs1), (GREVI GPR:$rs1, (i64 63))>;
	+} // Predicates = [HasStdExtZbp, IsRV64]
	+
	+let Predicates = [HasStdExtZbt] in {
	+def : Pat<(or (and (xor GPR:$rs2, -1), GPR:$rs3), (and GPR:$rs2, GPR:$rs1)),
	+ (CMIX GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
	+def : Pat<(riscv_selectcc GPR:$rs2, (XLenVT 0), (XLenVT 17), GPR:$rs3, GPR:$rs1),
	+ (CMOV GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
	+def : Pat<(fshl GPR:$rs1, GPR:$rs2, GPR:$rs3),
	+ (FSL GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
	+def : Pat<(fshr GPR:$rs1, GPR:$rs2, GPR:$rs3),
	+ (FSR GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
	+def : Pat<(fshr GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt),
	+ (FSRI GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt)>;
	+} // Predicates = [HasStdExtZbt]
	+
	+let Predicates = [HasStdExtZbb] in {
	+def : Pat<(ctlz GPR:$rs1), (CLZ GPR:$rs1)>;
	+def : Pat<(cttz GPR:$rs1), (CTZ GPR:$rs1)>;
	+def : Pat<(ctpop GPR:$rs1), (PCNT GPR:$rs1)>;
	+} // Predicates = [HasStdExtZbb]
	+
	+let Predicates = [HasStdExtZbb, IsRV32] in
	+def : Pat<(sra (shl GPR:$rs1, (i32 24)), (i32 24)), (SEXTB GPR:$rs1)>;
	+let Predicates = [HasStdExtZbb, IsRV64] in
	+def : Pat<(sra (shl GPR:$rs1, (i64 56)), (i64 56)), (SEXTB GPR:$rs1)>;
	+
	+let Predicates = [HasStdExtZbb, IsRV32] in
	+def : Pat<(sra (shl GPR:$rs1, (i32 16)), (i32 16)), (SEXTH GPR:$rs1)>;
	+let Predicates = [HasStdExtZbb, IsRV64] in
	+def : Pat<(sra (shl GPR:$rs1, (i64 48)), (i64 48)), (SEXTH GPR:$rs1)>;
	+
	+let Predicates = [HasStdExtZbb] in {
	+def : Pat<(smin GPR:$rs1, GPR:$rs2), (MIN GPR:$rs1, GPR:$rs2)>;
	+def : Pat<(riscv_selectcc GPR:$rs1, GPR:$rs2, (XLenVT 20), GPR:$rs1, GPR:$rs2),
	+ (MIN GPR:$rs1, GPR:$rs2)>;
	+def : Pat<(smax GPR:$rs1, GPR:$rs2), (MAX GPR:$rs1, GPR:$rs2)>;
	+def : Pat<(riscv_selectcc GPR:$rs2, GPR:$rs1, (XLenVT 20), GPR:$rs1, GPR:$rs2),
	+ (MAX GPR:$rs1, GPR:$rs2)>;
	+def : Pat<(umin GPR:$rs1, GPR:$rs2), (MINU GPR:$rs1, GPR:$rs2)>;
	+def : Pat<(riscv_selectcc GPR:$rs1, GPR:$rs2, (XLenVT 12), GPR:$rs1, GPR:$rs2),
	+ (MINU GPR:$rs1, GPR:$rs2)>;
	+def : Pat<(umax GPR:$rs1, GPR:$rs2), (MAXU GPR:$rs1, GPR:$rs2)>;
	+def : Pat<(riscv_selectcc GPR:$rs2, GPR:$rs1, (XLenVT 12), GPR:$rs1, GPR:$rs2),
	+ (MAXU GPR:$rs1, GPR:$rs2)>;
	+} // Predicates = [HasStdExtZbb]
	+
	+let Predicates = [HasStdExtZbbOrZbp, IsRV32] in
	+def : Pat<(or (and GPR:$rs1, 0x0000FFFF), (shl GPR:$rs2, (i32 16))),
	+ (PACK GPR:$rs1, GPR:$rs2)>;
	+let Predicates = [HasStdExtZbbOrZbp, IsRV64] in
	+def : Pat<(or (and GPR:$rs1, 0x00000000FFFFFFFF), (shl GPR:$rs2, (i64 32))),
	+ (PACK GPR:$rs1, GPR:$rs2)>;
	+let Predicates = [HasStdExtZbbOrZbp, IsRV32] in
	+def : Pat<(or (and GPR:$rs2, 0xFFFF0000), (srl GPR:$rs1, (i32 16))),
	+ (PACKU GPR:$rs1, GPR:$rs2)>;
	+let Predicates = [HasStdExtZbbOrZbp, IsRV64] in
	+def : Pat<(or (and GPR:$rs2, 0xFFFFFFFF00000000), (srl GPR:$rs1, (i64 32))),
	+ (PACKU GPR:$rs1, GPR:$rs2)>;
	+let Predicates = [HasStdExtZbbOrZbp] in
	+def : Pat<(or (and (shl GPR:$rs2, (XLenVT 8)), 0xFF00),
	+ (and GPR:$rs1, 0x00FF)),
	+ (PACKH GPR:$rs1, GPR:$rs2)>;
	+
	+let Predicates = [HasStdExtZbp, IsRV32] in {
	+def : Pat<(or (or (and (shl GPR:$rs1, (i32 8)), (i32 0x00FF0000)),
	+ (and GPR:$rs1, (i32 0xFF0000FF))),
	+ (and (srl GPR:$rs1, (i32 8)), (i32 0x0000FF00))),
	+ (SHFLI GPR:$rs1, (i32 8))>;
	+def : Pat<(or (or (and (shl GPR:$rs1, (i32 4)), (i32 0x0F000F00)),
	+ (and GPR:$rs1, (i32 0xF00FF00F))),
	+ (and (srl GPR:$rs1, (i32 4)), (i32 0x00F000F0))),
	+ (SHFLI GPR:$rs1, (i32 4))>;
	+def : Pat<(or (or (and (shl GPR:$rs1, (i32 2)), (i32 0x30303030)),
	+ (and GPR:$rs1, (i32 0xC3C3C3C3))),
	+ (and (srl GPR:$rs1, (i32 2)), (i32 0x0C0C0C0C))),
	+ (SHFLI GPR:$rs1, (i32 2))>;
	+def : Pat<(or (or (and (shl GPR:$rs1, (i32 1)), (i32 0x44444444)),
	+ (and GPR:$rs1, (i32 0x99999999))),
	+ (and (srl GPR:$rs1, (i32 1)), (i32 0x22222222))),
	+ (SHFLI GPR:$rs1, (i32 1))>;
	+} // Predicates = [HasStdExtZbp, IsRV32]
	+
	+let Predicates = [HasStdExtZbp, IsRV64] in {
	+def : Pat<(or (or (and (shl GPR:$rs1, (i64 16)), (i64 0x0000FFFF00000000)),
	+ (and GPR:$rs1, (i64 0xFFFF00000000FFFF))),
	+ (and (srl GPR:$rs1, (i64 16)), (i64 0x00000000FFFF0000))),
	+ (SHFLI GPR:$rs1, (i64 16))>;
	+def : Pat<(or (or (and (shl GPR:$rs1, (i64 8)), (i64 0x00FF000000FF0000)),
	+ (and GPR:$rs1, (i64 0xFF0000FFFF0000FF))),
	+ (and (srl GPR:$rs1, (i64 8)), (i64 0x0000FF000000FF00))),
	+ (SHFLI GPR:$rs1, (i64 8))>;
	+def : Pat<(or (or (and (shl GPR:$rs1, (i64 4)), (i64 0x0F000F000F000F00)),
	+ (and GPR:$rs1, (i64 0xF00FF00FF00FF00F))),
	+ (and (srl GPR:$rs1, (i64 4)), (i64 0x00F000F000F000F0))),
	+ (SHFLI GPR:$rs1, (i64 4))>;
	+def : Pat<(or (or (and (shl GPR:$rs1, (i64 2)), (i64 0x3030303030303030)),
	+ (and GPR:$rs1, (i64 0xC3C3C3C3C3C3C3C3))),
	+ (and (srl GPR:$rs1, (i64 2)), (i64 0x0C0C0C0C0C0C0C0C))),
	+ (SHFLI GPR:$rs1, (i64 2))>;
	+def : Pat<(or (or (and (shl GPR:$rs1, (i64 1)), (i64 0x4444444444444444)),
	+ (and GPR:$rs1, (i64 0x9999999999999999))),
	+ (and (srl GPR:$rs1, (i64 1)), (i64 0x2222222222222222))),
	+ (SHFLI GPR:$rs1, (i64 1))>;
	+} // Predicates = [HasStdExtZbp, IsRV64]
	+
	+let Predicates = [HasStdExtZbb, IsRV64] in {
	+def : Pat<(and (add GPR:$rs, simm12:$simm12), (i64 0xFFFFFFFF)),
	+ (ADDIWU GPR:$rs, simm12:$simm12)>;
	+def : Pat<(SLLIUWPat GPR:$rs1, uimmlog2xlen:$shamt),
	+ (SLLIUW GPR:$rs1, uimmlog2xlen:$shamt)>;
	+def : Pat<(and (add GPR:$rs1, GPR:$rs2), (i64 0xFFFFFFFF)),
	+ (ADDWU GPR:$rs1, GPR:$rs2)>;
	+def : Pat<(and (sub GPR:$rs1, GPR:$rs2), (i64 0xFFFFFFFF)),
	+ (SUBWU GPR:$rs1, GPR:$rs2)>;
	+def : Pat<(add GPR:$rs1, (and GPR:$rs2, (i64 0xFFFFFFFF))),
	+ (ADDUW GPR:$rs1, GPR:$rs2)>;
	+def : Pat<(sub GPR:$rs1, (and GPR:$rs2, (i64 0xFFFFFFFF))),
	+ (SUBUW GPR:$rs1, GPR:$rs2)>;
	+def : Pat<(xor (riscv_sllw (xor GPR:$rs1, -1), GPR:$rs2), -1),
	+ (SLOW GPR:$rs1, GPR:$rs2)>;
	+def : Pat<(xor (riscv_srlw (xor GPR:$rs1, -1), GPR:$rs2), -1),
	+ (SROW GPR:$rs1, GPR:$rs2)>;
	+} // Predicates = [HasStdExtZbb, IsRV64]
	+
	+let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
	+def : Pat<(or (riscv_sllw (assertsexti32 GPR:$rs1), (assertsexti32 GPR:$rs2)),
	+ (riscv_srlw (assertsexti32 GPR:$rs1),
	+ (sub (i64 0), (assertsexti32 GPR:$rs2)))),
	+ (ROLW GPR:$rs1, GPR:$rs2)>;
	+def : Pat<(or (riscv_sllw (assertsexti32 GPR:$rs1),
	+ (sub (i64 0), (assertsexti32 GPR:$rs2))),
	+ (riscv_srlw (assertsexti32 GPR:$rs1), (assertsexti32 GPR:$rs2))),
	+ (RORW GPR:$rs1, GPR:$rs2)>;
	+} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
	+
	+let Predicates = [HasStdExtZbs, IsRV64] in {
	+def : Pat<(and (xor (riscv_sllw 1, (assertsexti32 GPR:$rs2)), -1),
	+ (assertsexti32 GPR:$rs1)),
	+ (SBCLRW GPR:$rs1, GPR:$rs2)>;
	+def : Pat<(or (riscv_sllw 1, (assertsexti32 GPR:$rs2)),
	+ (assertsexti32 GPR:$rs1)),
	+ (SBSETW GPR:$rs1, GPR:$rs2)>;
	+def : Pat<(xor (riscv_sllw 1, (assertsexti32 GPR:$rs2)),
	+ (assertsexti32 GPR:$rs1)),
	+ (SBINVW GPR:$rs1, GPR:$rs2)>;
	+def : Pat<(and (riscv_srlw (assertsexti32 GPR:$rs1), (assertsexti32 GPR:$rs2)),
	+ 1),
	+ (SBEXTW GPR:$rs1, GPR:$rs2)>;
	+} // Predicates = [HasStdExtZbs, IsRV64]
	+
	+let Predicates = [HasStdExtZbb, IsRV64] in {
	+def : Pat<(SLOIWPat GPR:$rs1, uimmlog2xlen:$shamt),
	+ (SLOIW GPR:$rs1, uimmlog2xlen:$shamt)>;
	+def : Pat<(SROIWPat GPR:$rs1, uimmlog2xlen:$shamt),
	+ (SROIW GPR:$rs1, uimmlog2xlen:$shamt)>;
	+} // Predicates = [HasStdExtZbb, IsRV64]
	+
	+let Predicates = [HasStdExtZbbOrZbp, IsRV64] in
	+def : Pat<(RORIWPat GPR:$rs1, uimmlog2xlen:$shamt),
	+ (RORIW GPR:$rs1, uimmlog2xlen:$shamt)>;
	+
	+let Predicates = [HasStdExtZbp, IsRV64] in {
	+def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 1)), (i64 0x55555555)),
	+ GPR:$rs1),
	+ (and (shl GPR:$rs1, (i64 1)), (i64 0xAAAAAAAA))),
	+ i32),
	+ (GORCIW GPR:$rs1, (i64 1))>;
	+def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 2)), (i64 0x33333333)),
	+ GPR:$rs1),
	+ (and (shl GPR:$rs1, (i64 2)), (i64 0xCCCCCCCC))),
	+ i32),
	+ (GORCIW GPR:$rs1, (i64 2))>;
	+def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 4)), (i64 0x0F0F0F0F)),
	+ GPR:$rs1),
	+ (and (shl GPR:$rs1, (i64 4)), (i64 0xF0F0F0F0))),
	+ i32),
	+ (GORCIW GPR:$rs1, (i64 4))>;
	+def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 8)), (i64 0x00FF00FF)),
	+ GPR:$rs1),
	+ (and (shl GPR:$rs1, (i64 8)), (i64 0xFF00FF00))),
	+ i32),
	+ (GORCIW GPR:$rs1, (i64 8))>;
	+def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 16)), (i64 0x0000FFFF)),
	+ GPR:$rs1),
	+ (and (shl GPR:$rs1, (i64 16)), (i64 0xFFFF0000))),
	+ i32),
	+ (GORCIW GPR:$rs1, (i64 16))>;
	+def : Pat<(sext_inreg (or (or (srl (and GPR:$rs1, (i64 0xFFFF0000)), (i64 16)),
	+ GPR:$rs1),
	+ (shl GPR:$rs1, (i64 16))), i32),
	+ (GORCIW GPR:$rs1, (i64 16))>;
	+
	+def : Pat<(sext_inreg (or (and (shl GPR:$rs1, (i64 1)), (i64 0xAAAAAAAA)),
	+ (and (srl GPR:$rs1, (i64 1)), (i64 0x55555555))),
	+ i32),
	+ (GREVIW GPR:$rs1, (i64 1))>;
	+def : Pat<(sext_inreg (or (and (shl GPR:$rs1, (i64 2)), (i64 0xCCCCCCCC)),
	+ (and (srl GPR:$rs1, (i64 2)), (i64 0x33333333))),
	+ i32),
	+ (GREVIW GPR:$rs1, (i64 2))>;
	+def : Pat<(sext_inreg (or (and (shl GPR:$rs1, (i64 4)), (i64 0xF0F0F0F0)),
	+ (and (srl GPR:$rs1, (i64 4)), (i64 0x0F0F0F0F))),
	+ i32),
	+ (GREVIW GPR:$rs1, (i64 4))>;
	+def : Pat<(sext_inreg (or (and (shl GPR:$rs1, (i64 8)), (i64 0xFF00FF00)),
	+ (and (srl GPR:$rs1, (i64 8)), (i64 0x00FF00FF))),
	+ i32),
	+ (GREVIW GPR:$rs1, (i64 8))>;
	+def : Pat<(sext_inreg (or (shl GPR:$rs1, (i64 16)),
	+ (srl (and GPR:$rs1, 0xFFFF0000), (i64 16))), i32),
	+ (GREVIW GPR:$rs1, (i64 16))>;
	+def : Pat<(sra (bswap GPR:$rs1), (i64 32)), (GREVIW GPR:$rs1, (i64 24))>;
	+def : Pat<(sra (bitreverse GPR:$rs1), (i64 32)), (GREVIW GPR:$rs1, (i64 31))>;
	+} // Predicates = [HasStdExtZbp, IsRV64]
	+
	+let Predicates = [HasStdExtZbt, IsRV64] in {
	+def : Pat<(riscv_selectcc (and (assertsexti32 GPR:$rs3), 31),
	+ (i64 0),
	+ (i64 17),
	+ (assertsexti32 GPR:$rs1),
	+ (or (riscv_sllw (assertsexti32 GPR:$rs1),
	+ (and (assertsexti32 GPR:$rs3), 31)),
	+ (riscv_srlw (assertsexti32 GPR:$rs2),
	+ (sub (i64 32),
	+ (assertsexti32 GPR:$rs3))))),
	+ (FSLW GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
	+def : Pat<(riscv_selectcc (and (assertsexti32 GPR:$rs3), 31),
	+ (i64 0),
	+ (i64 17),
	+ (assertsexti32 GPR:$rs2),
	+ (or (riscv_sllw (assertsexti32 GPR:$rs1),
	+ (sub (i64 32),
	+ (assertsexti32 GPR:$rs3))),
	+ (riscv_srlw (assertsexti32 GPR:$rs2),
	+ (and (assertsexti32 GPR:$rs3), 31)))),
	+ (FSRW GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
	+def : Pat<(FSRIWPat GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt),
	+ (FSRIW GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt)>;
	+} // Predicates = [HasStdExtZbt, IsRV64]
	+
	+let Predicates = [HasStdExtZbb, IsRV64] in {
	+def : Pat<(add (ctlz (and GPR:$rs1, (i64 0xFFFFFFFF))), (i64 -32)),
	+ (CLZW GPR:$rs1)>;
	+// We don't pattern-match CTZW here as it has the same pattern and result as
	+// RV64 CTZ
	+def : Pat<(ctpop (and GPR:$rs1, (i64 0xFFFFFFFF))), (PCNTW GPR:$rs1)>;
	+} // Predicates = [HasStdExtZbb, IsRV64]
	+
	+let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
	+def : Pat<(sext_inreg (or (shl (assertsexti32 GPR:$rs2), (i64 16)),
	+ (and (assertsexti32 GPR:$rs1), 0x000000000000FFFF)),
	+ i32),
	+ (PACKW GPR:$rs1, GPR:$rs2)>;
	+def : Pat<(or (and (assertsexti32 GPR:$rs2), 0xFFFFFFFFFFFF0000),
	+ (srl (and (assertsexti32 GPR:$rs1), 0x00000000FFFF0000),
	+ (i64 16))),
	+ (PACKUW GPR:$rs1, GPR:$rs2)>;
	+} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
	diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
	index f8b6b7eb3aff..86aa85e965f6 100644
	--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
	+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
	@@ -1,50203 +1,50254 @@
	//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the interfaces that X86 uses to lower LLVM code into a
	// selection DAG.
	//
	//===----------------------------------------------------------------------===//

	#include "X86ISelLowering.h"
	#include "MCTargetDesc/X86ShuffleDecode.h"
	#include "X86.h"
	#include "X86CallingConv.h"
	#include "X86FrameLowering.h"
	#include "X86InstrBuilder.h"
	#include "X86IntrinsicsInfo.h"
	#include "X86MachineFunctionInfo.h"
	#include "X86TargetMachine.h"
	#include "X86TargetObjectFile.h"
	#include "llvm/ADT/SmallBitVector.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/Analysis/BlockFrequencyInfo.h"
	#include "llvm/Analysis/EHPersonalities.h"
	#include "llvm/Analysis/ProfileSummaryInfo.h"
	#include "llvm/Analysis/VectorUtils.h"
	#include "llvm/CodeGen/IntrinsicLowering.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineJumpTableInfo.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/CodeGen/WinEHFuncInfo.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/DiagnosticInfo.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GlobalAlias.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCExpr.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Target/TargetOptions.h"
	#include <algorithm>
	#include <bitset>
	#include <cctype>
	#include <numeric>
	using namespace llvm;

	#define DEBUG_TYPE "x86-isel"

	STATISTIC(NumTailCalls, "Number of tail calls");

	static cl::opt<int> ExperimentalPrefLoopAlignment(
	"x86-experimental-pref-loop-alignment", cl::init(4),
	cl::desc(
	"Sets the preferable loop alignment for experiments (as log2 bytes)"
	"(the last x86-experimental-pref-loop-alignment bits"
	" of the loop header PC will be 0)."),
	cl::Hidden);

	static cl::opt<bool> MulConstantOptimization(
	"mul-constant-optimization", cl::init(true),
	cl::desc("Replace 'mul x, Const' with more effective instructions like "
	"SHIFT, LEA, etc."),
	cl::Hidden);

	static cl::opt<bool> ExperimentalUnorderedISEL(
	"x86-experimental-unordered-atomic-isel", cl::init(false),
	cl::desc("Use LoadSDNode and StoreSDNode instead of "
	"AtomicSDNode for unordered atomic loads and "
	"stores respectively."),
	cl::Hidden);

	/// Call this when the user attempts to do something unsupported, like
	/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
	/// report_fatal_error, so calling code should attempt to recover without
	/// crashing.
	static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
	const char *Msg) {
	MachineFunction &MF = DAG.getMachineFunction();
	DAG.getContext()->diagnose(
	DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
	}

	X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
	const X86Subtarget &STI)
	: TargetLowering(TM), Subtarget(STI) {
	bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
	X86ScalarSSEf64 = Subtarget.hasSSE2();
	X86ScalarSSEf32 = Subtarget.hasSSE1();
	MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));

	// Set up the TargetLowering object.

	// X86 is weird. It always uses i8 for shift amounts and setcc results.
	setBooleanContents(ZeroOrOneBooleanContent);
	// X86-SSE is even stranger. It uses -1 or 0 for vector masks.
	setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

	// For 64-bit, since we have so many registers, use the ILP scheduler.
	// For 32-bit, use the register pressure specific scheduling.
	// For Atom, always use ILP scheduling.
	if (Subtarget.isAtom())
	setSchedulingPreference(Sched::ILP);
	else if (Subtarget.is64Bit())
	setSchedulingPreference(Sched::ILP);
	else
	setSchedulingPreference(Sched::RegPressure);
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());

	// Bypass expensive divides and use cheaper ones.
	if (TM.getOptLevel() >= CodeGenOpt::Default) {
	if (Subtarget.hasSlowDivide32())
	addBypassSlowDiv(32, 8);
	if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
	addBypassSlowDiv(64, 32);
	}

	if (Subtarget.isTargetWindowsMSVC() \|\|
	Subtarget.isTargetWindowsItanium()) {
	// Setup Windows compiler runtime calls.
	setLibcallName(RTLIB::SDIV_I64, "_alldiv");
	setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
	setLibcallName(RTLIB::SREM_I64, "_allrem");
	setLibcallName(RTLIB::UREM_I64, "_aullrem");
	setLibcallName(RTLIB::MUL_I64, "_allmul");
	setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
	}

	if (Subtarget.getTargetTriple().isOSMSVCRT()) {
	// MSVCRT doesn't have powi; fall back to pow
	setLibcallName(RTLIB::POWI_F32, nullptr);
	setLibcallName(RTLIB::POWI_F64, nullptr);
	}

	// If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
	// 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
	// FIXME: Should we be limiting the atomic size on other configs? Default is
	// 1024.
	if (!Subtarget.hasCmpxchg8b())
	setMaxAtomicSizeInBitsSupported(32);

	// Set up the register classes.
	addRegisterClass(MVT::i8, &X86::GR8RegClass);
	addRegisterClass(MVT::i16, &X86::GR16RegClass);
	addRegisterClass(MVT::i32, &X86::GR32RegClass);
	if (Subtarget.is64Bit())
	addRegisterClass(MVT::i64, &X86::GR64RegClass);

	for (MVT VT : MVT::integer_valuetypes())
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);

	// We don't accept any truncstore of integer registers.
	setTruncStoreAction(MVT::i64, MVT::i32, Expand);
	setTruncStoreAction(MVT::i64, MVT::i16, Expand);
	setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
	setTruncStoreAction(MVT::i32, MVT::i16, Expand);
	setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
	setTruncStoreAction(MVT::i16, MVT::i8, Expand);

	setTruncStoreAction(MVT::f64, MVT::f32, Expand);

	// SETOEQ and SETUNE require checking two conditions.
	for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
	setCondCodeAction(ISD::SETOEQ, VT, Expand);
	setCondCodeAction(ISD::SETUNE, VT, Expand);
	}

	// Integer absolute.
	if (Subtarget.hasCMov()) {
	setOperationAction(ISD::ABS , MVT::i16 , Custom);
	setOperationAction(ISD::ABS , MVT::i32 , Custom);
	}
	setOperationAction(ISD::ABS , MVT::i64 , Custom);

	// Funnel shifts.
	for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
	// For slow shld targets we only lower for code size.
	LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;

	setOperationAction(ShiftOp , MVT::i8 , Custom);
	setOperationAction(ShiftOp , MVT::i16 , Custom);
	setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
	if (Subtarget.is64Bit())
	setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
	}

	if (!Subtarget.useSoftFloat()) {
	// Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
	// operation.
	setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
	setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
	// We have an algorithm for SSE2, and we turn this into a 64-bit
	// FILD or VCVTUSI2SS/SD for other targets.
	setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
	// We have an algorithm for SSE2->double, and we turn this into a
	// 64-bit FILD followed by conditional FADD for other targets.
	setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);

	// Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
	// this operation.
	setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
	// SSE has no i16 to fp conversion, only i32. We promote in the handler
	// to allow f80 to use i16 and f64 to use i16 with sse1 only
	setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
	// f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
	setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
	// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
	// are Legal, f80 is custom lowered.
	setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);

	// Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
	// this operation.
	setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
	// FIXME: This doesn't generate invalid exception when it should. PR44019.
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);
	setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
	// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
	// are Legal, f80 is custom lowered.
	setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);

	// Handle FP_TO_UINT by promoting the destination to a larger signed
	// conversion.
	setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
	// FIXME: This doesn't generate invalid exception when it should. PR44019.
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);
	setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
	// FIXME: This doesn't generate invalid exception when it should. PR44019.
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
	setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);

	setOperationAction(ISD::LRINT, MVT::f32, Custom);
	setOperationAction(ISD::LRINT, MVT::f64, Custom);
	setOperationAction(ISD::LLRINT, MVT::f32, Custom);
	setOperationAction(ISD::LLRINT, MVT::f64, Custom);

	if (!Subtarget.is64Bit()) {
	setOperationAction(ISD::LRINT, MVT::i64, Custom);
	setOperationAction(ISD::LLRINT, MVT::i64, Custom);
	}
	}

	// Handle address space casts between mixed sized pointers.
	setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
	setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);

	// TODO: when we have SSE, these could be more efficient, by using movd/movq.
	if (!X86ScalarSSEf64) {
	setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
	setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
	// Without SSE, i64->f64 goes through memory.
	setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
	}
	} else if (!Subtarget.is64Bit())
	setOperationAction(ISD::BITCAST , MVT::i64 , Custom);

	// Scalar integer divide and remainder are lowered to use operations that
	// produce two results, to match the available instructions. This exposes
	// the two-result form to trivial CSE, which is able to combine x/y and x%y
	// into a single instruction.
	//
	// Scalar integer multiply-high is also lowered to use two-result
	// operations, to match the available instructions. However, plain multiply
	// (low) operations are left as Legal, as there are single-result
	// instructions for this in x86. Using the two-result multiply instructions
	// when both high and low results are needed must be arranged by dagcombine.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	}

	setOperationAction(ISD::BR_JT , MVT::Other, Expand);
	setOperationAction(ISD::BRCOND , MVT::Other, Custom);
	for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
	MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::BR_CC, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);
	}
	if (Subtarget.is64Bit())
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);

	setOperationAction(ISD::FREM , MVT::f32 , Expand);
	setOperationAction(ISD::FREM , MVT::f64 , Expand);
	setOperationAction(ISD::FREM , MVT::f80 , Expand);
	setOperationAction(ISD::FREM , MVT::f128 , Expand);
	setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);

	// Promote the i8 variants and force them on up to i32 which has a shorter
	// encoding.
	setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
	setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
	if (!Subtarget.hasBMI()) {
	setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
	setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
	}
	}

	if (Subtarget.hasLZCNT()) {
	// When promoting the i8 variants, force them to i32 for a shorter
	// encoding.
	setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
	setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
	} else {
	for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::CTLZ , VT, Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
	}
	}

	for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
	ISD::STRICT_FP_TO_FP16}) {
	// Special handling for half-precision floating point conversions.
	// If we don't have F16C support, then lower half float conversions
	// into library calls.
	setOperationAction(
	Op, MVT::f32,
	(!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
	// There's never any support for operations beyond MVT::f32.
	setOperationAction(Op, MVT::f64, Expand);
	setOperationAction(Op, MVT::f80, Expand);
	setOperationAction(Op, MVT::f128, Expand);
	}

	setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);
	setTruncStoreAction(MVT::f32, MVT::f16, Expand);
	setTruncStoreAction(MVT::f64, MVT::f16, Expand);
	setTruncStoreAction(MVT::f80, MVT::f16, Expand);
	setTruncStoreAction(MVT::f128, MVT::f16, Expand);

	if (Subtarget.hasPOPCNT()) {
	setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
	} else {
	setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
	setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
	setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
	if (Subtarget.is64Bit())
	setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
	else
	setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
	}

	setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);

	if (!Subtarget.hasMOVBE())
	setOperationAction(ISD::BSWAP , MVT::i16 , Expand);

	// X86 wants to expand cmov itself.
	for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
	}
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	}

	// Custom action for SELECT MMX and expand action for SELECT_CC MMX
	setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);

	setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
	// NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
	// LLVM/Clang supports zero-cost DWARF and SEH exception handling.
	setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
	setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
	setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
	if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
	setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");

	// Darwin ABI issue.
	for (auto VT : { MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::ConstantPool , VT, Custom);
	setOperationAction(ISD::JumpTable , VT, Custom);
	setOperationAction(ISD::GlobalAddress , VT, Custom);
	setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
	setOperationAction(ISD::ExternalSymbol , VT, Custom);
	setOperationAction(ISD::BlockAddress , VT, Custom);
	}

	// 64-bit shl, sra, srl (iff 32-bit x86)
	for (auto VT : { MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::SHL_PARTS, VT, Custom);
	setOperationAction(ISD::SRA_PARTS, VT, Custom);
	setOperationAction(ISD::SRL_PARTS, VT, Custom);
	}

	if (Subtarget.hasSSEPrefetch() \|\| Subtarget.has3DNow())
	setOperationAction(ISD::PREFETCH , MVT::Other, Legal);

	setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);

	// Expand certain atomics
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
	setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
	}

	if (!Subtarget.is64Bit())
	setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);

	if (Subtarget.hasCmpxchg16b()) {
	setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
	}

	// FIXME - use subtarget debug flags
	if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
	!Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
	TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
	setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
	}

	setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
	setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);

	setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
	setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);

	setOperationAction(ISD::TRAP, MVT::Other, Legal);
	setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);

	// VASTART needs to be custom lowered to use the VarArgsFrameIndex
	setOperationAction(ISD::VASTART , MVT::Other, Custom);
	setOperationAction(ISD::VAEND , MVT::Other, Expand);
	bool Is64Bit = Subtarget.is64Bit();
	setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
	setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);

	setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
	setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);

	setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);

	// GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
	setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
	setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);

	if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
	// f32 and f64 use SSE.
	// Set up the FP register classes.
	addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
	: &X86::FR32RegClass);
	addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
	: &X86::FR64RegClass);

	// Disable f32->f64 extload as we can only generate this in one instruction
	// under optsize. So its easier to pattern match (fpext (load)) for that
	// case instead of needing to emit 2 instructions for extload in the
	// non-optsize case.
	setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);

	for (auto VT : { MVT::f32, MVT::f64 }) {
	// Use ANDPD to simulate FABS.
	setOperationAction(ISD::FABS, VT, Custom);

	// Use XORP to simulate FNEG.
	setOperationAction(ISD::FNEG, VT, Custom);

	// Use ANDPD and ORPD to simulate FCOPYSIGN.
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);

	// These might be better off as horizontal vector ops.
	setOperationAction(ISD::FADD, VT, Custom);
	setOperationAction(ISD::FSUB, VT, Custom);

	// We don't support sin/cos/fmod
	setOperationAction(ISD::FSIN , VT, Expand);
	setOperationAction(ISD::FCOS , VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	}

	// Lower this to MOVMSK plus an AND.
	setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
	setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);

	} else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32 &&
	(UseX87 \|\| Is64Bit)) {
	// Use SSE for f32, x87 for f64.
	// Set up the FP register classes.
	addRegisterClass(MVT::f32, &X86::FR32RegClass);
	if (UseX87)
	addRegisterClass(MVT::f64, &X86::RFP64RegClass);

	// Use ANDPS to simulate FABS.
	setOperationAction(ISD::FABS , MVT::f32, Custom);

	// Use XORP to simulate FNEG.
	setOperationAction(ISD::FNEG , MVT::f32, Custom);

	if (UseX87)
	setOperationAction(ISD::UNDEF, MVT::f64, Expand);

	// Use ANDPS and ORPS to simulate FCOPYSIGN.
	if (UseX87)
	setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);

	// We don't support sin/cos/fmod
	setOperationAction(ISD::FSIN , MVT::f32, Expand);
	setOperationAction(ISD::FCOS , MVT::f32, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f32, Expand);

	if (UseX87) {
	// Always expand sin/cos functions even though x87 has an instruction.
	setOperationAction(ISD::FSIN, MVT::f64, Expand);
	setOperationAction(ISD::FCOS, MVT::f64, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
	}
	} else if (UseX87) {
	// f32 and f64 in x87.
	// Set up the FP register classes.
	addRegisterClass(MVT::f64, &X86::RFP64RegClass);
	addRegisterClass(MVT::f32, &X86::RFP32RegClass);

	for (auto VT : { MVT::f32, MVT::f64 }) {
	setOperationAction(ISD::UNDEF, VT, Expand);
	setOperationAction(ISD::FCOPYSIGN, VT, Expand);

	// Always expand sin/cos functions even though x87 has an instruction.
	setOperationAction(ISD::FSIN , VT, Expand);
	setOperationAction(ISD::FCOS , VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	}
	}

	// Expand FP32 immediates into loads from the stack, save special cases.
	if (isTypeLegal(MVT::f32)) {
	if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
	addLegalFPImmediate(APFloat(+0.0f)); // FLD0
	addLegalFPImmediate(APFloat(+1.0f)); // FLD1
	addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
	addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
	} else // SSE immediates.
	addLegalFPImmediate(APFloat(+0.0f)); // xorps
	}
	// Expand FP64 immediates into loads from the stack, save special cases.
	if (isTypeLegal(MVT::f64)) {
	if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
	addLegalFPImmediate(APFloat(+0.0)); // FLD0
	addLegalFPImmediate(APFloat(+1.0)); // FLD1
	addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
	addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
	} else // SSE immediates.
	addLegalFPImmediate(APFloat(+0.0)); // xorpd
	}
	// Handle constrained floating-point operations of scalar.
	setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
	setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
	setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
	setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
	setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);

	// We don't support FMA.
	setOperationAction(ISD::FMA, MVT::f64, Expand);
	setOperationAction(ISD::FMA, MVT::f32, Expand);

	// f80 always uses X87.
	if (UseX87) {
	addRegisterClass(MVT::f80, &X86::RFP80RegClass);
	setOperationAction(ISD::UNDEF, MVT::f80, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
	{
	APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
	addLegalFPImmediate(TmpFlt); // FLD0
	TmpFlt.changeSign();
	addLegalFPImmediate(TmpFlt); // FLD0/FCHS

	bool ignored;
	APFloat TmpFlt2(+1.0);
	TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
	&ignored);
	addLegalFPImmediate(TmpFlt2); // FLD1
	TmpFlt2.changeSign();
	addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
	}

	// Always expand sin/cos functions even though x87 has an instruction.
	setOperationAction(ISD::FSIN , MVT::f80, Expand);
	setOperationAction(ISD::FCOS , MVT::f80, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f80, Expand);

	setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
	setOperationAction(ISD::FCEIL, MVT::f80, Expand);
	setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
	setOperationAction(ISD::FRINT, MVT::f80, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
	setOperationAction(ISD::FMA, MVT::f80, Expand);
	setOperationAction(ISD::LROUND, MVT::f80, Expand);
	setOperationAction(ISD::LLROUND, MVT::f80, Expand);
	setOperationAction(ISD::LRINT, MVT::f80, Custom);
	setOperationAction(ISD::LLRINT, MVT::f80, Custom);

	// Handle constrained floating-point operations of scalar.
	setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);
	setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);
	setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);
	setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);
	setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);
	setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
	// FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
	// as Custom.
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
	}

	// f128 uses xmm registers, but most operations require libcalls.
	if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
	addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps

	setOperationAction(ISD::FADD, MVT::f128, LibCall);
	setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
	setOperationAction(ISD::FSUB, MVT::f128, LibCall);
	setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
	setOperationAction(ISD::FDIV, MVT::f128, LibCall);
	setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
	setOperationAction(ISD::FMUL, MVT::f128, LibCall);
	setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
	setOperationAction(ISD::FMA, MVT::f128, LibCall);
	setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);

	setOperationAction(ISD::FABS, MVT::f128, Custom);
	setOperationAction(ISD::FNEG, MVT::f128, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);

	setOperationAction(ISD::FSIN, MVT::f128, LibCall);
	setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);
	setOperationAction(ISD::FCOS, MVT::f128, LibCall);
	setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);
	setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
	// No STRICT_FSINCOS
	setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
	setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);

	setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
	setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
	// We need to custom handle any FP_ROUND with an f128 input, but
	// LegalizeDAG uses the result type to know when to run a custom handler.
	// So we have to list all legal floating point result types here.
	if (isTypeLegal(MVT::f32)) {
	setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
	}
	if (isTypeLegal(MVT::f64)) {
	setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
	}
	if (isTypeLegal(MVT::f80)) {
	setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
	}

	setOperationAction(ISD::SETCC, MVT::f128, Custom);

	setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
	setTruncStoreAction(MVT::f128, MVT::f32, Expand);
	setTruncStoreAction(MVT::f128, MVT::f64, Expand);
	setTruncStoreAction(MVT::f128, MVT::f80, Expand);
	}

	// Always use a library call for pow.
	setOperationAction(ISD::FPOW , MVT::f32 , Expand);
	setOperationAction(ISD::FPOW , MVT::f64 , Expand);
	setOperationAction(ISD::FPOW , MVT::f80 , Expand);
	setOperationAction(ISD::FPOW , MVT::f128 , Expand);

	setOperationAction(ISD::FLOG, MVT::f80, Expand);
	setOperationAction(ISD::FLOG2, MVT::f80, Expand);
	setOperationAction(ISD::FLOG10, MVT::f80, Expand);
	setOperationAction(ISD::FEXP, MVT::f80, Expand);
	setOperationAction(ISD::FEXP2, MVT::f80, Expand);
	setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
	setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);

	// Some FP actions are always expanded for vector types.
	for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
	MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
	setOperationAction(ISD::FSIN, VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	setOperationAction(ISD::FCOS, VT, Expand);
	setOperationAction(ISD::FREM, VT, Expand);
	setOperationAction(ISD::FCOPYSIGN, VT, Expand);
	setOperationAction(ISD::FPOW, VT, Expand);
	setOperationAction(ISD::FLOG, VT, Expand);
	setOperationAction(ISD::FLOG2, VT, Expand);
	setOperationAction(ISD::FLOG10, VT, Expand);
	setOperationAction(ISD::FEXP, VT, Expand);
	setOperationAction(ISD::FEXP2, VT, Expand);
	}

	// First set operation action for all vector types to either promote
	// (for widening) or expand (for scalarization). Then we will selectively
	// turn on ones that can be effectively codegen'd.
	for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
	setOperationAction(ISD::FMA, VT, Expand);
	setOperationAction(ISD::FFLOOR, VT, Expand);
	setOperationAction(ISD::FCEIL, VT, Expand);
	setOperationAction(ISD::FTRUNC, VT, Expand);
	setOperationAction(ISD::FRINT, VT, Expand);
	setOperationAction(ISD::FNEARBYINT, VT, Expand);
	setOperationAction(ISD::SMUL_LOHI, VT, Expand);
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::UMUL_LOHI, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	setOperationAction(ISD::SDIVREM, VT, Expand);
	setOperationAction(ISD::UDIVREM, VT, Expand);
	setOperationAction(ISD::CTPOP, VT, Expand);
	setOperationAction(ISD::CTTZ, VT, Expand);
	setOperationAction(ISD::CTLZ, VT, Expand);
	setOperationAction(ISD::ROTL, VT, Expand);
	setOperationAction(ISD::ROTR, VT, Expand);
	setOperationAction(ISD::BSWAP, VT, Expand);
	setOperationAction(ISD::SETCC, VT, Expand);
	setOperationAction(ISD::FP_TO_UINT, VT, Expand);
	setOperationAction(ISD::FP_TO_SINT, VT, Expand);
	setOperationAction(ISD::UINT_TO_FP, VT, Expand);
	setOperationAction(ISD::SINT_TO_FP, VT, Expand);
	setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
	setOperationAction(ISD::TRUNCATE, VT, Expand);
	setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
	setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
	setOperationAction(ISD::ANY_EXTEND, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);
	for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
	setTruncStoreAction(InnerVT, VT, Expand);

	setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
	setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);

	// N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
	// types, we have to deal with them whether we ask for Expansion or not.
	// Setting Expand causes its own optimisation problems though, so leave
	// them legal.
	if (VT.getVectorElementType() == MVT::i1)
	setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

	// EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
	// split/scalarized right now.
	if (VT.getVectorElementType() == MVT::f16)
	setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
	}
	}

	// FIXME: In order to prevent SSE instructions being expanded to MMX ones
	// with -msoft-float, disable use of MMX as well.
	if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
	addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
	// No operations on x86mmx supported, everything uses intrinsics.
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
	addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
	setOperationAction(ISD::FABS, MVT::v4f32, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
	setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
	setOperationAction(ISD::SELECT, MVT::v4f32, Custom);

	setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
	setOperationAction(ISD::STORE, MVT::v2f32, Custom);

	setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
	addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	// FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
	// registers cannot be used even for integer operations.
	addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
	MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
	setOperationAction(ISD::SDIV, VT, Custom);
	setOperationAction(ISD::SREM, VT, Custom);
	setOperationAction(ISD::UDIV, VT, Custom);
	setOperationAction(ISD::UREM, VT, Custom);
	}

	setOperationAction(ISD::MUL, MVT::v2i8, Custom);
	setOperationAction(ISD::MUL, MVT::v4i8, Custom);
	setOperationAction(ISD::MUL, MVT::v8i8, Custom);

	setOperationAction(ISD::MUL, MVT::v16i8, Custom);
	setOperationAction(ISD::MUL, MVT::v4i32, Custom);
	setOperationAction(ISD::MUL, MVT::v2i64, Custom);
	setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
	setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
	setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
	setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
	setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
	setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
	setOperationAction(ISD::MUL, MVT::v8i16, Legal);
	setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
	setOperationAction(ISD::FABS, MVT::v2f64, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
	setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
	setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
	setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
	}

	setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
	setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
	setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
	setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
	setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
	setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
	setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
	setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
	setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
	setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
	setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
	setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);

	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::ABS, VT, Custom);

	// The condition codes aren't legal in SSE/AVX and under AVX512 we use
	// setcc all the way to isel and prefer SETGT in some isel patterns.
	setCondCodeAction(ISD::SETLT, VT, Custom);
	setCondCodeAction(ISD::SETLE, VT, Custom);
	}

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	}

	for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);

	if (VT == MVT::v2i64 && !Subtarget.is64Bit())
	continue;

	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	}

	// Custom lower v2i64 and v2f64 selects.
	setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
	setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
	setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
	setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
	setOperationAction(ISD::SELECT, MVT::v16i8, Custom);

	setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);

	// Custom legalize these to avoid over promotion or custom promotion.
	for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
	setOperationAction(ISD::FP_TO_SINT, VT, Custom);
	setOperationAction(ISD::FP_TO_UINT, VT, Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
	}

	setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);

	setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);

	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);

	// Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
	setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);

	setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
	setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);

	// We want to legalize this to an f64 load rather than an i64 load on
	// 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
	// store.
	setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
	setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
	setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
	setOperationAction(ISD::STORE, MVT::v2i32, Custom);
	setOperationAction(ISD::STORE, MVT::v4i16, Custom);
	setOperationAction(ISD::STORE, MVT::v8i8, Custom);

	setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
	setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
	setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
	if (!Subtarget.hasAVX512())
	setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);

	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);

	setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);

	setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);

	// In the customized shift lowering, the legal v4i32/v2i64 cases
	// in AVX2 will be recognized.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	}

	setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
	setOperationAction(ISD::ROTL, MVT::v8i16, Custom);

	// With 512-bit registers or AVX512VL+BW, expanding (and promoting the
	// shifts) is better.
	if (!Subtarget.useAVX512Regs() &&
	!(Subtarget.hasBWI() && Subtarget.hasVLX()))
	setOperationAction(ISD::ROTL, MVT::v16i8, Custom);

	setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
	setOperationAction(ISD::ABS, MVT::v16i8, Legal);
	setOperationAction(ISD::ABS, MVT::v8i16, Legal);
	setOperationAction(ISD::ABS, MVT::v4i32, Legal);
	setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
	setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
	setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
	setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
	setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);

	// These might be better off as horizontal vector ops.
	setOperationAction(ISD::ADD, MVT::i16, Custom);
	setOperationAction(ISD::ADD, MVT::i32, Custom);
	setOperationAction(ISD::SUB, MVT::i16, Custom);
	setOperationAction(ISD::SUB, MVT::i32, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
	for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
	setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
	setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);
	setOperationAction(ISD::FCEIL, RoundedTy, Legal);
	setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);
	setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
	setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);
	setOperationAction(ISD::FRINT, RoundedTy, Legal);
	setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
	setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
	setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);

	setOperationAction(ISD::FROUND, RoundedTy, Custom);
	}

	setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
	setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
	setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
	setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
	setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
	setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
	setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
	setOperationAction(ISD::UMIN, MVT::v4i32, Legal);

	// FIXME: Do we need to handle scalar-to-vector here?
	setOperationAction(ISD::MUL, MVT::v4i32, Legal);

	// We directly match byte blends in the backend as they match the VSELECT
	// condition form.
	setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);

	// SSE41 brings specific instructions for doing vector sign extend even in
	// cases where we don't have SRA.
	for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
	}

	// SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
	for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
	setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
	}

	// i8 vectors are custom because the source register and source
	// source memory operand types are not the same width.
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);

	if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
	// We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
	// do the pre and post work in the vector domain.
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
	// We need to mark SINT_TO_FP as Custom even though we want to expand it
	// so that DAG combine doesn't try to turn it into uint_to_fp.
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
	setOperationAction(ISD::ROTL, VT, Custom);

	// XOP can efficiently perform BITREVERSE with VPPERM.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
	setOperationAction(ISD::BITREVERSE, VT, Custom);

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
	setOperationAction(ISD::BITREVERSE, VT, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
	bool HasInt256 = Subtarget.hasInt256();

	addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);

	for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
	setOperationAction(ISD::FFLOOR, VT, Legal);
	setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
	setOperationAction(ISD::FCEIL, VT, Legal);
	setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
	setOperationAction(ISD::FTRUNC, VT, Legal);
	setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
	setOperationAction(ISD::FRINT, VT, Legal);
	setOperationAction(ISD::STRICT_FRINT, VT, Legal);
	setOperationAction(ISD::FNEARBYINT, VT, Legal);
	setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

	setOperationAction(ISD::FROUND, VT, Custom);

	setOperationAction(ISD::FNEG, VT, Custom);
	setOperationAction(ISD::FABS, VT, Custom);
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
	}

	// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
	// even though v8i16 is a legal type.
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
	setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
	setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
	setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal);

	setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal);

	setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
	setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
	setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
	setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
	setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);

	if (!Subtarget.hasAVX512())
	setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);

	// In the customized shift lowering, the legal v8i32/v4i64 cases
	// in AVX2 will be recognized.
	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	}

	// These types need custom splitting if their input is a 128-bit vector.
	setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);

	setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
	setOperationAction(ISD::ROTL, MVT::v16i16, Custom);

	// With BWI, expanding (and promoting the shifts) is the better.
	if (!Subtarget.useBWIRegs())
	setOperationAction(ISD::ROTL, MVT::v32i8, Custom);

	setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
	setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
	setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
	setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
	setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
	setOperationAction(ISD::SELECT, MVT::v8f32, Custom);

	for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
	setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
	setOperationAction(ISD::ANY_EXTEND, VT, Custom);
	}

	setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
	setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::CTLZ, VT, Custom);

	// The condition codes aren't legal in SSE/AVX and under AVX512 we use
	// setcc all the way to isel and prefer SETGT in some isel patterns.
	setCondCodeAction(ISD::SETLT, VT, Custom);
	setCondCodeAction(ISD::SETLE, VT, Custom);
	}

	if (Subtarget.hasAnyFMA()) {
	for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
	MVT::v2f64, MVT::v4f64 }) {
	setOperationAction(ISD::FMA, VT, Legal);
	setOperationAction(ISD::STRICT_FMA, VT, Legal);
	}
	}

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
	}

	setOperationAction(ISD::MUL, MVT::v4i64, Custom);
	setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MUL, MVT::v32i8, Custom);

	setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
	setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
	setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
	setOperationAction(ISD::MULHS, MVT::v32i8, Custom);

	setOperationAction(ISD::ABS, MVT::v4i64, Custom);
	setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
	setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
	setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
	setOperationAction(ISD::UMIN, MVT::v4i64, Custom);

	setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
	setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
	}

	for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
	}

	if (HasInt256) {
	// The custom lowering for UINT_TO_FP for v8i32 becomes interesting
	// when we have a 256bit-wide blend with immediate.
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);

	// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
	for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
	setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
	}
	}

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
	setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::MSTORE, VT, Legal);
	}

	// Extract subvector is special because the value type
	// (result) is 128-bit but the source is 256-bit wide.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v4f32, MVT::v2f64 }) {
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
	}

	// Custom lower several nodes for 256-bit types.
	for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
	MVT::v8f32, MVT::v4f64 }) {
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
	setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
	setOperationAction(ISD::STORE, VT, Custom);
	}

	if (HasInt256) {
	setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);

	// Custom legalize 2x32 to get a little better code.
	setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
	setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
	setOperationAction(ISD::MGATHER, VT, Custom);
	}
	}

	// This block controls legalization of the mask vector sizes that are
	// available with AVX512. 512-bit vectors are in a separate block controlled
	// by useAVX512Regs.
	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
	addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
	addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
	addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
	addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
	addRegisterClass(MVT::v16i1, &X86::VK16RegClass);

	setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);

	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
	setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
	setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
	setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
	setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
	setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);

	// There is no byte sized k-register load or store without AVX512DQ.
	if (!Subtarget.hasDQI()) {
	setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
	setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
	setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
	setOperationAction(ISD::LOAD, MVT::v8i1, Custom);

	setOperationAction(ISD::STORE, MVT::v1i1, Custom);
	setOperationAction(ISD::STORE, MVT::v2i1, Custom);
	setOperationAction(ISD::STORE, MVT::v4i1, Custom);
	setOperationAction(ISD::STORE, MVT::v8i1, Custom);
	}

	// Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
	setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
	setOperationAction(ISD::ANY_EXTEND, VT, Custom);
	}

	for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
	setOperationAction(ISD::ADD, VT, Custom);
	setOperationAction(ISD::SUB, VT, Custom);
	setOperationAction(ISD::MUL, VT, Custom);
	setOperationAction(ISD::UADDSAT, VT, Custom);
	setOperationAction(ISD::SADDSAT, VT, Custom);
	setOperationAction(ISD::USUBSAT, VT, Custom);
	setOperationAction(ISD::SSUBSAT, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Expand);
	}

	for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::TRUNCATE, VT, Custom);

	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	}

	for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
	}

	// This block controls legalization for 512-bit operations with 32/64 bit
	// elements. 512-bits can be disabled based on prefer-vector-width and
	// required-vector-width function attributes.
	if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
	bool HasBWI = Subtarget.hasBWI();

	addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
	addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
	addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
	addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
	addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
	addRegisterClass(MVT::v64i8, &X86::VR512RegClass);

	for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
	setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
	setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
	if (HasBWI)
	setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
	}

	for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::FNEG, VT, Custom);
	setOperationAction(ISD::FABS, VT, Custom);
	setOperationAction(ISD::FMA, VT, Legal);
	setOperationAction(ISD::STRICT_FMA, VT, Legal);
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
	}

	for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) {
	setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
	setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);
	setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
	setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
	}
	setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal);
	setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal);

	setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
	setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
	setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
	setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
	setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);

	setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
	setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
	setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
	setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
	setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
	if (HasBWI)
	setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);

	// With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
	// to 512-bit rather than use the AVX2 instructions so that we can use
	// k-masks.
	if (!Subtarget.hasVLX()) {
	for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
	setOperationAction(ISD::MLOAD, VT, Custom);
	setOperationAction(ISD::MSTORE, VT, Custom);
	}
	}

	setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);
	setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
	setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);

	if (HasBWI) {
	// Extends from v64i1 masks to 512-bit vectors.
	setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
	}

	for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::FFLOOR, VT, Legal);
	setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
	setOperationAction(ISD::FCEIL, VT, Legal);
	setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
	setOperationAction(ISD::FTRUNC, VT, Legal);
	setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
	setOperationAction(ISD::FRINT, VT, Legal);
	setOperationAction(ISD::STRICT_FRINT, VT, Legal);
	setOperationAction(ISD::FNEARBYINT, VT, Legal);
	setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

	setOperationAction(ISD::FROUND, VT, Custom);
	}

	for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
	}

	setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
	setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
	setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
	setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);

	setOperationAction(ISD::MUL, MVT::v8i64, Custom);
	setOperationAction(ISD::MUL, MVT::v16i32, Legal);
	setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
	setOperationAction(ISD::MUL, MVT::v64i8, Custom);

	setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
	setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
	setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
	setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
	setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
	setOperationAction(ISD::MULHU, MVT::v64i8, Custom);

	setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);

	for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);

	// The condition codes aren't legal in SSE/AVX and under AVX512 we use
	// setcc all the way to isel and prefer SETGT in some isel patterns.
	setCondCodeAction(ISD::SETLT, VT, Custom);
	setCondCodeAction(ISD::SETLE, VT, Custom);
	}
	for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);
	setOperationAction(ISD::ABS, VT, Legal);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::ROTL, VT, Custom);
	setOperationAction(ISD::ROTR, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
	}

	for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
	setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
	setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
	setOperationAction(ISD::CTLZ, VT, Custom);
	setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
	setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
	setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
	setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
	setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
	setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
	setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
	setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
	}

	if (Subtarget.hasDQI()) {
	setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal);

	setOperationAction(ISD::MUL, MVT::v8i64, Legal);
	}

	if (Subtarget.hasCDI()) {
	// NonVLX sub-targets extend 128/256 vectors to use the 512 version.
	for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
	setOperationAction(ISD::CTLZ, VT, Legal);
	}
	} // Subtarget.hasCDI()

	if (Subtarget.hasVPOPCNTDQ()) {
	for (auto VT : { MVT::v16i32, MVT::v8i64 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}

	// Extract subvector is special because the value type
	// (result) is 256-bit but the source is 512-bit wide.
	// 128-bit was made Legal under AVX1.
	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
	MVT::v8f32, MVT::v4f64 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

	for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
	MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	}

	for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::MLOAD, VT, Legal);
	setOperationAction(ISD::MSTORE, VT, Legal);
	setOperationAction(ISD::MGATHER, VT, Custom);
	setOperationAction(ISD::MSCATTER, VT, Custom);
	}
	if (HasBWI) {
	for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
	setOperationAction(ISD::MLOAD, VT, Legal);
	setOperationAction(ISD::MSTORE, VT, Legal);
	}
	} else {
	setOperationAction(ISD::STORE, MVT::v32i16, Custom);
	setOperationAction(ISD::STORE, MVT::v64i8, Custom);
	}

	if (Subtarget.hasVBMI2()) {
	for (auto VT : { MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
	setOperationAction(ISD::FSHL, VT, Custom);
	setOperationAction(ISD::FSHR, VT, Custom);
	}
	}
	}// useAVX512Regs

	// This block controls legalization for operations that don't have
	// pre-AVX512 equivalents. Without VLX we use 512-bit operations for
	// narrower widths.
	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
	// These operations are handled on non-VLX by artificially widening in
	// isel patterns.

	setOperationAction(ISD::FP_TO_UINT, MVT::v8i32,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i32,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
	Subtarget.hasVLX() ? Legal : Custom);

	if (Subtarget.hasDQI()) {
	// Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
	// v2f32 UINT_TO_FP is already custom under SSE2.
	assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
	isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&
	"Unexpected operation action!");
	// v2i64 FP_TO_S/UINT(v2f32) custom conversion.
	setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
	}

	for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);
	setOperationAction(ISD::ABS, VT, Legal);
	}

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::ROTL, VT, Custom);
	setOperationAction(ISD::ROTR, VT, Custom);
	}

	// Custom legalize 2x32 to get a little better code.
	setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
	setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
	setOperationAction(ISD::MSCATTER, VT, Custom);

	if (Subtarget.hasDQI()) {
	for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::SINT_TO_FP, VT,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::UINT_TO_FP, VT,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::STRICT_SINT_TO_FP, VT,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, VT,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::FP_TO_SINT, VT,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::FP_TO_UINT, VT,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, VT,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, VT,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::MUL, VT, Legal);
	}
	}

	if (Subtarget.hasCDI()) {
	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::CTLZ, VT, Legal);
	}
	} // Subtarget.hasCDI()

	if (Subtarget.hasVPOPCNTDQ()) {
	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}
	}

	// This block control legalization of v32i1/v64i1 which are available with
	// AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
	// useBWIRegs.
	if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
	addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
	addRegisterClass(MVT::v64i1, &X86::VK64RegClass);

	for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
	setOperationAction(ISD::ADD, VT, Custom);
	setOperationAction(ISD::SUB, VT, Custom);
	setOperationAction(ISD::MUL, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Expand);
	setOperationAction(ISD::UADDSAT, VT, Custom);
	setOperationAction(ISD::SADDSAT, VT, Custom);
	setOperationAction(ISD::USUBSAT, VT, Custom);
	setOperationAction(ISD::SSUBSAT, VT, Custom);

	setOperationAction(ISD::TRUNCATE, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
	}

	for (auto VT : { MVT::v16i1, MVT::v32i1 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

	// Extends from v32i1 masks to 256-bit vectors.
	setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);

	for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
	setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
	}

	// These operations are handled on non-VLX by artificially widening in
	// isel patterns.
	// TODO: Custom widen in lowering on non-VLX and drop the isel patterns?

	if (Subtarget.hasBITALG()) {
	for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
	setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
	setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
	setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
	setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
	setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);

	setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
	setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
	setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
	setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
	setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);

	if (Subtarget.hasBWI()) {
	setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
	setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
	}

	if (Subtarget.hasVBMI2()) {
	// TODO: Make these legal even without VLX?
	for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::FSHL, VT, Custom);
	setOperationAction(ISD::FSHR, VT, Custom);
	}
	}

	setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
	}

	// We want to custom lower some of our intrinsics.
	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
	setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
	setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
	if (!Subtarget.is64Bit()) {
	setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
	}

	// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
	// handle type legalization for these operations here.
	//
	// FIXME: We really should do custom legalization for addition and
	// subtraction on x86-32 once PR3203 is fixed. We really can't do much better
	// than generic legalization for 64-bit multiplication-with-overflow, though.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	// Add/Sub/Mul with overflow operations are custom lowered.
	setOperationAction(ISD::SADDO, VT, Custom);
	setOperationAction(ISD::UADDO, VT, Custom);
	setOperationAction(ISD::SSUBO, VT, Custom);
	setOperationAction(ISD::USUBO, VT, Custom);
	setOperationAction(ISD::SMULO, VT, Custom);
	setOperationAction(ISD::UMULO, VT, Custom);

	// Support carry in as value rather than glue.
	setOperationAction(ISD::ADDCARRY, VT, Custom);
	setOperationAction(ISD::SUBCARRY, VT, Custom);
	setOperationAction(ISD::SETCCCARRY, VT, Custom);
	}

	if (!Subtarget.is64Bit()) {
	// These libcalls are not available in 32-bit.
	setLibcallName(RTLIB::SHL_I128, nullptr);
	setLibcallName(RTLIB::SRL_I128, nullptr);
	setLibcallName(RTLIB::SRA_I128, nullptr);
	setLibcallName(RTLIB::MUL_I128, nullptr);
	}

	// Combine sin / cos into _sincos_stret if it is available.
	if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
	getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
	setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
	setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
	}

	if (Subtarget.isTargetWin64()) {
	setOperationAction(ISD::SDIV, MVT::i128, Custom);
	setOperationAction(ISD::UDIV, MVT::i128, Custom);
	setOperationAction(ISD::SREM, MVT::i128, Custom);
	setOperationAction(ISD::UREM, MVT::i128, Custom);
	setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
	setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
	}

	// On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
	// is. We should promote the value to 64-bits to solve this.
	// This is what the CRT headers do - `fmodf` is an inline header
	// function casting to f64 and calling `fmod`.
	if (Subtarget.is32Bit() &&
	(Subtarget.isTargetWindowsMSVC() \|\| Subtarget.isTargetWindowsItanium()))
	for (ISD::NodeType Op :
	{ISD::FCEIL, ISD::STRICT_FCEIL,
	ISD::FCOS, ISD::STRICT_FCOS,
	ISD::FEXP, ISD::STRICT_FEXP,
	ISD::FFLOOR, ISD::STRICT_FFLOOR,
	ISD::FREM, ISD::STRICT_FREM,
	ISD::FLOG, ISD::STRICT_FLOG,
	ISD::FLOG10, ISD::STRICT_FLOG10,
	ISD::FPOW, ISD::STRICT_FPOW,
	ISD::FSIN, ISD::STRICT_FSIN})
	if (isOperationExpand(Op, MVT::f32))
	setOperationAction(Op, MVT::f32, Promote);

	// We have target-specific dag combine patterns for the following nodes:
	setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
	setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
	setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
	setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
	setTargetDAGCombine(ISD::CONCAT_VECTORS);
	setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
	setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
	setTargetDAGCombine(ISD::BITCAST);
	setTargetDAGCombine(ISD::VSELECT);
	setTargetDAGCombine(ISD::SELECT);
	setTargetDAGCombine(ISD::SHL);
	setTargetDAGCombine(ISD::SRA);
	setTargetDAGCombine(ISD::SRL);
	setTargetDAGCombine(ISD::OR);
	setTargetDAGCombine(ISD::AND);
	setTargetDAGCombine(ISD::ADD);
	setTargetDAGCombine(ISD::FADD);
	setTargetDAGCombine(ISD::FSUB);
	setTargetDAGCombine(ISD::FNEG);
	setTargetDAGCombine(ISD::FMA);
	setTargetDAGCombine(ISD::STRICT_FMA);
	setTargetDAGCombine(ISD::FMINNUM);
	setTargetDAGCombine(ISD::FMAXNUM);
	setTargetDAGCombine(ISD::SUB);
	setTargetDAGCombine(ISD::LOAD);
	setTargetDAGCombine(ISD::MLOAD);
	setTargetDAGCombine(ISD::STORE);
	setTargetDAGCombine(ISD::MSTORE);
	setTargetDAGCombine(ISD::TRUNCATE);
	setTargetDAGCombine(ISD::ZERO_EXTEND);
	setTargetDAGCombine(ISD::ANY_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
	setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
	setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
	setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
	setTargetDAGCombine(ISD::SINT_TO_FP);
	setTargetDAGCombine(ISD::UINT_TO_FP);
	setTargetDAGCombine(ISD::STRICT_SINT_TO_FP);
	setTargetDAGCombine(ISD::STRICT_UINT_TO_FP);
	setTargetDAGCombine(ISD::SETCC);
	setTargetDAGCombine(ISD::MUL);
	setTargetDAGCombine(ISD::XOR);
	setTargetDAGCombine(ISD::MSCATTER);
	setTargetDAGCombine(ISD::MGATHER);
	setTargetDAGCombine(ISD::FP16_TO_FP);
	setTargetDAGCombine(ISD::FP_EXTEND);
	setTargetDAGCombine(ISD::STRICT_FP_EXTEND);
	setTargetDAGCombine(ISD::FP_ROUND);

	computeRegisterProperties(Subtarget.getRegisterInfo());

	MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
	MaxStoresPerMemsetOptSize = 8;
	MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
	MaxStoresPerMemcpyOptSize = 4;
	MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
	MaxStoresPerMemmoveOptSize = 4;

	// TODO: These control memcmp expansion in CGP and could be raised higher, but
	// that needs to benchmarked and balanced with the potential use of vector
	// load/store types (PR33329, PR33914).
	MaxLoadsPerMemcmp = 2;
	MaxLoadsPerMemcmpOptSize = 2;

	// Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
	setPrefLoopAlignment(Align(1ULL << ExperimentalPrefLoopAlignment));

	// An out-of-order CPU can speculatively execute past a predictable branch,
	// but a conditional move could be stalled by an expensive earlier operation.
	PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
	EnableExtLdPromotion = true;
	setPrefFunctionAlignment(Align(16));

	verifyIntrinsicTables();

	// Default to having -disable-strictnode-mutation on
	IsStrictFPEnabled = true;
	}

	// This has so far only been implemented for 64-bit MachO.
	bool X86TargetLowering::useLoadStackGuardNode() const {
	return Subtarget.isTargetMachO() && Subtarget.is64Bit();
	}

	bool X86TargetLowering::useStackGuardXorFP() const {
	// Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
	return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
	}

	SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
	const SDLoc &DL) const {
	EVT PtrTy = getPointerTy(DAG.getDataLayout());
	unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
	MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
	return SDValue(Node, 0);
	}

	TargetLoweringBase::LegalizeTypeAction
	X86TargetLowering::getPreferredVectorAction(MVT VT) const {
	if ((VT == MVT::v32i1 \|\| VT == MVT::v64i1) && Subtarget.hasAVX512() &&
	!Subtarget.hasBWI())
	return TypeSplitVector;

	if (VT.getVectorNumElements() != 1 &&
	VT.getVectorElementType() != MVT::i1)
	return TypeWidenVector;

	return TargetLoweringBase::getPreferredVectorAction(VT);
	}

	static std::pair<MVT, unsigned>
	handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
	const X86Subtarget &Subtarget) {
	// v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
	// convention is one that uses k registers.
	if (NumElts == 2)
	return {MVT::v2i64, 1};
	if (NumElts == 4)
	return {MVT::v4i32, 1};
	if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
	CC != CallingConv::Intel_OCL_BI)
	return {MVT::v8i16, 1};
	if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
	CC != CallingConv::Intel_OCL_BI)
	return {MVT::v16i8, 1};
	// v32i1 passes in ymm unless we have BWI and the calling convention is
	// regcall.
	if (NumElts == 32 && (!Subtarget.hasBWI() \|\| CC != CallingConv::X86_RegCall))
	return {MVT::v32i8, 1};
	// Split v64i1 vectors if we don't have v64i8 available.
	if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
	if (Subtarget.useAVX512Regs())
	return {MVT::v64i8, 1};
	return {MVT::v32i8, 2};
	}

	// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
	if (!isPowerOf2_32(NumElts) \|\| (NumElts == 64 && !Subtarget.hasBWI()) \|\|
	NumElts > 64)
	return {MVT::i8, NumElts};

	return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
	}

	MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
	CallingConv::ID CC,
	EVT VT) const {
	if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
	Subtarget.hasAVX512()) {
	unsigned NumElts = VT.getVectorNumElements();

	MVT RegisterVT;
	unsigned NumRegisters;
	std::tie(RegisterVT, NumRegisters) =
	handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
	if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
	return RegisterVT;
	}

	return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
	}

	unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
	CallingConv::ID CC,
	EVT VT) const {
	if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
	Subtarget.hasAVX512()) {
	unsigned NumElts = VT.getVectorNumElements();

	MVT RegisterVT;
	unsigned NumRegisters;
	std::tie(RegisterVT, NumRegisters) =
	handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
	if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
	return NumRegisters;
	}

	return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
	}

	unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
	LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
	unsigned &NumIntermediates, MVT &RegisterVT) const {
	// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
	if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
	Subtarget.hasAVX512() &&
	(!isPowerOf2_32(VT.getVectorNumElements()) \|\|
	(VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) \|\|
	VT.getVectorNumElements() > 64)) {
	RegisterVT = MVT::i8;
	IntermediateVT = MVT::i1;
	NumIntermediates = VT.getVectorNumElements();
	return NumIntermediates;
	}

	// Split v64i1 vectors if we don't have v64i8 available.
	if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
	CC != CallingConv::X86_RegCall) {
	RegisterVT = MVT::v32i8;
	IntermediateVT = MVT::v32i1;
	NumIntermediates = 2;
	return 2;
	}

	return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
	NumIntermediates, RegisterVT);
	}

	EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
	LLVMContext& Context,
	EVT VT) const {
	if (!VT.isVector())
	return MVT::i8;

	if (Subtarget.hasAVX512()) {
	const unsigned NumElts = VT.getVectorNumElements();

	// Figure out what this type will be legalized to.
	EVT LegalVT = VT;
	while (getTypeAction(Context, LegalVT) != TypeLegal)
	LegalVT = getTypeToTransformTo(Context, LegalVT);

	// If we got a 512-bit vector then we'll definitely have a vXi1 compare.
	if (LegalVT.getSimpleVT().is512BitVector())
	return EVT::getVectorVT(Context, MVT::i1, NumElts);

	if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
	// If we legalized to less than a 512-bit vector, then we will use a vXi1
	// compare for vXi32/vXi64 for sure. If we have BWI we will also support
	// vXi16/vXi8.
	MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
	if (Subtarget.hasBWI() \|\| EltVT.getSizeInBits() >= 32)
	return EVT::getVectorVT(Context, MVT::i1, NumElts);
	}
	}

	return VT.changeVectorElementTypeToInteger();
	}

	/// Helper for getByValTypeAlignment to determine
	/// the desired ByVal argument alignment.
	static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
	if (MaxAlign == 16)
	return;
	if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
	if (VTy->getPrimitiveSizeInBits().getFixedSize() == 128)
	MaxAlign = Align(16);
	} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
	Align EltAlign;
	getMaxByValAlign(ATy->getElementType(), EltAlign);
	if (EltAlign > MaxAlign)
	MaxAlign = EltAlign;
	} else if (StructType *STy = dyn_cast<StructType>(Ty)) {
	for (auto *EltTy : STy->elements()) {
	Align EltAlign;
	getMaxByValAlign(EltTy, EltAlign);
	if (EltAlign > MaxAlign)
	MaxAlign = EltAlign;
	if (MaxAlign == 16)
	break;
	}
	}
	}

	/// Return the desired alignment for ByVal aggregate
	/// function arguments in the caller parameter area. For X86, aggregates
	/// that contain SSE vectors are placed at 16-byte boundaries while the rest
	/// are at 4-byte boundaries.
	unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
	const DataLayout &DL) const {
	if (Subtarget.is64Bit()) {
	// Max of 8 and alignment of type.
	Align TyAlign = DL.getABITypeAlign(Ty);
	if (TyAlign > 8)
	return TyAlign.value();
	return 8;
	}

	Align Alignment(4);
	if (Subtarget.hasSSE1())
	getMaxByValAlign(Ty, Alignment);
	return Alignment.value();
	}

	/// It returns EVT::Other if the type should be determined using generic
	/// target-independent logic.
	/// For vector ops we check that the overall size isn't larger than our
	/// preferred vector width.
	EVT X86TargetLowering::getOptimalMemOpType(
	const MemOp &Op, const AttributeList &FuncAttributes) const {
	if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
	if (Op.size() >= 16 &&
	(!Subtarget.isUnalignedMem16Slow() \|\| Op.isAligned(Align(16)))) {
	// FIXME: Check if unaligned 64-byte accesses are slow.
	if (Op.size() >= 64 && Subtarget.hasAVX512() &&
	(Subtarget.getPreferVectorWidth() >= 512)) {
	return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
	}
	// FIXME: Check if unaligned 32-byte accesses are slow.
	if (Op.size() >= 32 && Subtarget.hasAVX() &&
	(Subtarget.getPreferVectorWidth() >= 256)) {
	// Although this isn't a well-supported type for AVX1, we'll let
	// legalization and shuffle lowering produce the optimal codegen. If we
	// choose an optimal type with a vector element larger than a byte,
	// getMemsetStores() may create an intermediate splat (using an integer
	// multiply) before we splat as a vector.
	return MVT::v32i8;
	}
	if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
	return MVT::v16i8;
	// TODO: Can SSE1 handle a byte vector?
	// If we have SSE1 registers we should be able to use them.
	if (Subtarget.hasSSE1() && (Subtarget.is64Bit() \|\| Subtarget.hasX87()) &&
	(Subtarget.getPreferVectorWidth() >= 128))
	return MVT::v4f32;
	} else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) \|\| Op.isZeroMemset()) &&
	Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
	// Do not use f64 to lower memcpy if source is string constant. It's
	// better to use i32 to avoid the loads.
	// Also, do not use f64 to lower memset unless this is a memset of zeros.
	// The gymnastics of splatting a byte value into an XMM register and then
	// only using 8-byte stores (because this is a CPU with slow unaligned
	// 16-byte accesses) makes that a loser.
	return MVT::f64;
	}
	}
	// This is a compromise. If we reach here, unaligned accesses may be slow on
	// this target. However, creating smaller, aligned accesses could be even
	// slower and would certainly be a lot more code.
	if (Subtarget.is64Bit() && Op.size() >= 8)
	return MVT::i64;
	return MVT::i32;
	}

	bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
	if (VT == MVT::f32)
	return X86ScalarSSEf32;
	else if (VT == MVT::f64)
	return X86ScalarSSEf64;
	return true;
	}

	bool X86TargetLowering::allowsMisalignedMemoryAccesses(
	EVT VT, unsigned, unsigned Align, MachineMemOperand::Flags Flags,
	bool *Fast) const {
	if (Fast) {
	switch (VT.getSizeInBits()) {
	default:
	// 8-byte and under are always assumed to be fast.
	*Fast = true;
	break;
	case 128:
	*Fast = !Subtarget.isUnalignedMem16Slow();
	break;
	case 256:
	*Fast = !Subtarget.isUnalignedMem32Slow();
	break;
	// TODO: What about AVX-512 (512-bit) accesses?
	}
	}
	// NonTemporal vector memory ops must be aligned.
	if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
	// NT loads can only be vector aligned, so if its less aligned than the
	// minimum vector size (which we can split the vector down to), we might as
	// well use a regular unaligned vector load.
	// We don't have any NT loads pre-SSE41.
	if (!!(Flags & MachineMemOperand::MOLoad))
	return (Align < 16 \|\| !Subtarget.hasSSE41());
	return false;
	}
	// Misaligned accesses of any size are always allowed.
	return true;
	}

	/// Return the entry encoding for a jump table in the
	/// current function. The returned value is a member of the
	/// MachineJumpTableInfo::JTEntryKind enum.
	unsigned X86TargetLowering::getJumpTableEncoding() const {
	// In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
	// symbol.
	if (isPositionIndependent() && Subtarget.isPICStyleGOT())
	return MachineJumpTableInfo::EK_Custom32;

	// Otherwise, use the normal jump table encoding heuristics.
	return TargetLowering::getJumpTableEncoding();
	}

	bool X86TargetLowering::useSoftFloat() const {
	return Subtarget.useSoftFloat();
	}

	void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
	ArgListTy &Args) const {

	// Only relabel X86-32 for C / Stdcall CCs.
	if (Subtarget.is64Bit())
	return;
	if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
	return;
	unsigned ParamRegs = 0;
	if (auto *M = MF->getFunction().getParent())
	ParamRegs = M->getNumberRegisterParameters();

	// Mark the first N int arguments as having reg
	for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
	Type *T = Args[Idx].Ty;
	if (T->isIntOrPtrTy())
	if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
	unsigned numRegs = 1;
	if (MF->getDataLayout().getTypeAllocSize(T) > 4)
	numRegs = 2;
	if (ParamRegs < numRegs)
	return;
	ParamRegs -= numRegs;
	Args[Idx].IsInReg = true;
	}
	}
	}

	const MCExpr *
	X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
	const MachineBasicBlock *MBB,
	unsigned uid,MCContext &Ctx) const{
	assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
	// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
	// entries.
	return MCSymbolRefExpr::create(MBB->getSymbol(),
	MCSymbolRefExpr::VK_GOTOFF, Ctx);
	}

	/// Returns relocation base for the given PIC jumptable.
	SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
	SelectionDAG &DAG) const {
	if (!Subtarget.is64Bit())
	// This doesn't have SDLoc associated with it, but is not really the
	// same as a Register.
	return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
	getPointerTy(DAG.getDataLayout()));
	return Table;
	}

	/// This returns the relocation base for the given PIC jumptable,
	/// the same as getPICJumpTableRelocBase, but as an MCExpr.
	const MCExpr *X86TargetLowering::
	getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
	MCContext &Ctx) const {
	// X86-64 uses RIP relative addressing based on the jump table label.
	if (Subtarget.isPICStyleRIPRel())
	return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);

	// Otherwise, the reference is relative to the PIC base.
	return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
	}

	std::pair<const TargetRegisterClass *, uint8_t>
	X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
	MVT VT) const {
	const TargetRegisterClass *RRC = nullptr;
	uint8_t Cost = 1;
	switch (VT.SimpleTy) {
	default:
	return TargetLowering::findRepresentativeClass(TRI, VT);
	case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
	RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
	break;
	case MVT::x86mmx:
	RRC = &X86::VR64RegClass;
	break;
	case MVT::f32: case MVT::f64:
	case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
	case MVT::v4f32: case MVT::v2f64:
	case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
	case MVT::v8f32: case MVT::v4f64:
	case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
	case MVT::v16f32: case MVT::v8f64:
	RRC = &X86::VR128XRegClass;
	break;
	}
	return std::make_pair(RRC, Cost);
	}

	unsigned X86TargetLowering::getAddressSpace() const {
	if (Subtarget.is64Bit())
	return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
	return 256;
	}

	static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
	return TargetTriple.isOSGlibc() \|\| TargetTriple.isOSFuchsia() \|\|
	(TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
	}

	static Constant* SegmentOffset(IRBuilder<> &IRB,
	unsigned Offset, unsigned AddressSpace) {
	return ConstantExpr::getIntToPtr(
	ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
	Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
	}

	Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
	// glibc, bionic, and Fuchsia have a special slot for the stack guard in
	// tcbhead_t; use it instead of the usual global variable (see
	// sysdeps/{i386,x86_64}/nptl/tls.h)
	if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
	if (Subtarget.isTargetFuchsia()) {
	// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
	return SegmentOffset(IRB, 0x10, getAddressSpace());
	} else {
	// %fs:0x28, unless we're using a Kernel code model, in which case
	// it's %gs:0x28. gs:0x14 on i386.
	unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
	return SegmentOffset(IRB, Offset, getAddressSpace());
	}
	}

	return TargetLowering::getIRStackGuard(IRB);
	}

	void X86TargetLowering::insertSSPDeclarations(Module &M) const {
	// MSVC CRT provides functionalities for stack protection.
	if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() \|\|
	Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
	// MSVC CRT has a global variable holding security cookie.
	M.getOrInsertGlobal("__security_cookie",
	Type::getInt8PtrTy(M.getContext()));

	// MSVC CRT has a function to validate security cookie.
	FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
	"__security_check_cookie", Type::getVoidTy(M.getContext()),
	Type::getInt8PtrTy(M.getContext()));
	if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
	F->setCallingConv(CallingConv::X86_FastCall);
	F->addAttribute(1, Attribute::AttrKind::InReg);
	}
	return;
	}
	// glibc, bionic, and Fuchsia have a special slot for the stack guard.
	if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
	return;
	TargetLowering::insertSSPDeclarations(M);
	}

	Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
	// MSVC CRT has a global variable holding security cookie.
	if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() \|\|
	Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
	return M.getGlobalVariable("__security_cookie");
	}
	return TargetLowering::getSDagStackGuard(M);
	}

	Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
	// MSVC CRT has a function to validate security cookie.
	if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() \|\|
	Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
	return M.getFunction("__security_check_cookie");
	}
	return TargetLowering::getSSPStackGuardCheck(M);
	}

	Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
	if (Subtarget.getTargetTriple().isOSContiki())
	return getDefaultSafeStackPointerLocation(IRB, false);

	// Android provides a fixed TLS slot for the SafeStack pointer. See the
	// definition of TLS_SLOT_SAFESTACK in
	// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
	if (Subtarget.isTargetAndroid()) {
	// %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
	// %gs:0x24 on i386
	unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
	return SegmentOffset(IRB, Offset, getAddressSpace());
	}

	// Fuchsia is similar.
	if (Subtarget.isTargetFuchsia()) {
	// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
	return SegmentOffset(IRB, 0x18, getAddressSpace());
	}

	return TargetLowering::getSafeStackPointerLocation(IRB);
	}

	bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
	unsigned DestAS) const {
	assert(SrcAS != DestAS && "Expected different address spaces!");

	const TargetMachine &TM = getTargetMachine();
	if (TM.getPointerSize(SrcAS) != TM.getPointerSize(DestAS))
	return false;

	return SrcAS < 256 && DestAS < 256;
	}

	//===----------------------------------------------------------------------===//
	// Return Value Calling Convention Implementation
	//===----------------------------------------------------------------------===//

	bool X86TargetLowering::CanLowerReturn(
	CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
	return CCInfo.CheckReturn(Outs, RetCC_X86);
	}

	const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
	static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
	return ScratchRegs;
	}

	/// Lowers masks values (v*i1) to the local register values
	/// \returns DAG node after lowering to register type
	static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
	const SDLoc &Dl, SelectionDAG &DAG) {
	EVT ValVT = ValArg.getValueType();

	if (ValVT == MVT::v1i1)
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
	DAG.getIntPtrConstant(0, Dl));

	if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 \|\| ValLoc == MVT::i32)) \|\|
	(ValVT == MVT::v16i1 && (ValLoc == MVT::i16 \|\| ValLoc == MVT::i32))) {
	// Two stage lowering might be required
	// bitcast: v8i1 -> i8 / v16i1 -> i16
	// anyextend: i8 -> i32 / i16 -> i32
	EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
	SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
	if (ValLoc == MVT::i32)
	ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
	return ValToCopy;
	}

	if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) \|\|
	(ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
	// One stage lowering is required
	// bitcast: v32i1 -> i32 / v64i1 -> i64
	return DAG.getBitcast(ValLoc, ValArg);
	}

	return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
	}

	/// Breaks v64i1 value into two registers and adds the new node to the DAG
	static void Passv64i1ArgInRegs(
	const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
	SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
	CCValAssign &NextVA, const X86Subtarget &Subtarget) {
	assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
	assert(Subtarget.is32Bit() && "Expecting 32 bit target");
	assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
	assert(VA.isRegLoc() && NextVA.isRegLoc() &&
	"The value should reside in two registers");

	// Before splitting the value we cast it to i64
	Arg = DAG.getBitcast(MVT::i64, Arg);

	// Splitting the value into two i32 types
	SDValue Lo, Hi;
	Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
	DAG.getConstant(0, Dl, MVT::i32));
	Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
	DAG.getConstant(1, Dl, MVT::i32));

	// Attach the two i32 types into corresponding registers
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
	RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
	}

	SDValue
	X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &dl, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

	// In some cases we need to disable registers from the default CSR list.
	// For example, when they are used for argument passing.
	bool ShouldDisableCalleeSavedRegister =
	CallConv == CallingConv::X86_RegCall \|\|
	MF.getFunction().hasFnAttribute("no_caller_saved_registers");

	if (CallConv == CallingConv::X86_INTR && !Outs.empty())
	report_fatal_error("X86 interrupts may not return any value");

	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
	CCInfo.AnalyzeReturn(Outs, RetCC_X86);

	SmallVector<std::pair<Register, SDValue>, 4> RetVals;
	for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
	++I, ++OutsIndex) {
	CCValAssign &VA = RVLocs[I];
	assert(VA.isRegLoc() && "Can only return in registers!");

	// Add the register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());

	SDValue ValToCopy = OutVals[OutsIndex];
	EVT ValVT = ValToCopy.getValueType();

	// Promote values to the appropriate types.
	if (VA.getLocInfo() == CCValAssign::SExt)
	ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
	else if (VA.getLocInfo() == CCValAssign::ZExt)
	ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
	else if (VA.getLocInfo() == CCValAssign::AExt) {
	if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
	ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
	else
	ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
	}
	else if (VA.getLocInfo() == CCValAssign::BCvt)
	ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);

	assert(VA.getLocInfo() != CCValAssign::FPExt &&
	"Unexpected FP-extend for return value.");

	// Report an error if we have attempted to return a value via an XMM
	// register and SSE was disabled.
	if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
	errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	} else if (!Subtarget.hasSSE2() &&
	X86::FR64XRegClass.contains(VA.getLocReg()) &&
	ValVT == MVT::f64) {
	// When returning a double via an XMM register, report an error if SSE2 is
	// not enabled.
	errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	}

	// Returns in ST0/ST1 are handled specially: these are pushed as operands to
	// the RET instruction and handled by the FP Stackifier.
	if (VA.getLocReg() == X86::FP0 \|\|
	VA.getLocReg() == X86::FP1) {
	// If this is a copy from an xmm register to ST(0), use an FPExtend to
	// change the value to the FP stack register class.
	if (isScalarFPTypeInSSEReg(VA.getValVT()))
	ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
	RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
	// Don't emit a copytoreg.
	continue;
	}

	// 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
	// which is returned in RAX / RDX.
	if (Subtarget.is64Bit()) {
	if (ValVT == MVT::x86mmx) {
	if (VA.getLocReg() == X86::XMM0 \|\| VA.getLocReg() == X86::XMM1) {
	ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
	ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
	ValToCopy);
	// If we don't have SSE2 available, convert to v4f32 so the generated
	// register is legal.
	if (!Subtarget.hasSSE2())
	ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
	}
	}
	}

	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");

	Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
	Subtarget);

	// Add the second register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
	} else {
	RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
	}
	}

	SDValue Flag;
	SmallVector<SDValue, 6> RetOps;
	RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
	// Operand #1 = Bytes To Pop
	RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
	MVT::i32));

	// Copy the result values into the output registers.
	for (auto &RetVal : RetVals) {
	if (RetVal.first == X86::FP0 \|\| RetVal.first == X86::FP1) {
	RetOps.push_back(RetVal.second);
	continue; // Don't emit a copytoreg.
	}

	Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag);
	Flag = Chain.getValue(1);
	RetOps.push_back(
	DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
	}

	// Swift calling convention does not require we copy the sret argument
	// into %rax/%eax for the return, and SRetReturnReg is not set for Swift.

	// All x86 ABIs require that for returning structs by value we copy
	// the sret argument into %rax/%eax (depending on ABI) for the return.
	// We saved the argument into a virtual register in the entry block,
	// so now we copy the value out and into %rax/%eax.
	//
	// Checking Function.hasStructRetAttr() here is insufficient because the IR
	// may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
	// false, then an sret argument may be implicitly inserted in the SelDAG. In
	// either case FuncInfo->setSRetReturnReg() will have been called.
	if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
	// When we have both sret and another return value, we should use the
	// original Chain stored in RetOps[0], instead of the current Chain updated
	// in the above loop. If we only have sret, RetOps[0] equals to Chain.

	// For the case of sret and another return value, we have
	// Chain_0 at the function entry
	// Chain_1 = getCopyToReg(Chain_0) in the above loop
	// If we use Chain_1 in getCopyFromReg, we will have
	// Val = getCopyFromReg(Chain_1)
	// Chain_2 = getCopyToReg(Chain_1, Val) from below

	// getCopyToReg(Chain_0) will be glued together with
	// getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
	// in Unit B, and we will have cyclic dependency between Unit A and Unit B:
	// Data dependency from Unit B to Unit A due to usage of Val in
	// getCopyToReg(Chain_1, Val)
	// Chain dependency from Unit A to Unit B

	// So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
	SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
	getPointerTy(MF.getDataLayout()));

	Register RetValReg
	= (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
	X86::RAX : X86::EAX;
	Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
	Flag = Chain.getValue(1);

	// RAX/EAX now acts like a return value.
	RetOps.push_back(
	DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));

	// Add the returned register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
	}

	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const MCPhysReg *I =
	TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
	if (I) {
	for (; *I; ++I) {
	if (X86::GR64RegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::i64));
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
	}
	}

	RetOps[0] = Chain; // Update chain.

	// Add the flag if we have it.
	if (Flag.getNode())
	RetOps.push_back(Flag);

	X86ISD::NodeType opcode = X86ISD::RET_FLAG;
	if (CallConv == CallingConv::X86_INTR)
	opcode = X86ISD::IRET;
	return DAG.getNode(opcode, dl, MVT::Other, RetOps);
	}

	bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
	if (N->getNumValues() != 1 \|\| !N->hasNUsesOfValue(1, 0))
	return false;

	SDValue TCChain = Chain;
	SDNode Copy = N->use_begin();
	if (Copy->getOpcode() == ISD::CopyToReg) {
	// If the copy has a glue operand, we conservatively assume it isn't safe to
	// perform a tail call.
	if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
	return false;
	TCChain = Copy->getOperand(0);
	} else if (Copy->getOpcode() != ISD::FP_EXTEND)
	return false;

	bool HasRet = false;
	for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
	UI != UE; ++UI) {
	if (UI->getOpcode() != X86ISD::RET_FLAG)
	return false;
	// If we are returning more than one value, we can definitely
	// not make a tail call see PR19530
	if (UI->getNumOperands() > 4)
	return false;
	if (UI->getNumOperands() == 4 &&
	UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
	return false;
	HasRet = true;
	}

	if (!HasRet)
	return false;

	Chain = TCChain;
	return true;
	}

	EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
	ISD::NodeType ExtendKind) const {
	MVT ReturnMVT = MVT::i32;

	bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
	if (VT == MVT::i1 \|\| (!Darwin && (VT == MVT::i8 \|\| VT == MVT::i16))) {
	// The ABI does not require i1, i8 or i16 to be extended.
	//
	// On Darwin, there is code in the wild relying on Clang's old behaviour of
	// always extending i8/i16 return values, so keep doing that for now.
	// (PR26665).
	ReturnMVT = MVT::i8;
	}

	EVT MinVT = getRegisterType(Context, ReturnMVT);
	return VT.bitsLT(MinVT) ? MinVT : VT;
	}

	/// Reads two 32 bit registers and creates a 64 bit mask value.
	/// \param VA The current 32 bit value that need to be assigned.
	/// \param NextVA The next 32 bit value that need to be assigned.
	/// \param Root The parent DAG node.
	/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
	/// glue purposes. In the case the DAG is already using
	/// physical register instead of virtual, we should glue
	/// our new SDValue to InFlag SDvalue.
	/// \return a new SDvalue of size 64bit.
	static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
	SDValue &Root, SelectionDAG &DAG,
	const SDLoc &Dl, const X86Subtarget &Subtarget,
	SDValue *InFlag = nullptr) {
	assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
	assert(Subtarget.is32Bit() && "Expecting 32 bit target");
	assert(VA.getValVT() == MVT::v64i1 &&
	"Expecting first location of 64 bit width type");
	assert(NextVA.getValVT() == VA.getValVT() &&
	"The locations should have the same type");
	assert(VA.isRegLoc() && NextVA.isRegLoc() &&
	"The values should reside in two registers");

	SDValue Lo, Hi;
	SDValue ArgValueLo, ArgValueHi;

	MachineFunction &MF = DAG.getMachineFunction();
	const TargetRegisterClass *RC = &X86::GR32RegClass;

	// Read a 32 bit value from the registers.
	if (nullptr == InFlag) {
	// When no physical register is present,
	// create an intermediate virtual register.
	Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
	Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
	ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
	} else {
	// When a physical register is available read the value from it and glue
	// the reads together.
	ArgValueLo =
	DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
	*InFlag = ArgValueLo.getValue(2);
	ArgValueHi =
	DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
	*InFlag = ArgValueHi.getValue(2);
	}

	// Convert the i32 type into v32i1 type.
	Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);

	// Convert the i32 type into v32i1 type.
	Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);

	// Concatenate the two values together.
	return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
	}

	/// The function will lower a register of various sizes (8/16/32/64)
	/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
	/// \returns a DAG node contains the operand after lowering to mask type.
	static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
	const EVT &ValLoc, const SDLoc &Dl,
	SelectionDAG &DAG) {
	SDValue ValReturned = ValArg;

	if (ValVT == MVT::v1i1)
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);

	if (ValVT == MVT::v64i1) {
	// In 32 bit machine, this case is handled by getv64i1Argument
	assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
	// In 64 bit machine, There is no need to truncate the value only bitcast
	} else {
	MVT maskLen;
	switch (ValVT.getSimpleVT().SimpleTy) {
	case MVT::v8i1:
	maskLen = MVT::i8;
	break;
	case MVT::v16i1:
	maskLen = MVT::i16;
	break;
	case MVT::v32i1:
	maskLen = MVT::i32;
	break;
	default:
	llvm_unreachable("Expecting a vector of i1 types");
	}

	ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
	}
	return DAG.getBitcast(ValVT, ValReturned);
	}

	/// Lower the result values of a call into the
	/// appropriate copies out of appropriate physical registers.
	///
	SDValue X86TargetLowering::LowerCallResult(
	SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
	uint32_t *RegMask) const {

	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	// Assign locations to each value returned by this call.
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());
	CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

	// Copy all of the result registers out of their specified physreg.
	for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
	++I, ++InsIndex) {
	CCValAssign &VA = RVLocs[I];
	EVT CopyVT = VA.getLocVT();

	// In some calling conventions we need to remove the used registers
	// from the register mask.
	if (RegMask) {
	for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /IncludeSelf=/true);
	SubRegs.isValid(); ++SubRegs)
	RegMask[SubRegs / 32] &= ~(1u << (SubRegs % 32));
	}

	// Report an error if there was an attempt to return FP values via XMM
	// registers.
	if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
	errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
	if (VA.getLocReg() == X86::XMM1)
	VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
	else
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	} else if (!Subtarget.hasSSE2() &&
	X86::FR64XRegClass.contains(VA.getLocReg()) &&
	CopyVT == MVT::f64) {
	errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
	if (VA.getLocReg() == X86::XMM1)
	VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
	else
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	}

	// If we prefer to use the value in xmm registers, copy it out as f80 and
	// use a truncate to move it from fp stack reg to xmm reg.
	bool RoundAfterCopy = false;
	if ((VA.getLocReg() == X86::FP0 \|\| VA.getLocReg() == X86::FP1) &&
	isScalarFPTypeInSSEReg(VA.getValVT())) {
	if (!Subtarget.hasX87())
	report_fatal_error("X87 register return with X87 disabled");
	CopyVT = MVT::f80;
	RoundAfterCopy = (CopyVT != VA.getLocVT());
	}

	SDValue Val;
	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");
	Val =
	getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
	} else {
	Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
	.getValue(1);
	Val = Chain.getValue(0);
	InFlag = Chain.getValue(2);
	}

	if (RoundAfterCopy)
	Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
	// This truncation won't change the value.
	DAG.getIntPtrConstant(1, dl));

	if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
	if (VA.getValVT().isVector() &&
	((VA.getLocVT() == MVT::i64) \|\| (VA.getLocVT() == MVT::i32) \|\|
	(VA.getLocVT() == MVT::i16) \|\| (VA.getLocVT() == MVT::i8))) {
	// promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
	Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
	} else
	Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
	}

	if (VA.getLocInfo() == CCValAssign::BCvt)
	Val = DAG.getBitcast(VA.getValVT(), Val);

	InVals.push_back(Val);
	}

	return Chain;
	}

	//===----------------------------------------------------------------------===//
	// C & StdCall & Fast Calling Convention implementation
	//===----------------------------------------------------------------------===//
	// StdCall calling convention seems to be standard for many Windows' API
	// routines and around. It differs from C calling convention just a little:
	// callee should clean up the stack, not caller. Symbols should be also
	// decorated in some fancy way :) It doesn't support any vector arguments.
	// For info on fast calling convention see Fast Calling Convention (tail call)
	// implementation LowerX86_32FastCCCallTo.

	/// CallIsStructReturn - Determines whether a call uses struct return
	/// semantics.
	enum StructReturnType {
	NotStructReturn,
	RegStructReturn,
	StackStructReturn
	};
	static StructReturnType
	callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
	if (Outs.empty())
	return NotStructReturn;

	const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
	if (!Flags.isSRet())
	return NotStructReturn;
	if (Flags.isInReg() \|\| IsMCU)
	return RegStructReturn;
	return StackStructReturn;
	}

	/// Determines whether a function uses struct return semantics.
	static StructReturnType
	argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
	if (Ins.empty())
	return NotStructReturn;

	const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
	if (!Flags.isSRet())
	return NotStructReturn;
	if (Flags.isInReg() \|\| IsMCU)
	return RegStructReturn;
	return StackStructReturn;
	}

	/// Make a copy of an aggregate at address specified by "Src" to address
	/// "Dst" with size and alignment information specified by the specific
	/// parameter attribute. The copy will be passed as a byval function parameter.
	static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
	SDValue Chain, ISD::ArgFlagsTy Flags,
	SelectionDAG &DAG, const SDLoc &dl) {
	SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);

	return DAG.getMemcpy(
	Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
	/isVolatile/ false, /AlwaysInline=/true,
	/isTailCall/ false, MachinePointerInfo(), MachinePointerInfo());
	}

	/// Return true if the calling convention is one that we can guarantee TCO for.
	static bool canGuaranteeTCO(CallingConv::ID CC) {
	return (CC == CallingConv::Fast \|\| CC == CallingConv::GHC \|\|
	CC == CallingConv::X86_RegCall \|\| CC == CallingConv::HiPE \|\|
	CC == CallingConv::HHVM \|\| CC == CallingConv::Tail);
	}

	/// Return true if we might ever do TCO for calls with this calling convention.
	static bool mayTailCallThisCC(CallingConv::ID CC) {
	switch (CC) {
	// C calling conventions:
	case CallingConv::C:
	case CallingConv::Win64:
	case CallingConv::X86_64_SysV:
	// Callee pop conventions:
	case CallingConv::X86_ThisCall:
	case CallingConv::X86_StdCall:
	case CallingConv::X86_VectorCall:
	case CallingConv::X86_FastCall:
	// Swift:
	case CallingConv::Swift:
	return true;
	default:
	return canGuaranteeTCO(CC);
	}
	}

	/// Return true if the function is being made into a tailcall target by
	/// changing its ABI.
	static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
	return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) \|\| CC == CallingConv::Tail;
	}

	bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
	if (!CI->isTailCall())
	return false;

	CallingConv::ID CalleeCC = CI->getCallingConv();
	if (!mayTailCallThisCC(CalleeCC))
	return false;

	return true;
	}

	SDValue
	X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &dl, SelectionDAG &DAG,
	const CCValAssign &VA,
	MachineFrameInfo &MFI, unsigned i) const {
	// Create the nodes corresponding to a load from this parameter slot.
	ISD::ArgFlagsTy Flags = Ins[i].Flags;
	bool AlwaysUseMutable = shouldGuaranteeTCO(
	CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
	bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
	EVT ValVT;
	MVT PtrVT = getPointerTy(DAG.getDataLayout());

	// If value is passed by pointer we have address passed instead of the value
	// itself. No need to extend if the mask value and location share the same
	// absolute size.
	bool ExtendedInMem =
	VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
	VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();

	if (VA.getLocInfo() == CCValAssign::Indirect \|\| ExtendedInMem)
	ValVT = VA.getLocVT();
	else
	ValVT = VA.getValVT();

	// FIXME: For now, all byval parameter objects are marked mutable. This can be
	// changed with more analysis.
	// In case of tail call optimization mark all arguments mutable. Since they
	// could be overwritten by lowering of arguments in case of a tail call.
	if (Flags.isByVal()) {
	unsigned Bytes = Flags.getByValSize();
	if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.

	// FIXME: For now, all byval parameter objects are marked as aliasing. This
	// can be improved with deeper analysis.
	int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
	/isAliased=/true);
	return DAG.getFrameIndex(FI, PtrVT);
	}

	// This is an argument in memory. We might be able to perform copy elision.
	// If the argument is passed directly in memory without any extension, then we
	// can perform copy elision. Large vector types, for example, may be passed
	// indirectly by pointer.
	if (Flags.isCopyElisionCandidate() &&
	VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem) {
	EVT ArgVT = Ins[i].ArgVT;
	SDValue PartAddr;
	if (Ins[i].PartOffset == 0) {
	// If this is a one-part value or the first part of a multi-part value,
	// create a stack object for the entire argument value type and return a
	// load from our portion of it. This assumes that if the first part of an
	// argument is in memory, the rest will also be in memory.
	int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
	/IsImmutable=/false);
	PartAddr = DAG.getFrameIndex(FI, PtrVT);
	return DAG.getLoad(
	ValVT, dl, Chain, PartAddr,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	} else {
	// This is not the first piece of an argument in memory. See if there is
	// already a fixed stack object including this offset. If so, assume it
	// was created by the PartOffset == 0 branch above and create a load from
	// the appropriate offset into it.
	int64_t PartBegin = VA.getLocMemOffset();
	int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
	int FI = MFI.getObjectIndexBegin();
	for (; MFI.isFixedObjectIndex(FI); ++FI) {
	int64_t ObjBegin = MFI.getObjectOffset(FI);
	int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
	if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
	break;
	}
	if (MFI.isFixedObjectIndex(FI)) {
	SDValue Addr =
	DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
	DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
	return DAG.getLoad(
	ValVT, dl, Chain, Addr,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
	Ins[i].PartOffset));
	}
	}
	}

	int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
	VA.getLocMemOffset(), isImmutable);

	// Set SExt or ZExt flag.
	if (VA.getLocInfo() == CCValAssign::ZExt) {
	MFI.setObjectZExt(FI, true);
	} else if (VA.getLocInfo() == CCValAssign::SExt) {
	MFI.setObjectSExt(FI, true);
	}

	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	SDValue Val = DAG.getLoad(
	ValVT, dl, Chain, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	return ExtendedInMem
	? (VA.getValVT().isVector()
	? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
	: DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
	: Val;
	}

	// FIXME: Get this from tablegen.
	static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
	const X86Subtarget &Subtarget) {
	assert(Subtarget.is64Bit());

	if (Subtarget.isCallingConvWin64(CallConv)) {
	static const MCPhysReg GPR64ArgRegsWin64[] = {
	X86::RCX, X86::RDX, X86::R8, X86::R9
	};
	return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
	}

	static const MCPhysReg GPR64ArgRegs64Bit[] = {
	X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
	};
	return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
	}

	// FIXME: Get this from tablegen.
	static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
	CallingConv::ID CallConv,
	const X86Subtarget &Subtarget) {
	assert(Subtarget.is64Bit());
	if (Subtarget.isCallingConvWin64(CallConv)) {
	// The XMM registers which might contain var arg parameters are shadowed
	// in their paired GPR. So we only need to save the GPR to their home
	// slots.
	// TODO: __vectorcall will change this.
	return None;
	}

	const Function &F = MF.getFunction();
	bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
	bool isSoftFloat = Subtarget.useSoftFloat();
	assert(!(isSoftFloat && NoImplicitFloatOps) &&
	"SSE register cannot be used when SSE is disabled!");
	if (isSoftFloat \|\| NoImplicitFloatOps \|\| !Subtarget.hasSSE1())
	// Kernel mode asks for SSE to be disabled, so there are no XMM argument
	// registers.
	return None;

	static const MCPhysReg XMMArgRegs64Bit[] = {
	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
	};
	return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
	}

	#ifndef NDEBUG
	static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
	return llvm::is_sorted(
	ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
	return A.getValNo() < B.getValNo();
	});
	}
	#endif

	namespace {
	/// This is a helper class for lowering variable arguments parameters.
	class VarArgsLoweringHelper {
	public:
	VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
	SelectionDAG &DAG, const X86Subtarget &Subtarget,
	CallingConv::ID CallConv, CCState &CCInfo)
	: FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
	TheMachineFunction(DAG.getMachineFunction()),
	TheFunction(TheMachineFunction.getFunction()),
	FrameInfo(TheMachineFunction.getFrameInfo()),
	FrameLowering(*Subtarget.getFrameLowering()),
	TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
	CCInfo(CCInfo) {}

	// Lower variable arguments parameters.
	void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);

	private:
	void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);

	void forwardMustTailParameters(SDValue &Chain);

	bool is64Bit() { return Subtarget.is64Bit(); }
	bool isWin64() { return Subtarget.isCallingConvWin64(CallConv); }

	X86MachineFunctionInfo *FuncInfo;
	const SDLoc &DL;
	SelectionDAG &DAG;
	const X86Subtarget &Subtarget;
	MachineFunction &TheMachineFunction;
	const Function &TheFunction;
	MachineFrameInfo &FrameInfo;
	const TargetFrameLowering &FrameLowering;
	const TargetLowering &TargLowering;
	CallingConv::ID CallConv;
	CCState &CCInfo;
	};
	} // namespace

	void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
	SDValue &Chain, unsigned StackSize) {
	// If the function takes variable number of arguments, make a frame index for
	// the start of the first vararg value... for expansion of llvm.va_start. We
	// can skip this if there are no va_start calls.
	if (is64Bit() \|\| (CallConv != CallingConv::X86_FastCall &&
	CallConv != CallingConv::X86_ThisCall)) {
	FuncInfo->setVarArgsFrameIndex(
	FrameInfo.CreateFixedObject(1, StackSize, true));
	}

	// Figure out if XMM registers are in use.
	assert(!(Subtarget.useSoftFloat() &&
	TheFunction.hasFnAttribute(Attribute::NoImplicitFloat)) &&
	"SSE register cannot be used when SSE is disabled!");

	// 64-bit calling conventions support varargs and register parameters, so we
	// have to do extra work to spill them in the prologue.
	if (is64Bit()) {
	// Find the first unallocated argument registers.
	ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
	ArrayRef<MCPhysReg> ArgXMMs =
	get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
	unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
	unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);

	assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
	"SSE register cannot be used when SSE is disabled!");

	if (isWin64()) {
	// Get to the caller-allocated home save location. Add 8 to account
	// for the return address.
	int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
	FuncInfo->setRegSaveFrameIndex(
	FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
	// Fixup to set vararg frame on shadow area (4 x i64).
	if (NumIntRegs < 4)
	FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
	} else {
	// For X86-64, if there are vararg parameters that are passed via
	// registers, then we must store them to their spots on the stack so
	// they may be loaded by dereferencing the result of va_next.
	FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
	FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
	FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
	ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
	}

	SmallVector<SDValue, 6>
	LiveGPRs; // list of SDValue for GPR registers keeping live input value
	SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
	// keeping live input value
	SDValue ALVal; // if applicable keeps SDValue for %al register

	// Gather all the live in physical registers.
	for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
	Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
	LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
	}
	const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
	if (!AvailableXmms.empty()) {
	Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
	ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
	for (MCPhysReg Reg : AvailableXmms) {
	Register XMMReg = TheMachineFunction.addLiveIn(Reg, &X86::VR128RegClass);
	LiveXMMRegs.push_back(
	DAG.getCopyFromReg(Chain, DL, XMMReg, MVT::v4f32));
	}
	}

	// Store the integer parameter registers.
	SmallVector<SDValue, 8> MemOps;
	SDValue RSFIN =
	DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
	TargLowering.getPointerTy(DAG.getDataLayout()));
	unsigned Offset = FuncInfo->getVarArgsGPOffset();
	for (SDValue Val : LiveGPRs) {
	SDValue FIN = DAG.getNode(ISD::ADD, DL,
	TargLowering.getPointerTy(DAG.getDataLayout()),
	RSFIN, DAG.getIntPtrConstant(Offset, DL));
	SDValue Store =
	DAG.getStore(Val.getValue(1), DL, Val, FIN,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(),
	FuncInfo->getRegSaveFrameIndex(), Offset));
	MemOps.push_back(Store);
	Offset += 8;
	}

	// Now store the XMM (fp + vector) parameter registers.
	if (!LiveXMMRegs.empty()) {
	SmallVector<SDValue, 12> SaveXMMOps;
	SaveXMMOps.push_back(Chain);
	SaveXMMOps.push_back(ALVal);
	SaveXMMOps.push_back(
	DAG.getIntPtrConstant(FuncInfo->getRegSaveFrameIndex(), DL));
	SaveXMMOps.push_back(
	DAG.getIntPtrConstant(FuncInfo->getVarArgsFPOffset(), DL));
	SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
	LiveXMMRegs.end());
	MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, DL,
	MVT::Other, SaveXMMOps));
	}

	if (!MemOps.empty())
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
	}
	}

	void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
	// Find the largest legal vector type.
	MVT VecVT = MVT::Other;
	// FIXME: Only some x86_32 calling conventions support AVX512.
	if (Subtarget.useAVX512Regs() &&
	(is64Bit() \|\| (CallConv == CallingConv::X86_VectorCall \|\|
	CallConv == CallingConv::Intel_OCL_BI)))
	VecVT = MVT::v16f32;
	else if (Subtarget.hasAVX())
	VecVT = MVT::v8f32;
	else if (Subtarget.hasSSE2())
	VecVT = MVT::v4f32;

	// We forward some GPRs and some vector types.
	SmallVector<MVT, 2> RegParmTypes;
	MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
	RegParmTypes.push_back(IntVT);
	if (VecVT != MVT::Other)
	RegParmTypes.push_back(VecVT);

	// Compute the set of forwarded registers. The rest are scratch.
	SmallVectorImpl<ForwardedRegister> &Forwards =
	FuncInfo->getForwardedMustTailRegParms();
	CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);

	// Forward AL for SysV x86_64 targets, since it is used for varargs.
	if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
	Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
	Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
	}

	// Copy all forwards from physical to virtual registers.
	for (ForwardedRegister &FR : Forwards) {
	// FIXME: Can we use a less constrained schedule?
	SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
	FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
	TargLowering.getRegClassFor(FR.VT));
	Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
	}
	}

	void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
	unsigned StackSize) {
	// Set FrameIndex to the 0xAAAAAAA value to mark unset state.
	// If necessary, it would be set into the correct value later.
	FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
	FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);

	if (FrameInfo.hasVAStart())
	createVarArgAreaAndStoreRegisters(Chain, StackSize);

	if (FrameInfo.hasMustTailInVarArgFunc())
	forwardMustTailParameters(Chain);
	}

	SDValue X86TargetLowering::LowerFormalArguments(
	SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	MachineFunction &MF = DAG.getMachineFunction();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

	const Function &F = MF.getFunction();
	if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
	F.getName() == "main")
	FuncInfo->setForceFramePointer(true);

	MachineFrameInfo &MFI = MF.getFrameInfo();
	bool Is64Bit = Subtarget.is64Bit();
	bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);

	assert(
	!(IsVarArg && canGuaranteeTCO(CallConv)) &&
	"Var args not supported with calling conv' regcall, fastcc, ghc or hipe");

	// Assign locations to all of the incoming arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());

	// Allocate shadow area for Win64.
	if (IsWin64)
	CCInfo.AllocateStack(32, Align(8));

	CCInfo.AnalyzeArguments(Ins, CC_X86);

	// In vectorcall calling convention a second pass is required for the HVA
	// types.
	if (CallingConv::X86_VectorCall == CallConv) {
	CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
	}

	// The next loop assumes that the locations are in the same order of the
	// input arguments.
	assert(isSortedByValueNo(ArgLocs) &&
	"Argument Location list must be sorted before lowering");

	SDValue ArgValue;
	for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++InsIndex) {
	assert(InsIndex < Ins.size() && "Invalid Ins index");
	CCValAssign &VA = ArgLocs[I];

	if (VA.isRegLoc()) {
	EVT RegVT = VA.getLocVT();
	if (VA.needsCustom()) {
	assert(
	VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");

	// v64i1 values, in regcall calling convention, that are
	// compiled to 32 bit arch, are split up into two registers.
	ArgValue =
	getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
	} else {
	const TargetRegisterClass *RC;
	if (RegVT == MVT::i8)
	RC = &X86::GR8RegClass;
	else if (RegVT == MVT::i16)
	RC = &X86::GR16RegClass;
	else if (RegVT == MVT::i32)
	RC = &X86::GR32RegClass;
	else if (Is64Bit && RegVT == MVT::i64)
	RC = &X86::GR64RegClass;
	else if (RegVT == MVT::f32)
	RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
	else if (RegVT == MVT::f64)
	RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
	else if (RegVT == MVT::f80)
	RC = &X86::RFP80RegClass;
	else if (RegVT == MVT::f128)
	RC = &X86::VR128RegClass;
	else if (RegVT.is512BitVector())
	RC = &X86::VR512RegClass;
	else if (RegVT.is256BitVector())
	RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
	else if (RegVT.is128BitVector())
	RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
	else if (RegVT == MVT::x86mmx)
	RC = &X86::VR64RegClass;
	else if (RegVT == MVT::v1i1)
	RC = &X86::VK1RegClass;
	else if (RegVT == MVT::v8i1)
	RC = &X86::VK8RegClass;
	else if (RegVT == MVT::v16i1)
	RC = &X86::VK16RegClass;
	else if (RegVT == MVT::v32i1)
	RC = &X86::VK32RegClass;
	else if (RegVT == MVT::v64i1)
	RC = &X86::VK64RegClass;
	else
	llvm_unreachable("Unknown argument type!");

	Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
	}

	// If this is an 8 or 16-bit value, it is really passed promoted to 32
	// bits. Insert an assert[sz]ext to capture this, then truncate to the
	// right size.
	if (VA.getLocInfo() == CCValAssign::SExt)
	ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
	DAG.getValueType(VA.getValVT()));
	else if (VA.getLocInfo() == CCValAssign::ZExt)
	ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
	DAG.getValueType(VA.getValVT()));
	else if (VA.getLocInfo() == CCValAssign::BCvt)
	ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);

	if (VA.isExtInLoc()) {
	// Handle MMX values passed in XMM regs.
	if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
	ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
	else if (VA.getValVT().isVector() &&
	VA.getValVT().getScalarType() == MVT::i1 &&
	((VA.getLocVT() == MVT::i64) \|\| (VA.getLocVT() == MVT::i32) \|\|
	(VA.getLocVT() == MVT::i16) \|\| (VA.getLocVT() == MVT::i8))) {
	// Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
	ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
	} else
	ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
	}
	} else {
	assert(VA.isMemLoc());
	ArgValue =
	LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
	}

	// If value is passed via pointer - do a load.
	if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
	ArgValue =
	DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());

	InVals.push_back(ArgValue);
	}

	for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
	// Swift calling convention does not require we copy the sret argument
	// into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
	if (CallConv == CallingConv::Swift)
	continue;

	// All x86 ABIs require that for returning structs by value we copy the
	// sret argument into %rax/%eax (depending on ABI) for the return. Save
	// the argument into a virtual register so that we can access it from the
	// return points.
	if (Ins[I].Flags.isSRet()) {
	Register Reg = FuncInfo->getSRetReturnReg();
	if (!Reg) {
	MVT PtrTy = getPointerTy(DAG.getDataLayout());
	Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
	FuncInfo->setSRetReturnReg(Reg);
	}
	SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
	break;
	}
	}

	unsigned StackSize = CCInfo.getNextStackOffset();
	// Align stack specially for tail calls.
	if (shouldGuaranteeTCO(CallConv,
	MF.getTarget().Options.GuaranteedTailCallOpt))
	StackSize = GetAlignedArgumentStackSize(StackSize, DAG);

	if (IsVarArg)
	VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
	.lowerVarArgsParameters(Chain, StackSize);

	// Some CCs need callee pop.
	if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
	MF.getTarget().Options.GuaranteedTailCallOpt)) {
	FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
	} else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
	// X86 interrupts must pop the error code (and the alignment padding) if
	// present.
	FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
	} else {
	FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
	// If this is an sret function, the return should pop the hidden pointer.
	if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
	!Subtarget.getTargetTriple().isOSMSVCRT() &&
	argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
	FuncInfo->setBytesToPopOnReturn(4);
	}

	if (!Is64Bit) {
	// RegSaveFrameIndex is X86-64 only.
	FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
	}

	FuncInfo->setArgumentStackSize(StackSize);

	if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
	EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
	if (Personality == EHPersonality::CoreCLR) {
	assert(Is64Bit);
	// TODO: Add a mechanism to frame lowering that will allow us to indicate
	// that we'd prefer this slot be allocated towards the bottom of the frame
	// (i.e. near the stack pointer after allocating the frame). Every
	// funclet needs a copy of this slot in its (mostly empty) frame, and the
	// offset from the bottom of this and each funclet's frame must be the
	// same, so the size of funclets' (mostly empty) frames is dictated by
	// how far this slot is from the bottom (since they allocate just enough
	// space to accommodate holding this slot at the correct offset).
	int PSPSymFI = MFI.CreateStackObject(8, Align(8), /isSS=/false);
	EHInfo->PSPSymFrameIdx = PSPSymFI;
	}
	}

	if (CallConv == CallingConv::X86_RegCall \|\|
	F.hasFnAttribute("no_caller_saved_registers")) {
	MachineRegisterInfo &MRI = MF.getRegInfo();
	for (std::pair<Register, Register> Pair : MRI.liveins())
	MRI.disableCalleeSavedRegister(Pair.first);
	}

	return Chain;
	}

	SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
	SDValue Arg, const SDLoc &dl,
	SelectionDAG &DAG,
	const CCValAssign &VA,
	ISD::ArgFlagsTy Flags,
	bool isByVal) const {
	unsigned LocMemOffset = VA.getLocMemOffset();
	SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
	PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	StackPtr, PtrOff);
	if (isByVal)
	return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);

	return DAG.getStore(
	Chain, dl, Arg, PtrOff,
	MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
	}

	/// Emit a load of return address if tail call
	/// optimization is performed and it is required.
	SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
	SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
	bool Is64Bit, int FPDiff, const SDLoc &dl) const {
	// Adjust the Return address stack slot.
	EVT VT = getPointerTy(DAG.getDataLayout());
	OutRetAddr = getReturnAddressFrameIndex(DAG);

	// Load the "old" Return address.
	OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
	return SDValue(OutRetAddr.getNode(), 1);
	}

	/// Emit a store of the return address if tail call
	/// optimization is performed and it is required (FPDiff!=0).
	static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
	SDValue Chain, SDValue RetAddrFrIdx,
	EVT PtrVT, unsigned SlotSize,
	int FPDiff, const SDLoc &dl) {
	// Store the return address to the appropriate stack slot.
	if (!FPDiff) return Chain;
	// Calculate the new stack slot for the return address.
	int NewReturnAddrFI =
	MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
	false);
	SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
	Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(), NewReturnAddrFI));
	return Chain;
	}

	/// Returns a vector_shuffle mask for an movs{s\|d}, movd
	/// operation of specified width.
	static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
	SDValue V2) {
	unsigned NumElems = VT.getVectorNumElements();
	SmallVector<int, 8> Mask;
	Mask.push_back(NumElems);
	for (unsigned i = 1; i != NumElems; ++i)
	Mask.push_back(i);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	SDValue
	X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	SelectionDAG &DAG = CLI.DAG;
	SDLoc &dl = CLI.DL;
	SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
	SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
	SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
	SDValue Chain = CLI.Chain;
	SDValue Callee = CLI.Callee;
	CallingConv::ID CallConv = CLI.CallConv;
	bool &isTailCall = CLI.IsTailCall;
	bool isVarArg = CLI.IsVarArg;

	MachineFunction &MF = DAG.getMachineFunction();
	bool Is64Bit = Subtarget.is64Bit();
	bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
	StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
	bool IsSibcall = false;
	bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt \|\|
	CallConv == CallingConv::Tail;
	X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
	const auto *CI = dyn_cast_or_null<CallInst>(CLI.CB);
	const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
	bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) \|\|
	(Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
	const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CB);
	bool HasNoCfCheck =
	(CI && CI->doesNoCfCheck()) \|\| (II && II->doesNoCfCheck());
	const Module *M = MF.getMMI().getModule();
	Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");

	MachineFunction::CallSiteInfo CSInfo;
	if (CallConv == CallingConv::X86_INTR)
	report_fatal_error("X86 interrupts may not be called directly");

	if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO) {
	// If we are using a GOT, disable tail calls to external symbols with
	// default visibility. Tail calling such a symbol requires using a GOT
	// relocation, which forces early binding of the symbol. This breaks code
	// that require lazy function symbol resolution. Using musttail or
	// GuaranteedTailCallOpt will override this.
	GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
	if (!G \|\| (!G->getGlobal()->hasLocalLinkage() &&
	G->getGlobal()->hasDefaultVisibility()))
	isTailCall = false;
	}

	bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
	if (IsMustTail) {
	// Force this to be a tail call. The verifier rules are enough to ensure
	// that we can lower this successfully without moving the return address
	// around.
	isTailCall = true;
	} else if (isTailCall) {
	// Check if it's really possible to do a tail call.
	isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
	isVarArg, SR != NotStructReturn,
	MF.getFunction().hasStructRetAttr(), CLI.RetTy,
	Outs, OutVals, Ins, DAG);

	// Sibcalls are automatically detected tailcalls which do not require
	// ABI changes.
	if (!IsGuaranteeTCO && isTailCall)
	IsSibcall = true;

	if (isTailCall)
	++NumTailCalls;
	}

	assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
	"Var args not supported with calling convention fastcc, ghc or hipe");

	// Analyze operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

	// Allocate shadow area for Win64.
	if (IsWin64)
	CCInfo.AllocateStack(32, Align(8));

	CCInfo.AnalyzeArguments(Outs, CC_X86);

	// In vectorcall calling convention a second pass is required for the HVA
	// types.
	if (CallingConv::X86_VectorCall == CallConv) {
	CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
	}

	// Get a count of how many bytes are to be pushed on the stack.
	unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
	if (IsSibcall)
	// This is a sibcall. The memory operands are available in caller's
	// own caller's stack.
	NumBytes = 0;
	else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
	NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);

	int FPDiff = 0;
	if (isTailCall && !IsSibcall && !IsMustTail) {
	// Lower arguments at fp - stackoffset + fpdiff.
	unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();

	FPDiff = NumBytesCallerPushed - NumBytes;

	// Set the delta of movement of the returnaddr stackslot.
	// But only set if delta is greater than previous delta.
	if (FPDiff < X86Info->getTCReturnAddrDelta())
	X86Info->setTCReturnAddrDelta(FPDiff);
	}

	unsigned NumBytesToPush = NumBytes;
	unsigned NumBytesToPop = NumBytes;

	// If we have an inalloca argument, all stack space has already been allocated
	// for us and be right at the top of the stack. We don't support multiple
	// arguments passed in memory when using inalloca.
	if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
	NumBytesToPush = 0;
	if (!ArgLocs.back().isMemLoc())
	report_fatal_error("cannot use inalloca attribute on a register "
	"parameter");
	if (ArgLocs.back().getLocMemOffset() != 0)
	report_fatal_error("any parameter with the inalloca attribute must be "
	"the only memory argument");
	} else if (CLI.IsPreallocated) {
	assert(ArgLocs.back().isMemLoc() &&
	"cannot use preallocated attribute on a register "
	"parameter");
	SmallVector<size_t, 4> PreallocatedOffsets;
	for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
	if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
	PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
	}
	}
	auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
	size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
	MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
	MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
	NumBytesToPush = 0;
	}

	if (!IsSibcall && !IsMustTail)
	Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
	NumBytes - NumBytesToPush, dl);

	SDValue RetAddrFrIdx;
	// Load return address for tail calls.
	if (isTailCall && FPDiff)
	Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
	Is64Bit, FPDiff, dl);

	SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
	SmallVector<SDValue, 8> MemOpChains;
	SDValue StackPtr;

	// The next loop assumes that the locations are in the same order of the
	// input arguments.
	assert(isSortedByValueNo(ArgLocs) &&
	"Argument Location list must be sorted before lowering");

	// Walk the register/memloc assignments, inserting copies/loads. In the case
	// of tail call optimization arguments are handle later.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++OutIndex) {
	assert(OutIndex < Outs.size() && "Invalid Out index");
	// Skip inalloca/preallocated arguments, they have already been written.
	ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
	if (Flags.isInAlloca() \|\| Flags.isPreallocated())
	continue;

	CCValAssign &VA = ArgLocs[I];
	EVT RegVT = VA.getLocVT();
	SDValue Arg = OutVals[OutIndex];
	bool isByVal = Flags.isByVal();

	// Promote the value if needed.
	switch (VA.getLocInfo()) {
	default: llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full: break;
	case CCValAssign::SExt:
	Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::ZExt:
	Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::AExt:
	if (Arg.getValueType().isVector() &&
	Arg.getValueType().getVectorElementType() == MVT::i1)
	Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
	else if (RegVT.is128BitVector()) {
	// Special case: passing MMX values in XMM registers.
	Arg = DAG.getBitcast(MVT::i64, Arg);
	Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
	Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
	} else
	Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::BCvt:
	Arg = DAG.getBitcast(RegVT, Arg);
	break;
	case CCValAssign::Indirect: {
	if (isByVal) {
	// Memcpy the argument to a temporary stack slot to prevent
	// the caller from seeing any modifications the callee may make
	// as guaranteed by the `byval` attribute.
	int FrameIdx = MF.getFrameInfo().CreateStackObject(
	Flags.getByValSize(),
	std::max(Align(16), Flags.getNonZeroByValAlign()), false);
	SDValue StackSlot =
	DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
	Chain =
	CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
	// From now on treat this as a regular pointer
	Arg = StackSlot;
	isByVal = false;
	} else {
	// Store the argument.
	SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
	int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
	Chain = DAG.getStore(
	Chain, dl, Arg, SpillSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	Arg = SpillSlot;
	}
	break;
	}
	}

	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");
	// Split v64i1 value into two registers
	Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
	} else if (VA.isRegLoc()) {
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
	const TargetOptions &Options = DAG.getTarget().Options;
	if (Options.EmitCallSiteInfo)
	CSInfo.emplace_back(VA.getLocReg(), I);
	if (isVarArg && IsWin64) {
	// Win64 ABI requires argument XMM reg to be copied to the corresponding
	// shadow reg if callee is a varargs function.
	Register ShadowReg;
	switch (VA.getLocReg()) {
	case X86::XMM0: ShadowReg = X86::RCX; break;
	case X86::XMM1: ShadowReg = X86::RDX; break;
	case X86::XMM2: ShadowReg = X86::R8; break;
	case X86::XMM3: ShadowReg = X86::R9; break;
	}
	if (ShadowReg)
	RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
	}
	} else if (!IsSibcall && (!isTailCall \|\| isByVal)) {
	assert(VA.isMemLoc());
	if (!StackPtr.getNode())
	StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
	getPointerTy(DAG.getDataLayout()));
	MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
	dl, DAG, VA, Flags, isByVal));
	}
	}

	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

	if (Subtarget.isPICStyleGOT()) {
	// ELF / PIC requires GOT in the EBX register before function calls via PLT
	// GOT pointer.
	if (!isTailCall) {
	RegsToPass.push_back(std::make_pair(
	Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
	getPointerTy(DAG.getDataLayout()))));
	} else {
	// If we are tail calling and generating PIC/GOT style code load the
	// address of the callee into ECX. The value in ecx is used as target of
	// the tail jump. This is done to circumvent the ebx/callee-saved problem
	// for tail calls on PIC/GOT architectures. Normally we would just put the
	// address of GOT into ebx and then call target@PLT. But for tail calls
	// ebx would be restored (since ebx is callee saved) before jumping to the
	// target@PLT.

	// Note: The actual moving to ECX is done further down.
	GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
	if (G && !G->getGlobal()->hasLocalLinkage() &&
	G->getGlobal()->hasDefaultVisibility())
	Callee = LowerGlobalAddress(Callee, DAG);
	else if (isa<ExternalSymbolSDNode>(Callee))
	Callee = LowerExternalSymbol(Callee, DAG);
	}
	}

	if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
	// From AMD64 ABI document:
	// For calls that may call functions that use varargs or stdargs
	// (prototype-less calls or calls to functions containing ellipsis (...) in
	// the declaration) %al is used as hidden argument to specify the number
	// of SSE registers used. The contents of %al do not need to match exactly
	// the number of registers, but must be an ubound on the number of SSE
	// registers used and is in the range 0 - 8 inclusive.

	// Count the number of XMM registers allocated.
	static const MCPhysReg XMMArgRegs[] = {
	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
	};
	unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
	assert((Subtarget.hasSSE1() \|\| !NumXMMRegs)
	&& "SSE registers cannot be used when SSE is disabled");
	RegsToPass.push_back(std::make_pair(Register(X86::AL),
	DAG.getConstant(NumXMMRegs, dl,
	MVT::i8)));
	}

	if (isVarArg && IsMustTail) {
	const auto &Forwards = X86Info->getForwardedMustTailRegParms();
	for (const auto &F : Forwards) {
	SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
	RegsToPass.push_back(std::make_pair(F.PReg, Val));
	}
	}

	// For tail calls lower the arguments to the 'real' stack slots. Sibcalls
	// don't need this because the eligibility check rejects calls that require
	// shuffling arguments passed in memory.
	if (!IsSibcall && isTailCall) {
	// Force all the incoming stack arguments to be loaded from the stack
	// before any new outgoing arguments are stored to the stack, because the
	// outgoing stack slots may alias the incoming argument stack slots, and
	// the alias isn't otherwise explicit. This is slightly more conservative
	// than necessary, because it means that each store effectively depends
	// on every argument instead of just those arguments it would clobber.
	SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);

	SmallVector<SDValue, 8> MemOpChains2;
	SDValue FIN;
	int FI = 0;
	for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++OutsIndex) {
	CCValAssign &VA = ArgLocs[I];

	if (VA.isRegLoc()) {
	if (VA.needsCustom()) {
	assert((CallConv == CallingConv::X86_RegCall) &&
	"Expecting custom case only in regcall calling convention");
	// This means that we are in special case where one argument was
	// passed through two register locations - Skip the next location
	++I;
	}

	continue;
	}

	assert(VA.isMemLoc());
	SDValue Arg = OutVals[OutsIndex];
	ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
	// Skip inalloca/preallocated arguments. They don't require any work.
	if (Flags.isInAlloca() \|\| Flags.isPreallocated())
	continue;
	// Create frame index.
	int32_t Offset = VA.getLocMemOffset()+FPDiff;
	uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
	FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
	FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));

	if (Flags.isByVal()) {
	// Copy relative to framepointer.
	SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
	if (!StackPtr.getNode())
	StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
	getPointerTy(DAG.getDataLayout()));
	Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	StackPtr, Source);

	MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
	ArgChain,
	Flags, DAG, dl));
	} else {
	// Store relative to framepointer.
	MemOpChains2.push_back(DAG.getStore(
	ArgChain, dl, Arg, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
	}
	}

	if (!MemOpChains2.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);

	// Store the return address to the appropriate stack slot.
	Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
	getPointerTy(DAG.getDataLayout()),
	RegInfo->getSlotSize(), FPDiff, dl);
	}

	// Build a sequence of copy-to-reg nodes chained together with token chain
	// and flag operands which copy the outgoing args into registers.
	SDValue InFlag;
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
	Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
	RegsToPass[i].second, InFlag);
	InFlag = Chain.getValue(1);
	}

	if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
	assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
	// In the 64-bit large code model, we have to make all calls
	// through a register, since the call instruction's 32-bit
	// pc-relative offset may not be large enough to hold the whole
	// address.
	} else if (Callee->getOpcode() == ISD::GlobalAddress \|\|
	Callee->getOpcode() == ISD::ExternalSymbol) {
	// Lower direct calls to global addresses and external symbols. Setting
	// ForCall to true here has the effect of removing WrapperRIP when possible
	// to allow direct calls to be selected without first materializing the
	// address into a register.
	Callee = LowerGlobalOrExternal(Callee, DAG, /ForCall=/true);
	} else if (Subtarget.isTarget64BitILP32() &&
	Callee->getValueType(0) == MVT::i32) {
	// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
	Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
	}

	// Returns a chain & a flag for retval copy to use.
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	SmallVector<SDValue, 8> Ops;

	if (!IsSibcall && isTailCall && !IsMustTail) {
	Chain = DAG.getCALLSEQ_END(Chain,
	DAG.getIntPtrConstant(NumBytesToPop, dl, true),
	DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
	InFlag = Chain.getValue(1);
	}

	Ops.push_back(Chain);
	Ops.push_back(Callee);

	if (isTailCall)
	Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));

	// Add argument registers to the end of the list so that they are known live
	// into the call.
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
	Ops.push_back(DAG.getRegister(RegsToPass[i].first,
	RegsToPass[i].second.getValueType()));

	// Add a register mask operand representing the call-preserved registers.
	// If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
	// set X86_INTR calling convention because it has the same CSR mask
	// (same preserved registers).
	const uint32_t *Mask = RegInfo->getCallPreservedMask(
	MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
	assert(Mask && "Missing call preserved mask for calling convention");

	// If this is an invoke in a 32-bit function using a funclet-based
	// personality, assume the function clobbers all registers. If an exception
	// is thrown, the runtime will not restore CSRs.
	// FIXME: Model this more precisely so that we can register allocate across
	// the normal edge and spill and fill across the exceptional edge.
	if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
	const Function &CallerFn = MF.getFunction();
	EHPersonality Pers =
	CallerFn.hasPersonalityFn()
	? classifyEHPersonality(CallerFn.getPersonalityFn())
	: EHPersonality::Unknown;
	if (isFuncletEHPersonality(Pers))
	Mask = RegInfo->getNoPreservedMask();
	}

	// Define a new register mask from the existing mask.
	uint32_t *RegMask = nullptr;

	// In some calling conventions we need to remove the used physical registers
	// from the reg mask.
	if (CallConv == CallingConv::X86_RegCall \|\| HasNCSR) {
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

	// Allocate a new Reg Mask and copy Mask.
	RegMask = MF.allocateRegMask();
	unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
	memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);

	// Make sure all sub registers of the argument registers are reset
	// in the RegMask.
	for (auto const &RegPair : RegsToPass)
	for (MCSubRegIterator SubRegs(RegPair.first, TRI, /IncludeSelf=/true);
	SubRegs.isValid(); ++SubRegs)
	RegMask[SubRegs / 32] &= ~(1u << (SubRegs % 32));

	// Create the RegMask Operand according to our updated mask.
	Ops.push_back(DAG.getRegisterMask(RegMask));
	} else {
	// Create the RegMask Operand according to the static mask.
	Ops.push_back(DAG.getRegisterMask(Mask));
	}

	if (InFlag.getNode())
	Ops.push_back(InFlag);

	if (isTailCall) {
	// We used to do:
	//// If this is the first return lowered for this function, add the regs
	//// to the liveout set for the function.
	// This isn't right, although it's probably harmless on x86; liveouts
	// should be computed from returns not tail calls. Consider a void
	// function making a tail call to a function returning int.
	MF.getFrameInfo().setHasTailCall();
	SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
	DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
	return Ret;
	}

	if (HasNoCfCheck && IsCFProtectionSupported) {
	Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
	} else {
	Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
	}
	InFlag = Chain.getValue(1);
	DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
	DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));

	// Save heapallocsite metadata.
	if (CLI.CB)
	if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
	DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);

	// Create the CALLSEQ_END node.
	unsigned NumBytesForCalleeToPop;
	if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
	DAG.getTarget().Options.GuaranteedTailCallOpt))
	NumBytesForCalleeToPop = NumBytes; // Callee pops everything
	else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
	!Subtarget.getTargetTriple().isOSMSVCRT() &&
	SR == StackStructReturn)
	// If this is a call to a struct-return function, the callee
	// pops the hidden struct pointer, so we have to push it back.
	// This is common for Darwin/X86, Linux & Mingw32 targets.
	// For MSVC Win32 targets, the caller pops the hidden struct pointer.
	NumBytesForCalleeToPop = 4;
	else
	NumBytesForCalleeToPop = 0; // Callee pops nothing.

	// Returns a flag for retval copy to use.
	if (!IsSibcall) {
	Chain = DAG.getCALLSEQ_END(Chain,
	DAG.getIntPtrConstant(NumBytesToPop, dl, true),
	DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
	true),
	InFlag, dl);
	InFlag = Chain.getValue(1);
	}

	// Handle result values, copying them out of physregs into vregs that we
	// return.
	return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
	InVals, RegMask);
	}

	//===----------------------------------------------------------------------===//
	// Fast Calling Convention (tail call) implementation
	//===----------------------------------------------------------------------===//

	// Like std call, callee cleans arguments, convention except that ECX is
	// reserved for storing the tail called function address. Only 2 registers are
	// free for argument passing (inreg). Tail call optimization is performed
	// provided:
	// * tailcallopt is enabled
	// * caller/callee are fastcc
	// On X86_64 architecture with GOT-style position independent code only local
	// (within module) calls are supported at the moment.
	// To keep the stack aligned according to platform abi the function
	// GetAlignedArgumentStackSize ensures that argument delta is always multiples
	// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
	// If a tail called function callee has more arguments than the caller the
	// caller needs to make sure that there is room to move the RETADDR to. This is
	// achieved by reserving an area the size of the argument delta right after the
	// original RETADDR, but before the saved framepointer or the spilled registers
	// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
	// stack layout:
	// arg1
	// arg2
	// RETADDR
	// [ new RETADDR
	// move area ]
	// (possible EBP)
	// ESI
	// EDI
	// local1 ..

	/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
	/// requirement.
	unsigned
	X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
	SelectionDAG &DAG) const {
	const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
	const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
	assert(StackSize % SlotSize == 0 &&
	"StackSize must be a multiple of SlotSize");
	return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
	}

	/// Return true if the given stack call argument is already available in the
	/// same position (relatively) of the caller's incoming argument stack.
	static
	bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
	MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
	const X86InstrInfo *TII, const CCValAssign &VA) {
	unsigned Bytes = Arg.getValueSizeInBits() / 8;

	for (;;) {
	// Look through nodes that don't alter the bits of the incoming value.
	unsigned Op = Arg.getOpcode();
	if (Op == ISD::ZERO_EXTEND \|\| Op == ISD::ANY_EXTEND \|\| Op == ISD::BITCAST) {
	Arg = Arg.getOperand(0);
	continue;
	}
	if (Op == ISD::TRUNCATE) {
	const SDValue &TruncInput = Arg.getOperand(0);
	if (TruncInput.getOpcode() == ISD::AssertZext &&
	cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
	Arg.getValueType()) {
	Arg = TruncInput.getOperand(0);
	continue;
	}
	}
	break;
	}

	int FI = INT_MAX;
	if (Arg.getOpcode() == ISD::CopyFromReg) {
	Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
	if (!Register::isVirtualRegister(VR))
	return false;
	MachineInstr *Def = MRI->getVRegDef(VR);
	if (!Def)
	return false;
	if (!Flags.isByVal()) {
	if (!TII->isLoadFromStackSlot(*Def, FI))
	return false;
	} else {
	unsigned Opcode = Def->getOpcode();
	if ((Opcode == X86::LEA32r \|\| Opcode == X86::LEA64r \|\|
	Opcode == X86::LEA64_32r) &&
	Def->getOperand(1).isFI()) {
	FI = Def->getOperand(1).getIndex();
	Bytes = Flags.getByValSize();
	} else
	return false;
	}
	} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
	if (Flags.isByVal())
	// ByVal argument is passed in as a pointer but it's now being
	// dereferenced. e.g.
	// define @foo(%struct.X* %A) {
	// tail call @bar(%struct.X* byval %A)
	// }
	return false;
	SDValue Ptr = Ld->getBasePtr();
	FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
	if (!FINode)
	return false;
	FI = FINode->getIndex();
	} else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
	FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
	FI = FINode->getIndex();
	Bytes = Flags.getByValSize();
	} else
	return false;

	assert(FI != INT_MAX);
	if (!MFI.isFixedObjectIndex(FI))
	return false;

	if (Offset != MFI.getObjectOffset(FI))
	return false;

	// If this is not byval, check that the argument stack object is immutable.
	// inalloca and argument copy elision can create mutable argument stack
	// objects. Byval objects can be mutated, but a byval call intends to pass the
	// mutated memory.
	if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
	return false;

	if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
	// If the argument location is wider than the argument type, check that any
	// extension flags match.
	if (Flags.isZExt() != MFI.isObjectZExt(FI) \|\|
	Flags.isSExt() != MFI.isObjectSExt(FI)) {
	return false;
	}
	}

	return Bytes == MFI.getObjectSize(FI);
	}

	/// Check whether the call is eligible for tail call optimization. Targets
	/// that want to do tail call optimization should implement this function.
	bool X86TargetLowering::IsEligibleForTailCallOptimization(
	SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
	bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
	if (!mayTailCallThisCC(CalleeCC))
	return false;

	// If -tailcallopt is specified, make fastcc functions tail-callable.
	MachineFunction &MF = DAG.getMachineFunction();
	const Function &CallerF = MF.getFunction();

	// If the function return type is x86_fp80 and the callee return type is not,
	// then the FP_EXTEND of the call result is not a nop. It's not safe to
	// perform a tailcall optimization here.
	if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
	return false;

	CallingConv::ID CallerCC = CallerF.getCallingConv();
	bool CCMatch = CallerCC == CalleeCC;
	bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
	bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
	bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt \|\|
	CalleeCC == CallingConv::Tail;

	// Win64 functions have extra shadow space for argument homing. Don't do the
	// sibcall if the caller and callee have mismatched expectations for this
	// space.
	if (IsCalleeWin64 != IsCallerWin64)
	return false;

	if (IsGuaranteeTCO) {
	if (canGuaranteeTCO(CalleeCC) && CCMatch)
	return true;
	return false;
	}

	// Look for obvious safe cases to perform tail call optimization that do not
	// require ABI changes. This is what gcc calls sibcall.

	// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
	// emit a special epilogue.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	if (RegInfo->needsStackRealignment(MF))
	return false;

	// Also avoid sibcall optimization if either caller or callee uses struct
	// return semantics.
	if (isCalleeStructRet \|\| isCallerStructRet)
	return false;

	// Do not sibcall optimize vararg calls unless all arguments are passed via
	// registers.
	LLVMContext &C = *DAG.getContext();
	if (isVarArg && !Outs.empty()) {
	// Optimizing for varargs on Win64 is unlikely to be safe without
	// additional testing.
	if (IsCalleeWin64 \|\| IsCallerWin64)
	return false;

	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	CCInfo.AnalyzeCallOperands(Outs, CC_X86);
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
	if (!ArgLocs[i].isRegLoc())
	return false;
	}

	// If the call result is in ST0 / ST1, it needs to be popped off the x87
	// stack. Therefore, if it's not used by the call it is not safe to optimize
	// this into a sibcall.
	bool Unused = false;
	for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
	if (!Ins[i].Used) {
	Unused = true;
	break;
	}
	}
	if (Unused) {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
	CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
	for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
	CCValAssign &VA = RVLocs[i];
	if (VA.getLocReg() == X86::FP0 \|\| VA.getLocReg() == X86::FP1)
	return false;
	}
	}

	// Check that the call results are passed in the same way.
	if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
	RetCC_X86, RetCC_X86))
	return false;
	// The callee has to preserve all registers the caller needs to preserve.
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
	if (!CCMatch) {
	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
	if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
	return false;
	}

	unsigned StackArgsSize = 0;

	// If the callee takes no arguments then go on to check the results of the
	// call.
	if (!Outs.empty()) {
	// Check if stack adjustment is needed. For now, do not do this if any
	// argument is passed on the stack.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	// Allocate shadow area for Win64
	if (IsCalleeWin64)
	CCInfo.AllocateStack(32, Align(8));

	CCInfo.AnalyzeCallOperands(Outs, CC_X86);
	StackArgsSize = CCInfo.getNextStackOffset();

	if (CCInfo.getNextStackOffset()) {
	// Check if the arguments are already laid out in the right way as
	// the caller's fixed stack objects.
	MachineFrameInfo &MFI = MF.getFrameInfo();
	const MachineRegisterInfo *MRI = &MF.getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	SDValue Arg = OutVals[i];
	ISD::ArgFlagsTy Flags = Outs[i].Flags;
	if (VA.getLocInfo() == CCValAssign::Indirect)
	return false;
	if (!VA.isRegLoc()) {
	if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
	MFI, MRI, TII, VA))
	return false;
	}
	}
	}

	bool PositionIndependent = isPositionIndependent();
	// If the tailcall address may be in a register, then make sure it's
	// possible to register allocate for it. In 32-bit, the call address can
	// only target EAX, EDX, or ECX since the tail call must be scheduled after
	// callee-saved registers are restored. These happen to be the same
	// registers used to pass 'inreg' arguments so watch out for those.
	if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
	!isa<ExternalSymbolSDNode>(Callee)) \|\|
	PositionIndependent)) {
	unsigned NumInRegs = 0;
	// In PIC we need an extra register to formulate the address computation
	// for the callee.
	unsigned MaxInRegs = PositionIndependent ? 2 : 3;

	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	if (!VA.isRegLoc())
	continue;
	Register Reg = VA.getLocReg();
	switch (Reg) {
	default: break;
	case X86::EAX: case X86::EDX: case X86::ECX:
	if (++NumInRegs == MaxInRegs)
	return false;
	break;
	}
	}
	}

	const MachineRegisterInfo &MRI = MF.getRegInfo();
	if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
	return false;
	}

	bool CalleeWillPop =
	X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
	MF.getTarget().Options.GuaranteedTailCallOpt);

	if (unsigned BytesToPop =
	MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
	// If we have bytes to pop, the callee must pop them.
	bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
	if (!CalleePopMatches)
	return false;
	} else if (CalleeWillPop && StackArgsSize > 0) {
	// If we don't have bytes to pop, make sure the callee doesn't pop any.
	return false;
	}

	return true;
	}

	FastISel *
	X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo) const {
	return X86::createFastISel(funcInfo, libInfo);
	}

	//===----------------------------------------------------------------------===//
	// Other Lowering Hooks
	//===----------------------------------------------------------------------===//

	static bool MayFoldLoad(SDValue Op) {
	return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
	}

	static bool MayFoldIntoStore(SDValue Op) {
	return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
	}

	static bool MayFoldIntoZeroExtend(SDValue Op) {
	if (Op.hasOneUse()) {
	unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
	return (ISD::ZERO_EXTEND == Opcode);
	}
	return false;
	}

	static bool isTargetShuffle(unsigned Opcode) {
	switch(Opcode) {
	default: return false;
	case X86ISD::BLENDI:
	case X86ISD::PSHUFB:
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFHW:
	case X86ISD::PSHUFLW:
	case X86ISD::SHUFP:
	case X86ISD::INSERTPS:
	case X86ISD::EXTRQI:
	case X86ISD::INSERTQI:
	case X86ISD::VALIGN:
	case X86ISD::PALIGNR:
	case X86ISD::VSHLDQ:
	case X86ISD::VSRLDQ:
	case X86ISD::MOVLHPS:
	case X86ISD::MOVHLPS:
	case X86ISD::MOVSHDUP:
	case X86ISD::MOVSLDUP:
	case X86ISD::MOVDDUP:
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	case X86ISD::VBROADCAST:
	case X86ISD::VPERMILPI:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERM2X128:
	case X86ISD::SHUF128:
	case X86ISD::VPERMIL2:
	case X86ISD::VPERMI:
	case X86ISD::VPPERM:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	case X86ISD::VZEXT_MOVL:
	return true;
	}
	}

	static bool isTargetShuffleVariableMask(unsigned Opcode) {
	switch (Opcode) {
	default: return false;
	// Target Shuffles.
	case X86ISD::PSHUFB:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERMIL2:
	case X86ISD::VPPERM:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	return true;
	// 'Faux' Target Shuffles.
	case ISD::OR:
	case ISD::AND:
	case X86ISD::ANDNP:
	return true;
	}
	}

	static bool isTargetShuffleSplat(SDValue Op) {
	unsigned Opcode = Op.getOpcode();
	if (Opcode == ISD::EXTRACT_SUBVECTOR)
	return isTargetShuffleSplat(Op.getOperand(0));
	return Opcode == X86ISD::VBROADCAST \|\| Opcode == X86ISD::VBROADCAST_LOAD;
	}

	SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
	int ReturnAddrIndex = FuncInfo->getRAIndex();

	if (ReturnAddrIndex == 0) {
	// Set up a frame object for the return address.
	unsigned SlotSize = RegInfo->getSlotSize();
	ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
	-(int64_t)SlotSize,
	false);
	FuncInfo->setRAIndex(ReturnAddrIndex);
	}

	return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
	}

	bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
	bool hasSymbolicDisplacement) {
	// Offset should fit into 32 bit immediate field.
	if (!isInt<32>(Offset))
	return false;

	// If we don't have a symbolic displacement - we don't have any extra
	// restrictions.
	if (!hasSymbolicDisplacement)
	return true;

	// FIXME: Some tweaks might be needed for medium code model.
	if (M != CodeModel::Small && M != CodeModel::Kernel)
	return false;

	// For small code model we assume that latest object is 16MB before end of 31
	// bits boundary. We may also accept pretty large negative constants knowing
	// that all objects are in the positive half of address space.
	if (M == CodeModel::Small && Offset < 1610241024)
	return true;

	// For kernel code model we know that all object resist in the negative half
	// of 32bits address space. We may not accept negative offsets, since they may
	// be just off and we may accept pretty large positive ones.
	if (M == CodeModel::Kernel && Offset >= 0)
	return true;

	return false;
	}

	/// Determines whether the callee is required to pop its own arguments.
	/// Callee pop is necessary to support tail calls.
	bool X86::isCalleePop(CallingConv::ID CallingConv,
	bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
	// If GuaranteeTCO is true, we force some calls to be callee pop so that we
	// can guarantee TCO.
	if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
	return true;

	switch (CallingConv) {
	default:
	return false;
	case CallingConv::X86_StdCall:
	case CallingConv::X86_FastCall:
	case CallingConv::X86_ThisCall:
	case CallingConv::X86_VectorCall:
	return !is64Bit;
	}
	}

	/// Return true if the condition is an signed comparison operation.
	static bool isX86CCSigned(unsigned X86CC) {
	switch (X86CC) {
	default:
	llvm_unreachable("Invalid integer condition!");
	case X86::COND_E:
	case X86::COND_NE:
	case X86::COND_B:
	case X86::COND_A:
	case X86::COND_BE:
	case X86::COND_AE:
	return false;
	case X86::COND_G:
	case X86::COND_GE:
	case X86::COND_L:
	case X86::COND_LE:
	return true;
	}
	}

	static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
	switch (SetCCOpcode) {
	default: llvm_unreachable("Invalid integer condition!");
	case ISD::SETEQ: return X86::COND_E;
	case ISD::SETGT: return X86::COND_G;
	case ISD::SETGE: return X86::COND_GE;
	case ISD::SETLT: return X86::COND_L;
	case ISD::SETLE: return X86::COND_LE;
	case ISD::SETNE: return X86::COND_NE;
	case ISD::SETULT: return X86::COND_B;
	case ISD::SETUGT: return X86::COND_A;
	case ISD::SETULE: return X86::COND_BE;
	case ISD::SETUGE: return X86::COND_AE;
	}
	}

	/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
	/// condition code, returning the condition code and the LHS/RHS of the
	/// comparison to make.
	static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
	bool isFP, SDValue &LHS, SDValue &RHS,
	SelectionDAG &DAG) {
	if (!isFP) {
	if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
	if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
	// X > -1 -> X == 0, jump !sign.
	RHS = DAG.getConstant(0, DL, RHS.getValueType());
	return X86::COND_NS;
	}
	if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
	// X < 0 -> X == 0, jump on sign.
	return X86::COND_S;
	}
	if (SetCCOpcode == ISD::SETGE && RHSC->isNullValue()) {
	// X >= 0 -> X == 0, jump on !sign.
	return X86::COND_NS;
	}
	if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
	// X < 1 -> X <= 0
	RHS = DAG.getConstant(0, DL, RHS.getValueType());
	return X86::COND_LE;
	}
	}

	return TranslateIntegerX86CC(SetCCOpcode);
	}

	// First determine if it is required or is profitable to flip the operands.

	// If LHS is a foldable load, but RHS is not, flip the condition.
	if (ISD::isNON_EXTLoad(LHS.getNode()) &&
	!ISD::isNON_EXTLoad(RHS.getNode())) {
	SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
	std::swap(LHS, RHS);
	}

	switch (SetCCOpcode) {
	default: break;
	case ISD::SETOLT:
	case ISD::SETOLE:
	case ISD::SETUGT:
	case ISD::SETUGE:
	std::swap(LHS, RHS);
	break;
	}

	// On a floating point condition, the flags are set as follows:
	// ZF PF CF op
	// 0 \| 0 \| 0 \| X > Y
	// 0 \| 0 \| 1 \| X < Y
	// 1 \| 0 \| 0 \| X == Y
	// 1 \| 1 \| 1 \| unordered
	switch (SetCCOpcode) {
	default: llvm_unreachable("Condcode should be pre-legalized away");
	case ISD::SETUEQ:
	case ISD::SETEQ: return X86::COND_E;
	case ISD::SETOLT: // flipped
	case ISD::SETOGT:
	case ISD::SETGT: return X86::COND_A;
	case ISD::SETOLE: // flipped
	case ISD::SETOGE:
	case ISD::SETGE: return X86::COND_AE;
	case ISD::SETUGT: // flipped
	case ISD::SETULT:
	case ISD::SETLT: return X86::COND_B;
	case ISD::SETUGE: // flipped
	case ISD::SETULE:
	case ISD::SETLE: return X86::COND_BE;
	case ISD::SETONE:
	case ISD::SETNE: return X86::COND_NE;
	case ISD::SETUO: return X86::COND_P;
	case ISD::SETO: return X86::COND_NP;
	case ISD::SETOEQ:
	case ISD::SETUNE: return X86::COND_INVALID;
	}
	}

	/// Is there a floating point cmov for the specific X86 condition code?
	/// Current x86 isa includes the following FP cmov instructions:
	/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
	static bool hasFPCMov(unsigned X86CC) {
	switch (X86CC) {
	default:
	return false;
	case X86::COND_B:
	case X86::COND_BE:
	case X86::COND_E:
	case X86::COND_P:
	case X86::COND_A:
	case X86::COND_AE:
	case X86::COND_NE:
	case X86::COND_NP:
	return true;
	}
	}


	bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
	const CallInst &I,
	MachineFunction &MF,
	unsigned Intrinsic) const {

	const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
	if (!IntrData)
	return false;

	Info.flags = MachineMemOperand::MONone;
	Info.offset = 0;

	switch (IntrData->Type) {
	case TRUNCATE_TO_MEM_VI8:
	case TRUNCATE_TO_MEM_VI16:
	case TRUNCATE_TO_MEM_VI32: {
	Info.opc = ISD::INTRINSIC_VOID;
	Info.ptrVal = I.getArgOperand(0);
	MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
	MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
	if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
	ScalarVT = MVT::i8;
	else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
	ScalarVT = MVT::i16;
	else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
	ScalarVT = MVT::i32;

	Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
	Info.align = Align(1);
	Info.flags \|= MachineMemOperand::MOStore;
	break;
	}
	case GATHER:
	case GATHER_AVX2: {
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.ptrVal = nullptr;
	MVT DataVT = MVT::getVT(I.getType());
	MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
	unsigned NumElts = std::min(DataVT.getVectorNumElements(),
	IndexVT.getVectorNumElements());
	Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
	Info.align = Align(1);
	Info.flags \|= MachineMemOperand::MOLoad;
	break;
	}
	case SCATTER: {
	Info.opc = ISD::INTRINSIC_VOID;
	Info.ptrVal = nullptr;
	MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
	MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
	unsigned NumElts = std::min(DataVT.getVectorNumElements(),
	IndexVT.getVectorNumElements());
	Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
	Info.align = Align(1);
	Info.flags \|= MachineMemOperand::MOStore;
	break;
	}
	default:
	return false;
	}

	return true;
	}

	/// Returns true if the target can instruction select the
	/// specified FP immediate natively. If false, the legalizer will
	/// materialize the FP immediate as a load from a constant pool.
	bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
	bool ForCodeSize) const {
	for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
	if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
	return true;
	}
	return false;
	}

	bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
	ISD::LoadExtType ExtTy,
	EVT NewVT) const {
	assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");

	// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
	// relocation target a movq or addq instruction: don't let the load shrink.
	SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
	if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
	if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
	return GA->getTargetFlags() != X86II::MO_GOTTPOFF;

	// If this is an (1) AVX vector load with (2) multiple uses and (3) all of
	// those uses are extracted directly into a store, then the extract + store
	// can be store-folded. Therefore, it's probably not worth splitting the load.
	EVT VT = Load->getValueType(0);
	if ((VT.is256BitVector() \|\| VT.is512BitVector()) && !Load->hasOneUse()) {
	for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
	// Skip uses of the chain value. Result 0 of the node is the load value.
	if (UI.getUse().getResNo() != 0)
	continue;

	// If this use is not an extract + store, it's probably worth splitting.
	if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR \|\| !UI->hasOneUse() \|\|
	UI->use_begin()->getOpcode() != ISD::STORE)
	return true;
	}
	// All non-chain uses are extract + store.
	return false;
	}

	return true;
	}

	/// Returns true if it is beneficial to convert a load of a constant
	/// to just the constant itself.
	bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const {
	assert(Ty->isIntegerTy());

	unsigned BitSize = Ty->getPrimitiveSizeInBits();
	if (BitSize == 0 \|\| BitSize > 64)
	return false;
	return true;
	}

	bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
	// If we are using XMM registers in the ABI and the condition of the select is
	// a floating-point compare and we have blendv or conditional move, then it is
	// cheaper to select instead of doing a cross-register move and creating a
	// load that depends on the compare result.
	bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
	return !IsFPSetCC \|\| !Subtarget.isTarget64BitLP64() \|\| !Subtarget.hasAVX();
	}

	bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
	// TODO: It might be a win to ease or lift this restriction, but the generic
	// folds in DAGCombiner conflict with vector folds for an AVX512 target.
	if (VT.isVector() && Subtarget.hasAVX512())
	return false;

	return true;
	}

	bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
	SDValue C) const {
	// TODO: We handle scalars using custom code, but generic combining could make
	// that unnecessary.
	APInt MulC;
	if (!ISD::isConstantSplatVector(C.getNode(), MulC))
	return false;

	// Find the type this will be legalized too. Otherwise we might prematurely
	// convert this to shl+add/sub and then still have to type legalize those ops.
	// Another choice would be to defer the decision for illegal types until
	// after type legalization. But constant splat vectors of i64 can't make it
	// through type legalization on 32-bit targets so we would need to special
	// case vXi64.
	while (getTypeAction(Context, VT) != TypeLegal)
	VT = getTypeToTransformTo(Context, VT);

	// If vector multiply is legal, assume that's faster than shl + add/sub.
	// TODO: Multiply is a complex op with higher latency and lower throughput in
	// most implementations, so this check could be loosened based on type
	// and/or a CPU attribute.
	if (isOperationLegal(ISD::MUL, VT))
	return false;

	// shl+add, shl+sub, shl+add+neg
	return (MulC + 1).isPowerOf2() \|\| (MulC - 1).isPowerOf2() \|\|
	(1 - MulC).isPowerOf2() \|\| (-(MulC + 1)).isPowerOf2();
	}

	bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
	unsigned Index) const {
	if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
	return false;

	// Mask vectors support all subregister combinations and operations that
	// extract half of vector.
	if (ResVT.getVectorElementType() == MVT::i1)
	return Index == 0 \|\| ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
	(Index == ResVT.getVectorNumElements()));

	return (Index % ResVT.getVectorNumElements()) == 0;
	}

	bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
	unsigned Opc = VecOp.getOpcode();

	// Assume target opcodes can't be scalarized.
	// TODO - do we have any exceptions?
	if (Opc >= ISD::BUILTIN_OP_END)
	return false;

	// If the vector op is not supported, try to convert to scalar.
	EVT VecVT = VecOp.getValueType();
	if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
	return true;

	// If the vector op is supported, but the scalar op is not, the transform may
	// not be worthwhile.
	EVT ScalarVT = VecVT.getScalarType();
	return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
	}

	bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
	bool) const {
	// TODO: Allow vectors?
	if (VT.isVector())
	return false;
	return VT.isSimple() \|\| !isOperationExpand(Opcode, VT);
	}

	bool X86TargetLowering::isCheapToSpeculateCttz() const {
	// Speculate cttz only if we can directly use TZCNT.
	return Subtarget.hasBMI();
	}

	bool X86TargetLowering::isCheapToSpeculateCtlz() const {
	// Speculate ctlz only if we can directly use LZCNT.
	return Subtarget.hasLZCNT();
	}

	bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
	const SelectionDAG &DAG,
	const MachineMemOperand &MMO) const {
	if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
	BitcastVT.getVectorElementType() == MVT::i1)
	return false;

	if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
	return false;

	// If both types are legal vectors, it's always ok to convert them.
	if (LoadVT.isVector() && BitcastVT.isVector() &&
	isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
	return true;

	return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
	}

	bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
	const SelectionDAG &DAG) const {
	// Do not merge to float value size (128 bytes) if no implicit
	// float attribute is set.
	bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat);

	if (NoFloat) {
	unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
	return (MemVT.getSizeInBits() <= MaxIntSize);
	}
	// Make sure we don't merge greater than our preferred vector
	// width.
	if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
	return false;
	return true;
	}

	bool X86TargetLowering::isCtlzFast() const {
	return Subtarget.hasFastLZCNT();
	}

	bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
	const Instruction &AndI) const {
	return true;
	}

	bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
	EVT VT = Y.getValueType();

	if (VT.isVector())
	return false;

	if (!Subtarget.hasBMI())
	return false;

	// There are only 32-bit and 64-bit forms for 'andn'.
	if (VT != MVT::i32 && VT != MVT::i64)
	return false;

	return !isa<ConstantSDNode>(Y);
	}

	bool X86TargetLowering::hasAndNot(SDValue Y) const {
	EVT VT = Y.getValueType();

	if (!VT.isVector())
	return hasAndNotCompare(Y);

	// Vector.

	if (!Subtarget.hasSSE1() \|\| VT.getSizeInBits() < 128)
	return false;

	if (VT == MVT::v4i32)
	return true;

	return Subtarget.hasSSE2();
	}

	bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
	return X.getValueType().isScalarInteger(); // 'bt'
	}

	bool X86TargetLowering::
	shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
	SDValue X, ConstantSDNode XC, ConstantSDNode CC, SDValue Y,
	unsigned OldShiftOpcode, unsigned NewShiftOpcode,
	SelectionDAG &DAG) const {
	// Does baseline recommend not to perform the fold by default?
	if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
	X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
	return false;
	// For scalars this transform is always beneficial.
	if (X.getValueType().isScalarInteger())
	return true;
	// If all the shift amounts are identical, then transform is beneficial even
	// with rudimentary SSE2 shifts.
	if (DAG.isSplatValue(Y, /AllowUndefs=/true))
	return true;
	// If we have AVX2 with it's powerful shift operations, then it's also good.
	if (Subtarget.hasAVX2())
	return true;
	// Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
	return NewShiftOpcode == ISD::SHL;
	}

	bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
	const SDNode *N, CombineLevel Level) const {
	assert(((N->getOpcode() == ISD::SHL &&
	N->getOperand(0).getOpcode() == ISD::SRL) \|\|
	(N->getOpcode() == ISD::SRL &&
	N->getOperand(0).getOpcode() == ISD::SHL)) &&
	"Expected shift-shift mask");
	EVT VT = N->getValueType(0);
	if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) \|\|
	(Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
	// Only fold if the shift values are equal - so it folds to AND.
	// TODO - we should fold if either is a non-uniform vector but we don't do
	// the fold for non-splats yet.
	return N->getOperand(1) == N->getOperand(0).getOperand(1);
	}
	return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
	}

	bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
	EVT VT = Y.getValueType();

	// For vectors, we don't have a preference, but we probably want a mask.
	if (VT.isVector())
	return false;

	// 64-bit shifts on 32-bit targets produce really bad bloated code.
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	return false;

	return true;
	}

	bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,
	SDNode *N) const {
	if (DAG.getMachineFunction().getFunction().hasMinSize() &&
	!Subtarget.isOSWindows())
	return false;
	return true;
	}

	bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
	// Any legal vector type can be splatted more efficiently than
	// loading/spilling from memory.
	return isTypeLegal(VT);
	}

	MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
	MVT VT = MVT::getIntegerVT(NumBits);
	if (isTypeLegal(VT))
	return VT;

	// PMOVMSKB can handle this.
	if (NumBits == 128 && isTypeLegal(MVT::v16i8))
	return MVT::v16i8;

	// VPMOVMSKB can handle this.
	if (NumBits == 256 && isTypeLegal(MVT::v32i8))
	return MVT::v32i8;

	// TODO: Allow 64-bit type for 32-bit target.
	// TODO: 512-bit types should be allowed, but make sure that those
	// cases are handled in combineVectorSizedSetCCEquality().

	return MVT::INVALID_SIMPLE_VALUE_TYPE;
	}

	/// Val is the undef sentinel value or equal to the specified value.
	static bool isUndefOrEqual(int Val, int CmpVal) {
	return ((Val == SM_SentinelUndef) \|\| (Val == CmpVal));
	}

	/// Val is either the undef or zero sentinel value.
	static bool isUndefOrZero(int Val) {
	return ((Val == SM_SentinelUndef) \|\| (Val == SM_SentinelZero));
	}

	/// Return true if every element in Mask, beginning from position Pos and ending
	/// in Pos+Size is the undef sentinel value.
	static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
	return llvm::all_of(Mask.slice(Pos, Size),
	[](int M) { return M == SM_SentinelUndef; });
	}

	/// Return true if the mask creates a vector whose lower half is undefined.
	static bool isUndefLowerHalf(ArrayRef<int> Mask) {
	unsigned NumElts = Mask.size();
	return isUndefInRange(Mask, 0, NumElts / 2);
	}

	/// Return true if the mask creates a vector whose upper half is undefined.
	static bool isUndefUpperHalf(ArrayRef<int> Mask) {
	unsigned NumElts = Mask.size();
	return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
	}

	/// Return true if Val falls within the specified range (L, H].
	static bool isInRange(int Val, int Low, int Hi) {
	return (Val >= Low && Val < Hi);
	}

	/// Return true if the value of any element in Mask falls within the specified
	/// range (L, H].
	static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
	return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
	}

	/// Return true if the value of any element in Mask is the zero sentinel value.
	static bool isAnyZero(ArrayRef<int> Mask) {
	return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
	}

	/// Return true if the value of any element in Mask is the zero or undef
	/// sentinel values.
	static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {
	return llvm::any_of(Mask, [](int M) {
	return M == SM_SentinelZero \|\| M == SM_SentinelUndef;
	});
	}

	/// Return true if Val is undef or if its value falls within the
	/// specified range (L, H].
	static bool isUndefOrInRange(int Val, int Low, int Hi) {
	return (Val == SM_SentinelUndef) \|\| isInRange(Val, Low, Hi);
	}

	/// Return true if every element in Mask is undef or if its value
	/// falls within the specified range (L, H].
	static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
	return llvm::all_of(
	Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
	}

	/// Return true if Val is undef, zero or if its value falls within the
	/// specified range (L, H].
	static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
	return isUndefOrZero(Val) \|\| isInRange(Val, Low, Hi);
	}

	/// Return true if every element in Mask is undef, zero or if its value
	/// falls within the specified range (L, H].
	static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
	return llvm::all_of(
	Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos + Size, falls within the specified
	/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
	static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
	unsigned Size, int Low, int Step = 1) {
	for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
	if (!isUndefOrEqual(Mask[i], Low))
	return false;
	return true;
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size, falls within the specified
	/// sequential range (Low, Low+Size], or is undef or is zero.
	static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
	unsigned Size, int Low,
	int Step = 1) {
	for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
	if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
	return false;
	return true;
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size is undef or is zero.
	static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
	unsigned Size) {
	return llvm::all_of(Mask.slice(Pos, Size),
	[](int M) { return isUndefOrZero(M); });
	}

	/// Helper function to test whether a shuffle mask could be
	/// simplified by widening the elements being shuffled.
	///
	/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
	/// leaves it in an unspecified state.
	///
	/// NOTE: This must handle normal vector shuffle masks and target vector
	/// shuffle masks. The latter have the special property of a '-2' representing
	/// a zero-ed lane of a vector.
	static bool canWidenShuffleElements(ArrayRef<int> Mask,
	SmallVectorImpl<int> &WidenedMask) {
	WidenedMask.assign(Mask.size() / 2, 0);
	for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
	int M0 = Mask[i];
	int M1 = Mask[i + 1];

	// If both elements are undef, its trivial.
	if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
	WidenedMask[i / 2] = SM_SentinelUndef;
	continue;
	}

	// Check for an undef mask and a mask value properly aligned to fit with
	// a pair of values. If we find such a case, use the non-undef mask's value.
	if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
	WidenedMask[i / 2] = M1 / 2;
	continue;
	}
	if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
	WidenedMask[i / 2] = M0 / 2;
	continue;
	}

	// When zeroing, we need to spread the zeroing across both lanes to widen.
	if (M0 == SM_SentinelZero \|\| M1 == SM_SentinelZero) {
	if ((M0 == SM_SentinelZero \|\| M0 == SM_SentinelUndef) &&
	(M1 == SM_SentinelZero \|\| M1 == SM_SentinelUndef)) {
	WidenedMask[i / 2] = SM_SentinelZero;
	continue;
	}
	return false;
	}

	// Finally check if the two mask values are adjacent and aligned with
	// a pair.
	if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
	WidenedMask[i / 2] = M0 / 2;
	continue;
	}

	// Otherwise we can't safely widen the elements used in this shuffle.
	return false;
	}
	assert(WidenedMask.size() == Mask.size() / 2 &&
	"Incorrect size of mask after widening the elements!");

	return true;
	}

	static bool canWidenShuffleElements(ArrayRef<int> Mask,
	const APInt &Zeroable,
	bool V2IsZero,
	SmallVectorImpl<int> &WidenedMask) {
	// Create an alternative mask with info about zeroable elements.
	// Here we do not set undef elements as zeroable.
	SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
	if (V2IsZero) {
	assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
	for (int i = 0, Size = Mask.size(); i != Size; ++i)
	if (Mask[i] != SM_SentinelUndef && Zeroable[i])
	ZeroableMask[i] = SM_SentinelZero;
	}
	return canWidenShuffleElements(ZeroableMask, WidenedMask);
	}

	static bool canWidenShuffleElements(ArrayRef<int> Mask) {
	SmallVector<int, 32> WidenedMask;
	return canWidenShuffleElements(Mask, WidenedMask);
	}

	// Attempt to narrow/widen shuffle mask until it matches the target number of
	// elements.
	static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
	SmallVectorImpl<int> &ScaledMask) {
	unsigned NumSrcElts = Mask.size();
	assert(((NumSrcElts % NumDstElts) == 0 \|\| (NumDstElts % NumSrcElts) == 0) &&
	"Illegal shuffle scale factor");

	// Narrowing is guaranteed to work.
	if (NumDstElts >= NumSrcElts) {
	int Scale = NumDstElts / NumSrcElts;
	llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
	return true;
	}

	// We have to repeat the widening until we reach the target size, but we can
	// split out the first widening as it sets up ScaledMask for us.
	if (canWidenShuffleElements(Mask, ScaledMask)) {
	while (ScaledMask.size() > NumDstElts) {
	SmallVector<int, 16> WidenedMask;
	if (!canWidenShuffleElements(ScaledMask, WidenedMask))
	return false;
	ScaledMask = std::move(WidenedMask);
	}
	return true;
	}

	return false;
	}

	/// Returns true if Elt is a constant zero or a floating point constant +0.0.
	bool X86::isZeroNode(SDValue Elt) {
	return isNullConstant(Elt) \|\| isNullFPConstant(Elt);
	}

	// Build a vector of constants.
	// Use an UNDEF node if MaskElt == -1.
	// Split 64-bit constants in the 32-bit mode.
	static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
	const SDLoc &dl, bool IsMask = false) {

	SmallVector<SDValue, 32> Ops;
	bool Split = false;

	MVT ConstVecVT = VT;
	unsigned NumElts = VT.getVectorNumElements();
	bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
	if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
	ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
	Split = true;
	}

	MVT EltVT = ConstVecVT.getVectorElementType();
	for (unsigned i = 0; i < NumElts; ++i) {
	bool IsUndef = Values[i] < 0 && IsMask;
	SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
	DAG.getConstant(Values[i], dl, EltVT);
	Ops.push_back(OpNode);
	if (Split)
	Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
	DAG.getConstant(0, dl, EltVT));
	}
	SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
	if (Split)
	ConstsNode = DAG.getBitcast(VT, ConstsNode);
	return ConstsNode;
	}

	static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
	MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
	assert(Bits.size() == Undefs.getBitWidth() &&
	"Unequal constant and undef arrays");
	SmallVector<SDValue, 32> Ops;
	bool Split = false;

	MVT ConstVecVT = VT;
	unsigned NumElts = VT.getVectorNumElements();
	bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
	if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
	ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
	Split = true;
	}

	MVT EltVT = ConstVecVT.getVectorElementType();
	for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
	if (Undefs[i]) {
	Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
	continue;
	}
	const APInt &V = Bits[i];
	assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
	if (Split) {
	Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
	Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
	} else if (EltVT == MVT::f32) {
	APFloat FV(APFloat::IEEEsingle(), V);
	Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
	} else if (EltVT == MVT::f64) {
	APFloat FV(APFloat::IEEEdouble(), V);
	Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
	} else {
	Ops.push_back(DAG.getConstant(V, dl, EltVT));
	}
	}

	SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
	return DAG.getBitcast(VT, ConstsNode);
	}

	/// Returns a vector of specified type with all zero elements.
	static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector() \|\|
	VT.getVectorElementType() == MVT::i1) &&
	"Unexpected vector type");

	// Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
	// type. This ensures they get CSE'd. But if the integer type is not
	// available, use a floating-point +0.0 instead.
	SDValue Vec;
	if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
	Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
	} else if (VT.isFloatingPoint()) {
	Vec = DAG.getConstantFP(+0.0, dl, VT);
	} else if (VT.getVectorElementType() == MVT::i1) {
	assert((Subtarget.hasBWI() \|\| VT.getVectorNumElements() <= 16) &&
	"Unexpected vector type");
	Vec = DAG.getConstant(0, dl, VT);
	} else {
	unsigned Num32BitElts = VT.getSizeInBits() / 32;
	Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
	}
	return DAG.getBitcast(VT, Vec);
	}

	static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
	const SDLoc &dl, unsigned vectorWidth) {
	EVT VT = Vec.getValueType();
	EVT ElVT = VT.getVectorElementType();
	unsigned Factor = VT.getSizeInBits()/vectorWidth;
	EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
	VT.getVectorNumElements()/Factor);

	// Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
	unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// This is the index of the first element of the vectorWidth-bit chunk
	// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
	IdxVal &= ~(ElemsPerChunk - 1);

	// If the input is a buildvector just emit a smaller one.
	if (Vec.getOpcode() == ISD::BUILD_VECTOR)
	return DAG.getBuildVector(ResultVT, dl,
	Vec->ops().slice(IdxVal, ElemsPerChunk));

	SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
	}

	/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
	/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
	/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
	/// instructions or a simple subregister reference. Idx is an index in the
	/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
	/// lowering EXTRACT_VECTOR_ELT operations easier.
	static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert((Vec.getValueType().is256BitVector() \|\|
	Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
	return extractSubVector(Vec, IdxVal, DAG, dl, 128);
	}

	/// Generate a DAG to grab 256-bits from a 512-bit vector.
	static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
	return extractSubVector(Vec, IdxVal, DAG, dl, 256);
	}

	static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl,
	unsigned vectorWidth) {
	assert((vectorWidth == 128 \|\| vectorWidth == 256) &&
	"Unsupported vector width");
	// Inserting UNDEF is Result
	if (Vec.isUndef())
	return Result;
	EVT VT = Vec.getValueType();
	EVT ElVT = VT.getVectorElementType();
	EVT ResultVT = Result.getValueType();

	// Insert the relevant vectorWidth bits.
	unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// This is the index of the first element of the vectorWidth-bit chunk
	// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
	IdxVal &= ~(ElemsPerChunk - 1);

	SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
	}

	/// Generate a DAG to put 128-bits into a vector > 128 bits. This
	/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
	/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
	/// simple superregister reference. Idx is an index in the 128 bits
	/// we want. It need not be aligned to a 128-bit boundary. That makes
	/// lowering INSERT_VECTOR_ELT operations easier.
	static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
	return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
	}

	/// Widen a vector to a larger size with the same scalar type, with the new
	/// elements either zero or undef.
	static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl) {
	assert(Vec.getValueSizeInBits() < VT.getSizeInBits() &&
	Vec.getValueType().getScalarType() == VT.getScalarType() &&
	"Unsupported vector widening type");
	SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
	: DAG.getUNDEF(VT);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	/// Widen a vector to a larger size with the same scalar type, with the new
	/// elements either zero or undef.
	static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl, unsigned WideSizeInBits) {
	assert(Vec.getValueSizeInBits() < WideSizeInBits &&
	(WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
	"Unsupported vector widening type");
	unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
	MVT SVT = Vec.getSimpleValueType().getScalarType();
	MVT VT = MVT::getVectorVT(SVT, WideNumElts);
	return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
	}

	// Helper function to collect subvector ops that are concatenated together,
	// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
	// The subvectors in Ops are guaranteed to be the same type.
	static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
	assert(Ops.empty() && "Expected an empty ops vector");

	if (N->getOpcode() == ISD::CONCAT_VECTORS) {
	Ops.append(N->op_begin(), N->op_end());
	return true;
	}

	if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
	SDValue Src = N->getOperand(0);
	SDValue Sub = N->getOperand(1);
	const APInt &Idx = N->getConstantOperandAPInt(2);
	EVT VT = Src.getValueType();
	EVT SubVT = Sub.getValueType();

	// TODO - Handle more general insert_subvector chains.
	if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
	Idx == (VT.getVectorNumElements() / 2)) {
	// insert_subvector(insert_subvector(undef, x, lo), y, hi)
	if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
	Src.getOperand(1).getValueType() == SubVT &&
	isNullConstant(Src.getOperand(2))) {
	Ops.push_back(Src.getOperand(1));
	Ops.push_back(Sub);
	return true;
	}
	// insert_subvector(x, extract_subvector(x, lo), hi)
	if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
	Ops.append(2, Sub);
	return true;
	}
	}
	}

	return false;
	}

	static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
	const SDLoc &dl) {
	EVT VT = Op.getValueType();
	unsigned NumElems = VT.getVectorNumElements();
	unsigned SizeInBits = VT.getSizeInBits();
	assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
	"Can't split odd sized vector");

	SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
	SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
	return std::make_pair(Lo, Hi);
	}

	// Split an unary integer op into 2 half sized ops.
	static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
	EVT VT = Op.getValueType();

	// Make sure we only try to split 256/512-bit types to avoid creating
	// narrow vectors.
	assert((Op.getOperand(0).getValueType().is256BitVector() \|\|
	Op.getOperand(0).getValueType().is512BitVector()) &&
	(VT.is256BitVector() \|\| VT.is512BitVector()) && "Unsupported VT!");
	assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
	VT.getVectorNumElements() &&
	"Unexpected VTs!");

	SDLoc dl(Op);

	// Extract the Lo/Hi vectors
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = splitVector(Op.getOperand(0), DAG, dl);

	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, LoVT, Lo),
	DAG.getNode(Op.getOpcode(), dl, HiVT, Hi));
	}

	/// Break a binary integer operation into 2 half sized ops and then
	/// concatenate the result back.
	static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {
	EVT VT = Op.getValueType();

	// Sanity check that all the types match.
	assert(Op.getOperand(0).getValueType() == VT &&
	Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
	assert((VT.is256BitVector() \|\| VT.is512BitVector()) && "Unsupported VT!");

	SDLoc dl(Op);

	// Extract the LHS Lo/Hi vectors
	SDValue LHS1, LHS2;
	std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);

	// Extract the RHS Lo/Hi vectors
	SDValue RHS1, RHS2;
	std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);

	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, LoVT, LHS1, RHS1),
	DAG.getNode(Op.getOpcode(), dl, HiVT, LHS2, RHS2));
	}

	// Helper for splitting operands of an operation to legal target size and
	// apply a function on each part.
	// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
	// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
	// deciding if/how to split Ops. Ops elements do not have to be of type VT.
	// The argument Builder is a function that will be applied on each split part:
	// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
	template <typename F>
	SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
	const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
	F Builder, bool CheckBWI = true) {
	assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
	unsigned NumSubs = 1;
	if ((CheckBWI && Subtarget.useBWIRegs()) \|\|
	(!CheckBWI && Subtarget.useAVX512Regs())) {
	if (VT.getSizeInBits() > 512) {
	NumSubs = VT.getSizeInBits() / 512;
	assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
	}
	} else if (Subtarget.hasAVX2()) {
	if (VT.getSizeInBits() > 256) {
	NumSubs = VT.getSizeInBits() / 256;
	assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
	}
	} else {
	if (VT.getSizeInBits() > 128) {
	NumSubs = VT.getSizeInBits() / 128;
	assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
	}
	}

	if (NumSubs == 1)
	return Builder(DAG, DL, Ops);

	SmallVector<SDValue, 4> Subs;
	for (unsigned i = 0; i != NumSubs; ++i) {
	SmallVector<SDValue, 2> SubOps;
	for (SDValue Op : Ops) {
	EVT OpVT = Op.getValueType();
	unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
	unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
	SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
	}
	Subs.push_back(Builder(DAG, DL, SubOps));
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
	}

	/// Insert i1-subvector to i1-vector.
	static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {

	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	SDValue SubVec = Op.getOperand(1);
	SDValue Idx = Op.getOperand(2);
	unsigned IdxVal = Op.getConstantOperandVal(2);

	// Inserting undef is a nop. We can just return the original vector.
	if (SubVec.isUndef())
	return Vec;

	if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
	return Op;

	MVT OpVT = Op.getSimpleValueType();
	unsigned NumElems = OpVT.getVectorNumElements();
	SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);

	// Extend to natively supported kshift.
	MVT WideOpVT = OpVT;
	if ((!Subtarget.hasDQI() && NumElems == 8) \|\| NumElems < 8)
	WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

	// Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
	// if necessary.
	if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
	// May need to promote to a legal type.
	Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	DAG.getConstant(0, dl, WideOpVT),
	SubVec, Idx);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	MVT SubVecVT = SubVec.getSimpleValueType();
	unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
	assert(IdxVal + SubVecNumElems <= NumElems &&
	IdxVal % SubVecVT.getSizeInBits() == 0 &&
	"Unexpected index value in INSERT_SUBVECTOR");

	SDValue Undef = DAG.getUNDEF(WideOpVT);

	if (IdxVal == 0) {
	// Zero lower bits of the Vec
	SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
	ZeroIdx);
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
	// Merge them together, SubVec should be zero extended.
	SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	DAG.getConstant(0, dl, WideOpVT),
	SubVec, ZeroIdx);
	Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	Undef, SubVec, ZeroIdx);

	if (Vec.isUndef()) {
	assert(IdxVal != 0 && "Unexpected index");
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getTargetConstant(IdxVal, dl, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
	}

	if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
	assert(IdxVal != 0 && "Unexpected index");
	NumElems = WideOpVT.getVectorNumElements();
	unsigned ShiftLeft = NumElems - SubVecNumElems;
	unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
	if (ShiftRight != 0)
	SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
	DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
	}

	// Simple case when we put subvector in the upper part
	if (IdxVal + SubVecNumElems == NumElems) {
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getTargetConstant(IdxVal, dl, MVT::i8));
	if (SubVecNumElems * 2 == NumElems) {
	// Special case, use legal zero extending insert_subvector. This allows
	// isel to optimize when bits are known zero.
	Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	DAG.getConstant(0, dl, WideOpVT),
	Vec, ZeroIdx);
	} else {
	// Otherwise use explicit shifts to zero the bits.
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	Undef, Vec, ZeroIdx);
	NumElems = WideOpVT.getVectorNumElements();
	SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
	}
	Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	// Inserting into the middle is more complicated.

	NumElems = WideOpVT.getVectorNumElements();

	// Widen the vector if needed.
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);

	unsigned ShiftLeft = NumElems - SubVecNumElems;
	unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;

	// Do an optimization for the the most frequently used types.
	if (WideOpVT != MVT::v64i1 \|\| Subtarget.is64Bit()) {
	APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
	Mask0.flipAllBits();
	SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
	SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
	Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
	SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
	DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
	Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);

	// Reduce to original width if needed.
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	// Clear the upper bits of the subvector and move it to its insert position.
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
	SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
	DAG.getTargetConstant(ShiftRight, dl, MVT::i8));

	// Isolate the bits below the insertion point.
	unsigned LowShift = NumElems - IdxVal;
	SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
	DAG.getTargetConstant(LowShift, dl, MVT::i8));
	Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
	DAG.getTargetConstant(LowShift, dl, MVT::i8));

	// Isolate the bits after the last inserted bit.
	unsigned HighShift = IdxVal + SubVecNumElems;
	SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
	DAG.getTargetConstant(HighShift, dl, MVT::i8));
	High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
	DAG.getTargetConstant(HighShift, dl, MVT::i8));

	// Now OR all 3 pieces together.
	Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
	SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);

	// Reduce to original width if needed.
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
	}

	static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
	const SDLoc &dl) {
	assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
	EVT SubVT = V1.getValueType();
	EVT SubSVT = SubVT.getScalarType();
	unsigned SubNumElts = SubVT.getVectorNumElements();
	unsigned SubVectorWidth = SubVT.getSizeInBits();
	EVT VT = EVT::getVectorVT(DAG.getContext(), SubSVT, 2 SubNumElts);
	SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
	return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
	}

	/// Returns a vector of specified type with all bits set.
	/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
	/// Then bitcast to their original type, ensuring they get CSE'd.
	static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) &&
	"Expected a 128/256/512-bit vector type");

	APInt Ones = APInt::getAllOnesValue(32);
	unsigned NumElts = VT.getSizeInBits() / 32;
	SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
	return DAG.getBitcast(VT, Vec);
	}

	// Convert _EXTEND to _EXTEND_VECTOR_INREG opcode.
	static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
	switch (Opcode) {
	case ISD::ANY_EXTEND:
	case ISD::ANY_EXTEND_VECTOR_INREG:
	return ISD::ANY_EXTEND_VECTOR_INREG;
	case ISD::ZERO_EXTEND:
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	return ISD::ZERO_EXTEND_VECTOR_INREG;
	case ISD::SIGN_EXTEND:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	return ISD::SIGN_EXTEND_VECTOR_INREG;
	}
	llvm_unreachable("Unknown opcode");
	}

	static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT,
	SDValue In, SelectionDAG &DAG) {
	EVT InVT = In.getValueType();
	assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
	assert((ISD::ANY_EXTEND == Opcode \|\| ISD::SIGN_EXTEND == Opcode \|\|
	ISD::ZERO_EXTEND == Opcode) &&
	"Unknown extension opcode");

	// For 256-bit vectors, we only need the lower (128-bit) input half.
	// For 512-bit vectors, we only need the lower input half or quarter.
	if (InVT.getSizeInBits() > 128) {
	assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
	"Expected VTs to be the same size!");
	unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
	In = extractSubVector(In, 0, DAG, DL,
	std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
	InVT = In.getValueType();
	}

	if (VT.getVectorNumElements() != InVT.getVectorNumElements())
	Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);

	return DAG.getNode(Opcode, DL, VT, In);
	}

	// Match (xor X, -1) -> X.
	// Match extract_subvector(xor X, -1) -> extract_subvector(X).
	// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
	static SDValue IsNOT(SDValue V, SelectionDAG &DAG, bool OneUse = false) {
	V = OneUse ? peekThroughOneUseBitcasts(V) : peekThroughBitcasts(V);
	if (V.getOpcode() == ISD::XOR &&
	ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
	return V.getOperand(0);
	if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	(isNullConstant(V.getOperand(1)) \|\| V.getOperand(0).hasOneUse())) {
	if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
	Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
	Not, V.getOperand(1));
	}
	}
	SmallVector<SDValue, 2> CatOps;
	if (collectConcatOps(V.getNode(), CatOps)) {
	for (SDValue &CatOp : CatOps) {
	SDValue NotCat = IsNOT(CatOp, DAG);
	if (!NotCat) return SDValue();
	CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
	}
	return SDValue();
	}

	void llvm::createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
	bool Lo, bool Unary) {
	assert(Mask.empty() && "Expected an empty shuffle mask vector");
	int NumElts = VT.getVectorNumElements();
	int NumEltsInLane = 128 / VT.getScalarSizeInBits();
	for (int i = 0; i < NumElts; ++i) {
	unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
	int Pos = (i % NumEltsInLane) / 2 + LaneStart;
	Pos += (Unary ? 0 : NumElts * (i % 2));
	Pos += (Lo ? 0 : NumEltsInLane / 2);
	Mask.push_back(Pos);
	}
	}

	/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
	/// imposed by AVX and specific to the unary pattern. Example:
	/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
	/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
	void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
	bool Lo) {
	assert(Mask.empty() && "Expected an empty shuffle mask vector");
	int NumElts = VT.getVectorNumElements();
	for (int i = 0; i < NumElts; ++i) {
	int Pos = i / 2;
	Pos += (Lo ? 0 : NumElts / 2);
	Mask.push_back(Pos);
	}
	}

	/// Returns a vector_shuffle node for an unpackl operation.
	static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
	SDValue V1, SDValue V2) {
	SmallVector<int, 8> Mask;
	createUnpackShuffleMask(VT, Mask, /* Lo = / true, / Unary = */ false);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	/// Returns a vector_shuffle node for an unpackh operation.
	static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
	SDValue V1, SDValue V2) {
	SmallVector<int, 8> Mask;
	createUnpackShuffleMask(VT, Mask, /* Lo = / false, / Unary = */ false);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	/// Return a vector_shuffle of the specified vector of zero or undef vector.
	/// This produces a shuffle where the low element of V2 is swizzled into the
	/// zero/undef vector, landing at element Idx.
	/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
	static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
	bool IsZero,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = V2.getSimpleValueType();
	SDValue V1 = IsZero
	? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
	int NumElems = VT.getVectorNumElements();
	SmallVector<int, 16> MaskVec(NumElems);
	for (int i = 0; i != NumElems; ++i)
	// If this is the insertion idx, put the low elt of V2 here.
	MaskVec[i] = (i == Idx) ? NumElems : i;
	return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
	}

	static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {
	if (Ptr.getOpcode() == X86ISD::Wrapper \|\|
	Ptr.getOpcode() == X86ISD::WrapperRIP)
	Ptr = Ptr.getOperand(0);

	auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
	if (!CNode \|\| CNode->isMachineConstantPoolEntry() \|\| CNode->getOffset() != 0)
	return nullptr;

	return CNode->getConstVal();
	}

	static const Constant getTargetConstantFromNode(LoadSDNode Load) {
	if (!Load \|\| !ISD::isNormalLoad(Load))
	return nullptr;
	return getTargetConstantFromBasePtr(Load->getBasePtr());
	}

	static const Constant *getTargetConstantFromNode(SDValue Op) {
	Op = peekThroughBitcasts(Op);
	return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
	}

	const Constant *
	X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
	assert(LD && "Unexpected null LoadSDNode");
	return getTargetConstantFromNode(LD);
	}

	// Extract raw constant bits from constant pools.
	static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
	APInt &UndefElts,
	SmallVectorImpl<APInt> &EltBits,
	bool AllowWholeUndefs = true,
	bool AllowPartialUndefs = true) {
	assert(EltBits.empty() && "Expected an empty EltBits vector");

	Op = peekThroughBitcasts(Op);

	EVT VT = Op.getValueType();
	unsigned SizeInBits = VT.getSizeInBits();
	assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
	unsigned NumElts = SizeInBits / EltSizeInBits;

	// Bitcast a source array of element bits to the target size.
	auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
	unsigned NumSrcElts = UndefSrcElts.getBitWidth();
	unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
	assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
	"Constant bit sizes don't match");

	// Don't split if we don't allow undef bits.
	bool AllowUndefs = AllowWholeUndefs \|\| AllowPartialUndefs;
	if (UndefSrcElts.getBoolValue() && !AllowUndefs)
	return false;

	// If we're already the right size, don't bother bitcasting.
	if (NumSrcElts == NumElts) {
	UndefElts = UndefSrcElts;
	EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
	return true;
	}

	// Extract all the undef/constant element data and pack into single bitsets.
	APInt UndefBits(SizeInBits, 0);
	APInt MaskBits(SizeInBits, 0);

	for (unsigned i = 0; i != NumSrcElts; ++i) {
	unsigned BitOffset = i * SrcEltSizeInBits;
	if (UndefSrcElts[i])
	UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
	MaskBits.insertBits(SrcEltBits[i], BitOffset);
	}

	// Split the undef/constant single bitset data into the target elements.
	UndefElts = APInt(NumElts, 0);
	EltBits.resize(NumElts, APInt(EltSizeInBits, 0));

	for (unsigned i = 0; i != NumElts; ++i) {
	unsigned BitOffset = i * EltSizeInBits;
	APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);

	// Only treat an element as UNDEF if all bits are UNDEF.
	if (UndefEltBits.isAllOnesValue()) {
	if (!AllowWholeUndefs)
	return false;
	UndefElts.setBit(i);
	continue;
	}

	// If only some bits are UNDEF then treat them as zero (or bail if not
	// supported).
	if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
	return false;

	EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
	}
	return true;
	};

	// Collect constant bits and insert into mask/undef bit masks.
	auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
	unsigned UndefBitIndex) {
	if (!Cst)
	return false;
	if (isa<UndefValue>(Cst)) {
	Undefs.setBit(UndefBitIndex);
	return true;
	}
	if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
	Mask = CInt->getValue();
	return true;
	}
	if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
	Mask = CFP->getValueAPF().bitcastToAPInt();
	return true;
	}
	return false;
	};

	// Handle UNDEFs.
	if (Op.isUndef()) {
	APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
	SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract scalar constant bits.
	if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
	APInt UndefSrcElts = APInt::getNullValue(1);
	SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
	return CastBitData(UndefSrcElts, SrcEltBits);
	}
	if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
	APInt UndefSrcElts = APInt::getNullValue(1);
	APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
	SmallVector<APInt, 64> SrcEltBits(1, RawBits);
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract constant bits from build vector.
	if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
	unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
	for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
	const SDValue &Src = Op.getOperand(i);
	if (Src.isUndef()) {
	UndefSrcElts.setBit(i);
	continue;
	}
	auto *Cst = cast<ConstantSDNode>(Src);
	SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
	}
	return CastBitData(UndefSrcElts, SrcEltBits);
	}
	if (ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) {
	unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
	for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
	const SDValue &Src = Op.getOperand(i);
	if (Src.isUndef()) {
	UndefSrcElts.setBit(i);
	continue;
	}
	auto *Cst = cast<ConstantFPSDNode>(Src);
	APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
	SrcEltBits[i] = RawBits.zextOrTrunc(SrcEltSizeInBits);
	}
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract constant bits from constant pool vector.
	if (auto *Cst = getTargetConstantFromNode(Op)) {
	Type *CstTy = Cst->getType();
	unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
	if (!CstTy->isVectorTy() \|\| (CstSizeInBits % SizeInBits) != 0)
	return false;

	unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
	for (unsigned i = 0; i != NumSrcElts; ++i)
	if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
	UndefSrcElts, i))
	return false;

	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract constant bits from a broadcasted constant pool scalar.
	if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
	EltSizeInBits <= VT.getScalarSizeInBits()) {
	auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
	if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
	return false;

	SDValue Ptr = MemIntr->getBasePtr();
	if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
	unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
	if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
	if (UndefSrcElts[0])
	UndefSrcElts.setBits(0, NumSrcElts);
	SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
	return CastBitData(UndefSrcElts, SrcEltBits);
	}
	}
	}

	// Extract constant bits from a subvector broadcast.
	if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) {
	SmallVector<APInt, 16> SubEltBits;
	if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
	UndefElts, SubEltBits, AllowWholeUndefs,
	AllowPartialUndefs)) {
	UndefElts = APInt::getSplat(NumElts, UndefElts);
	while (EltBits.size() < NumElts)
	EltBits.append(SubEltBits.begin(), SubEltBits.end());
	return true;
	}
	}

	// Extract a rematerialized scalar constant insertion.
	if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
	Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
	isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
	unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits;
	auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
	SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
	SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Insert constant bits from a base and sub vector sources.
	if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
	// TODO - support insert_subvector through bitcasts.
	if (EltSizeInBits != VT.getScalarSizeInBits())
	return false;

	APInt UndefSubElts;
	SmallVector<APInt, 32> EltSubBits;
	if (getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
	UndefSubElts, EltSubBits,
	AllowWholeUndefs, AllowPartialUndefs) &&
	getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
	UndefElts, EltBits, AllowWholeUndefs,
	AllowPartialUndefs)) {
	unsigned BaseIdx = Op.getConstantOperandVal(2);
	UndefElts.insertBits(UndefSubElts, BaseIdx);
	for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
	EltBits[BaseIdx + i] = EltSubBits[i];
	return true;
	}
	}

	// Extract constant bits from a subvector's source.
	if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
	// TODO - support extract_subvector through bitcasts.
	if (EltSizeInBits != VT.getScalarSizeInBits())
	return false;

	if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
	UndefElts, EltBits, AllowWholeUndefs,
	AllowPartialUndefs)) {
	EVT SrcVT = Op.getOperand(0).getValueType();
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	unsigned NumSubElts = VT.getVectorNumElements();
	unsigned BaseIdx = Op.getConstantOperandVal(1);
	UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
	if ((BaseIdx + NumSubElts) != NumSrcElts)
	EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
	if (BaseIdx != 0)
	EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
	return true;
	}
	}

	// Extract constant bits from shuffle node sources.
	if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
	// TODO - support shuffle through bitcasts.
	if (EltSizeInBits != VT.getScalarSizeInBits())
	return false;

	ArrayRef<int> Mask = SVN->getMask();
	if ((!AllowWholeUndefs \|\| !AllowPartialUndefs) &&
	llvm::any_of(Mask, [](int M) { return M < 0; }))
	return false;

	APInt UndefElts0, UndefElts1;
	SmallVector<APInt, 32> EltBits0, EltBits1;
	if (isAnyInRange(Mask, 0, NumElts) &&
	!getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
	UndefElts0, EltBits0, AllowWholeUndefs,
	AllowPartialUndefs))
	return false;
	if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
	!getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
	UndefElts1, EltBits1, AllowWholeUndefs,
	AllowPartialUndefs))
	return false;

	UndefElts = APInt::getNullValue(NumElts);
	for (int i = 0; i != (int)NumElts; ++i) {
	int M = Mask[i];
	if (M < 0) {
	UndefElts.setBit(i);
	EltBits.push_back(APInt::getNullValue(EltSizeInBits));
	} else if (M < (int)NumElts) {
	if (UndefElts0[M])
	UndefElts.setBit(i);
	EltBits.push_back(EltBits0[M]);
	} else {
	if (UndefElts1[M - NumElts])
	UndefElts.setBit(i);
	EltBits.push_back(EltBits1[M - NumElts]);
	}
	}
	return true;
	}

	return false;
	}

	namespace llvm {
	namespace X86 {
	bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
	APInt UndefElts;
	SmallVector<APInt, 16> EltBits;
	if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
	UndefElts, EltBits, true,
	AllowPartialUndefs)) {
	int SplatIndex = -1;
	for (int i = 0, e = EltBits.size(); i != e; ++i) {
	if (UndefElts[i])
	continue;
	if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
	SplatIndex = -1;
	break;
	}
	SplatIndex = i;
	}
	if (0 <= SplatIndex) {
	SplatVal = EltBits[SplatIndex];
	return true;
	}
	}

	return false;
	}
	} // namespace X86
	} // namespace llvm

	static bool getTargetShuffleMaskIndices(SDValue MaskNode,
	unsigned MaskEltSizeInBits,
	SmallVectorImpl<uint64_t> &RawMask,
	APInt &UndefElts) {
	// Extract the raw target constant bits.
	SmallVector<APInt, 64> EltBits;
	if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
	EltBits, /* AllowWholeUndefs */ true,
	/* AllowPartialUndefs */ false))
	return false;

	// Insert the extracted elements into the mask.
	for (APInt Elt : EltBits)
	RawMask.push_back(Elt.getZExtValue());

	return true;
	}

	/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
	/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
	/// Note: This ignores saturation, so inputs must be checked first.
	static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
	bool Unary, unsigned NumStages = 1) {
	assert(Mask.empty() && "Expected an empty shuffle mask vector");
	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumLanes = VT.getSizeInBits() / 128;
	unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
	unsigned Offset = Unary ? 0 : NumElts;
	unsigned Repetitions = 1u << (NumStages - 1);
	unsigned Increment = 1u << NumStages;
	assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");

	for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
	for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
	for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
	Mask.push_back(Elt + (Lane * NumEltsPerLane));
	for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
	Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
	}
	}
	}

	// Split the demanded elts of a PACKSS/PACKUS node between its operands.
	static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
	APInt &DemandedLHS, APInt &DemandedRHS) {
	int NumLanes = VT.getSizeInBits() / 128;
	int NumElts = DemandedElts.getBitWidth();
	int NumInnerElts = NumElts / 2;
	int NumEltsPerLane = NumElts / NumLanes;
	int NumInnerEltsPerLane = NumInnerElts / NumLanes;

	DemandedLHS = APInt::getNullValue(NumInnerElts);
	DemandedRHS = APInt::getNullValue(NumInnerElts);

	// Map DemandedElts to the packed operands.
	for (int Lane = 0; Lane != NumLanes; ++Lane) {
	for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
	int OuterIdx = (Lane * NumEltsPerLane) + Elt;
	int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
	if (DemandedElts[OuterIdx])
	DemandedLHS.setBit(InnerIdx);
	if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
	DemandedRHS.setBit(InnerIdx);
	}
	}
	}

	// Split the demanded elts of a HADD/HSUB node between its operands.
	static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
	APInt &DemandedLHS, APInt &DemandedRHS) {
	int NumLanes = VT.getSizeInBits() / 128;
	int NumElts = DemandedElts.getBitWidth();
	int NumEltsPerLane = NumElts / NumLanes;
	int HalfEltsPerLane = NumEltsPerLane / 2;

	DemandedLHS = APInt::getNullValue(NumElts);
	DemandedRHS = APInt::getNullValue(NumElts);

	// Map DemandedElts to the horizontal operands.
	for (int Idx = 0; Idx != NumElts; ++Idx) {
	if (!DemandedElts[Idx])
	continue;
	int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
	int LocalIdx = Idx % NumEltsPerLane;
	if (LocalIdx < HalfEltsPerLane) {
	DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
	DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
	} else {
	LocalIdx -= HalfEltsPerLane;
	DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
	DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
	}
	}
	}

	/// Calculates the shuffle mask corresponding to the target-specific opcode.
	/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
	/// operands in \p Ops, and returns true.
	/// Sets \p IsUnary to true if only one source is used. Note that this will set
	/// IsUnary for shuffles which use a single input multiple times, and in those
	/// cases it will adjust the mask to only have indices within that single input.
	/// It is an error to call this with non-empty Mask/Ops vectors.
	static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
	SmallVectorImpl<SDValue> &Ops,
	SmallVectorImpl<int> &Mask, bool &IsUnary) {
	unsigned NumElems = VT.getVectorNumElements();
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	SmallVector<uint64_t, 32> RawMask;
	APInt RawUndefs;
	uint64_t ImmN;

	assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
	assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");

	IsUnary = false;
	bool IsFakeUnary = false;
	switch (N->getOpcode()) {
	case X86ISD::BLENDI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
	DecodeBLENDMask(NumElems, ImmN, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::SHUFP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
	DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::INSERTPS:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
	DecodeINSERTPSMask(ImmN, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::EXTRQI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	if (isa<ConstantSDNode>(N->getOperand(1)) &&
	isa<ConstantSDNode>(N->getOperand(2))) {
	int BitLen = N->getConstantOperandVal(1);
	int BitIdx = N->getConstantOperandVal(2);
	DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
	IsUnary = true;
	}
	break;
	case X86ISD::INSERTQI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	if (isa<ConstantSDNode>(N->getOperand(2)) &&
	isa<ConstantSDNode>(N->getOperand(3))) {
	int BitLen = N->getConstantOperandVal(2);
	int BitIdx = N->getConstantOperandVal(3);
	DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	}
	break;
	case X86ISD::UNPCKH:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::UNPCKL:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVHLPS:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeMOVHLPSMask(NumElems, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVLHPS:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeMOVLHPSMask(NumElems, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::VALIGN:
	assert((VT.getScalarType() == MVT::i32 \|\| VT.getScalarType() == MVT::i64) &&
	"Only 32-bit and 64-bit elements are supported!");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
	DecodeVALIGNMask(NumElems, ImmN, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	Ops.push_back(N->getOperand(1));
	Ops.push_back(N->getOperand(0));
	break;
	case X86ISD::PALIGNR:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
	DecodePALIGNRMask(NumElems, ImmN, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	Ops.push_back(N->getOperand(1));
	Ops.push_back(N->getOperand(0));
	break;
	case X86ISD::VSHLDQ:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
	DecodePSLLDQMask(NumElems, ImmN, Mask);
	IsUnary = true;
	break;
	case X86ISD::VSRLDQ:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
	DecodePSRLDQMask(NumElems, ImmN, Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFD:
	case X86ISD::VPERMILPI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
	DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFHW:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
	DecodePSHUFHWMask(NumElems, ImmN, Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFLW:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
	DecodePSHUFLWMask(NumElems, ImmN, Mask);
	IsUnary = true;
	break;
	case X86ISD::VZEXT_MOVL:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeZeroMoveLowMask(NumElems, Mask);
	IsUnary = true;
	break;
	case X86ISD::VBROADCAST:
	// We only decode broadcasts of same-sized vectors, peeking through to
	// extracted subvectors is likely to cause hasOneUse issues with
	// SimplifyDemandedBits etc.
	if (N->getOperand(0).getValueType() == VT) {
	DecodeVectorBroadcast(NumElems, Mask);
	IsUnary = true;
	break;
	}
	return false;
	case X86ISD::VPERMILPV: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	IsUnary = true;
	SDValue MaskNode = N->getOperand(1);
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
	RawUndefs)) {
	DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
	break;
	}
	return false;
	}
	case X86ISD::PSHUFB: {
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = true;
	SDValue MaskNode = N->getOperand(1);
	if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
	DecodePSHUFBMask(RawMask, RawUndefs, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
	DecodeVPERMMask(NumElems, ImmN, Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
	break;
	case X86ISD::VPERM2X128:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
	DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::SHUF128:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
	decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVSLDUP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeMOVSLDUPMask(NumElems, Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVSHDUP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeMOVSHDUPMask(NumElems, Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVDDUP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeMOVDDUPMask(NumElems, Mask);
	IsUnary = true;
	break;
	case X86ISD::VPERMIL2: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	SDValue MaskNode = N->getOperand(2);
	SDValue CtrlNode = N->getOperand(3);
	if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
	unsigned CtrlImm = CtrlOp->getZExtValue();
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
	RawUndefs)) {
	DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
	Mask);
	break;
	}
	}
	return false;
	}
	case X86ISD::VPPERM: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	SDValue MaskNode = N->getOperand(2);
	if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
	DecodeVPPERMMask(RawMask, RawUndefs, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMV: {
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = true;
	// Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
	Ops.push_back(N->getOperand(1));
	SDValue MaskNode = N->getOperand(0);
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
	RawUndefs)) {
	DecodeVPERMVMask(RawMask, RawUndefs, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMV3: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
	// Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
	Ops.push_back(N->getOperand(0));
	Ops.push_back(N->getOperand(2));
	SDValue MaskNode = N->getOperand(1);
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
	RawUndefs)) {
	DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
	break;
	}
	return false;
	}
	default: llvm_unreachable("unknown target shuffle node");
	}

	// Empty mask indicates the decode failed.
	if (Mask.empty())
	return false;

	// Check if we're getting a shuffle mask with zero'd elements.
	if (!AllowSentinelZero && isAnyZero(Mask))
	return false;

	// If we have a fake unary shuffle, the shuffle mask is spread across two
	// inputs that are actually the same node. Re-map the mask to always point
	// into the first input.
	if (IsFakeUnary)
	for (int &M : Mask)
	if (M >= (int)Mask.size())
	M -= Mask.size();

	// If we didn't already add operands in the opcode-specific code, default to
	// adding 1 or 2 operands starting at 0.
	if (Ops.empty()) {
	Ops.push_back(N->getOperand(0));
	if (!IsUnary \|\| IsFakeUnary)
	Ops.push_back(N->getOperand(1));
	}

	return true;
	}

	/// Compute whether each element of a shuffle is zeroable.
	///
	/// A "zeroable" vector shuffle element is one which can be lowered to zero.
	/// Either it is an undef element in the shuffle mask, the element of the input
	/// referenced is undef, or the element of the input referenced is known to be
	/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
	/// as many lanes with this technique as possible to simplify the remaining
	/// shuffle.
	static void computeZeroableShuffleElements(ArrayRef<int> Mask,
	SDValue V1, SDValue V2,
	APInt &KnownUndef, APInt &KnownZero) {
	int Size = Mask.size();
	KnownUndef = KnownZero = APInt::getNullValue(Size);

	V1 = peekThroughBitcasts(V1);
	V2 = peekThroughBitcasts(V2);

	bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
	bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());

	int VectorSizeInBits = V1.getValueSizeInBits();
	int ScalarSizeInBits = VectorSizeInBits / Size;
	assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");

	for (int i = 0; i < Size; ++i) {
	int M = Mask[i];
	// Handle the easy cases.
	if (M < 0) {
	KnownUndef.setBit(i);
	continue;
	}
	if ((M >= 0 && M < Size && V1IsZero) \|\| (M >= Size && V2IsZero)) {
	KnownZero.setBit(i);
	continue;
	}

	// Determine shuffle input and normalize the mask.
	SDValue V = M < Size ? V1 : V2;
	M %= Size;

	// Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
	if (V.getOpcode() != ISD::BUILD_VECTOR)
	continue;

	// If the BUILD_VECTOR has fewer elements then the bitcasted portion of
	// the (larger) source element must be UNDEF/ZERO.
	if ((Size % V.getNumOperands()) == 0) {
	int Scale = Size / V->getNumOperands();
	SDValue Op = V.getOperand(M / Scale);
	if (Op.isUndef())
	KnownUndef.setBit(i);
	if (X86::isZeroNode(Op))
	KnownZero.setBit(i);
	else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
	APInt Val = Cst->getAPIntValue();
	Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
	if (Val == 0)
	KnownZero.setBit(i);
	} else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
	APInt Val = Cst->getValueAPF().bitcastToAPInt();
	Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
	if (Val == 0)
	KnownZero.setBit(i);
	}
	continue;
	}

	// If the BUILD_VECTOR has more elements then all the (smaller) source
	// elements must be UNDEF or ZERO.
	if ((V.getNumOperands() % Size) == 0) {
	int Scale = V->getNumOperands() / Size;
	bool AllUndef = true;
	bool AllZero = true;
	for (int j = 0; j < Scale; ++j) {
	SDValue Op = V.getOperand((M * Scale) + j);
	AllUndef &= Op.isUndef();
	AllZero &= X86::isZeroNode(Op);
	}
	if (AllUndef)
	KnownUndef.setBit(i);
	if (AllZero)
	KnownZero.setBit(i);
	continue;
	}
	}
	}

	/// Decode a target shuffle mask and inputs and see if any values are
	/// known to be undef or zero from their inputs.
	/// Returns true if the target shuffle mask was decoded.
	/// FIXME: Merge this with computeZeroableShuffleElements?
	static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
	SmallVectorImpl<SDValue> &Ops,
	APInt &KnownUndef, APInt &KnownZero) {
	bool IsUnary;
	if (!isTargetShuffle(N.getOpcode()))
	return false;

	MVT VT = N.getSimpleValueType();
	if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
	return false;

	int Size = Mask.size();
	SDValue V1 = Ops[0];
	SDValue V2 = IsUnary ? V1 : Ops[1];
	KnownUndef = KnownZero = APInt::getNullValue(Size);

	V1 = peekThroughBitcasts(V1);
	V2 = peekThroughBitcasts(V2);

	assert((VT.getSizeInBits() % Size) == 0 &&
	"Illegal split of shuffle value type");
	unsigned EltSizeInBits = VT.getSizeInBits() / Size;

	// Extract known constant input data.
	APInt UndefSrcElts[2];
	SmallVector<APInt, 32> SrcEltBits[2];
	bool IsSrcConstant[2] = {
	getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
	SrcEltBits[0], true, false),
	getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
	SrcEltBits[1], true, false)};

	for (int i = 0; i < Size; ++i) {
	int M = Mask[i];

	// Already decoded as SM_SentinelZero / SM_SentinelUndef.
	if (M < 0) {
	assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
	if (SM_SentinelUndef == M)
	KnownUndef.setBit(i);
	if (SM_SentinelZero == M)
	KnownZero.setBit(i);
	continue;
	}

	// Determine shuffle input and normalize the mask.
	unsigned SrcIdx = M / Size;
	SDValue V = M < Size ? V1 : V2;
	M %= Size;

	// We are referencing an UNDEF input.
	if (V.isUndef()) {
	KnownUndef.setBit(i);
	continue;
	}

	// SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
	// TODO: We currently only set UNDEF for integer types - floats use the same
	// registers as vectors and many of the scalar folded loads rely on the
	// SCALAR_TO_VECTOR pattern.
	if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	(Size % V.getValueType().getVectorNumElements()) == 0) {
	int Scale = Size / V.getValueType().getVectorNumElements();
	int Idx = M / Scale;
	if (Idx != 0 && !VT.isFloatingPoint())
	KnownUndef.setBit(i);
	else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
	KnownZero.setBit(i);
	continue;
	}

	// INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
	// base vectors.
	if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
	SDValue Vec = V.getOperand(0);
	int NumVecElts = Vec.getValueType().getVectorNumElements();
	if (Vec.isUndef() && Size == NumVecElts) {
	int Idx = V.getConstantOperandVal(2);
	int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
	if (M < Idx \|\| (Idx + NumSubElts) <= M)
	KnownUndef.setBit(i);
	}
	continue;
	}

	// Attempt to extract from the source's constant bits.
	if (IsSrcConstant[SrcIdx]) {
	if (UndefSrcElts[SrcIdx][M])
	KnownUndef.setBit(i);
	else if (SrcEltBits[SrcIdx][M] == 0)
	KnownZero.setBit(i);
	}
	}

	assert(VT.getVectorNumElements() == (unsigned)Size &&
	"Different mask size from vector size!");
	return true;
	}

	// Replace target shuffle mask elements with known undef/zero sentinels.
	static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
	const APInt &KnownUndef,
	const APInt &KnownZero,
	bool ResolveKnownZeros= true) {
	unsigned NumElts = Mask.size();
	assert(KnownUndef.getBitWidth() == NumElts &&
	KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");

	for (unsigned i = 0; i != NumElts; ++i) {
	if (KnownUndef[i])
	Mask[i] = SM_SentinelUndef;
	else if (ResolveKnownZeros && KnownZero[i])
	Mask[i] = SM_SentinelZero;
	}
	}

	// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
	static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
	APInt &KnownUndef,
	APInt &KnownZero) {
	unsigned NumElts = Mask.size();
	KnownUndef = KnownZero = APInt::getNullValue(NumElts);

	for (unsigned i = 0; i != NumElts; ++i) {
	int M = Mask[i];
	if (SM_SentinelUndef == M)
	KnownUndef.setBit(i);
	if (SM_SentinelZero == M)
	KnownZero.setBit(i);
	}
	}

	// Forward declaration (for getFauxShuffleMask recursive check).
	// TODO: Use DemandedElts variant.
	static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
	SmallVectorImpl<int> &Mask,
	const SelectionDAG &DAG, unsigned Depth,
	bool ResolveKnownElts);

	// Attempt to decode ops that could be represented as a shuffle mask.
	// The decoded shuffle mask may contain a different number of elements to the
	// destination value type.
	static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
	SmallVectorImpl<int> &Mask,
	SmallVectorImpl<SDValue> &Ops,
	const SelectionDAG &DAG, unsigned Depth,
	bool ResolveKnownElts) {
	Mask.clear();
	Ops.clear();

	MVT VT = N.getSimpleValueType();
	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumSizeInBits = VT.getSizeInBits();
	unsigned NumBitsPerElt = VT.getScalarSizeInBits();
	if ((NumBitsPerElt % 8) != 0 \|\| (NumSizeInBits % 8) != 0)
	return false;
	assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
	unsigned NumSizeInBytes = NumSizeInBits / 8;
	unsigned NumBytesPerElt = NumBitsPerElt / 8;

	unsigned Opcode = N.getOpcode();
	switch (Opcode) {
	case ISD::VECTOR_SHUFFLE: {
	// Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
	ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
	if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
	Mask.append(ShuffleMask.begin(), ShuffleMask.end());
	Ops.push_back(N.getOperand(0));
	Ops.push_back(N.getOperand(1));
	return true;
	}
	return false;
	}
	case ISD::AND:
	case X86ISD::ANDNP: {
	// Attempt to decode as a per-byte mask.
	APInt UndefElts;
	SmallVector<APInt, 32> EltBits;
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);
	bool IsAndN = (X86ISD::ANDNP == Opcode);
	uint64_t ZeroMask = IsAndN ? 255 : 0;
	if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
	return false;
	for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
	if (UndefElts[i]) {
	Mask.push_back(SM_SentinelUndef);
	continue;
	}
	const APInt &ByteBits = EltBits[i];
	if (ByteBits != 0 && ByteBits != 255)
	return false;
	Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
	}
	Ops.push_back(IsAndN ? N1 : N0);
	return true;
	}
	case ISD::OR: {
	// Inspect each operand at the byte level. We can merge these into a
	// blend shuffle mask if for each byte at least one is masked out (zero).
	KnownBits Known0 =
	DAG.computeKnownBits(N.getOperand(0), DemandedElts, Depth + 1);
	KnownBits Known1 =
	DAG.computeKnownBits(N.getOperand(1), DemandedElts, Depth + 1);
	if (Known0.One.isNullValue() && Known1.One.isNullValue()) {
	bool IsByteMask = true;
	APInt ZeroMask = APInt::getNullValue(NumBytesPerElt);
	APInt SelectMask = APInt::getNullValue(NumBytesPerElt);
	for (unsigned i = 0; i != NumBytesPerElt && IsByteMask; ++i) {
	unsigned LHS = Known0.Zero.extractBits(8, i * 8).getZExtValue();
	unsigned RHS = Known1.Zero.extractBits(8, i * 8).getZExtValue();
	if (LHS == 255 && RHS == 0)
	SelectMask.setBit(i);
	else if (LHS == 255 && RHS == 255)
	ZeroMask.setBit(i);
	else if (!(LHS == 0 && RHS == 255))
	IsByteMask = false;
	}
	if (IsByteMask) {
	for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) {
	for (unsigned j = 0; j != NumBytesPerElt; ++j) {
	unsigned Ofs = (SelectMask[j] ? NumSizeInBytes : 0);
	int Idx = (ZeroMask[j] ? (int)SM_SentinelZero : (i + j + Ofs));
	Mask.push_back(Idx);
	}
	}
	Ops.push_back(N.getOperand(0));
	Ops.push_back(N.getOperand(1));
	return true;
	}
	}

	// Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
	// is a valid shuffle index.
	SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
	SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
	if (!N0.getValueType().isVector() \|\| !N1.getValueType().isVector())
	return false;
	SmallVector<int, 64> SrcMask0, SrcMask1;
	SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
	if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1,
	true) \|\|
	!getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,
	true))
	return false;

	// Shuffle inputs must be the same size as the result.
	if (llvm::any_of(SrcInputs0, [VT](SDValue Op) {
	return VT.getSizeInBits() != Op.getValueSizeInBits();
	}))
	return false;
	if (llvm::any_of(SrcInputs1, [VT](SDValue Op) {
	return VT.getSizeInBits() != Op.getValueSizeInBits();
	}))
	return false;

	size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
	SmallVector<int, 64> Mask0, Mask1;
	narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
	narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
	for (size_t i = 0; i != MaskSize; ++i) {
	if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
	Mask.push_back(SM_SentinelUndef);
	else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
	Mask.push_back(SM_SentinelZero);
	else if (Mask1[i] == SM_SentinelZero)
	Mask.push_back(Mask0[i]);
	else if (Mask0[i] == SM_SentinelZero)
	Mask.push_back(Mask1[i] + (int)(MaskSize * SrcInputs0.size()));
	else
	return false;
	}
	Ops.append(SrcInputs0.begin(), SrcInputs0.end());
	Ops.append(SrcInputs1.begin(), SrcInputs1.end());
	return true;
	}
	case ISD::INSERT_SUBVECTOR: {
	SDValue Src = N.getOperand(0);
	SDValue Sub = N.getOperand(1);
	EVT SubVT = Sub.getValueType();
	unsigned NumSubElts = SubVT.getVectorNumElements();
	if (!N->isOnlyUserOf(Sub.getNode()))
	return false;
	uint64_t InsertIdx = N.getConstantOperandVal(2);
	// Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
	if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	Sub.getOperand(0).getValueType() == VT) {
	uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
	for (int i = 0; i != (int)NumElts; ++i)
	Mask.push_back(i);
	for (int i = 0; i != (int)NumSubElts; ++i)
	Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
	Ops.push_back(Src);
	Ops.push_back(Sub.getOperand(0));
	return true;
	}
	// Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
	SmallVector<int, 64> SubMask;
	SmallVector<SDValue, 2> SubInputs;
	if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
	SubMask, DAG, Depth + 1, ResolveKnownElts))
	return false;

	// Subvector shuffle inputs must not be larger than the subvector.
	if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
	return SubVT.getSizeInBits() < SubInput.getValueSizeInBits();
	}))
	return false;

	if (SubMask.size() != NumSubElts) {
	assert(((SubMask.size() % NumSubElts) == 0 \|\|
	(NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
	if ((NumSubElts % SubMask.size()) == 0) {
	int Scale = NumSubElts / SubMask.size();
	SmallVector<int,64> ScaledSubMask;
	narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
	SubMask = ScaledSubMask;
	} else {
	int Scale = SubMask.size() / NumSubElts;
	NumSubElts = SubMask.size();
	NumElts *= Scale;
	InsertIdx *= Scale;
	}
	}
	Ops.push_back(Src);
	Ops.append(SubInputs.begin(), SubInputs.end());
	for (int i = 0; i != (int)NumElts; ++i)
	Mask.push_back(i);
	for (int i = 0; i != (int)NumSubElts; ++i) {
	int M = SubMask[i];
	if (0 <= M) {
	int InputIdx = M / NumSubElts;
	M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
	}
	Mask[i + InsertIdx] = M;
	}
	return true;
	}
	case X86ISD::PINSRB:
	case X86ISD::PINSRW:
	case ISD::SCALAR_TO_VECTOR:
	case ISD::INSERT_VECTOR_ELT: {
	// Match against a insert_vector_elt/scalar_to_vector of an extract from a
	// vector, for matching src/dst vector types.
	SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);

	unsigned DstIdx = 0;
	if (Opcode != ISD::SCALAR_TO_VECTOR) {
	// Check we have an in-range constant insertion index.
	if (!isa<ConstantSDNode>(N.getOperand(2)) \|\|
	N.getConstantOperandAPInt(2).uge(NumElts))
	return false;
	DstIdx = N.getConstantOperandVal(2);

	// Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
	if (X86::isZeroNode(Scl)) {
	Ops.push_back(N.getOperand(0));
	for (unsigned i = 0; i != NumElts; ++i)
	Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
	return true;
	}
	}

	// Peek through trunc/aext/zext.
	// TODO: aext shouldn't require SM_SentinelZero padding.
	// TODO: handle shift of scalars.
	unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
	while (Scl.getOpcode() == ISD::TRUNCATE \|\|
	Scl.getOpcode() == ISD::ANY_EXTEND \|\|
	Scl.getOpcode() == ISD::ZERO_EXTEND) {
	Scl = Scl.getOperand(0);
	MinBitsPerElt =
	std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
	}
	if ((MinBitsPerElt % 8) != 0)
	return false;

	// Attempt to find the source vector the scalar was extracted from.
	SDValue SrcExtract;
	if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT \|\|
	Scl.getOpcode() == X86ISD::PEXTRW \|\|
	Scl.getOpcode() == X86ISD::PEXTRB) &&
	Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
	SrcExtract = Scl;
	}
	if (!SrcExtract \|\| !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
	return false;

	SDValue SrcVec = SrcExtract.getOperand(0);
	EVT SrcVT = SrcVec.getValueType();
	if (!SrcVT.getScalarType().isByteSized())
	return false;
	unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
	unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
	unsigned DstByte = DstIdx * NumBytesPerElt;
	MinBitsPerElt =
	std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());

	// Create 'identity' byte level shuffle mask and then add inserted bytes.
	if (Opcode == ISD::SCALAR_TO_VECTOR) {
	Ops.push_back(SrcVec);
	Mask.append(NumSizeInBytes, SM_SentinelUndef);
	} else {
	Ops.push_back(SrcVec);
	Ops.push_back(N.getOperand(0));
	for (int i = 0; i != (int)NumSizeInBytes; ++i)
	Mask.push_back(NumSizeInBytes + i);
	}

	unsigned MinBytesPerElts = MinBitsPerElt / 8;
	MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
	for (unsigned i = 0; i != MinBytesPerElts; ++i)
	Mask[DstByte + i] = SrcByte + i;
	for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
	Mask[DstByte + i] = SM_SentinelZero;
	return true;
	}
	case X86ISD::PACKSS:
	case X86ISD::PACKUS: {
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);
	assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
	N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
	"Unexpected input value type");

	APInt EltsLHS, EltsRHS;
	getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);

	// If we know input saturation won't happen we can treat this
	// as a truncation shuffle.
	if (Opcode == X86ISD::PACKSS) {
	if ((!N0.isUndef() &&
	DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) \|\|
	(!N1.isUndef() &&
	DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
	return false;
	} else {
	APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
	if ((!N0.isUndef() &&
	!DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) \|\|
	(!N1.isUndef() &&
	!DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
	return false;
	}

	bool IsUnary = (N0 == N1);

	Ops.push_back(N0);
	if (!IsUnary)
	Ops.push_back(N1);

	createPackShuffleMask(VT, Mask, IsUnary);
	return true;
	}
	case X86ISD::VTRUNC: {
	SDValue Src = N.getOperand(0);
	EVT SrcVT = Src.getValueType();
	// Truncated source must be a simple vector.
	if (!SrcVT.isSimple() \|\| (SrcVT.getSizeInBits() % 128) != 0 \|\|
	(SrcVT.getScalarSizeInBits() % 8) != 0)
	return false;
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
	unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
	assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
	for (unsigned i = 0; i != NumSrcElts; ++i)
	Mask.push_back(i * Scale);
	Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
	Ops.push_back(Src);
	return true;
	}
	case X86ISD::VSHLI:
	case X86ISD::VSRLI: {
	uint64_t ShiftVal = N.getConstantOperandVal(1);
	// Out of range bit shifts are guaranteed to be zero.
	if (NumBitsPerElt <= ShiftVal) {
	Mask.append(NumElts, SM_SentinelZero);
	return true;
	}

	// We can only decode 'whole byte' bit shifts as shuffles.
	if ((ShiftVal % 8) != 0)
	break;

	uint64_t ByteShift = ShiftVal / 8;
	Ops.push_back(N.getOperand(0));

	// Clear mask to all zeros and insert the shifted byte indices.
	Mask.append(NumSizeInBytes, SM_SentinelZero);

	if (X86ISD::VSHLI == Opcode) {
	for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
	for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
	Mask[i + j] = i + j - ByteShift;
	} else {
	for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
	for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
	Mask[i + j - ByteShift] = i + j;
	}
	return true;
	}
	case X86ISD::VROTLI:
	case X86ISD::VROTRI: {
	// We can only decode 'whole byte' bit rotates as shuffles.
	uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
	if ((RotateVal % 8) != 0)
	return false;
	Ops.push_back(N.getOperand(0));
	int Offset = RotateVal / 8;
	Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
	for (int i = 0; i != (int)NumElts; ++i) {
	int BaseIdx = i * NumBytesPerElt;
	for (int j = 0; j != (int)NumBytesPerElt; ++j) {
	Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
	}
	}
	return true;
	}
	case X86ISD::VBROADCAST: {
	SDValue Src = N.getOperand(0);
	if (!Src.getSimpleValueType().isVector())
	return false;
	Ops.push_back(Src);
	Mask.append(NumElts, 0);
	return true;
	}
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	case ISD::ANY_EXTEND_VECTOR_INREG: {
	SDValue Src = N.getOperand(0);
	EVT SrcVT = Src.getValueType();

	// Extended source must be a simple vector.
	if (!SrcVT.isSimple() \|\| (SrcVT.getSizeInBits() % 128) != 0 \|\|
	(SrcVT.getScalarSizeInBits() % 8) != 0)
	return false;

	bool IsAnyExtend =
	(ISD::ANY_EXTEND == Opcode \|\| ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
	DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
	IsAnyExtend, Mask);
	Ops.push_back(Src);
	return true;
	}
	}

	return false;
	}

	/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
	static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
	SmallVectorImpl<int> &Mask) {
	int MaskWidth = Mask.size();
	SmallVector<SDValue, 16> UsedInputs;
	for (int i = 0, e = Inputs.size(); i < e; ++i) {
	int lo = UsedInputs.size() * MaskWidth;
	int hi = lo + MaskWidth;

	// Strip UNDEF input usage.
	if (Inputs[i].isUndef())
	for (int &M : Mask)
	if ((lo <= M) && (M < hi))
	M = SM_SentinelUndef;

	// Check for unused inputs.
	if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
	for (int &M : Mask)
	if (lo <= M)
	M -= MaskWidth;
	continue;
	}

	// Check for repeated inputs.
	bool IsRepeat = false;
	for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
	if (UsedInputs[j] != Inputs[i])
	continue;
	for (int &M : Mask)
	if (lo <= M)
	M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
	IsRepeat = true;
	break;
	}
	if (IsRepeat)
	continue;

	UsedInputs.push_back(Inputs[i]);
	}
	Inputs = UsedInputs;
	}

	/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
	/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
	/// Returns true if the target shuffle mask was decoded.
	static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
	SmallVectorImpl<SDValue> &Inputs,
	SmallVectorImpl<int> &Mask,
	APInt &KnownUndef, APInt &KnownZero,
	const SelectionDAG &DAG, unsigned Depth,
	bool ResolveKnownElts) {
	EVT VT = Op.getValueType();
	if (!VT.isSimple() \|\| !VT.isVector())
	return false;

	if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
	if (ResolveKnownElts)
	resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
	return true;
	}
	if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
	ResolveKnownElts)) {
	resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
	return true;
	}
	return false;
	}

	static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
	SmallVectorImpl<int> &Mask,
	const SelectionDAG &DAG, unsigned Depth = 0,
	bool ResolveKnownElts = true) {
	EVT VT = Op.getValueType();
	if (!VT.isSimple() \|\| !VT.isVector())
	return false;

	APInt KnownUndef, KnownZero;
	unsigned NumElts = Op.getValueType().getVectorNumElements();
	APInt DemandedElts = APInt::getAllOnesValue(NumElts);
	return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
	KnownZero, DAG, Depth, ResolveKnownElts);
	}

	/// Returns the scalar element that will make up the i'th
	/// element of the result of the vector shuffle.
	static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
	SelectionDAG &DAG, unsigned Depth) {
	if (Depth >= SelectionDAG::MaxRecursionDepth)
	return SDValue(); // Limit search depth.

	EVT VT = Op.getValueType();
	unsigned Opcode = Op.getOpcode();
	unsigned NumElems = VT.getVectorNumElements();

	// Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
	if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
	int Elt = SV->getMaskElt(Index);

	if (Elt < 0)
	return DAG.getUNDEF(VT.getVectorElementType());

	SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
	return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
	}

	// Recurse into target specific vector shuffles to find scalars.
	if (isTargetShuffle(Opcode)) {
	MVT ShufVT = VT.getSimpleVT();
	MVT ShufSVT = ShufVT.getVectorElementType();
	int NumElems = (int)ShufVT.getVectorNumElements();
	SmallVector<int, 16> ShuffleMask;
	SmallVector<SDValue, 16> ShuffleOps;
	bool IsUnary;

	if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,
	ShuffleMask, IsUnary))
	return SDValue();

	int Elt = ShuffleMask[Index];
	if (Elt == SM_SentinelZero)
	return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
	: DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
	if (Elt == SM_SentinelUndef)
	return DAG.getUNDEF(ShufSVT);

	assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
	SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
	return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
	}

	// Recurse into insert_subvector base/sub vector to find scalars.
	if (Opcode == ISD::INSERT_SUBVECTOR) {
	SDValue Vec = Op.getOperand(0);
	SDValue Sub = Op.getOperand(1);
	uint64_t SubIdx = Op.getConstantOperandVal(2);
	unsigned NumSubElts = Sub.getValueType().getVectorNumElements();

	if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
	return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
	return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
	}

	// Recurse into concat_vectors sub vector to find scalars.
	if (Opcode == ISD::CONCAT_VECTORS) {
	EVT SubVT = Op.getOperand(0).getValueType();
	unsigned NumSubElts = SubVT.getVectorNumElements();
	uint64_t SubIdx = Index / NumSubElts;
	uint64_t SubElt = Index % NumSubElts;
	return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
	}

	// Recurse into extract_subvector src vector to find scalars.
	if (Opcode == ISD::EXTRACT_SUBVECTOR) {
	SDValue Src = Op.getOperand(0);
	uint64_t SrcIdx = Op.getConstantOperandVal(1);
	return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
	}

	// We only peek through bitcasts of the same vector width.
	if (Opcode == ISD::BITCAST) {
	SDValue Src = Op.getOperand(0);
	EVT SrcVT = Src.getValueType();
	if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
	return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
	return SDValue();
	}

	// Actual nodes that may contain scalar elements

	// For insert_vector_elt - either return the index matching scalar or recurse
	// into the base vector.
	if (Opcode == ISD::INSERT_VECTOR_ELT &&
	isa<ConstantSDNode>(Op.getOperand(2))) {
	if (Op.getConstantOperandAPInt(2) == Index)
	return Op.getOperand(1);
	return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
	}

	if (Opcode == ISD::SCALAR_TO_VECTOR)
	return (Index == 0) ? Op.getOperand(0)
	: DAG.getUNDEF(VT.getVectorElementType());

	if (Opcode == ISD::BUILD_VECTOR)
	return Op.getOperand(Index);

	return SDValue();
	}

	// Use PINSRB/PINSRW/PINSRD to create a build vector.
	static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
	unsigned NumNonZero, unsigned NumZero,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	unsigned NumElts = VT.getVectorNumElements();
	assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) \|\|
	((VT == MVT::v16i8 \|\| VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
	"Illegal vector insertion");

	SDLoc dl(Op);
	SDValue V;
	bool First = true;

	for (unsigned i = 0; i < NumElts; ++i) {
	bool IsNonZero = (NonZeros & (1 << i)) != 0;
	if (!IsNonZero)
	continue;

	// If the build vector contains zeros or our first insertion is not the
	// first index then insert into zero vector to break any register
	// dependency else use SCALAR_TO_VECTOR.
	if (First) {
	First = false;
	if (NumZero \|\| 0 != i)
	V = getZeroVector(VT, Subtarget, DAG, dl);
	else {
	assert(0 == i && "Expected insertion into zero-index");
	V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
	V = DAG.getBitcast(VT, V);
	continue;
	}
	}
	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
	DAG.getIntPtrConstant(i, dl));
	}

	return V;
	}

	/// Custom lower build_vector of v16i8.
	static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
	unsigned NumNonZero, unsigned NumZero,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (NumNonZero > 8 && !Subtarget.hasSSE41())
	return SDValue();

	// SSE4.1 - use PINSRB to insert each byte directly.
	if (Subtarget.hasSSE41())
	return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
	Subtarget);

	SDLoc dl(Op);
	SDValue V;

	// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
	for (unsigned i = 0; i < 16; i += 2) {
	bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
	bool NextIsNonZero = (NonZeros & (1 << (i + 1))) != 0;
	if (!ThisIsNonZero && !NextIsNonZero)
	continue;

	// FIXME: Investigate combining the first 4 bytes as a i32 instead.
	SDValue Elt;
	if (ThisIsNonZero) {
	if (NumZero \|\| NextIsNonZero)
	Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
	else
	Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
	}

	if (NextIsNonZero) {
	SDValue NextElt = Op.getOperand(i + 1);
	if (i == 0 && NumZero)
	NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
	else
	NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
	NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
	DAG.getConstant(8, dl, MVT::i8));
	if (ThisIsNonZero)
	Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
	else
	Elt = NextElt;
	}

	// If our first insertion is not the first index or zeros are needed, then
	// insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
	// elements undefined).
	if (!V) {
	if (i != 0 \|\| NumZero)
	V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
	else {
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
	V = DAG.getBitcast(MVT::v8i16, V);
	continue;
	}
	}
	Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
	DAG.getIntPtrConstant(i / 2, dl));
	}

	return DAG.getBitcast(MVT::v16i8, V);
	}

	/// Custom lower build_vector of v8i16.
	static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
	unsigned NumNonZero, unsigned NumZero,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (NumNonZero > 4 && !Subtarget.hasSSE41())
	return SDValue();

	// Use PINSRW to insert each byte directly.
	return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
	Subtarget);
	}

	/// Custom lower build_vector of v4i32 or v4f32.
	static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// If this is a splat of a pair of elements, use MOVDDUP (unless the target
	// has XOP; in that case defer lowering to potentially use VPERMIL2PS).
	// Because we're creating a less complicated build vector here, we may enable
	// further folding of the MOVDDUP via shuffle transforms.
	if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
	Op.getOperand(0) == Op.getOperand(2) &&
	Op.getOperand(1) == Op.getOperand(3) &&
	Op.getOperand(0) != Op.getOperand(1)) {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	// Create a new build vector with the first 2 elements followed by undef
	// padding, bitcast to v2f64, duplicate, and bitcast back.
	SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
	DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
	SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
	SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
	return DAG.getBitcast(VT, Dup);
	}

	// Find all zeroable elements.
	std::bitset<4> Zeroable, Undefs;
	for (int i = 0; i < 4; ++i) {
	SDValue Elt = Op.getOperand(i);
	Undefs[i] = Elt.isUndef();
	Zeroable[i] = (Elt.isUndef() \|\| X86::isZeroNode(Elt));
	}
	assert(Zeroable.size() - Zeroable.count() > 1 &&
	"We expect at least two non-zero elements!");

	// We only know how to deal with build_vector nodes where elements are either
	// zeroable or extract_vector_elt with constant index.
	SDValue FirstNonZero;
	unsigned FirstNonZeroIdx;
	for (unsigned i = 0; i < 4; ++i) {
	if (Zeroable[i])
	continue;
	SDValue Elt = Op.getOperand(i);
	if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(Elt.getOperand(1)))
	return SDValue();
	// Make sure that this node is extracting from a 128-bit vector.
	MVT VT = Elt.getOperand(0).getSimpleValueType();
	if (!VT.is128BitVector())
	return SDValue();
	if (!FirstNonZero.getNode()) {
	FirstNonZero = Elt;
	FirstNonZeroIdx = i;
	}
	}

	assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
	SDValue V1 = FirstNonZero.getOperand(0);
	MVT VT = V1.getSimpleValueType();

	// See if this build_vector can be lowered as a blend with zero.
	SDValue Elt;
	unsigned EltMaskIdx, EltIdx;
	int Mask[4];
	for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
	if (Zeroable[EltIdx]) {
	// The zero vector will be on the right hand side.
	Mask[EltIdx] = EltIdx+4;
	continue;
	}

	Elt = Op->getOperand(EltIdx);
	// By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
	EltMaskIdx = Elt.getConstantOperandVal(1);
	if (Elt.getOperand(0) != V1 \|\| EltMaskIdx != EltIdx)
	break;
	Mask[EltIdx] = EltIdx;
	}

	if (EltIdx == 4) {
	// Let the shuffle legalizer deal with blend operations.
	SDValue VZeroOrUndef = (Zeroable == Undefs)
	? DAG.getUNDEF(VT)
	: getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
	if (V1.getSimpleValueType() != VT)
	V1 = DAG.getBitcast(VT, V1);
	return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
	}

	// See if we can lower this build_vector to a INSERTPS.
	if (!Subtarget.hasSSE41())
	return SDValue();

	SDValue V2 = Elt.getOperand(0);
	if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
	V1 = SDValue();

	bool CanFold = true;
	for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
	if (Zeroable[i])
	continue;

	SDValue Current = Op->getOperand(i);
	SDValue SrcVector = Current->getOperand(0);
	if (!V1.getNode())
	V1 = SrcVector;
	CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
	}

	if (!CanFold)
	return SDValue();

	assert(V1.getNode() && "Expected at least two non-zero elements!");
	if (V1.getSimpleValueType() != MVT::v4f32)
	V1 = DAG.getBitcast(MVT::v4f32, V1);
	if (V2.getSimpleValueType() != MVT::v4f32)
	V2 = DAG.getBitcast(MVT::v4f32, V2);

	// Ok, we can emit an INSERTPS instruction.
	unsigned ZMask = Zeroable.to_ulong();

	unsigned InsertPSMask = EltMaskIdx << 6 \| EltIdx << 4 \| ZMask;
	assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
	SDLoc DL(Op);
	SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
	DAG.getIntPtrConstant(InsertPSMask, DL, true));
	return DAG.getBitcast(VT, Result);
	}

	/// Return a vector logical shift node.
	static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
	SelectionDAG &DAG, const TargetLowering &TLI,
	const SDLoc &dl) {
	assert(VT.is128BitVector() && "Unknown type for VShift");
	MVT ShVT = MVT::v16i8;
	unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
	SrcOp = DAG.getBitcast(ShVT, SrcOp);
	assert(NumBits % 8 == 0 && "Only support byte sized shifts");
	SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
	return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
	}

	static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
	SelectionDAG &DAG) {

	// Check if the scalar load can be widened into a vector load. And if
	// the address is "base + cst" see if the cst can be "absorbed" into
	// the shuffle mask.
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
	SDValue Ptr = LD->getBasePtr();
	if (!ISD::isNormalLoad(LD) \|\| !LD->isSimple())
	return SDValue();
	EVT PVT = LD->getValueType(0);
	if (PVT != MVT::i32 && PVT != MVT::f32)
	return SDValue();

	int FI = -1;
	int64_t Offset = 0;
	if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
	FI = FINode->getIndex();
	Offset = 0;
	} else if (DAG.isBaseWithConstantOffset(Ptr) &&
	isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
	FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
	Offset = Ptr.getConstantOperandVal(1);
	Ptr = Ptr.getOperand(0);
	} else {
	return SDValue();
	}

	// FIXME: 256-bit vector instructions don't require a strict alignment,
	// improve this code to support it better.
	Align RequiredAlign(VT.getSizeInBits() / 8);
	SDValue Chain = LD->getChain();
	// Make sure the stack object alignment is at least 16 or 32.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
	if (!InferredAlign \|\| *InferredAlign < RequiredAlign) {
	if (MFI.isFixedObjectIndex(FI)) {
	// Can't change the alignment. FIXME: It's possible to compute
	// the exact stack offset and reference FI + adjust offset instead.
	// If someone really cares about this. That's the way to implement it.
	return SDValue();
	} else {
	MFI.setObjectAlignment(FI, RequiredAlign);
	}
	}

	// (Offset % 16 or 32) must be multiple of 4. Then address is then
	// Ptr + (Offset & ~15).
	if (Offset < 0)
	return SDValue();
	if ((Offset % RequiredAlign.value()) & 3)
	return SDValue();
	int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
	if (StartOffset) {
	SDLoc DL(Ptr);
	Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
	DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
	}

	int EltNo = (Offset - StartOffset) >> 2;
	unsigned NumElems = VT.getVectorNumElements();

	EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
	SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
	LD->getPointerInfo().getWithOffset(StartOffset));

	SmallVector<int, 8> Mask(NumElems, EltNo);

	return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
	}

	return SDValue();
	}

	// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
	static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
	if (ISD::isNON_EXTLoad(Elt.getNode())) {
	auto *BaseLd = cast<LoadSDNode>(Elt);
	if (!BaseLd->isSimple())
	return false;
	Ld = BaseLd;
	ByteOffset = 0;
	return true;
	}

	switch (Elt.getOpcode()) {
	case ISD::BITCAST:
	case ISD::TRUNCATE:
	case ISD::SCALAR_TO_VECTOR:
	return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
	case ISD::SRL:
	if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
	uint64_t Idx = IdxC->getZExtValue();
	if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
	ByteOffset += Idx / 8;
	return true;
	}
	}
	break;
	case ISD::EXTRACT_VECTOR_ELT:
	if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
	SDValue Src = Elt.getOperand(0);
	unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
	unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
	if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
	findEltLoadSrc(Src, Ld, ByteOffset)) {
	uint64_t Idx = IdxC->getZExtValue();
	ByteOffset += Idx * (SrcSizeInBits / 8);
	return true;
	}
	}
	break;
	}

	return false;
	}

	/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
	/// elements can be replaced by a single large load which has the same value as
	/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
	///
	/// Example: <load i32 a, load i32 a+4, zero, undef> -> zextload a
	static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
	const SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	bool isAfterLegalize) {
	if ((VT.getScalarSizeInBits() % 8) != 0)
	return SDValue();

	unsigned NumElems = Elts.size();

	int LastLoadedElt = -1;
	APInt LoadMask = APInt::getNullValue(NumElems);
	APInt ZeroMask = APInt::getNullValue(NumElems);
	APInt UndefMask = APInt::getNullValue(NumElems);

	SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
	SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);

	// For each element in the initializer, see if we've found a load, zero or an
	// undef.
	for (unsigned i = 0; i < NumElems; ++i) {
	SDValue Elt = peekThroughBitcasts(Elts[i]);
	if (!Elt.getNode())
	return SDValue();
	if (Elt.isUndef()) {
	UndefMask.setBit(i);
	continue;
	}
	if (X86::isZeroNode(Elt) \|\| ISD::isBuildVectorAllZeros(Elt.getNode())) {
	ZeroMask.setBit(i);
	continue;
	}

	// Each loaded element must be the correct fractional portion of the
	// requested vector load.
	unsigned EltSizeInBits = Elt.getValueSizeInBits();
	if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
	return SDValue();

	if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) \|\| ByteOffsets[i] < 0)
	return SDValue();
	unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
	if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
	return SDValue();

	LoadMask.setBit(i);
	LastLoadedElt = i;
	}
	assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +
	LoadMask.countPopulation()) == NumElems &&
	"Incomplete element masks");

	// Handle Special Cases - all undef or undef/zero.
	if (UndefMask.countPopulation() == NumElems)
	return DAG.getUNDEF(VT);

	// FIXME: Should we return this as a BUILD_VECTOR instead?
	if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
	return VT.isInteger() ? DAG.getConstant(0, DL, VT)
	: DAG.getConstantFP(0.0, DL, VT);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	int FirstLoadedElt = LoadMask.countTrailingZeros();
	SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
	EVT EltBaseVT = EltBase.getValueType();
	assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
	"Register/Memory size mismatch");
	LoadSDNode *LDBase = Loads[FirstLoadedElt];
	assert(LDBase && "Did not find base load for merging consecutive loads");
	unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
	unsigned BaseSizeInBytes = BaseSizeInBits / 8;
	int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits;
	assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");

	// TODO: Support offsetting the base load.
	if (ByteOffsets[FirstLoadedElt] != 0)
	return SDValue();

	// Check to see if the element's load is consecutive to the base load
	// or offset from a previous (already checked) load.
	auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
	LoadSDNode *Ld = Loads[EltIdx];
	int64_t ByteOffset = ByteOffsets[EltIdx];
	if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
	int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
	return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
	Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
	}
	return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
	EltIdx - FirstLoadedElt);
	};

	// Consecutive loads can contain UNDEFS but not ZERO elements.
	// Consecutive loads with UNDEFs and ZEROs elements require a
	// an additional shuffle stage to clear the ZERO elements.
	bool IsConsecutiveLoad = true;
	bool IsConsecutiveLoadWithZeros = true;
	for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
	if (LoadMask[i]) {
	if (!CheckConsecutiveLoad(LDBase, i)) {
	IsConsecutiveLoad = false;
	IsConsecutiveLoadWithZeros = false;
	break;
	}
	} else if (ZeroMask[i]) {
	IsConsecutiveLoad = false;
	}
	}

	auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
	auto MMOFlags = LDBase->getMemOperand()->getFlags();
	assert(LDBase->isSimple() &&
	"Cannot merge volatile or atomic loads.");
	SDValue NewLd =
	DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
	LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
	MMOFlags);
	for (auto *LD : Loads)
	if (LD)
	DAG.makeEquivalentMemoryOrdering(LD, NewLd);
	return NewLd;
	};

	// Check if the base load is entirely dereferenceable.
	bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
	VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());

	// LOAD - all consecutive load/undefs (must start/end with a load or be
	// entirely dereferenceable). If we have found an entire vector of loads and
	// undefs, then return a large load of the entire vector width starting at the
	// base pointer. If the vector contains zeros, then attempt to shuffle those
	// elements.
	if (FirstLoadedElt == 0 &&
	(LastLoadedElt == (int)(NumElems - 1) \|\| IsDereferenceable) &&
	(IsConsecutiveLoad \|\| IsConsecutiveLoadWithZeros)) {
	if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
	return SDValue();

	// Don't create 256-bit non-temporal aligned loads without AVX2 as these
	// will lower to regular temporal loads and use the cache.
	if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
	VT.is256BitVector() && !Subtarget.hasInt256())
	return SDValue();

	if (NumElems == 1)
	return DAG.getBitcast(VT, Elts[FirstLoadedElt]);

	if (!ZeroMask)
	return CreateLoad(VT, LDBase);

	// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
	// vector and a zero vector to clear out the zero elements.
	if (!isAfterLegalize && VT.isVector()) {
	unsigned NumMaskElts = VT.getVectorNumElements();
	if ((NumMaskElts % NumElems) == 0) {
	unsigned Scale = NumMaskElts / NumElems;
	SmallVector<int, 4> ClearMask(NumMaskElts, -1);
	for (unsigned i = 0; i < NumElems; ++i) {
	if (UndefMask[i])
	continue;
	int Offset = ZeroMask[i] ? NumMaskElts : 0;
	for (unsigned j = 0; j != Scale; ++j)
	ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
	}
	SDValue V = CreateLoad(VT, LDBase);
	SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
	: DAG.getConstantFP(0.0, DL, VT);
	return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
	}
	}
	}

	// If the upper half of a ymm/zmm load is undef then just load the lower half.
	if (VT.is256BitVector() \|\| VT.is512BitVector()) {
	unsigned HalfNumElems = NumElems / 2;
	if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) {
	EVT HalfVT =
	EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
	SDValue HalfLD =
	EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
	DAG, Subtarget, isAfterLegalize);
	if (HalfLD)
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
	HalfLD, DAG.getIntPtrConstant(0, DL));
	}
	}

	// VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
	if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
	(LoadSizeInBits == 32 \|\| LoadSizeInBits == 64) &&
	((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()))) {
	MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
	: MVT::getIntegerVT(LoadSizeInBits);
	MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
	// Allow v4f32 on SSE1 only targets.
	// FIXME: Add more isel patterns so we can just use VT directly.
	if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
	VecVT = MVT::v4f32;
	if (TLI.isTypeLegal(VecVT)) {
	SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
	SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
	SDValue ResNode = DAG.getMemIntrinsicNode(
	X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
	LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
	for (auto *LD : Loads)
	if (LD)
	DAG.makeEquivalentMemoryOrdering(LD, ResNode);
	return DAG.getBitcast(VT, ResNode);
	}
	}

	// BROADCAST - match the smallest possible repetition pattern, load that
	// scalar/subvector element and then broadcast to the entire vector.
	if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
	(VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector())) {
	for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
	unsigned RepeatSize = SubElems * BaseSizeInBits;
	unsigned ScalarSize = std::min(RepeatSize, 64u);
	if (!Subtarget.hasAVX2() && ScalarSize < 32)
	continue;

	bool Match = true;
	SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
	for (unsigned i = 0; i != NumElems && Match; ++i) {
	if (!LoadMask[i])
	continue;
	SDValue Elt = peekThroughBitcasts(Elts[i]);
	if (RepeatedLoads[i % SubElems].isUndef())
	RepeatedLoads[i % SubElems] = Elt;
	else
	Match &= (RepeatedLoads[i % SubElems] == Elt);
	}

	// We must have loads at both ends of the repetition.
	Match &= !RepeatedLoads.front().isUndef();
	Match &= !RepeatedLoads.back().isUndef();
	if (!Match)
	continue;

	EVT RepeatVT =
	VT.isInteger() && (RepeatSize != 64 \|\| TLI.isTypeLegal(MVT::i64))
	? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
	: EVT::getFloatingPointVT(ScalarSize);
	if (RepeatSize > ScalarSize)
	RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
	RepeatSize / ScalarSize);
	EVT BroadcastVT =
	EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
	VT.getSizeInBits() / ScalarSize);
	if (TLI.isTypeLegal(BroadcastVT)) {
	if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
	RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) {
	unsigned Opcode = RepeatSize > ScalarSize ? X86ISD::SUBV_BROADCAST
	: X86ISD::VBROADCAST;
	SDValue Broadcast = DAG.getNode(Opcode, DL, BroadcastVT, RepeatLoad);
	return DAG.getBitcast(VT, Broadcast);
	}
	}
	}
	}

	return SDValue();
	}

	// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
	// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
	// are consecutive, non-overlapping, and in the right order.
	static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	bool isAfterLegalize) {
	SmallVector<SDValue, 64> Elts;
	for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
	if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
	Elts.push_back(Elt);
	continue;
	}
	return SDValue();
	}
	assert(Elts.size() == VT.getVectorNumElements());
	return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
	isAfterLegalize);
	}

	static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
	unsigned SplatBitSize, LLVMContext &C) {
	unsigned ScalarSize = VT.getScalarSizeInBits();
	unsigned NumElm = SplatBitSize / ScalarSize;

	SmallVector<Constant *, 32> ConstantVec;
	for (unsigned i = 0; i < NumElm; i++) {
	APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
	Constant *Const;
	if (VT.isFloatingPoint()) {
	if (ScalarSize == 32) {
	Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
	} else {
	assert(ScalarSize == 64 && "Unsupported floating point scalar size");
	Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
	}
	} else
	Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
	ConstantVec.push_back(Const);
	}
	return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
	}

	static bool isFoldableUseOfShuffle(SDNode *N) {
	for (auto *U : N->uses()) {
	unsigned Opc = U->getOpcode();
	// VPERMV/VPERMV3 shuffles can never fold their index operands.
	if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
	return false;
	if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
	return false;
	if (isTargetShuffle(Opc))
	return true;
	if (Opc == ISD::BITCAST) // Ignore bitcasts
	return isFoldableUseOfShuffle(U);
	if (N->hasOneUse())
	return true;
	}
	return false;
	}

	// Check if the current node of build vector is a zero extended vector.
	// // If so, return the value extended.
	// // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
	// // NumElt - return the number of zero extended identical values.
	// // EltType - return the type of the value include the zero extend.
	static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
	unsigned &NumElt, MVT &EltType) {
	SDValue ExtValue = Op->getOperand(0);
	unsigned NumElts = Op->getNumOperands();
	unsigned Delta = NumElts;

	for (unsigned i = 1; i < NumElts; i++) {
	if (Op->getOperand(i) == ExtValue) {
	Delta = i;
	break;
	}
	if (!(Op->getOperand(i).isUndef() \|\| isNullConstant(Op->getOperand(i))))
	return SDValue();
	}
	if (!isPowerOf2_32(Delta) \|\| Delta == 1)
	return SDValue();

	for (unsigned i = Delta; i < NumElts; i++) {
	if (i % Delta == 0) {
	if (Op->getOperand(i) != ExtValue)
	return SDValue();
	} else if (!(isNullConstant(Op->getOperand(i)) \|\|
	Op->getOperand(i).isUndef()))
	return SDValue();
	}
	unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
	unsigned ExtVTSize = EltSize * Delta;
	EltType = MVT::getIntegerVT(ExtVTSize);
	NumElt = NumElts / Delta;
	return ExtValue;
	}

	/// Attempt to use the vbroadcast instruction to generate a splat value
	/// from a splat BUILD_VECTOR which uses:
	/// a. A single scalar load, or a constant.
	/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
	///
	/// The VBROADCAST node is returned when a pattern is found,
	/// or SDValue() otherwise.
	static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// VBROADCAST requires AVX.
	// TODO: Splats could be generated for non-AVX CPUs using SSE
	// instructions, but there's less potential gain for only 128-bit vectors.
	if (!Subtarget.hasAVX())
	return SDValue();

	MVT VT = BVOp->getSimpleValueType(0);
	SDLoc dl(BVOp);

	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) &&
	"Unsupported vector type for broadcast.");

	BitVector UndefElements;
	SDValue Ld = BVOp->getSplatValue(&UndefElements);

	// Attempt to use VBROADCASTM
	// From this pattern:
	// a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
	// b. t1 = (build_vector t0 t0)
	//
	// Create (VBROADCASTM v2i1 X)
	if (Subtarget.hasCDI() && (VT.is512BitVector() \|\| Subtarget.hasVLX())) {
	MVT EltType = VT.getScalarType();
	unsigned NumElts = VT.getVectorNumElements();
	SDValue BOperand;
	SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
	if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) \|\|
	(Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
	Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
	if (ZeroExtended)
	BOperand = ZeroExtended.getOperand(0);
	else
	BOperand = Ld.getOperand(0).getOperand(0);
	MVT MaskVT = BOperand.getSimpleValueType();
	if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) \|\| // for broadcastmb2q
	(EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
	SDValue Brdcst =
	DAG.getNode(X86ISD::VBROADCASTM, dl,
	MVT::getVectorVT(EltType, NumElts), BOperand);
	return DAG.getBitcast(VT, Brdcst);
	}
	}
	}

	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumUndefElts = UndefElements.count();
	if (!Ld \|\| (NumElts - NumUndefElts) <= 1) {
	APInt SplatValue, Undef;
	unsigned SplatBitSize;
	bool HasUndef;
	// Check if this is a repeated constant pattern suitable for broadcasting.
	if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
	SplatBitSize > VT.getScalarSizeInBits() &&
	SplatBitSize < VT.getSizeInBits()) {
	// Avoid replacing with broadcast when it's a use of a shuffle
	// instruction to preserve the present custom lowering of shuffles.
	if (isFoldableUseOfShuffle(BVOp))
	return SDValue();
	// replace BUILD_VECTOR with broadcast of the repeated constants.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	LLVMContext *Ctx = DAG.getContext();
	MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
	if (Subtarget.hasAVX()) {
	if (SplatBitSize == 32 \|\| SplatBitSize == 64 \|\|
	(SplatBitSize < 32 && Subtarget.hasAVX2())) {
	// Splatted value can fit in one INTEGER constant in constant pool.
	// Load the constant and broadcast it.
	MVT CVT = MVT::getIntegerVT(SplatBitSize);
	Type ScalarTy = Type::getIntNTy(Ctx, SplatBitSize);
	Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
	SDValue CP = DAG.getConstantPool(C, PVT);
	unsigned Repeat = VT.getSizeInBits() / SplatBitSize;

	Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
	SDVTList Tys =
	DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
	SDValue Ops[] = {DAG.getEntryNode(), CP};
	MachinePointerInfo MPI =
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
	SDValue Brdcst = DAG.getMemIntrinsicNode(
	X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,
	MachineMemOperand::MOLoad);
	return DAG.getBitcast(VT, Brdcst);
	}
	if (SplatBitSize > 64) {
	// Load the vector of constants and broadcast it.
	MVT CVT = VT.getScalarType();
	Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
	*Ctx);
	SDValue VCP = DAG.getConstantPool(VecC, PVT);
	unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
	Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
	Ld = DAG.getLoad(
	MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
	return DAG.getBitcast(VT, Brdcst);
	}
	}
	}

	// If we are moving a scalar into a vector (Ld must be set and all elements
	// but 1 are undef) and that operation is not obviously supported by
	// vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
	// That's better than general shuffling and may eliminate a load to GPR and
	// move from scalar to vector register.
	if (!Ld \|\| NumElts - NumUndefElts != 1)
	return SDValue();
	unsigned ScalarSize = Ld.getValueSizeInBits();
	if (!(UndefElements[0] \|\| (ScalarSize != 32 && ScalarSize != 64)))
	return SDValue();
	}

	bool ConstSplatVal =
	(Ld.getOpcode() == ISD::Constant \|\| Ld.getOpcode() == ISD::ConstantFP);
	bool IsLoad = ISD::isNormalLoad(Ld.getNode());

	// Make sure that all of the users of a non-constant load are from the
	// BUILD_VECTOR node.
	// FIXME: Is the use count needed for non-constant, non-load case?
	if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
	return SDValue();

	unsigned ScalarSize = Ld.getValueSizeInBits();
	bool IsGE256 = (VT.getSizeInBits() >= 256);

	// When optimizing for size, generate up to 5 extra bytes for a broadcast
	// instruction to save 8 or more bytes of constant pool data.
	// TODO: If multiple splats are generated to load the same constant,
	// it may be detrimental to overall size. There needs to be a way to detect
	// that condition to know if this is truly a size win.
	bool OptForSize = DAG.shouldOptForSize();

	// Handle broadcasting a single constant scalar from the constant pool
	// into a vector.
	// On Sandybridge (no AVX2), it is still better to load a constant vector
	// from the constant pool and not to broadcast it from a scalar.
	// But override that restriction when optimizing for size.
	// TODO: Check if splatting is recommended for other AVX-capable CPUs.
	if (ConstSplatVal && (Subtarget.hasAVX2() \|\| OptForSize)) {
	EVT CVT = Ld.getValueType();
	assert(!CVT.isVector() && "Must not broadcast a vector type");

	// Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
	// For size optimization, also splat v2f64 and v2i64, and for size opt
	// with AVX2, also splat i8 and i16.
	// With pattern matching, the VBROADCAST node may become a VMOVDDUP.
	if (ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64) \|\|
	(OptForSize && (ScalarSize == 64 \|\| Subtarget.hasAVX2()))) {
	const Constant *C = nullptr;
	if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
	C = CI->getConstantIntValue();
	else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
	C = CF->getConstantFPValue();

	assert(C && "Invalid constant type");

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue CP =
	DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
	Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();

	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = {DAG.getEntryNode(), CP};
	MachinePointerInfo MPI =
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
	return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
	MPI, Alignment, MachineMemOperand::MOLoad);
	}
	}

	// Handle AVX2 in-register broadcasts.
	if (!IsLoad && Subtarget.hasInt256() &&
	(ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64)))
	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

	// The scalar source must be a normal load.
	if (!IsLoad)
	return SDValue();

	// Make sure the non-chain result is only used by this build vector.
	if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
	return SDValue();

	if (ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64) \|\|
	(Subtarget.hasVLX() && ScalarSize == 64)) {
	auto *LN = cast<LoadSDNode>(Ld);
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
	SDValue BCast =
	DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
	LN->getMemoryVT(), LN->getMemOperand());
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
	return BCast;
	}

	// The integer check is needed for the 64-bit into 128-bit so it doesn't match
	// double since there is no vbroadcastsd xmm
	if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
	(ScalarSize == 8 \|\| ScalarSize == 16 \|\| ScalarSize == 64)) {
	auto *LN = cast<LoadSDNode>(Ld);
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
	SDValue BCast =
	DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
	LN->getMemoryVT(), LN->getMemOperand());
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
	return BCast;
	}

	// Unsupported broadcast.
	return SDValue();
	}

	/// For an EXTRACT_VECTOR_ELT with a constant index return the real
	/// underlying vector and index.
	///
	/// Modifies \p ExtractedFromVec to the real vector and returns the real
	/// index.
	static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
	SDValue ExtIdx) {
	int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
	if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
	return Idx;

	// For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
	// lowered this:
	// (extract_vector_elt (v8f32 %1), Constant<6>)
	// to:
	// (extract_vector_elt (vector_shuffle<2,u,u,u>
	// (extract_subvector (v8f32 %0), Constant<4>),
	// undef)
	// Constant<0>)
	// In this case the vector is the extract_subvector expression and the index
	// is 2, as specified by the shuffle.
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
	SDValue ShuffleVec = SVOp->getOperand(0);
	MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
	assert(ShuffleVecVT.getVectorElementType() ==
	ExtractedFromVec.getSimpleValueType().getVectorElementType());

	int ShuffleIdx = SVOp->getMaskElt(Idx);
	if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
	ExtractedFromVec = ShuffleVec;
	return ShuffleIdx;
	}
	return Idx;
	}

	static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	// Skip if insert_vec_elt is not supported.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
	return SDValue();

	SDLoc DL(Op);
	unsigned NumElems = Op.getNumOperands();

	SDValue VecIn1;
	SDValue VecIn2;
	SmallVector<unsigned, 4> InsertIndices;
	SmallVector<int, 8> Mask(NumElems, -1);

	for (unsigned i = 0; i != NumElems; ++i) {
	unsigned Opc = Op.getOperand(i).getOpcode();

	if (Opc == ISD::UNDEF)
	continue;

	if (Opc != ISD::EXTRACT_VECTOR_ELT) {
	// Quit if more than 1 elements need inserting.
	if (InsertIndices.size() > 1)
	return SDValue();

	InsertIndices.push_back(i);
	continue;
	}

	SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
	SDValue ExtIdx = Op.getOperand(i).getOperand(1);

	// Quit if non-constant index.
	if (!isa<ConstantSDNode>(ExtIdx))
	return SDValue();
	int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);

	// Quit if extracted from vector of different type.
	if (ExtractedFromVec.getValueType() != VT)
	return SDValue();

	if (!VecIn1.getNode())
	VecIn1 = ExtractedFromVec;
	else if (VecIn1 != ExtractedFromVec) {
	if (!VecIn2.getNode())
	VecIn2 = ExtractedFromVec;
	else if (VecIn2 != ExtractedFromVec)
	// Quit if more than 2 vectors to shuffle
	return SDValue();
	}

	if (ExtractedFromVec == VecIn1)
	Mask[i] = Idx;
	else if (ExtractedFromVec == VecIn2)
	Mask[i] = Idx + NumElems;
	}

	if (!VecIn1.getNode())
	return SDValue();

	VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
	SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);

	for (unsigned Idx : InsertIndices)
	NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
	DAG.getIntPtrConstant(Idx, DL));

	return NV;
	}

	// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
	static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {

	MVT VT = Op.getSimpleValueType();
	assert((VT.getVectorElementType() == MVT::i1) &&
	"Unexpected type in LowerBUILD_VECTORvXi1!");

	SDLoc dl(Op);
	if (ISD::isBuildVectorAllZeros(Op.getNode()) \|\|
	ISD::isBuildVectorAllOnes(Op.getNode()))
	return Op;

	uint64_t Immediate = 0;
	SmallVector<unsigned, 16> NonConstIdx;
	bool IsSplat = true;
	bool HasConstElts = false;
	int SplatIdx = -1;
	for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
	SDValue In = Op.getOperand(idx);
	if (In.isUndef())
	continue;
	if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
	Immediate \|= (InC->getZExtValue() & 0x1) << idx;
	HasConstElts = true;
	} else {
	NonConstIdx.push_back(idx);
	}
	if (SplatIdx < 0)
	SplatIdx = idx;
	else if (In != Op.getOperand(SplatIdx))
	IsSplat = false;
	}

	// for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
	if (IsSplat) {
	// The build_vector allows the scalar element to be larger than the vector
	// element type. We need to mask it to use as a condition unless we know
	// the upper bits are zero.
	// FIXME: Use computeKnownBits instead of checking specific opcode?
	SDValue Cond = Op.getOperand(SplatIdx);
	assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
	if (Cond.getOpcode() != ISD::SETCC)
	Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
	DAG.getConstant(1, dl, MVT::i8));

	// Perform the select in the scalar domain so we can use cmov.
	if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
	SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
	DAG.getAllOnesConstant(dl, MVT::i32),
	DAG.getConstant(0, dl, MVT::i32));
	Select = DAG.getBitcast(MVT::v32i1, Select);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
	} else {
	MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
	SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
	DAG.getAllOnesConstant(dl, ImmVT),
	DAG.getConstant(0, dl, ImmVT));
	MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
	Select = DAG.getBitcast(VecVT, Select);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
	DAG.getIntPtrConstant(0, dl));
	}
	}

	// insert elements one by one
	SDValue DstVec;
	if (HasConstElts) {
	if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
	SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
	SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
	ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
	ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
	DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
	} else {
	MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
	SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
	MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
	DstVec = DAG.getBitcast(VecVT, Imm);
	DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
	DAG.getIntPtrConstant(0, dl));
	}
	} else
	DstVec = DAG.getUNDEF(VT);

	for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
	unsigned InsertIdx = NonConstIdx[i];
	DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
	Op.getOperand(InsertIdx),
	DAG.getIntPtrConstant(InsertIdx, dl));
	}
	return DstVec;
	}

	/// This is a helper function of LowerToHorizontalOp().
	/// This function checks that the build_vector \p N in input implements a
	/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
	/// may not match the layout of an x86 256-bit horizontal instruction.
	/// In other words, if this returns true, then some extraction/insertion will
	/// be required to produce a valid horizontal instruction.
	///
	/// Parameter \p Opcode defines the kind of horizontal operation to match.
	/// For example, if \p Opcode is equal to ISD::ADD, then this function
	/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
	/// is equal to ISD::SUB, then this function checks if this is a horizontal
	/// arithmetic sub.
	///
	/// This function only analyzes elements of \p N whose indices are
	/// in range [BaseIdx, LastIdx).
	///
	/// TODO: This function was originally used to match both real and fake partial
	/// horizontal operations, but the index-matching logic is incorrect for that.
	/// See the corrected implementation in isHopBuildVector(). Can we reduce this
	/// code because it is only used for partial h-op matching now?
	static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
	SelectionDAG &DAG,
	unsigned BaseIdx, unsigned LastIdx,
	SDValue &V0, SDValue &V1) {
	EVT VT = N->getValueType(0);
	assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
	assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
	assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
	"Invalid Vector in input!");

	bool IsCommutable = (Opcode == ISD::ADD \|\| Opcode == ISD::FADD);
	bool CanFold = true;
	unsigned ExpectedVExtractIdx = BaseIdx;
	unsigned NumElts = LastIdx - BaseIdx;
	V0 = DAG.getUNDEF(VT);
	V1 = DAG.getUNDEF(VT);

	// Check if N implements a horizontal binop.
	for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
	SDValue Op = N->getOperand(i + BaseIdx);

	// Skip UNDEFs.
	if (Op->isUndef()) {
	// Update the expected vector extract index.
	if (i * 2 == NumElts)
	ExpectedVExtractIdx = BaseIdx;
	ExpectedVExtractIdx += 2;
	continue;
	}

	CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();

	if (!CanFold)
	break;

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	// Try to match the following pattern:
	// (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
	CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Op0.getOperand(0) == Op1.getOperand(0) &&
	isa<ConstantSDNode>(Op0.getOperand(1)) &&
	isa<ConstantSDNode>(Op1.getOperand(1)));
	if (!CanFold)
	break;

	unsigned I0 = Op0.getConstantOperandVal(1);
	unsigned I1 = Op1.getConstantOperandVal(1);

	if (i * 2 < NumElts) {
	if (V0.isUndef()) {
	V0 = Op0.getOperand(0);
	if (V0.getValueType() != VT)
	return false;
	}
	} else {
	if (V1.isUndef()) {
	V1 = Op0.getOperand(0);
	if (V1.getValueType() != VT)
	return false;
	}
	if (i * 2 == NumElts)
	ExpectedVExtractIdx = BaseIdx;
	}

	SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
	if (I0 == ExpectedVExtractIdx)
	CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
	else if (IsCommutable && I1 == ExpectedVExtractIdx) {
	// Try to match the following dag sequence:
	// (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
	CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
	} else
	CanFold = false;

	ExpectedVExtractIdx += 2;
	}

	return CanFold;
	}

	/// Emit a sequence of two 128-bit horizontal add/sub followed by
	/// a concat_vector.
	///
	/// This is a helper function of LowerToHorizontalOp().
	/// This function expects two 256-bit vectors called V0 and V1.
	/// At first, each vector is split into two separate 128-bit vectors.
	/// Then, the resulting 128-bit vectors are used to implement two
	/// horizontal binary operations.
	///
	/// The kind of horizontal binary operation is defined by \p X86Opcode.
	///
	/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
	/// the two new horizontal binop.
	/// When Mode is set, the first horizontal binop dag node would take as input
	/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
	/// horizontal binop dag node would take as input the lower 128-bit of V1
	/// and the upper 128-bit of V1.
	/// Example:
	/// HADD V0_LO, V0_HI
	/// HADD V1_LO, V1_HI
	///
	/// Otherwise, the first horizontal binop dag node takes as input the lower
	/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
	/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
	/// Example:
	/// HADD V0_LO, V1_LO
	/// HADD V0_HI, V1_HI
	///
	/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
	/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
	/// the upper 128-bits of the result.
	static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
	const SDLoc &DL, SelectionDAG &DAG,
	unsigned X86Opcode, bool Mode,
	bool isUndefLO, bool isUndefHI) {
	MVT VT = V0.getSimpleValueType();
	assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
	"Invalid nodes in input!");

	unsigned NumElts = VT.getVectorNumElements();
	SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
	SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
	SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
	SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
	MVT NewVT = V0_LO.getSimpleValueType();

	SDValue LO = DAG.getUNDEF(NewVT);
	SDValue HI = DAG.getUNDEF(NewVT);

	if (Mode) {
	// Don't emit a horizontal binop if the result is expected to be UNDEF.
	if (!isUndefLO && !V0->isUndef())
	LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
	if (!isUndefHI && !V1->isUndef())
	HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
	} else {
	// Don't emit a horizontal binop if the result is expected to be UNDEF.
	if (!isUndefLO && (!V0_LO->isUndef() \|\| !V1_LO->isUndef()))
	LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);

	if (!isUndefHI && (!V0_HI->isUndef() \|\| !V1_HI->isUndef()))
	HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
	}

	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
	}

	/// Returns true iff \p BV builds a vector with the result equivalent to
	/// the result of ADDSUB/SUBADD operation.
	/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
	/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
	/// \p Opnd0 and \p Opnd1.
	static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	SDValue &Opnd0, SDValue &Opnd1,
	unsigned &NumExtracts,
	bool &IsSubAdd) {

	MVT VT = BV->getSimpleValueType(0);
	if (!Subtarget.hasSSE3() \|\| !VT.isFloatingPoint())
	return false;

	unsigned NumElts = VT.getVectorNumElements();
	SDValue InVec0 = DAG.getUNDEF(VT);
	SDValue InVec1 = DAG.getUNDEF(VT);

	NumExtracts = 0;

	// Odd-numbered elements in the input build vector are obtained from
	// adding/subtracting two integer/float elements.
	// Even-numbered elements in the input build vector are obtained from
	// subtracting/adding two integer/float elements.
	unsigned Opc[2] = {0, 0};
	for (unsigned i = 0, e = NumElts; i != e; ++i) {
	SDValue Op = BV->getOperand(i);

	// Skip 'undef' values.
	unsigned Opcode = Op.getOpcode();
	if (Opcode == ISD::UNDEF)
	continue;

	// Early exit if we found an unexpected opcode.
	if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
	return false;

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	// Try to match the following pattern:
	// (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
	// Early exit if we cannot match that sequence.
	if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(Op0.getOperand(1)) \|\|
	Op0.getOperand(1) != Op1.getOperand(1))
	return false;

	unsigned I0 = Op0.getConstantOperandVal(1);
	if (I0 != i)
	return false;

	// We found a valid add/sub node, make sure its the same opcode as previous
	// elements for this parity.
	if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
	return false;
	Opc[i % 2] = Opcode;

	// Update InVec0 and InVec1.
	if (InVec0.isUndef()) {
	InVec0 = Op0.getOperand(0);
	if (InVec0.getSimpleValueType() != VT)
	return false;
	}
	if (InVec1.isUndef()) {
	InVec1 = Op1.getOperand(0);
	if (InVec1.getSimpleValueType() != VT)
	return false;
	}

	// Make sure that operands in input to each add/sub node always
	// come from a same pair of vectors.
	if (InVec0 != Op0.getOperand(0)) {
	if (Opcode == ISD::FSUB)
	return false;

	// FADD is commutable. Try to commute the operands
	// and then test again.
	std::swap(Op0, Op1);
	if (InVec0 != Op0.getOperand(0))
	return false;
	}

	if (InVec1 != Op1.getOperand(0))
	return false;

	// Increment the number of extractions done.
	++NumExtracts;
	}

	// Ensure we have found an opcode for both parities and that they are
	// different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
	// inputs are undef.
	if (!Opc[0] \|\| !Opc[1] \|\| Opc[0] == Opc[1] \|\|
	InVec0.isUndef() \|\| InVec1.isUndef())
	return false;

	IsSubAdd = Opc[0] == ISD::FADD;

	Opnd0 = InVec0;
	Opnd1 = InVec1;
	return true;
	}

	/// Returns true if is possible to fold MUL and an idiom that has already been
	/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
	/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
	/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
	///
	/// Prior to calling this function it should be known that there is some
	/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
	/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
	/// before replacement of such SDNode with ADDSUB operation. Thus the number
	/// of \p Opnd0 uses is expected to be equal to 2.
	/// For example, this function may be called for the following IR:
	/// %AB = fmul fast <2 x double> %A, %B
	/// %Sub = fsub fast <2 x double> %AB, %C
	/// %Add = fadd fast <2 x double> %AB, %C
	/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
	/// <2 x i32> <i32 0, i32 3>
	/// There is a def for %Addsub here, which potentially can be replaced by
	/// X86ISD::ADDSUB operation:
	/// %Addsub = X86ISD::ADDSUB %AB, %C
	/// and such ADDSUB can further be replaced with FMADDSUB:
	/// %Addsub = FMADDSUB %A, %B, %C.
	///
	/// The main reason why this method is called before the replacement of the
	/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
	/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
	/// FMADDSUB is.
	static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
	SelectionDAG &DAG,
	SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
	unsigned ExpectedUses) {
	if (Opnd0.getOpcode() != ISD::FMUL \|\|
	!Opnd0->hasNUsesOfValue(ExpectedUses, 0) \|\| !Subtarget.hasAnyFMA())
	return false;

	// FIXME: These checks must match the similar ones in
	// DAGCombiner::visitFADDForFMACombine. It would be good to have one
	// function that would answer if it is Ok to fuse MUL + ADD to FMADD
	// or MUL + ADDSUB to FMADDSUB.
	const TargetOptions &Options = DAG.getTarget().Options;
	bool AllowFusion =
	(Options.AllowFPOpFusion == FPOpFusion::Fast \|\| Options.UnsafeFPMath);
	if (!AllowFusion)
	return false;

	Opnd2 = Opnd1;
	Opnd1 = Opnd0.getOperand(1);
	Opnd0 = Opnd0.getOperand(0);

	return true;
	}

	/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
	/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
	/// X86ISD::FMSUBADD node.
	static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Opnd0, Opnd1;
	unsigned NumExtracts;
	bool IsSubAdd;
	if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
	IsSubAdd))
	return SDValue();

	MVT VT = BV->getSimpleValueType(0);
	SDLoc DL(BV);

	// Try to generate X86ISD::FMADDSUB node here.
	SDValue Opnd2;
	if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
	unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
	return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
	}

	// We only support ADDSUB.
	if (IsSubAdd)
	return SDValue();

	// Do not generate X86ISD::ADDSUB node for 512-bit types even though
	// the ADDSUB idiom has been successfully recognized. There are no known
	// X86 targets with 512-bit ADDSUB instructions!
	// 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
	// recognition.
	if (VT.is512BitVector())
	return SDValue();

	return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
	}

	static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
	unsigned &HOpcode, SDValue &V0, SDValue &V1) {
	// Initialize outputs to known values.
	MVT VT = BV->getSimpleValueType(0);
	HOpcode = ISD::DELETED_NODE;
	V0 = DAG.getUNDEF(VT);
	V1 = DAG.getUNDEF(VT);

	// x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
	// half of the result is calculated independently from the 128-bit halves of
	// the inputs, so that makes the index-checking logic below more complicated.
	unsigned NumElts = VT.getVectorNumElements();
	unsigned GenericOpcode = ISD::DELETED_NODE;
	unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
	unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
	unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
	for (unsigned i = 0; i != Num128BitChunks; ++i) {
	for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
	// Ignore undef elements.
	SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
	if (Op.isUndef())
	continue;

	// If there's an opcode mismatch, we're done.
	if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
	return false;

	// Initialize horizontal opcode.
	if (HOpcode == ISD::DELETED_NODE) {
	GenericOpcode = Op.getOpcode();
	switch (GenericOpcode) {
	case ISD::ADD: HOpcode = X86ISD::HADD; break;
	case ISD::SUB: HOpcode = X86ISD::HSUB; break;
	case ISD::FADD: HOpcode = X86ISD::FHADD; break;
	case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
	default: return false;
	}
	}

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op0.getOperand(0) != Op1.getOperand(0) \|\|
	!isa<ConstantSDNode>(Op0.getOperand(1)) \|\|
	!isa<ConstantSDNode>(Op1.getOperand(1)) \|\| !Op.hasOneUse())
	return false;

	// The source vector is chosen based on which 64-bit half of the
	// destination vector is being calculated.
	if (j < NumEltsIn64Bits) {
	if (V0.isUndef())
	V0 = Op0.getOperand(0);
	} else {
	if (V1.isUndef())
	V1 = Op0.getOperand(0);
	}

	SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
	if (SourceVec != Op0.getOperand(0))
	return false;

	// op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
	unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
	unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
	unsigned ExpectedIndex = i * NumEltsIn128Bits +
	(j % NumEltsIn64Bits) * 2;
	if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
	continue;

	// If this is not a commutative op, this does not match.
	if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
	return false;

	// Addition is commutative, so try swapping the extract indexes.
	// op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
	if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
	continue;

	// Extract indexes do not match horizontal requirement.
	return false;
	}
	}
	// We matched. Opcode and operands are returned by reference as arguments.
	return true;
	}

	static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
	SelectionDAG &DAG, unsigned HOpcode,
	SDValue V0, SDValue V1) {
	// If either input vector is not the same size as the build vector,
	// extract/insert the low bits to the correct size.
	// This is free (examples: zmm --> xmm, xmm --> ymm).
	MVT VT = BV->getSimpleValueType(0);
	unsigned Width = VT.getSizeInBits();
	if (V0.getValueSizeInBits() > Width)
	V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
	else if (V0.getValueSizeInBits() < Width)
	V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);

	if (V1.getValueSizeInBits() > Width)
	V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
	else if (V1.getValueSizeInBits() < Width)
	V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);

	unsigned NumElts = VT.getVectorNumElements();
	APInt DemandedElts = APInt::getAllOnesValue(NumElts);
	for (unsigned i = 0; i != NumElts; ++i)
	if (BV->getOperand(i).isUndef())
	DemandedElts.clearBit(i);

	// If we don't need the upper xmm, then perform as a xmm hop.
	unsigned HalfNumElts = NumElts / 2;
	if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
	MVT HalfVT = VT.getHalfNumVectorElementsVT();
	V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
	V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
	SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
	return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
	}

	return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
	}

	/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
	static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// We need at least 2 non-undef elements to make this worthwhile by default.
	unsigned NumNonUndefs =
	count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
	if (NumNonUndefs < 2)
	return SDValue();

	// There are 4 sets of horizontal math operations distinguished by type:
	// int/FP at 128-bit/256-bit. Each type was introduced with a different
	// subtarget feature. Try to match those "native" patterns first.
	MVT VT = BV->getSimpleValueType(0);
	if (((VT == MVT::v4f32 \|\| VT == MVT::v2f64) && Subtarget.hasSSE3()) \|\|
	((VT == MVT::v8i16 \|\| VT == MVT::v4i32) && Subtarget.hasSSSE3()) \|\|
	((VT == MVT::v8f32 \|\| VT == MVT::v4f64) && Subtarget.hasAVX()) \|\|
	((VT == MVT::v16i16 \|\| VT == MVT::v8i32) && Subtarget.hasAVX2())) {
	unsigned HOpcode;
	SDValue V0, V1;
	if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
	return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
	}

	// Try harder to match 256-bit ops by using extract/concat.
	if (!Subtarget.hasAVX() \|\| !VT.is256BitVector())
	return SDValue();

	// Count the number of UNDEF operands in the build_vector in input.
	unsigned NumElts = VT.getVectorNumElements();
	unsigned Half = NumElts / 2;
	unsigned NumUndefsLO = 0;
	unsigned NumUndefsHI = 0;
	for (unsigned i = 0, e = Half; i != e; ++i)
	if (BV->getOperand(i)->isUndef())
	NumUndefsLO++;

	for (unsigned i = Half, e = NumElts; i != e; ++i)
	if (BV->getOperand(i)->isUndef())
	NumUndefsHI++;

	SDLoc DL(BV);
	SDValue InVec0, InVec1;
	if (VT == MVT::v8i32 \|\| VT == MVT::v16i16) {
	SDValue InVec2, InVec3;
	unsigned X86Opcode;
	bool CanFold = true;

	if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
	isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
	InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	X86Opcode = X86ISD::HADD;
	else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
	InVec1) &&
	isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
	InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	X86Opcode = X86ISD::HSUB;
	else
	CanFold = false;

	if (CanFold) {
	// Do not try to expand this build_vector into a pair of horizontal
	// add/sub if we can emit a pair of scalar add/sub.
	if (NumUndefsLO + 1 == Half \|\| NumUndefsHI + 1 == Half)
	return SDValue();

	// Convert this build_vector into a pair of horizontal binops followed by
	// a concat vector. We must adjust the outputs from the partial horizontal
	// matching calls above to account for undefined vector halves.
	SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
	SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
	assert((!V0.isUndef() \|\| !V1.isUndef()) && "Horizontal-op of undefs?");
	bool isUndefLO = NumUndefsLO == Half;
	bool isUndefHI = NumUndefsHI == Half;
	return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
	isUndefHI);
	}
	}

	if (VT == MVT::v8f32 \|\| VT == MVT::v4f64 \|\| VT == MVT::v8i32 \|\|
	VT == MVT::v16i16) {
	unsigned X86Opcode;
	if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
	X86Opcode = X86ISD::HADD;
	else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
	InVec1))
	X86Opcode = X86ISD::HSUB;
	else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
	InVec1))
	X86Opcode = X86ISD::FHADD;
	else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
	InVec1))
	X86Opcode = X86ISD::FHSUB;
	else
	return SDValue();

	// Don't try to expand this build_vector into a pair of horizontal add/sub
	// if we can simply emit a pair of scalar add/sub.
	if (NumUndefsLO + 1 == Half \|\| NumUndefsHI + 1 == Half)
	return SDValue();

	// Convert this build_vector into two horizontal add/sub followed by
	// a concat vector.
	bool isUndefLO = NumUndefsLO == Half;
	bool isUndefHI = NumUndefsHI == Half;
	return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
	isUndefLO, isUndefHI);
	}

	return SDValue();
	}

	static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG);

	/// If a BUILD_VECTOR's source elements all apply the same bit operation and
	/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
	/// just apply the bit to the vectors.
	/// NOTE: Its not in our interest to start make a general purpose vectorizer
	/// from this, but enough scalar bit operations are created from the later
	/// legalization + scalarization stages to need basic support.
	static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc DL(Op);
	MVT VT = Op->getSimpleValueType(0);
	unsigned NumElems = VT.getVectorNumElements();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Check that all elements have the same opcode.
	// TODO: Should we allow UNDEFS and if so how many?
	unsigned Opcode = Op->getOperand(0).getOpcode();
	for (unsigned i = 1; i < NumElems; ++i)
	if (Opcode != Op->getOperand(i).getOpcode())
	return SDValue();

	// TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
	bool IsShift = false;
	switch (Opcode) {
	default:
	return SDValue();
	case ISD::SHL:
	case ISD::SRL:
	case ISD::SRA:
	IsShift = true;
	break;
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR:
	// Don't do this if the buildvector is a splat - we'd replace one
	// constant with an entire vector.
	if (Op->getSplatValue())
	return SDValue();
	if (!TLI.isOperationLegalOrPromote(Opcode, VT))
	return SDValue();
	break;
	}

	SmallVector<SDValue, 4> LHSElts, RHSElts;
	for (SDValue Elt : Op->ops()) {
	SDValue LHS = Elt.getOperand(0);
	SDValue RHS = Elt.getOperand(1);

	// We expect the canonicalized RHS operand to be the constant.
	if (!isa<ConstantSDNode>(RHS))
	return SDValue();

	// Extend shift amounts.
	if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
	if (!IsShift)
	return SDValue();
	RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
	}

	LHSElts.push_back(LHS);
	RHSElts.push_back(RHS);
	}

	// Limit to shifts by uniform immediates.
	// TODO: Only accept vXi8/vXi64 special cases?
	// TODO: Permit non-uniform XOP/AVX2/MULLO cases?
	if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
	return SDValue();

	SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
	SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
	SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);

	if (!IsShift)
	return Res;

	// Immediately lower the shift to ensure the constant build vector doesn't
	// get converted to a constant pool before the shift is lowered.
	return LowerShift(Res, Subtarget, DAG);
	}

	/// Create a vector constant without a load. SSE/AVX provide the bare minimum
	/// functionality to do this, so it's all zeros, all ones, or some derivation
	/// that is cheap to calculate.
	static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();

	// Vectors containing all zeros can be matched by pxor and xorps.
	if (ISD::isBuildVectorAllZeros(Op.getNode()))
	return Op;

	// Vectors containing all ones can be matched by pcmpeqd on 128-bit width
	// vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
	// vpcmpeqd on 256-bit vectors.
	if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
	if (VT == MVT::v4i32 \|\| VT == MVT::v8i32 \|\| VT == MVT::v16i32)
	return Op;

	return getOnesVector(VT, DAG, DL);
	}

	return SDValue();
	}

	/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
	/// from a vector of source values and a vector of extraction indices.
	/// The vectors might be manipulated to match the type of the permute op.
	static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
	SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT ShuffleVT = VT;
	EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
	unsigned NumElts = VT.getVectorNumElements();
	unsigned SizeInBits = VT.getSizeInBits();

	// Adjust IndicesVec to match VT size.
	assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
	"Illegal variable permute mask size");
	if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
	IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
	NumElts * VT.getScalarSizeInBits());
	IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);

	// Handle SrcVec that don't match VT type.
	if (SrcVec.getValueSizeInBits() != SizeInBits) {
	if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
	// Handle larger SrcVec by treating it as a larger permute.
	unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
	VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
	IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
	IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
	Subtarget, DAG, SDLoc(IndicesVec));
	SDValue NewSrcVec =
	createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
	if (NewSrcVec)
	return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
	return SDValue();
	} else if (SrcVec.getValueSizeInBits() < SizeInBits) {
	// Widen smaller SrcVec to match VT.
	SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
	} else
	return SDValue();
	}

	auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
	assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
	EVT SrcVT = Idx.getValueType();
	unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
	uint64_t IndexScale = 0;
	uint64_t IndexOffset = 0;

	// If we're scaling a smaller permute op, then we need to repeat the
	// indices, scaling and offsetting them as well.
	// e.g. v4i32 -> v16i8 (Scale = 4)
	// IndexScale = v4i32 Splat(4 << 24 \| 4 << 16 \| 4 << 8 \| 4)
	// IndexOffset = v4i32 Splat(3 << 24 \| 2 << 16 \| 1 << 8 \| 0)
	for (uint64_t i = 0; i != Scale; ++i) {
	IndexScale \|= Scale << (i * NumDstBits);
	IndexOffset \|= i << (i * NumDstBits);
	}

	Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
	DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
	Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
	DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
	return Idx;
	};

	unsigned Opcode = 0;
	switch (VT.SimpleTy) {
	default:
	break;
	case MVT::v16i8:
	if (Subtarget.hasSSSE3())
	Opcode = X86ISD::PSHUFB;
	break;
	case MVT::v8i16:
	if (Subtarget.hasVLX() && Subtarget.hasBWI())
	Opcode = X86ISD::VPERMV;
	else if (Subtarget.hasSSSE3()) {
	Opcode = X86ISD::PSHUFB;
	ShuffleVT = MVT::v16i8;
	}
	break;
	case MVT::v4f32:
	case MVT::v4i32:
	if (Subtarget.hasAVX()) {
	Opcode = X86ISD::VPERMILPV;
	ShuffleVT = MVT::v4f32;
	} else if (Subtarget.hasSSSE3()) {
	Opcode = X86ISD::PSHUFB;
	ShuffleVT = MVT::v16i8;
	}
	break;
	case MVT::v2f64:
	case MVT::v2i64:
	if (Subtarget.hasAVX()) {
	// VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
	IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
	Opcode = X86ISD::VPERMILPV;
	ShuffleVT = MVT::v2f64;
	} else if (Subtarget.hasSSE41()) {
	// SSE41 can compare v2i64 - select between indices 0 and 1.
	return DAG.getSelectCC(
	DL, IndicesVec,
	getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
	DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
	DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
	ISD::CondCode::SETEQ);
	}
	break;
	case MVT::v32i8:
	if (Subtarget.hasVLX() && Subtarget.hasVBMI())
	Opcode = X86ISD::VPERMV;
	else if (Subtarget.hasXOP()) {
	SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
	SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
	SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
	SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
	return DAG.getNode(
	ISD::CONCAT_VECTORS, DL, VT,
	DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
	DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
	} else if (Subtarget.hasAVX()) {
	SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
	SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
	SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
	SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
	auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	// Permute Lo and Hi and then select based on index range.
	// This works as SHUFB uses bits[3:0] to permute elements and we don't
	// care about the bit[7] as its just an index vector.
	SDValue Idx = Ops[2];
	EVT VT = Idx.getValueType();
	return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
	DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
	DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
	ISD::CondCode::SETGT);
	};
	SDValue Ops[] = {LoLo, HiHi, IndicesVec};
	return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
	PSHUFBBuilder);
	}
	break;
	case MVT::v16i16:
	if (Subtarget.hasVLX() && Subtarget.hasBWI())
	Opcode = X86ISD::VPERMV;
	else if (Subtarget.hasAVX()) {
	// Scale to v32i8 and perform as v32i8.
	IndicesVec = ScaleIndices(IndicesVec, 2);
	return DAG.getBitcast(
	VT, createVariablePermute(
	MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
	DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
	}
	break;
	case MVT::v8f32:
	case MVT::v8i32:
	if (Subtarget.hasAVX2())
	Opcode = X86ISD::VPERMV;
	else if (Subtarget.hasAVX()) {
	SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
	SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
	{0, 1, 2, 3, 0, 1, 2, 3});
	SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
	{4, 5, 6, 7, 4, 5, 6, 7});
	if (Subtarget.hasXOP())
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
	IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
	// Permute Lo and Hi and then select based on index range.
	// This works as VPERMILPS only uses index bits[0:1] to permute elements.
	SDValue Res = DAG.getSelectCC(
	DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
	DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
	DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
	ISD::CondCode::SETGT);
	return DAG.getBitcast(VT, Res);
	}
	break;
	case MVT::v4i64:
	case MVT::v4f64:
	if (Subtarget.hasAVX512()) {
	if (!Subtarget.hasVLX()) {
	MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
	SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
	SDLoc(SrcVec));
	IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
	DAG, SDLoc(IndicesVec));
	SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
	DAG, Subtarget);
	return extract256BitVector(Res, 0, DAG, DL);
	}
	Opcode = X86ISD::VPERMV;
	} else if (Subtarget.hasAVX()) {
	SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
	SDValue LoLo =
	DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
	SDValue HiHi =
	DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
	// VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
	IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
	if (Subtarget.hasXOP())
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
	IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
	// Permute Lo and Hi and then select based on index range.
	// This works as VPERMILPD only uses index bit[1] to permute elements.
	SDValue Res = DAG.getSelectCC(
	DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
	DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
	DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
	ISD::CondCode::SETGT);
	return DAG.getBitcast(VT, Res);
	}
	break;
	case MVT::v64i8:
	if (Subtarget.hasVBMI())
	Opcode = X86ISD::VPERMV;
	break;
	case MVT::v32i16:
	if (Subtarget.hasBWI())
	Opcode = X86ISD::VPERMV;
	break;
	case MVT::v16f32:
	case MVT::v16i32:
	case MVT::v8f64:
	case MVT::v8i64:
	if (Subtarget.hasAVX512())
	Opcode = X86ISD::VPERMV;
	break;
	}
	if (!Opcode)
	return SDValue();

	assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
	(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
	"Illegal variable permute shuffle type");

	uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
	if (Scale > 1)
	IndicesVec = ScaleIndices(IndicesVec, Scale);

	EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
	IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);

	SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
	SDValue Res = Opcode == X86ISD::VPERMV
	? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
	: DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
	return DAG.getBitcast(VT, Res);
	}

	// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
	// reasoned to be a permutation of a vector by indices in a non-constant vector.
	// (build_vector (extract_elt V, (extract_elt I, 0)),
	// (extract_elt V, (extract_elt I, 1)),
	// ...
	// ->
	// (vpermv I, V)
	//
	// TODO: Handle undefs
	// TODO: Utilize pshufb and zero mask blending to support more efficient
	// construction of vectors with constant-0 elements.
	static SDValue
	LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue SrcVec, IndicesVec;
	// Check for a match of the permute source vector and permute index elements.
	// This is done by checking that the i-th build_vector operand is of the form:
	// (extract_elt SrcVec, (extract_elt IndicesVec, i)).
	for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
	SDValue Op = V.getOperand(Idx);
	if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	// If this is the first extract encountered in V, set the source vector,
	// otherwise verify the extract is from the previously defined source
	// vector.
	if (!SrcVec)
	SrcVec = Op.getOperand(0);
	else if (SrcVec != Op.getOperand(0))
	return SDValue();
	SDValue ExtractedIndex = Op->getOperand(1);
	// Peek through extends.
	if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND \|\|
	ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
	ExtractedIndex = ExtractedIndex.getOperand(0);
	if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	// If this is the first extract from the index vector candidate, set the
	// indices vector, otherwise verify the extract is from the previously
	// defined indices vector.
	if (!IndicesVec)
	IndicesVec = ExtractedIndex.getOperand(0);
	else if (IndicesVec != ExtractedIndex.getOperand(0))
	return SDValue();

	auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
	if (!PermIdx \|\| PermIdx->getAPIntValue() != Idx)
	return SDValue();
	}

	SDLoc DL(V);
	MVT VT = V.getSimpleValueType();
	return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
	}

	SDValue
	X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
	SDLoc dl(Op);

	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElems = Op.getNumOperands();

	// Generate vectors for predicate vectors.
	if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
	return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);

	if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
	return VectorConstant;

	BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
	if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
	return AddSub;
	if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
	return HorizontalOp;
	if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
	return Broadcast;
	if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
	return BitOp;

	unsigned EVTBits = EltVT.getSizeInBits();

	unsigned NumZero = 0;
	unsigned NumNonZero = 0;
	uint64_t NonZeros = 0;
	bool IsAllConstants = true;
	SmallSet<SDValue, 8> Values;
	unsigned NumConstants = NumElems;
	for (unsigned i = 0; i < NumElems; ++i) {
	SDValue Elt = Op.getOperand(i);
	if (Elt.isUndef())
	continue;
	Values.insert(Elt);
	if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
	IsAllConstants = false;
	NumConstants--;
	}
	if (X86::isZeroNode(Elt))
	NumZero++;
	else {
	assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
	NonZeros \|= ((uint64_t)1 << i);
	NumNonZero++;
	}
	}

	// All undef vector. Return an UNDEF. All zero vectors were handled above.
	if (NumNonZero == 0)
	return DAG.getUNDEF(VT);

	// If we are inserting one variable into a vector of non-zero constants, try
	// to avoid loading each constant element as a scalar. Load the constants as a
	// vector and then insert the variable scalar element. If insertion is not
	// supported, fall back to a shuffle to get the scalar blended with the
	// constants. Insertion into a zero vector is handled as a special-case
	// somewhere below here.
	if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
	(isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) \|\|
	isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
	// Create an all-constant vector. The variable element in the old
	// build vector is replaced by undef in the constant vector. Save the
	// variable scalar element and its index for use in the insertelement.
	LLVMContext &Context = *DAG.getContext();
	Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
	SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
	SDValue VarElt;
	SDValue InsIndex;
	for (unsigned i = 0; i != NumElems; ++i) {
	SDValue Elt = Op.getOperand(i);
	if (auto *C = dyn_cast<ConstantSDNode>(Elt))
	ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
	else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
	ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
	else if (!Elt.isUndef()) {
	assert(!VarElt.getNode() && !InsIndex.getNode() &&
	"Expected one variable element in this vector");
	VarElt = Elt;
	InsIndex = DAG.getVectorIdxConstant(i, dl);
	}
	}
	Constant *CV = ConstantVector::get(ConstVecOps);
	SDValue DAGConstVec = DAG.getConstantPool(CV, VT);

	// The constants we just created may not be legal (eg, floating point). We
	// must lower the vector right here because we can not guarantee that we'll
	// legalize it before loading it. This is also why we could not just create
	// a new build vector here. If the build vector contains illegal constants,
	// it could get split back up into a series of insert elements.
	// TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
	SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
	MachineFunction &MF = DAG.getMachineFunction();
	MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
	SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
	unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
	unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
	if (InsertC < NumEltsInLow128Bits)
	return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);

	// There's no good way to insert into the high elements of a >128-bit
	// vector, so use shuffles to avoid an extract/insert sequence.
	assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
	assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
	SmallVector<int, 8> ShuffleMask;
	unsigned NumElts = VT.getVectorNumElements();
	for (unsigned i = 0; i != NumElts; ++i)
	ShuffleMask.push_back(i == InsertC ? NumElts : i);
	SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
	return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
	}

	// Special case for single non-zero, non-undef, element.
	if (NumNonZero == 1) {
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue Item = Op.getOperand(Idx);

	// If we have a constant or non-constant insertion into the low element of
	// a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
	// the rest of the elements. This will be matched as movd/movq/movss/movsd
	// depending on what the source datatype is.
	if (Idx == 0) {
	if (NumZero == 0)
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

	if (EltVT == MVT::i32 \|\| EltVT == MVT::f32 \|\| EltVT == MVT::f64 \|\|
	(EltVT == MVT::i64 && Subtarget.is64Bit())) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\|
	VT.is512BitVector()) &&
	"Expected an SSE value type!");
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
	// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
	return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
	}

	// We can't directly insert an i8 or i16 into a vector, so zero extend
	// it to i32 first.
	if (EltVT == MVT::i16 \|\| EltVT == MVT::i8) {
	Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
	MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
	Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
	return DAG.getBitcast(VT, Item);
	}
	}

	// Is it a vector logical left shift?
	if (NumElems == 2 && Idx == 1 &&
	X86::isZeroNode(Op.getOperand(0)) &&
	!X86::isZeroNode(Op.getOperand(1))) {
	unsigned NumBits = VT.getSizeInBits();
	return getVShift(true, VT,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
	VT, Op.getOperand(1)),
	NumBits/2, DAG, *this, dl);
	}

	if (IsAllConstants) // Otherwise, it's better to do a constpool load.
	return SDValue();

	// Otherwise, if this is a vector with i32 or f32 elements, and the element
	// is a non-constant being inserted into an element other than the low one,
	// we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
	// movd/movss) to move this into the low element, then shuffle it into
	// place.
	if (EVTBits == 32) {
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
	return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
	}
	}

	// Splat is obviously ok. Let legalizer expand it to a shuffle.
	if (Values.size() == 1) {
	if (EVTBits == 32) {
	// Instead of a shuffle like this:
	// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
	// Check if it's possible to issue this instead.
	// shuffle (vload ptr)), undef, <1, 1, 1, 1>
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue Item = Op.getOperand(Idx);
	if (Op.getNode()->isOnlyUserOf(Item.getNode()))
	return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
	}
	return SDValue();
	}

	// A vector full of immediates; various special cases are already
	// handled, so this is best done with a single constant-pool load.
	if (IsAllConstants)
	return SDValue();

	if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
	return V;

	// See if we can use a vector load to get all of the elements.
	{
	SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
	if (SDValue LD =
	EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
	return LD;
	}

	// If this is a splat of pairs of 32-bit elements, we can use a narrower
	// build_vector and broadcast it.
	// TODO: We could probably generalize this more.
	if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
	SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
	DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
	auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
	// Make sure all the even/odd operands match.
	for (unsigned i = 2; i != NumElems; ++i)
	if (Ops[i % 2] != Op.getOperand(i))
	return false;
	return true;
	};
	if (CanSplat(Op, NumElems, Ops)) {
	MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
	MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
	// Create a new build vector and cast to v2i64/v2f64.
	SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
	DAG.getBuildVector(NarrowVT, dl, Ops));
	// Broadcast from v2i64/v2f64 and cast to final VT.
	MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2);
	return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
	NewBV));
	}
	}

	// For AVX-length vectors, build the individual 128-bit pieces and use
	// shuffles to put them in place.
	if (VT.getSizeInBits() > 128) {
	MVT HVT = MVT::getVectorVT(EltVT, NumElems/2);

	// Build both the lower and upper subvector.
	SDValue Lower =
	DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
	SDValue Upper = DAG.getBuildVector(
	HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));

	// Recreate the wider vector with the lower and upper part.
	return concatSubVectors(Lower, Upper, DAG, dl);
	}

	// Let legalizer expand 2-wide build_vectors.
	if (EVTBits == 64) {
	if (NumNonZero == 1) {
	// One half is zero or undef.
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
	Op.getOperand(Idx));
	return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
	}
	return SDValue();
	}

	// If element VT is < 32 bits, convert it to inserts into a zero vector.
	if (EVTBits == 8 && NumElems == 16)
	if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
	DAG, Subtarget))
	return V;

	if (EVTBits == 16 && NumElems == 8)
	if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
	DAG, Subtarget))
	return V;

	// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
	if (EVTBits == 32 && NumElems == 4)
	if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
	return V;

	// If element VT is == 32 bits, turn it into a number of shuffles.
	if (NumElems == 4 && NumZero > 0) {
	SmallVector<SDValue, 8> Ops(NumElems);
	for (unsigned i = 0; i < 4; ++i) {
	bool isZero = !(NonZeros & (1ULL << i));
	if (isZero)
	Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
	else
	Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
	}

	for (unsigned i = 0; i < 2; ++i) {
	switch ((NonZeros >> (i*2)) & 0x3) {
	default: llvm_unreachable("Unexpected NonZero count");
	case 0:
	Ops[i] = Ops[i*2]; // Must be a zero vector.
	break;
	case 1:
	Ops[i] = getMOVL(DAG, dl, VT, Ops[i2+1], Ops[i2]);
	break;
	case 2:
	Ops[i] = getMOVL(DAG, dl, VT, Ops[i2], Ops[i2+1]);
	break;
	case 3:
	Ops[i] = getUnpackl(DAG, dl, VT, Ops[i2], Ops[i2+1]);
	break;
	}
	}

	bool Reverse1 = (NonZeros & 0x3) == 2;
	bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
	int MaskVec[] = {
	Reverse1 ? 1 : 0,
	Reverse1 ? 0 : 1,
	static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
	static_cast<int>(Reverse2 ? NumElems : NumElems+1)
	};
	return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
	}

	assert(Values.size() > 1 && "Expected non-undef and non-splat vector");

	// Check for a build vector from mostly shuffle plus few inserting.
	if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
	return Sh;

	// For SSE 4.1, use insertps to put the high elements into the low element.
	if (Subtarget.hasSSE41()) {
	SDValue Result;
	if (!Op.getOperand(0).isUndef())
	Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
	else
	Result = DAG.getUNDEF(VT);

	for (unsigned i = 1; i < NumElems; ++i) {
	if (Op.getOperand(i).isUndef()) continue;
	Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
	Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
	}
	return Result;
	}

	// Otherwise, expand into a number of unpckl*, start by extending each of
	// our (non-undef) elements to the full vector width with the element in the
	// bottom slot of the vector (which generates no code for SSE).
	SmallVector<SDValue, 8> Ops(NumElems);
	for (unsigned i = 0; i < NumElems; ++i) {
	if (!Op.getOperand(i).isUndef())
	Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
	else
	Ops[i] = DAG.getUNDEF(VT);
	}

	// Next, we iteratively mix elements, e.g. for v4f32:
	// Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
	// : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
	// Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
	for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
	// Generate scaled UNPCKL shuffle mask.
	SmallVector<int, 16> Mask;
	for(unsigned i = 0; i != Scale; ++i)
	Mask.push_back(i);
	for (unsigned i = 0; i != Scale; ++i)
	Mask.push_back(NumElems+i);
	Mask.append(NumElems - Mask.size(), SM_SentinelUndef);

	for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
	Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2i], Ops[(2i)+1], Mask);
	}
	return Ops[0];
	}

	// 256-bit AVX can use the vinsertf128 instruction
	// to create 256-bit vectors from two other 128-bit ones.
	// TODO: Detect subvector broadcast here instead of DAG combine?
	static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	MVT ResVT = Op.getSimpleValueType();

	assert((ResVT.is256BitVector() \|\|
	ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");

	unsigned NumOperands = Op.getNumOperands();
	unsigned NumZero = 0;
	unsigned NumNonZero = 0;
	unsigned NonZeros = 0;
	for (unsigned i = 0; i != NumOperands; ++i) {
	SDValue SubVec = Op.getOperand(i);
	if (SubVec.isUndef())
	continue;
	if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
	++NumZero;
	else {
	assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
	NonZeros \|= 1 << i;
	++NumNonZero;
	}
	}

	// If we have more than 2 non-zeros, build each half separately.
	if (NumNonZero > 2) {
	MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
	ArrayRef<SDUse> Ops = Op->ops();
	SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
	Ops.slice(0, NumOperands/2));
	SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
	Ops.slice(NumOperands/2));
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
	}

	// Otherwise, build it up through insert_subvectors.
	SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
	: DAG.getUNDEF(ResVT);

	MVT SubVT = Op.getOperand(0).getSimpleValueType();
	unsigned NumSubElems = SubVT.getVectorNumElements();
	for (unsigned i = 0; i != NumOperands; ++i) {
	if ((NonZeros & (1 << i)) == 0)
	continue;

	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
	Op.getOperand(i),
	DAG.getIntPtrConstant(i * NumSubElems, dl));
	}

	return Vec;
	}

	// Returns true if the given node is a type promotion (by concatenating i1
	// zeros) of the result of a node that already zeros all upper bits of
	// k-register.
	// TODO: Merge this with LowerAVXCONCAT_VECTORS?
	static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG & DAG) {
	SDLoc dl(Op);
	MVT ResVT = Op.getSimpleValueType();
	unsigned NumOperands = Op.getNumOperands();

	assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
	"Unexpected number of operands in CONCAT_VECTORS");

	uint64_t Zeros = 0;
	uint64_t NonZeros = 0;
	for (unsigned i = 0; i != NumOperands; ++i) {
	SDValue SubVec = Op.getOperand(i);
	if (SubVec.isUndef())
	continue;
	assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
	if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
	Zeros \|= (uint64_t)1 << i;
	else
	NonZeros \|= (uint64_t)1 << i;
	}

	unsigned NumElems = ResVT.getVectorNumElements();

	// If we are inserting non-zero vector and there are zeros in LSBs and undef
	// in the MSBs we need to emit a KSHIFTL. The generic lowering to
	// insert_subvector will give us two kshifts.
	if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
	Log2_64(NonZeros) != NumOperands - 1) {
	MVT ShiftVT = ResVT;
	if ((!Subtarget.hasDQI() && NumElems == 8) \|\| NumElems < 8)
	ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
	unsigned Idx = Log2_64(NonZeros);
	SDValue SubVec = Op.getOperand(Idx);
	unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
	SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
	DAG.getUNDEF(ShiftVT), SubVec,
	DAG.getIntPtrConstant(0, dl));
	Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
	DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
	DAG.getIntPtrConstant(0, dl));
	}

	// If there are zero or one non-zeros we can handle this very simply.
	if (NonZeros == 0 \|\| isPowerOf2_64(NonZeros)) {
	SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
	if (!NonZeros)
	return Vec;
	unsigned Idx = Log2_64(NonZeros);
	SDValue SubVec = Op.getOperand(Idx);
	unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
	DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
	}

	if (NumOperands > 2) {
	MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
	ArrayRef<SDUse> Ops = Op->ops();
	SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
	Ops.slice(0, NumOperands/2));
	SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
	Ops.slice(NumOperands/2));
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
	}

	assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?");

	if (ResVT.getVectorNumElements() >= 16)
	return Op; // The operation is legal with KUNPCK

	SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
	DAG.getUNDEF(ResVT), Op.getOperand(0),
	DAG.getIntPtrConstant(0, dl));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
	DAG.getIntPtrConstant(NumElems/2, dl));
	}

	static SDValue LowerCONCAT_VECTORS(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	if (VT.getVectorElementType() == MVT::i1)
	return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);

	assert((VT.is256BitVector() && Op.getNumOperands() == 2) \|\|
	(VT.is512BitVector() && (Op.getNumOperands() == 2 \|\|
	Op.getNumOperands() == 4)));

	// AVX can use the vinsertf128 instruction to create 256-bit vectors
	// from two other 128-bit ones.

	// 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
	return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
	}

	//===----------------------------------------------------------------------===//
	// Vector shuffle lowering
	//
	// This is an experimental code path for lowering vector shuffles on x86. It is
	// designed to handle arbitrary vector shuffles and blends, gracefully
	// degrading performance as necessary. It works hard to recognize idiomatic
	// shuffles and lower them to optimal instruction patterns without leaving
	// a framework that allows reasonably efficient handling of all vector shuffle
	// patterns.
	//===----------------------------------------------------------------------===//

	/// Tiny helper function to identify a no-op mask.
	///
	/// This is a somewhat boring predicate function. It checks whether the mask
	/// array input, which is assumed to be a single-input shuffle mask of the kind
	/// used by the X86 shuffle instructions (not a fully general
	/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
	/// in-place shuffle are 'no-op's.
	static bool isNoopShuffleMask(ArrayRef<int> Mask) {
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] >= 0 && Mask[i] != i)
	return false;
	}
	return true;
	}

	/// Test whether there are elements crossing LaneSizeInBits lanes in this
	/// shuffle mask.
	///
	/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
	/// and we routinely test for these.
	static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
	unsigned ScalarSizeInBits,
	ArrayRef<int> Mask) {
	assert(LaneSizeInBits && ScalarSizeInBits &&
	(LaneSizeInBits % ScalarSizeInBits) == 0 &&
	"Illegal shuffle lane size");
	int LaneSize = LaneSizeInBits / ScalarSizeInBits;
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
	return true;
	return false;
	}

	/// Test whether there are elements crossing 128-bit lanes in this
	/// shuffle mask.
	static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
	return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
	}

	/// Test whether a shuffle mask is equivalent within each sub-lane.
	///
	/// This checks a shuffle mask to see if it is performing the same
	/// lane-relative shuffle in each sub-lane. This trivially implies
	/// that it is also not lane-crossing. It may however involve a blend from the
	/// same lane of a second vector.
	///
	/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
	/// non-trivial to compute in the face of undef lanes. The representation is
	/// suitable for use with existing 128-bit shuffles as entries from the second
	/// vector have been remapped to [LaneSize, 2*LaneSize).
	static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
	ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
	RepeatedMask.assign(LaneSize, -1);
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i) {
	assert(Mask[i] == SM_SentinelUndef \|\| Mask[i] >= 0);
	if (Mask[i] < 0)
	continue;
	if ((Mask[i] % Size) / LaneSize != i / LaneSize)
	// This entry crosses lanes, so there is no way to model this shuffle.
	return false;

	// Ok, handle the in-lane shuffles by detecting if and when they repeat.
	// Adjust second vector indices to start at LaneSize instead of Size.
	int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
	: Mask[i] % LaneSize + LaneSize;
	if (RepeatedMask[i % LaneSize] < 0)
	// This is the first non-undef entry in this slot of a 128-bit lane.
	RepeatedMask[i % LaneSize] = LocalM;
	else if (RepeatedMask[i % LaneSize] != LocalM)
	// Found a mismatch with the repeated mask.
	return false;
	}
	return true;
	}

	/// Test whether a shuffle mask is equivalent within each 128-bit lane.
	static bool
	is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
	}

	static bool
	is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
	SmallVector<int, 32> RepeatedMask;
	return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
	}

	/// Test whether a shuffle mask is equivalent within each 256-bit lane.
	static bool
	is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
	}

	/// Test whether a target shuffle mask is equivalent within each sub-lane.
	/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
	static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
	ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
	RepeatedMask.assign(LaneSize, SM_SentinelUndef);
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i) {
	assert(isUndefOrZero(Mask[i]) \|\| (Mask[i] >= 0));
	if (Mask[i] == SM_SentinelUndef)
	continue;
	if (Mask[i] == SM_SentinelZero) {
	if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
	return false;
	RepeatedMask[i % LaneSize] = SM_SentinelZero;
	continue;
	}
	if ((Mask[i] % Size) / LaneSize != i / LaneSize)
	// This entry crosses lanes, so there is no way to model this shuffle.
	return false;

	// Ok, handle the in-lane shuffles by detecting if and when they repeat.
	// Adjust second vector indices to start at LaneSize instead of Size.
	int LocalM =
	Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
	if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
	// This is the first non-undef entry in this slot of a 128-bit lane.
	RepeatedMask[i % LaneSize] = LocalM;
	else if (RepeatedMask[i % LaneSize] != LocalM)
	// Found a mismatch with the repeated mask.
	return false;
	}
	return true;
	}

	/// Checks whether a shuffle mask is equivalent to an explicit list of
	/// arguments.
	///
	/// This is a fast way to test a shuffle mask against a fixed pattern:
	///
	/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
	///
	/// It returns true if the mask is exactly as wide as the argument list, and
	/// each element of the mask is either -1 (signifying undef) or the value given
	/// in the argument.
	static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
	ArrayRef<int> ExpectedMask) {
	if (Mask.size() != ExpectedMask.size())
	return false;

	int Size = Mask.size();

	// If the values are build vectors, we can look through them to find
	// equivalent inputs that make the shuffles equivalent.
	auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
	auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);

	for (int i = 0; i < Size; ++i) {
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
	auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
	auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
	if (!MaskBV \|\| !ExpectedBV \|\|
	MaskBV->getOperand(Mask[i] % Size) !=
	ExpectedBV->getOperand(ExpectedMask[i] % Size))
	return false;
	}
	}

	return true;
	}

	/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
	///
	/// The masks must be exactly the same width.
	///
	/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
	/// value in ExpectedMask is always accepted. Otherwise the indices must match.
	///
	/// SM_SentinelZero is accepted as a valid negative index but must match in
	/// both.
	static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
	ArrayRef<int> ExpectedMask,
	SDValue V1 = SDValue(),
	SDValue V2 = SDValue()) {
	int Size = Mask.size();
	if (Size != (int)ExpectedMask.size())
	return false;
	assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
	"Illegal target shuffle mask");

	// Check for out-of-range target shuffle mask indices.
	if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
	return false;

	// If the values are build vectors, we can look through them to find
	// equivalent inputs that make the shuffles equivalent.
	auto *BV1 = dyn_cast_or_null<BuildVectorSDNode>(V1);
	auto *BV2 = dyn_cast_or_null<BuildVectorSDNode>(V2);
	BV1 = ((BV1 && Size != (int)BV1->getNumOperands()) ? nullptr : BV1);
	BV2 = ((BV2 && Size != (int)BV2->getNumOperands()) ? nullptr : BV2);

	for (int i = 0; i < Size; ++i) {
	if (Mask[i] == SM_SentinelUndef \|\| Mask[i] == ExpectedMask[i])
	continue;
	if (0 <= Mask[i] && 0 <= ExpectedMask[i]) {
	auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
	auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
	if (MaskBV && ExpectedBV &&
	MaskBV->getOperand(Mask[i] % Size) ==
	ExpectedBV->getOperand(ExpectedMask[i] % Size))
	continue;
	}
	// TODO - handle SM_Sentinel equivalences.
	return false;
	}
	return true;
	}

	// Attempt to create a shuffle mask from a VSELECT condition mask.
	static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
	SDValue Cond) {
	if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
	return false;

	unsigned Size = Cond.getValueType().getVectorNumElements();
	Mask.resize(Size, SM_SentinelUndef);

	for (int i = 0; i != (int)Size; ++i) {
	SDValue CondElt = Cond.getOperand(i);
	Mask[i] = i;
	// Arbitrarily choose from the 2nd operand if the select condition element
	// is undef.
	// TODO: Can we do better by matching patterns such as even/odd?
	if (CondElt.isUndef() \|\| isNullConstant(CondElt))
	Mask[i] += Size;
	}

	return true;
	}

	// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
	// instructions.
	static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
	if (VT != MVT::v8i32 && VT != MVT::v8f32)
	return false;

	SmallVector<int, 8> Unpcklwd;
	createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
	/* Unary = */ false);
	SmallVector<int, 8> Unpckhwd;
	createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
	/* Unary = */ false);
	bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) \|\|
	isTargetShuffleEquivalent(Mask, Unpckhwd));
	return IsUnpackwdMask;
	}

	static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
	// Create 128-bit vector type based on mask size.
	MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
	MVT VT = MVT::getVectorVT(EltVT, Mask.size());

	// We can't assume a canonical shuffle mask, so try the commuted version too.
	SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
	ShuffleVectorSDNode::commuteMask(CommutedMask);

	// Match any of unary/binary or low/high.
	for (unsigned i = 0; i != 4; ++i) {
	SmallVector<int, 16> UnpackMask;
	createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
	if (isTargetShuffleEquivalent(Mask, UnpackMask) \|\|
	isTargetShuffleEquivalent(CommutedMask, UnpackMask))
	return true;
	}
	return false;
	}

	/// Return true if a shuffle mask chooses elements identically in its top and
	/// bottom halves. For example, any splat mask has the same top and bottom
	/// halves. If an element is undefined in only one half of the mask, the halves
	/// are not considered identical.
	static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
	assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
	unsigned HalfSize = Mask.size() / 2;
	for (unsigned i = 0; i != HalfSize; ++i) {
	if (Mask[i] != Mask[i + HalfSize])
	return false;
	}
	return true;
	}

	/// Get a 4-lane 8-bit shuffle immediate for a mask.
	///
	/// This helper function produces an 8-bit shuffle immediate corresponding to
	/// the ubiquitous shuffle encoding scheme used in x86 instructions for
	/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
	/// example.
	///
	/// NB: We rely heavily on "undef" masks preserving the input lane.
	static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
	assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
	assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
	assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
	assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
	assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");

	unsigned Imm = 0;
	Imm \|= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
	Imm \|= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
	Imm \|= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
	Imm \|= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
	return Imm;
	}

	static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
	SelectionDAG &DAG) {
	return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
	}

	// The Shuffle result is as follow:
	// 0a[0]0a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
	// Each Zeroable's element correspond to a particular Mask's element.
	// As described in computeZeroableShuffleElements function.
	//
	// The function looks for a sub-mask that the nonzero elements are in
	// increasing order. If such sub-mask exist. The function returns true.
	static bool isNonZeroElementsInOrder(const APInt &Zeroable,
	ArrayRef<int> Mask, const EVT &VectorType,
	bool &IsZeroSideLeft) {
	int NextElement = -1;
	// Check if the Mask's nonzero elements are in increasing order.
	for (int i = 0, e = Mask.size(); i < e; i++) {
	// Checks if the mask's zeros elements are built from only zeros.
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] < 0)
	return false;
	if (Zeroable[i])
	continue;
	// Find the lowest non zero element
	if (NextElement < 0) {
	NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
	IsZeroSideLeft = NextElement != 0;
	}
	// Exit if the mask's non zero elements are not in increasing order.
	if (NextElement != Mask[i])
	return false;
	NextElement++;
	}
	return true;
	}

	/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
	static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Size = Mask.size();
	int LaneSize = 128 / VT.getScalarSizeInBits();
	const int NumBytes = VT.getSizeInBits() / 8;
	const int NumEltBytes = VT.getScalarSizeInBits() / 8;

	assert((Subtarget.hasSSSE3() && VT.is128BitVector()) \|\|
	(Subtarget.hasAVX2() && VT.is256BitVector()) \|\|
	(Subtarget.hasBWI() && VT.is512BitVector()));

	SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
	// Sign bit set in i8 mask means zero element.
	SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);

	SDValue V;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / NumEltBytes];
	if (M < 0) {
	PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
	continue;
	}
	if (Zeroable[i / NumEltBytes]) {
	PSHUFBMask[i] = ZeroMask;
	continue;
	}

	// We can only use a single input of V1 or V2.
	SDValue SrcV = (M >= Size ? V2 : V1);
	if (V && V != SrcV)
	return SDValue();
	V = SrcV;
	M %= Size;

	// PSHUFB can't cross lanes, ensure this doesn't happen.
	if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
	return SDValue();

	M = M % LaneSize;
	M = M * NumEltBytes + (i % NumEltBytes);
	PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
	}
	assert(V && "Failed to find a source input");

	MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
	DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
	}

	static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl);

	// X86 has dedicated shuffle that can be lowered to VEXPAND
	static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
	const APInt &Zeroable,
	ArrayRef<int> Mask, SDValue &V1,
	SDValue &V2, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	bool IsLeftZeroSide = true;
	if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
	IsLeftZeroSide))
	return SDValue();
	unsigned VEXPANDMask = (~Zeroable).getZExtValue();
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
	unsigned NumElts = VT.getVectorNumElements();
	assert((NumElts == 4 \|\| NumElts == 8 \|\| NumElts == 16) &&
	"Unexpected number of vector elements");
	SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
	Subtarget, DAG, DL);
	SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
	SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
	return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
	}

	static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
	unsigned &UnpackOpcode, bool IsUnary,
	ArrayRef<int> TargetMask, const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	int NumElts = VT.getVectorNumElements();

	bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
	for (int i = 0; i != NumElts; i += 2) {
	int M1 = TargetMask[i + 0];
	int M2 = TargetMask[i + 1];
	Undef1 &= (SM_SentinelUndef == M1);
	Undef2 &= (SM_SentinelUndef == M2);
	Zero1 &= isUndefOrZero(M1);
	Zero2 &= isUndefOrZero(M2);
	}
	assert(!((Undef1 \|\| Zero1) && (Undef2 \|\| Zero2)) &&
	"Zeroable shuffle detected");

	// Attempt to match the target mask against the unpack lo/hi mask patterns.
	SmallVector<int, 64> Unpckl, Unpckh;
	createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
	if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
	UnpackOpcode = X86ISD::UNPCKL;
	V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
	V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
	return true;
	}

	createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
	if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
	UnpackOpcode = X86ISD::UNPCKH;
	V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
	V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
	return true;
	}

	// If an unary shuffle, attempt to match as an unpack lo/hi with zero.
	if (IsUnary && (Zero1 \|\| Zero2)) {
	// Don't bother if we can blend instead.
	if ((Subtarget.hasSSE41() \|\| VT == MVT::v2i64 \|\| VT == MVT::v2f64) &&
	isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
	return false;

	bool MatchLo = true, MatchHi = true;
	for (int i = 0; (i != NumElts) && (MatchLo \|\| MatchHi); ++i) {
	int M = TargetMask[i];

	// Ignore if the input is known to be zero or the index is undef.
	if ((((i & 1) == 0) && Zero1) \|\| (((i & 1) == 1) && Zero2) \|\|
	(M == SM_SentinelUndef))
	continue;

	MatchLo &= (M == Unpckl[i]);
	MatchHi &= (M == Unpckh[i]);
	}

	if (MatchLo \|\| MatchHi) {
	UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
	V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
	V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
	return true;
	}
	}

	// If a binary shuffle, commute and try again.
	if (!IsUnary) {
	ShuffleVectorSDNode::commuteMask(Unpckl);
	if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
	UnpackOpcode = X86ISD::UNPCKL;
	std::swap(V1, V2);
	return true;
	}

	ShuffleVectorSDNode::commuteMask(Unpckh);
	if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
	UnpackOpcode = X86ISD::UNPCKH;
	std::swap(V1, V2);
	return true;
	}
	}

	return false;
	}

	// X86 has dedicated unpack instructions that can handle specific blend
	// operations: UNPCKH and UNPCKL.
	static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1, SDValue V2,
	SelectionDAG &DAG) {
	SmallVector<int, 8> Unpckl;
	createUnpackShuffleMask(VT, Unpckl, /* Lo = / true, / Unary = */ false);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
	return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);

	SmallVector<int, 8> Unpckh;
	createUnpackShuffleMask(VT, Unpckh, /* Lo = / false, / Unary = */ false);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
	return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);

	// Commute and try again.
	ShuffleVectorSDNode::commuteMask(Unpckl);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
	return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);

	ShuffleVectorSDNode::commuteMask(Unpckh);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
	return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);

	return SDValue();
	}

	/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
	/// followed by unpack 256-bit.
	static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	SmallVector<int, 32> Unpckl, Unpckh;
	createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
	createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);

	unsigned UnpackOpcode;
	if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
	UnpackOpcode = X86ISD::UNPCKL;
	else if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
	UnpackOpcode = X86ISD::UNPCKH;
	else
	return SDValue();

	// This is a "natural" unpack operation (rather than the 128-bit sectored
	// operation implemented by AVX). We need to rearrange 64-bit chunks of the
	// input in order to use the x86 instruction.
	V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
	DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
	V1 = DAG.getBitcast(VT, V1);
	return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
	}

	// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
	// source into the lower elements and zeroing the upper elements.
	// TODO: Merge with matchShuffleAsVPMOV.
	static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
	ArrayRef<int> Mask, const APInt &Zeroable,
	const X86Subtarget &Subtarget) {
	if (!VT.is512BitVector() && !Subtarget.hasVLX())
	return false;

	unsigned NumElts = Mask.size();
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	unsigned MaxScale = 64 / EltSizeInBits;

	for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
	unsigned SrcEltBits = EltSizeInBits * Scale;
	if (SrcEltBits < 32 && !Subtarget.hasBWI())
	continue;
	unsigned NumSrcElts = NumElts / Scale;
	if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
	continue;
	unsigned UpperElts = NumElts - NumSrcElts;
	if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
	continue;
	SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
	SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
	DstVT = MVT::getIntegerVT(EltSizeInBits);
	if ((NumSrcElts * EltSizeInBits) >= 128) {
	// ISD::TRUNCATE
	DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
	} else {
	// X86ISD::VTRUNC
	DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
	}
	return true;
	}

	return false;
	}

	static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
	int Delta) {
	int Size = (int)Mask.size();
	int Split = Size / Delta;
	int TruncatedVectorStart = SwappedOps ? Size : 0;

	// Match for mask starting with e.g.: <8, 10, 12, 14,... or <0, 2, 4, 6,...
	if (!isSequentialOrUndefInRange(Mask, 0, Split, TruncatedVectorStart, Delta))
	return false;

	// The rest of the mask should not refer to the truncated vector's elements.
	if (isAnyInRange(Mask.slice(Split, Size - Split), TruncatedVectorStart,
	TruncatedVectorStart + Size))
	return false;

	return true;
	}

	// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
	//
	// An example is the following:
	//
	// t0: ch = EntryToken
	// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
	// t25: v4i32 = truncate t2
	// t41: v8i16 = bitcast t25
	// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
	// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
	// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
	// t18: v2i64 = bitcast t51
	//
	// Without avx512vl, this is lowered to:
	//
	// vpmovqd %zmm0, %ymm0
	// vpshufb {{.*#+}} xmm0 =
	// xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
	//
	// But when avx512vl is available, one can just use a single vpmovdw
	// instruction.
	static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (VT != MVT::v16i8 && VT != MVT::v8i16)
	return SDValue();

	if (Mask.size() != VT.getVectorNumElements())
	return SDValue();

	bool SwappedOps = false;

	if (!ISD::isBuildVectorAllZeros(V2.getNode())) {
	if (!ISD::isBuildVectorAllZeros(V1.getNode()))
	return SDValue();

	std::swap(V1, V2);
	SwappedOps = true;
	}

	// Look for:
	//
	// bitcast (truncate <8 x i32> %vec to <8 x i16>) to <16 x i8>
	// bitcast (truncate <4 x i64> %vec to <4 x i32>) to <8 x i16>
	//
	// and similar ones.
	if (V1.getOpcode() != ISD::BITCAST)
	return SDValue();
	if (V1.getOperand(0).getOpcode() != ISD::TRUNCATE)
	return SDValue();

	SDValue Src = V1.getOperand(0).getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();

	// The vptrunc** instructions truncating 128 bit and 256 bit vectors
	// are only available with avx512vl.
	if (!SrcVT.is512BitVector() && !Subtarget.hasVLX())
	return SDValue();

	// Down Convert Word to Byte is only available with avx512bw. The case with
	// 256-bit output doesn't contain a shuffle and is therefore not handled here.
	if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
	!Subtarget.hasBWI())
	return SDValue();

	// The first half/quarter of the mask should refer to every second/fourth
	// element of the vector truncated and bitcasted.
	if (!matchShuffleAsVPMOV(Mask, SwappedOps, 2) &&
	!matchShuffleAsVPMOV(Mask, SwappedOps, 4))
	return SDValue();

	return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
	}

	/// Check whether a compaction lowering can be done by dropping even
	/// elements and compute how many times even elements must be dropped.
	///
	/// This handles shuffles which take every Nth element where N is a power of
	/// two. Example shuffle masks:
	///
	/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
	/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
	/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
	/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
	/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
	/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
	///
	/// Any of these lanes can of course be undef.
	///
	/// This routine only supports N <= 3.
	/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
	/// for larger N.
	///
	/// \returns N above, or the number of times even elements must be dropped if
	/// there is such a number. Otherwise returns zero.
	static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
	bool IsSingleInput) {
	// The modulus for the shuffle vector entries is based on whether this is
	// a single input or not.
	int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
	assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
	"We should only be called with masks with a power-of-2 size!");

	uint64_t ModMask = (uint64_t)ShuffleModulus - 1;

	// We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
	// and 2^3 simultaneously. This is because we may have ambiguity with
	// partially undef inputs.
	bool ViableForN[3] = {true, true, true};

	for (int i = 0, e = Mask.size(); i < e; ++i) {
	// Ignore undef lanes, we'll optimistically collapse them to the pattern we
	// want.
	if (Mask[i] < 0)
	continue;

	bool IsAnyViable = false;
	for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
	if (ViableForN[j]) {
	uint64_t N = j + 1;

	// The shuffle mask must be equal to (i * 2^N) % M.
	if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
	IsAnyViable = true;
	else
	ViableForN[j] = false;
	}
	// Early exit if we exhaust the possible powers of two.
	if (!IsAnyViable)
	break;
	}

	for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
	if (ViableForN[j])
	return j + 1;

	// Return 0 as there is no viable power of two.
	return 0;
	}

	// X86 has dedicated pack instructions that can handle specific truncation
	// operations: PACKSS and PACKUS.
	// Checks for compaction shuffle masks if MaxStages > 1.
	// TODO: Add support for matching multiple PACKSS/PACKUS stages.
	static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
	unsigned &PackOpcode, ArrayRef<int> TargetMask,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	unsigned MaxStages = 1) {
	unsigned NumElts = VT.getVectorNumElements();
	unsigned BitSize = VT.getScalarSizeInBits();
	assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
	"Illegal maximum compaction");

	auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
	unsigned NumSrcBits = PackVT.getScalarSizeInBits();
	unsigned NumPackedBits = NumSrcBits - BitSize;
	SDValue VV1 = DAG.getBitcast(PackVT, N1);
	SDValue VV2 = DAG.getBitcast(PackVT, N2);
	if (Subtarget.hasSSE41() \|\| BitSize == 8) {
	APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
	if ((N1.isUndef() \|\| DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
	(N2.isUndef() \|\| DAG.MaskedValueIsZero(VV2, ZeroMask))) {
	V1 = VV1;
	V2 = VV2;
	SrcVT = PackVT;
	PackOpcode = X86ISD::PACKUS;
	return true;
	}
	}
	if ((N1.isUndef() \|\| DAG.ComputeNumSignBits(VV1) > NumPackedBits) &&
	(N2.isUndef() \|\| DAG.ComputeNumSignBits(VV2) > NumPackedBits)) {
	V1 = VV1;
	V2 = VV2;
	SrcVT = PackVT;
	PackOpcode = X86ISD::PACKSS;
	return true;
	}
	return false;
	};

	// Attempt to match against wider and wider compaction patterns.
	for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
	MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
	MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);

	// Try binary shuffle.
	SmallVector<int, 32> BinaryMask;
	createPackShuffleMask(VT, BinaryMask, false, NumStages);
	if (isTargetShuffleEquivalent(TargetMask, BinaryMask, V1, V2))
	if (MatchPACK(V1, V2, PackVT))
	return true;

	// Try unary shuffle.
	SmallVector<int, 32> UnaryMask;
	createPackShuffleMask(VT, UnaryMask, true, NumStages);
	if (isTargetShuffleEquivalent(TargetMask, UnaryMask, V1))
	if (MatchPACK(V1, V1, PackVT))
	return true;
	}

	return false;
	}

	static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
	SDValue V1, SDValue V2, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT PackVT;
	unsigned PackOpcode;
	unsigned SizeBits = VT.getSizeInBits();
	unsigned EltBits = VT.getScalarSizeInBits();
	unsigned MaxStages = Log2_32(64 / EltBits);
	if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
	Subtarget, MaxStages))
	return SDValue();

	unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
	unsigned NumStages = Log2_32(CurrentEltBits / EltBits);

	// Don't lower multi-stage packs on AVX512, truncation is better.
	if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
	return SDValue();

	// Pack to the largest type possible:
	// vXi64/vXi32 -> PACKSDW and vXi16 -> PACKSWB.
	unsigned MaxPackBits = 16;
	if (CurrentEltBits > 16 &&
	(PackOpcode == X86ISD::PACKSS \|\| Subtarget.hasSSE41()))
	MaxPackBits = 32;

	// Repeatedly pack down to the target size.
	SDValue Res;
	for (unsigned i = 0; i != NumStages; ++i) {
	unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
	unsigned NumSrcElts = SizeBits / SrcEltBits;
	MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
	MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
	MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
	MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
	Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
	DAG.getBitcast(SrcVT, V2));
	V1 = V2 = Res;
	CurrentEltBits /= 2;
	}
	assert(Res && Res.getValueType() == VT &&
	"Failed to lower compaction shuffle");
	return Res;
	}

	/// Try to emit a bitmask instruction for a shuffle.
	///
	/// This handles cases where we can model a blend exactly as a bitmask due to
	/// one of the inputs being zeroable.
	static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT MaskVT = VT;
	MVT EltVT = VT.getVectorElementType();
	SDValue Zero, AllOnes;
	// Use f64 if i64 isn't legal.
	if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
	EltVT = MVT::f64;
	MaskVT = MVT::getVectorVT(EltVT, Mask.size());
	}

	MVT LogicVT = VT;
	if (EltVT == MVT::f32 \|\| EltVT == MVT::f64) {
	Zero = DAG.getConstantFP(0.0, DL, EltVT);
	APFloat AllOnesValue = APFloat::getAllOnesValue(
	SelectionDAG::EVTToAPFloatSemantics(EltVT), EltVT.getSizeInBits());
	AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
	LogicVT =
	MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
	} else {
	Zero = DAG.getConstant(0, DL, EltVT);
	AllOnes = DAG.getAllOnesConstant(DL, EltVT);
	}

	SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
	SDValue V;
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Zeroable[i])
	continue;
	if (Mask[i] % Size != i)
	return SDValue(); // Not a blend.
	if (!V)
	V = Mask[i] < Size ? V1 : V2;
	else if (V != (Mask[i] < Size ? V1 : V2))
	return SDValue(); // Can only let one input through the mask.

	VMaskOps[i] = AllOnes;
	}
	if (!V)
	return SDValue(); // No non-zeroable elements!

	SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
	VMask = DAG.getBitcast(LogicVT, VMask);
	V = DAG.getBitcast(LogicVT, V);
	SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
	return DAG.getBitcast(VT, And);
	}

	/// Try to emit a blend instruction for a shuffle using bit math.
	///
	/// This is used as a fallback approach when first class blend instructions are
	/// unavailable. Currently it is only suitable for integer vectors, but could
	/// be generalized for floating point vectors if desirable.
	static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(VT.isInteger() && "Only supports integer vector types!");
	MVT EltVT = VT.getVectorElementType();
	SDValue Zero = DAG.getConstant(0, DL, EltVT);
	SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
	SmallVector<SDValue, 16> MaskOps;
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
	return SDValue(); // Shuffled input!
	MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
	}

	SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
	V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
	V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
	return DAG.getNode(ISD::OR, DL, VT, V1, V2);
	}

	static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG);

	static bool matchShuffleAsBlend(SDValue V1, SDValue V2,
	MutableArrayRef<int> Mask,
	const APInt &Zeroable, bool &ForceV1Zero,
	bool &ForceV2Zero, uint64_t &BlendMask) {
	bool V1IsZeroOrUndef =
	V1.isUndef() \|\| ISD::isBuildVectorAllZeros(V1.getNode());
	bool V2IsZeroOrUndef =
	V2.isUndef() \|\| ISD::isBuildVectorAllZeros(V2.getNode());

	BlendMask = 0;
	ForceV1Zero = false, ForceV2Zero = false;
	assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");

	// Attempt to generate the binary blend mask. If an input is zero then
	// we can use any lane.
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	if (M == i)
	continue;
	if (M == i + Size) {
	BlendMask \|= 1ull << i;
	continue;
	}
	if (Zeroable[i]) {
	if (V1IsZeroOrUndef) {
	ForceV1Zero = true;
	Mask[i] = i;
	continue;
	}
	if (V2IsZeroOrUndef) {
	ForceV2Zero = true;
	BlendMask \|= 1ull << i;
	Mask[i] = i + Size;
	continue;
	}
	}
	return false;
	}
	return true;
	}

	static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
	int Scale) {
	uint64_t ScaledMask = 0;
	for (int i = 0; i != Size; ++i)
	if (BlendMask & (1ull << i))
	ScaledMask \|= ((1ull << Scale) - 1) << (i * Scale);
	return ScaledMask;
	}

	/// Try to emit a blend instruction for a shuffle.
	///
	/// This doesn't do any checks for the availability of instructions for blending
	/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
	/// be matched in the backend with the type given. What it does check for is
	/// that the shuffle mask is a blend, or convertible into a blend with zero.
	static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Original,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	uint64_t BlendMask = 0;
	bool ForceV1Zero = false, ForceV2Zero = false;
	SmallVector<int, 64> Mask(Original.begin(), Original.end());
	if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
	BlendMask))
	return SDValue();

	// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
	if (ForceV1Zero)
	V1 = getZeroVector(VT, Subtarget, DAG, DL);
	if (ForceV2Zero)
	V2 = getZeroVector(VT, Subtarget, DAG, DL);

	switch (VT.SimpleTy) {
	case MVT::v4i64:
	case MVT::v8i32:
	assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
	LLVM_FALLTHROUGH;
	case MVT::v4f64:
	case MVT::v8f32:
	assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
	LLVM_FALLTHROUGH;
	case MVT::v2f64:
	case MVT::v2i64:
	case MVT::v4f32:
	case MVT::v4i32:
	case MVT::v8i16:
	assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
	return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
	DAG.getTargetConstant(BlendMask, DL, MVT::i8));
	case MVT::v16i16: {
	assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
	// We can lower these with PBLENDW which is mirrored across 128-bit lanes.
	assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
	BlendMask = 0;
	for (int i = 0; i < 8; ++i)
	if (RepeatedMask[i] >= 8)
	BlendMask \|= 1ull << i;
	return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
	DAG.getTargetConstant(BlendMask, DL, MVT::i8));
	}
	// Use PBLENDW for lower/upper lanes and then blend lanes.
	// TODO - we should allow 2 PBLENDW here and leave shuffle combine to
	// merge to VSELECT where useful.
	uint64_t LoMask = BlendMask & 0xFF;
	uint64_t HiMask = (BlendMask >> 8) & 0xFF;
	if (LoMask == 0 \|\| LoMask == 255 \|\| HiMask == 0 \|\| HiMask == 255) {
	SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
	DAG.getTargetConstant(LoMask, DL, MVT::i8));
	SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
	DAG.getTargetConstant(HiMask, DL, MVT::i8));
	return DAG.getVectorShuffle(
	MVT::v16i16, DL, Lo, Hi,
	{0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
	}
	LLVM_FALLTHROUGH;
	}
	case MVT::v32i8:
	assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
	LLVM_FALLTHROUGH;
	case MVT::v16i8: {
	assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");

	// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
	if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return Masked;

	if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
	return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
	}

	// If we have VPTERNLOG, we can use that as a bit blend.
	if (Subtarget.hasVLX())
	if (SDValue BitBlend =
	lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
	return BitBlend;

	// Scale the blend by the number of bytes per element.
	int Scale = VT.getScalarSizeInBits() / 8;

	// This form of blend is always done on bytes. Compute the byte vector
	// type.
	MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

	// x86 allows load folding with blendvb from the 2nd source operand. But
	// we are still using LLVM select here (see comment below), so that's V1.
	// If V2 can be load-folded and V1 cannot be load-folded, then commute to
	// allow that load-folding possibility.
	if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
	ShuffleVectorSDNode::commuteMask(Mask);
	std::swap(V1, V2);
	}

	// Compute the VSELECT mask. Note that VSELECT is really confusing in the
	// mix of LLVM's code generator and the x86 backend. We tell the code
	// generator that boolean values in the elements of an x86 vector register
	// are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
	// mapping a select to operand #1, and 'false' mapping to operand #2. The
	// reality in x86 is that vector masks (pre-AVX-512) use only the high bit
	// of the element (the remaining are ignored) and 0 in that high bit would
	// mean operand #1 while 1 in the high bit would mean operand #2. So while
	// the LLVM model for boolean values in vector elements gets the relevant
	// bit set, it is set backwards and over constrained relative to x86's
	// actual model.
	SmallVector<SDValue, 32> VSELECTMask;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	for (int j = 0; j < Scale; ++j)
	VSELECTMask.push_back(
	Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
	: DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
	MVT::i8));

	V1 = DAG.getBitcast(BlendVT, V1);
	V2 = DAG.getBitcast(BlendVT, V2);
	return DAG.getBitcast(
	VT,
	DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
	V1, V2));
	}
	case MVT::v16f32:
	case MVT::v8f64:
	case MVT::v8i64:
	case MVT::v16i32:
	case MVT::v32i16:
	case MVT::v64i8: {
	// Attempt to lower to a bitmask if we can. Only if not optimizing for size.
	bool OptForSize = DAG.shouldOptForSize();
	if (!OptForSize) {
	if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return Masked;
	}

	// Otherwise load an immediate into a GPR, cast to k-register, and use a
	// masked move.
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
	return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
	}
	default:
	llvm_unreachable("Not a supported integer vector type!");
	}
	}

	/// Try to lower as a blend of elements from two inputs followed by
	/// a single-input permutation.
	///
	/// This matches the pattern where we can blend elements from two inputs and
	/// then reduce the shuffle to a single-input permutation.
	static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG,
	bool ImmBlends = false) {
	// We build up the blend mask while checking whether a blend is a viable way
	// to reduce the shuffle.
	SmallVector<int, 32> BlendMask(Mask.size(), -1);
	SmallVector<int, 32> PermuteMask(Mask.size(), -1);

	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");

	if (BlendMask[Mask[i] % Size] < 0)
	BlendMask[Mask[i] % Size] = Mask[i];
	else if (BlendMask[Mask[i] % Size] != Mask[i])
	return SDValue(); // Can't blend in the needed input!

	PermuteMask[i] = Mask[i] % Size;
	}

	// If only immediate blends, then bail if the blend mask can't be widened to
	// i16.
	unsigned EltSize = VT.getScalarSizeInBits();
	if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
	return SDValue();

	SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
	return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
	}

	/// Try to lower as an unpack of elements from two inputs followed by
	/// a single-input permutation.
	///
	/// This matches the pattern where we can unpack elements from two inputs and
	/// then reduce the shuffle to a single-input (wider) permutation.
	static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	int NumElts = Mask.size();
	int NumLanes = VT.getSizeInBits() / 128;
	int NumLaneElts = NumElts / NumLanes;
	int NumHalfLaneElts = NumLaneElts / 2;

	bool MatchLo = true, MatchHi = true;
	SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};

	// Determine UNPCKL/UNPCKH type and operand order.
	for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
	for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
	int M = Mask[Lane + Elt];
	if (M < 0)
	continue;

	SDValue &Op = Ops[Elt & 1];
	if (M < NumElts && (Op.isUndef() \|\| Op == V1))
	Op = V1;
	else if (NumElts <= M && (Op.isUndef() \|\| Op == V2))
	Op = V2;
	else
	return SDValue();

	int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
	MatchLo &= isUndefOrInRange(M, Lo, Mid) \|\|
	isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
	MatchHi &= isUndefOrInRange(M, Mid, Hi) \|\|
	isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
	if (!MatchLo && !MatchHi)
	return SDValue();
	}
	}
	assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");

	// Now check that each pair of elts come from the same unpack pair
	// and set the permute mask based on each pair.
	// TODO - Investigate cases where we permute individual elements.
	SmallVector<int, 32> PermuteMask(NumElts, -1);
	for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
	for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
	int M0 = Mask[Lane + Elt + 0];
	int M1 = Mask[Lane + Elt + 1];
	if (0 <= M0 && 0 <= M1 &&
	(M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
	return SDValue();
	if (0 <= M0)
	PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
	if (0 <= M1)
	PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
	}
	}

	unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
	SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
	return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
	}

	/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
	/// permuting the elements of the result in place.
	static SDValue lowerShuffleAsByteRotateAndPermute(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) \|\|
	(VT.is256BitVector() && !Subtarget.hasAVX2()) \|\|
	(VT.is512BitVector() && !Subtarget.hasBWI()))
	return SDValue();

	// We don't currently support lane crossing permutes.
	if (is128BitLaneCrossingShuffleMask(VT, Mask))
	return SDValue();

	int Scale = VT.getScalarSizeInBits() / 8;
	int NumLanes = VT.getSizeInBits() / 128;
	int NumElts = VT.getVectorNumElements();
	int NumEltsPerLane = NumElts / NumLanes;

	// Determine range of mask elts.
	bool Blend1 = true;
	bool Blend2 = true;
	std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
	std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
	for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
	for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
	int M = Mask[Lane + Elt];
	if (M < 0)
	continue;
	if (M < NumElts) {
	Blend1 &= (M == (Lane + Elt));
	assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
	M = M % NumEltsPerLane;
	Range1.first = std::min(Range1.first, M);
	Range1.second = std::max(Range1.second, M);
	} else {
	M -= NumElts;
	Blend2 &= (M == (Lane + Elt));
	assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
	M = M % NumEltsPerLane;
	Range2.first = std::min(Range2.first, M);
	Range2.second = std::max(Range2.second, M);
	}
	}
	}

	// Bail if we don't need both elements.
	// TODO - it might be worth doing this for unary shuffles if the permute
	// can be widened.
	if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) \|\|
	!(0 <= Range2.first && Range2.second < NumEltsPerLane))
	return SDValue();

	if (VT.getSizeInBits() > 128 && (Blend1 \|\| Blend2))
	return SDValue();

	// Rotate the 2 ops so we can access both ranges, then permute the result.
	auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
	MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
	SDValue Rotate = DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
	DAG.getBitcast(ByteVT, Lo),
	DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
	SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
	for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
	for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
	int M = Mask[Lane + Elt];
	if (M < 0)
	continue;
	if (M < NumElts)
	PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
	else
	PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
	}
	}
	return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
	};

	// Check if the ranges are small enough to rotate from either direction.
	if (Range2.second < Range1.first)
	return RotateAndPermute(V1, V2, Range1.first, 0);
	if (Range1.second < Range2.first)
	return RotateAndPermute(V2, V1, Range2.first, NumElts);
	return SDValue();
	}

	/// Generic routine to decompose a shuffle and blend into independent
	/// blends and permutes.
	///
	/// This matches the extremely common pattern for handling combined
	/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
	/// operations. It will try to pick the best arrangement of shuffles and
	/// blends.
	static SDValue lowerShuffleAsDecomposedShuffleBlend(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	// Shuffle the input elements into the desired positions in V1 and V2 and
	// blend them together.
	SmallVector<int, 32> V1Mask(Mask.size(), -1);
	SmallVector<int, 32> V2Mask(Mask.size(), -1);
	SmallVector<int, 32> BlendMask(Mask.size(), -1);
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= 0 && Mask[i] < Size) {
	V1Mask[i] = Mask[i];
	BlendMask[i] = i;
	} else if (Mask[i] >= Size) {
	V2Mask[i] = Mask[i] - Size;
	BlendMask[i] = i + Size;
	}

	// Try to lower with the simpler initial blend/unpack/rotate strategies unless
	// one of the input shuffles would be a no-op. We prefer to shuffle inputs as
	// the shuffle may be able to fold with a load or other benefit. However, when
	// we'll have to do 2x as many shuffles in order to achieve this, a 2-input
	// pre-shuffle first is a better strategy.
	if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
	// Only prefer immediate blends to unpack/rotate.
	if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
	DAG, true))
	return BlendPerm;
	if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
	DAG))
	return UnpackPerm;
	if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
	DL, VT, V1, V2, Mask, Subtarget, DAG))
	return RotatePerm;
	// Unpack/rotate failed - try again with variable blends.
	if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
	DAG))
	return BlendPerm;
	}

	V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
	return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
	}

	/// Try to lower a vector shuffle as a bit rotation.
	///
	/// Look for a repeated rotation pattern in each sub group.
	/// Returns a ISD::ROTL element rotation amount or -1 if failed.
	static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
	int NumElts = Mask.size();
	assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask");

	int RotateAmt = -1;
	for (int i = 0; i != NumElts; i += NumSubElts) {
	for (int j = 0; j != NumSubElts; ++j) {
	int M = Mask[i + j];
	if (M < 0)
	continue;
	if (!isInRange(M, i, i + NumSubElts))
	return -1;
	int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
	if (0 <= RotateAmt && Offset != RotateAmt)
	return -1;
	RotateAmt = Offset;
	}
	}
	return RotateAmt;
	}

	static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
	const X86Subtarget &Subtarget,
	ArrayRef<int> Mask) {
	assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
	assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");

	// AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
	int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
	int MaxSubElts = 64 / EltSizeInBits;
	for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
	int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
	if (RotateAmt < 0)
	continue;

	int NumElts = Mask.size();
	MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
	RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
	return RotateAmt * EltSizeInBits;
	}

	return -1;
	}

	/// Lower shuffle using X86ISD::VROTLI rotations.
	static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
	ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// Only XOP + AVX512 targets have bit rotation instructions.
	// If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
	bool IsLegal =
	(VT.is128BitVector() && Subtarget.hasXOP()) \|\| Subtarget.hasAVX512();
	if (!IsLegal && Subtarget.hasSSE3())
	return SDValue();

	MVT RotateVT;
	int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
	Subtarget, Mask);
	if (RotateAmt < 0)
	return SDValue();

	// For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
	// expanded to OR(SRL,SHL), will be more efficient, but if they can
	// widen to vXi16 or more then existing lowering should will be better.
	if (!IsLegal) {
	if ((RotateAmt % 16) == 0)
	return SDValue();
	// TODO: Use getTargetVShiftByConstNode.
	unsigned ShlAmt = RotateAmt;
	unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
	V1 = DAG.getBitcast(RotateVT, V1);
	SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
	DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
	SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
	DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
	SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
	return DAG.getBitcast(VT, Rot);
	}

	SDValue Rot =
	DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
	DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
	return DAG.getBitcast(VT, Rot);
	}

	/// Try to match a vector shuffle as an element rotation.
	///
	/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
	static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask) {
	int NumElts = Mask.size();

	// We need to detect various ways of spelling a rotation:
	// [11, 12, 13, 14, 15, 0, 1, 2]
	// [-1, 12, 13, 14, -1, -1, 1, -1]
	// [-1, -1, -1, -1, -1, -1, 1, 2]
	// [ 3, 4, 5, 6, 7, 8, 9, 10]
	// [-1, 4, 5, 6, -1, -1, 9, -1]
	// [-1, 4, 5, 6, -1, -1, -1, -1]
	int Rotation = 0;
	SDValue Lo, Hi;
	for (int i = 0; i < NumElts; ++i) {
	int M = Mask[i];
	assert((M == SM_SentinelUndef \|\| (0 <= M && M < (2*NumElts))) &&
	"Unexpected mask index.");
	if (M < 0)
	continue;

	// Determine where a rotated vector would have started.
	int StartIdx = i - (M % NumElts);
	if (StartIdx == 0)
	// The identity rotation isn't interesting, stop.
	return -1;

	// If we found the tail of a vector the rotation must be the missing
	// front. If we found the head of a vector, it must be how much of the
	// head.
	int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;

	if (Rotation == 0)
	Rotation = CandidateRotation;
	else if (Rotation != CandidateRotation)
	// The rotations don't match, so we can't match this mask.
	return -1;

	// Compute which value this mask is pointing at.
	SDValue MaskV = M < NumElts ? V1 : V2;

	// Compute which of the two target values this index should be assigned
	// to. This reflects whether the high elements are remaining or the low
	// elements are remaining.
	SDValue &TargetV = StartIdx < 0 ? Hi : Lo;

	// Either set up this value if we've not encountered it before, or check
	// that it remains consistent.
	if (!TargetV)
	TargetV = MaskV;
	else if (TargetV != MaskV)
	// This may be a rotation, but it pulls from the inputs in some
	// unsupported interleaving.
	return -1;
	}

	// Check that we successfully analyzed the mask, and normalize the results.
	assert(Rotation != 0 && "Failed to locate a viable rotation!");
	assert((Lo \|\| Hi) && "Failed to find a rotated input vector!");
	if (!Lo)
	Lo = Hi;
	else if (!Hi)
	Hi = Lo;

	V1 = Lo;
	V2 = Hi;

	return Rotation;
	}

	/// Try to lower a vector shuffle as a byte rotation.
	///
	/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
	/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
	/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
	/// try to generically lower a vector shuffle through such an pattern. It
	/// does not check for the profitability of lowering either as PALIGNR or
	/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
	/// This matches shuffle vectors that look like:
	///
	/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
	///
	/// Essentially it concatenates V1 and V2, shifts right by some number of
	/// elements, and takes the low elements as the result. Note that while this is
	/// specified as a right shift because x86 is little-endian, it is a *left
	/// rotate* of the vector lanes.
	static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask) {
	// Don't accept any shuffles with zero elements.
	if (isAnyZero(Mask))
	return -1;

	// PALIGNR works on 128-bit lanes.
	SmallVector<int, 16> RepeatedMask;
	if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
	return -1;

	int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
	if (Rotation <= 0)
	return -1;

	// PALIGNR rotates bytes, so we need to scale the
	// rotation based on how many bytes are in the vector lane.
	int NumElts = RepeatedMask.size();
	int Scale = 16 / NumElts;
	return Rotation * Scale;
	}

	static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");

	SDValue Lo = V1, Hi = V2;
	int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
	if (ByteRotation <= 0)
	return SDValue();

	// Cast the inputs to i8 vector of correct length to match PALIGNR or
	// PSLLDQ/PSRLDQ.
	MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
	Lo = DAG.getBitcast(ByteVT, Lo);
	Hi = DAG.getBitcast(ByteVT, Hi);

	// SSSE3 targets can use the palignr instruction.
	if (Subtarget.hasSSSE3()) {
	assert((!VT.is512BitVector() \|\| Subtarget.hasBWI()) &&
	"512-bit PALIGNR requires BWI instructions");
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
	DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
	}

	assert(VT.is128BitVector() &&
	"Rotate-based lowering only supports 128-bit lowering!");
	assert(Mask.size() <= 16 &&
	"Can shuffle at most 16 bytes in a 128-bit vector!");
	assert(ByteVT == MVT::v16i8 &&
	"SSE2 rotate lowering only needed for v16i8!");

	// Default SSE2 implementation
	int LoByteShift = 16 - ByteRotation;
	int HiByteShift = ByteRotation;

	SDValue LoShift =
	DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
	DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
	SDValue HiShift =
	DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
	DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
	return DAG.getBitcast(VT,
	DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
	}

	/// Try to lower a vector shuffle as a dword/qword rotation.
	///
	/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
	/// rotation of the concatenation of two vectors; This routine will
	/// try to generically lower a vector shuffle through such an pattern.
	///
	/// Essentially it concatenates V1 and V2, shifts right by some number of
	/// elements, and takes the low elements as the result. Note that while this is
	/// specified as a right shift because x86 is little-endian, it is a *left
	/// rotate* of the vector lanes.
	static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert((VT.getScalarType() == MVT::i32 \|\| VT.getScalarType() == MVT::i64) &&
	"Only 32-bit and 64-bit elements are supported!");

	// 128/256-bit vectors are only supported with VLX.
	assert((Subtarget.hasVLX() \|\| (!VT.is128BitVector() && !VT.is256BitVector()))
	&& "VLX required for 128/256-bit vectors");

	SDValue Lo = V1, Hi = V2;
	int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
	if (Rotation <= 0)
	return SDValue();

	return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
	DAG.getTargetConstant(Rotation, DL, MVT::i8));
	}

	/// Try to lower a vector shuffle as a byte shift sequence.
	static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
	assert(VT.is128BitVector() && "Only 128-bit vectors supported");

	// We need a shuffle that has zeros at one/both ends and a sequential
	// shuffle from one source within.
	unsigned ZeroLo = Zeroable.countTrailingOnes();
	unsigned ZeroHi = Zeroable.countLeadingOnes();
	if (!ZeroLo && !ZeroHi)
	return SDValue();

	unsigned NumElts = Mask.size();
	unsigned Len = NumElts - (ZeroLo + ZeroHi);
	if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
	return SDValue();

	unsigned Scale = VT.getScalarSizeInBits() / 8;
	ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
	if (!isUndefOrInRange(StubMask, 0, NumElts) &&
	!isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
	return SDValue();

	SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
	Res = DAG.getBitcast(MVT::v16i8, Res);

	// Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
	// inner sequential set of elements, possibly offset:
	// 01234567 --> zzzzzz01 --> 1zzzzzzz
	// 01234567 --> 4567zzzz --> zzzzz456
	// 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
	if (ZeroLo == 0) {
	unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
	Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
	DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
	Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
	DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
	} else if (ZeroHi == 0) {
	unsigned Shift = Mask[ZeroLo] % NumElts;
	Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
	DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
	Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
	DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
	} else if (!Subtarget.hasSSSE3()) {
	// If we don't have PSHUFB then its worth avoiding an AND constant mask
	// by performing 3 byte shifts. Shuffle combining can kick in above that.
	// TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
	unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
	Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
	DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
	Shift += Mask[ZeroLo] % NumElts;
	Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
	DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
	Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
	DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
	} else
	return SDValue();

	return DAG.getBitcast(VT, Res);
	}

	/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
	///
	/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
	/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
	/// matches elements from one of the input vectors shuffled to the left or
	/// right with zeroable elements 'shifted in'. It handles both the strictly
	/// bit-wise element shifts and the byte shift across an entire 128-bit double
	/// quad word lane.
	///
	/// PSHL : (little-endian) left bit shift.
	/// [ zz, 0, zz, 2 ]
	/// [ -1, 4, zz, -1 ]
	/// PSRL : (little-endian) right bit shift.
	/// [ 1, zz, 3, zz]
	/// [ -1, -1, 7, zz]
	/// PSLLDQ : (little-endian) left byte shift
	/// [ zz, 0, 1, 2, 3, 4, 5, 6]
	/// [ zz, zz, -1, -1, 2, 3, 4, -1]
	/// [ zz, zz, zz, zz, zz, zz, -1, 1]
	/// PSRLDQ : (little-endian) right byte shift
	/// [ 5, 6, 7, zz, zz, zz, zz, zz]
	/// [ -1, 5, 6, 7, zz, zz, zz, zz]
	/// [ 1, 2, -1, -1, -1, -1, zz, zz]
	static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
	unsigned ScalarSizeInBits, ArrayRef<int> Mask,
	int MaskOffset, const APInt &Zeroable,
	const X86Subtarget &Subtarget) {
	int Size = Mask.size();
	unsigned SizeInBits = Size * ScalarSizeInBits;

	auto CheckZeros = [&](int Shift, int Scale, bool Left) {
	for (int i = 0; i < Size; i += Scale)
	for (int j = 0; j < Shift; ++j)
	if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
	return false;

	return true;
	};

	auto MatchShift = [&](int Shift, int Scale, bool Left) {
	for (int i = 0; i != Size; i += Scale) {
	unsigned Pos = Left ? i + Shift : i;
	unsigned Low = Left ? i : i + Shift;
	unsigned Len = Scale - Shift;
	if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
	return -1;
	}

	int ShiftEltBits = ScalarSizeInBits * Scale;
	bool ByteShift = ShiftEltBits > 64;
	Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
	: (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
	int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);

	// Normalize the scale for byte shifts to still produce an i64 element
	// type.
	Scale = ByteShift ? Scale / 2 : Scale;

	// We need to round trip through the appropriate type for the shift.
	MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
	ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
	: MVT::getVectorVT(ShiftSVT, Size / Scale);
	return (int)ShiftAmt;
	};

	// SSE/AVX supports logical shifts up to 64-bit integers - so we can just
	// keep doubling the size of the integer elements up to that. We can
	// then shift the elements of the integer vector by whole multiples of
	// their width within the elements of the larger integer vector. Test each
	// multiple to see if we can find a match with the moved element indices
	// and that the shifted in elements are all zeroable.
	unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
	for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
	for (int Shift = 1; Shift != Scale; ++Shift)
	for (bool Left : {true, false})
	if (CheckZeros(Shift, Scale, Left)) {
	int ShiftAmt = MatchShift(Shift, Scale, Left);
	if (0 < ShiftAmt)
	return ShiftAmt;
	}

	// no match
	return -1;
	}

	static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Size = Mask.size();
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");

	MVT ShiftVT;
	SDValue V = V1;
	unsigned Opcode;

	// Try to match shuffle against V1 shift.
	int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
	Mask, 0, Zeroable, Subtarget);

	// If V1 failed, try to match shuffle against V2 shift.
	if (ShiftAmt < 0) {
	ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
	Mask, Size, Zeroable, Subtarget);
	V = V2;
	}

	if (ShiftAmt < 0)
	return SDValue();

	assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
	"Illegal integer vector type");
	V = DAG.getBitcast(ShiftVT, V);
	V = DAG.getNode(Opcode, DL, ShiftVT, V,
	DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
	return DAG.getBitcast(VT, V);
	}

	// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
	// Remainder of lower half result is zero and upper half is all undef.
	static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask, uint64_t &BitLen,
	uint64_t &BitIdx, const APInt &Zeroable) {
	int Size = Mask.size();
	int HalfSize = Size / 2;
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
	assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");

	// Upper half must be undefined.
	if (!isUndefUpperHalf(Mask))
	return false;

	// Determine the extraction length from the part of the
	// lower half that isn't zeroable.
	int Len = HalfSize;
	for (; Len > 0; --Len)
	if (!Zeroable[Len - 1])
	break;
	assert(Len > 0 && "Zeroable shuffle mask");

	// Attempt to match first Len sequential elements from the lower half.
	SDValue Src;
	int Idx = -1;
	for (int i = 0; i != Len; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	SDValue &V = (M < Size ? V1 : V2);
	M = M % Size;

	// The extracted elements must start at a valid index and all mask
	// elements must be in the lower half.
	if (i > M \|\| M >= HalfSize)
	return false;

	if (Idx < 0 \|\| (Src == V && Idx == (M - i))) {
	Src = V;
	Idx = M - i;
	continue;
	}
	return false;
	}

	if (!Src \|\| Idx < 0)
	return false;

	assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
	BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
	BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
	V1 = Src;
	return true;
	}

	// INSERTQ: Extract lowest Len elements from lower half of second source and
	// insert over first source, starting at Idx.
	// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
	static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask, uint64_t &BitLen,
	uint64_t &BitIdx) {
	int Size = Mask.size();
	int HalfSize = Size / 2;
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");

	// Upper half must be undefined.
	if (!isUndefUpperHalf(Mask))
	return false;

	for (int Idx = 0; Idx != HalfSize; ++Idx) {
	SDValue Base;

	// Attempt to match first source from mask before insertion point.
	if (isUndefInRange(Mask, 0, Idx)) {
	/* EMPTY */
	} else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
	Base = V1;
	} else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
	Base = V2;
	} else {
	continue;
	}

	// Extend the extraction length looking to match both the insertion of
	// the second source and the remaining elements of the first.
	for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
	SDValue Insert;
	int Len = Hi - Idx;

	// Match insertion.
	if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
	Insert = V1;
	} else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
	Insert = V2;
	} else {
	continue;
	}

	// Match the remaining elements of the lower half.
	if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
	/* EMPTY */
	} else if ((!Base \|\| (Base == V1)) &&
	isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
	Base = V1;
	} else if ((!Base \|\| (Base == V2)) &&
	isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
	Size + Hi)) {
	Base = V2;
	} else {
	continue;
	}

	BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
	BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
	V1 = Base;
	V2 = Insert;
	return true;
	}
	}

	return false;
	}

	/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
	static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, SelectionDAG &DAG) {
	uint64_t BitLen, BitIdx;
	if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
	return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
	DAG.getTargetConstant(BitLen, DL, MVT::i8),
	DAG.getTargetConstant(BitIdx, DL, MVT::i8));

	if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
	return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
	V2 ? V2 : DAG.getUNDEF(VT),
	DAG.getTargetConstant(BitLen, DL, MVT::i8),
	DAG.getTargetConstant(BitIdx, DL, MVT::i8));

	return SDValue();
	}

	/// Lower a vector shuffle as a zero or any extension.
	///
	/// Given a specific number of elements, element bit width, and extension
	/// stride, produce either a zero or any extension based on the available
	/// features of the subtarget. The extended elements are consecutive and
	/// begin and can start from an offsetted element index in the input; to
	/// avoid excess shuffling the offset must either being in the bottom lane
	/// or at the start of a higher lane. All extended elements must be from
	/// the same lane.
	static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
	const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
	ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(Scale > 1 && "Need a scale to extend.");
	int EltBits = VT.getScalarSizeInBits();
	int NumElements = VT.getVectorNumElements();
	int NumEltsPerLane = 128 / EltBits;
	int OffsetLane = Offset / NumEltsPerLane;
	assert((EltBits == 8 \|\| EltBits == 16 \|\| EltBits == 32) &&
	"Only 8, 16, and 32 bit elements can be extended.");
	assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
	assert(0 <= Offset && "Extension offset must be positive.");
	assert((Offset < NumEltsPerLane \|\| Offset % NumEltsPerLane == 0) &&
	"Extension offset must be in the first lane or start an upper lane.");

	// Check that an index is in same lane as the base offset.
	auto SafeOffset = [&](int Idx) {
	return OffsetLane == (Idx / NumEltsPerLane);
	};

	// Shift along an input so that the offset base moves to the first element.
	auto ShuffleOffset = [&](SDValue V) {
	if (!Offset)
	return V;

	SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
	for (int i = 0; i * Scale < NumElements; ++i) {
	int SrcIdx = i + Offset;
	ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
	}
	return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
	};

	// Found a valid a/zext mask! Try various lowering strategies based on the
	// input type and available ISA extensions.
	if (Subtarget.hasSSE41()) {
	// Not worth offsetting 128-bit vectors if scale == 2, a pattern using
	// PUNPCK will catch this in a later shuffle match.
	if (Offset && Scale == 2 && VT.is128BitVector())
	return SDValue();
	MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
	NumElements / Scale);
	InputV = ShuffleOffset(InputV);
	InputV = getExtendInVec(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND, DL,
	ExtVT, InputV, DAG);
	return DAG.getBitcast(VT, InputV);
	}

	assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");

	// For any extends we can cheat for larger element sizes and use shuffle
	// instructions that can fold with a load and/or copy.
	if (AnyExt && EltBits == 32) {
	int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
	-1};
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
	DAG.getBitcast(MVT::v4i32, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}
	if (AnyExt && EltBits == 16 && Scale > 2) {
	int PSHUFDMask[4] = {Offset / 2, -1,
	SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
	InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
	DAG.getBitcast(MVT::v4i32, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
	int PSHUFWMask[4] = {1, -1, -1, -1};
	unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
	return DAG.getBitcast(
	VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
	DAG.getBitcast(MVT::v8i16, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
	}

	// The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
	// to 64-bits.
	if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
	assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
	assert(VT.is128BitVector() && "Unexpected vector width!");

	int LoIdx = Offset * EltBits;
	SDValue Lo = DAG.getBitcast(
	MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
	DAG.getTargetConstant(EltBits, DL, MVT::i8),
	DAG.getTargetConstant(LoIdx, DL, MVT::i8)));

	if (isUndefUpperHalf(Mask) \|\| !SafeOffset(Offset + 1))
	return DAG.getBitcast(VT, Lo);

	int HiIdx = (Offset + 1) * EltBits;
	SDValue Hi = DAG.getBitcast(
	MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
	DAG.getTargetConstant(EltBits, DL, MVT::i8),
	DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
	return DAG.getBitcast(VT,
	DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
	}

	// If this would require more than 2 unpack instructions to expand, use
	// pshufb when available. We can only use more than 2 unpack instructions
	// when zero extending i8 elements which also makes it easier to use pshufb.
	if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
	assert(NumElements == 16 && "Unexpected byte vector width!");
	SDValue PSHUFBMask[16];
	for (int i = 0; i < 16; ++i) {
	int Idx = Offset + (i / Scale);
	if ((i % Scale == 0 && SafeOffset(Idx))) {
	PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
	continue;
	}
	PSHUFBMask[i] =
	AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
	}
	InputV = DAG.getBitcast(MVT::v16i8, InputV);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
	DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
	}

	// If we are extending from an offset, ensure we start on a boundary that
	// we can unpack from.
	int AlignToUnpack = Offset % (NumElements / Scale);
	if (AlignToUnpack) {
	SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
	for (int i = AlignToUnpack; i < NumElements; ++i)
	ShMask[i - AlignToUnpack] = i;
	InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
	Offset -= AlignToUnpack;
	}

	// Otherwise emit a sequence of unpacks.
	do {
	unsigned UnpackLoHi = X86ISD::UNPCKL;
	if (Offset >= (NumElements / 2)) {
	UnpackLoHi = X86ISD::UNPCKH;
	Offset -= (NumElements / 2);
	}

	MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
	SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
	: getZeroVector(InputVT, Subtarget, DAG, DL);
	InputV = DAG.getBitcast(InputVT, InputV);
	InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
	Scale /= 2;
	EltBits *= 2;
	NumElements /= 2;
	} while (Scale > 1);
	return DAG.getBitcast(VT, InputV);
	}

	/// Try to lower a vector shuffle as a zero extension on any microarch.
	///
	/// This routine will try to do everything in its power to cleverly lower
	/// a shuffle which happens to match the pattern of a zero extend. It doesn't
	/// check for the profitability of this lowering, it tries to aggressively
	/// match this pattern. It will use all of the micro-architectural details it
	/// can to emit an efficient lowering. It handles both blends with all-zero
	/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
	/// masking out later).
	///
	/// The reason we have dedicated lowering for zext-style shuffles is that they
	/// are both incredibly common and often quite performance sensitive.
	static SDValue lowerShuffleAsZeroOrAnyExtend(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Bits = VT.getSizeInBits();
	int NumLanes = Bits / 128;
	int NumElements = VT.getVectorNumElements();
	int NumEltsPerLane = NumElements / NumLanes;
	assert(VT.getScalarSizeInBits() <= 32 &&
	"Exceeds 32-bit integer zero extension limit");
	assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");

	// Define a helper function to check a particular ext-scale and lower to it if
	// valid.
	auto Lower = [&](int Scale) -> SDValue {
	SDValue InputV;
	bool AnyExt = true;
	int Offset = 0;
	int Matches = 0;
	for (int i = 0; i < NumElements; ++i) {
	int M = Mask[i];
	if (M < 0)
	continue; // Valid anywhere but doesn't tell us anything.
	if (i % Scale != 0) {
	// Each of the extended elements need to be zeroable.
	if (!Zeroable[i])
	return SDValue();

	// We no longer are in the anyext case.
	AnyExt = false;
	continue;
	}

	// Each of the base elements needs to be consecutive indices into the
	// same input vector.
	SDValue V = M < NumElements ? V1 : V2;
	M = M % NumElements;
	if (!InputV) {
	InputV = V;
	Offset = M - (i / Scale);
	} else if (InputV != V)
	return SDValue(); // Flip-flopping inputs.

	// Offset must start in the lowest 128-bit lane or at the start of an
	// upper lane.
	// FIXME: Is it ever worth allowing a negative base offset?
	if (!((0 <= Offset && Offset < NumEltsPerLane) \|\|
	(Offset % NumEltsPerLane) == 0))
	return SDValue();

	// If we are offsetting, all referenced entries must come from the same
	// lane.
	if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
	return SDValue();

	if ((M % NumElements) != (Offset + (i / Scale)))
	return SDValue(); // Non-consecutive strided elements.
	Matches++;
	}

	// If we fail to find an input, we have a zero-shuffle which should always
	// have already been handled.
	// FIXME: Maybe handle this here in case during blending we end up with one?
	if (!InputV)
	return SDValue();

	// If we are offsetting, don't extend if we only match a single input, we
	// can always do better by using a basic PSHUF or PUNPCK.
	if (Offset != 0 && Matches < 2)
	return SDValue();

	return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
	InputV, Mask, Subtarget, DAG);
	};

	// The widest scale possible for extending is to a 64-bit integer.
	assert(Bits % 64 == 0 &&
	"The number of bits in a vector must be divisible by 64 on x86!");
	int NumExtElements = Bits / 64;

	// Each iteration, try extending the elements half as much, but into twice as
	// many elements.
	for (; NumExtElements < NumElements; NumExtElements *= 2) {
	assert(NumElements % NumExtElements == 0 &&
	"The input vector size must be divisible by the extended size.");
	if (SDValue V = Lower(NumElements / NumExtElements))
	return V;
	}

	// General extends failed, but 128-bit vectors may be able to use MOVQ.
	if (Bits != 128)
	return SDValue();

	// Returns one of the source operands if the shuffle can be reduced to a
	// MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
	auto CanZExtLowHalf = [&]() {
	for (int i = NumElements / 2; i != NumElements; ++i)
	if (!Zeroable[i])
	return SDValue();
	if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
	return V1;
	if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
	return V2;
	return SDValue();
	};

	if (SDValue V = CanZExtLowHalf()) {
	V = DAG.getBitcast(MVT::v2i64, V);
	V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
	return DAG.getBitcast(VT, V);
	}

	// No viable ext lowering found.
	return SDValue();
	}

	/// Try to get a scalar value for a specific element of a vector.
	///
	/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
	static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
	SelectionDAG &DAG) {
	MVT VT = V.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	V = peekThroughBitcasts(V);

	// If the bitcasts shift the element size, we can't extract an equivalent
	// element from it.
	MVT NewVT = V.getSimpleValueType();
	if (!NewVT.isVector() \|\| NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
	return SDValue();

	if (V.getOpcode() == ISD::BUILD_VECTOR \|\|
	(Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
	// Ensure the scalar operand is the same size as the destination.
	// FIXME: Add support for scalar truncation where possible.
	SDValue S = V.getOperand(Idx);
	if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
	return DAG.getBitcast(EltVT, S);
	}

	return SDValue();
	}

	/// Helper to test for a load that can be folded with x86 shuffles.
	///
	/// This is particularly important because the set of instructions varies
	/// significantly based on whether the operand is a load or not.
	static bool isShuffleFoldableLoad(SDValue V) {
	V = peekThroughBitcasts(V);
	return ISD::isNON_EXTLoad(V.getNode());
	}

	/// Try to lower insertion of a single element into a zero vector.
	///
	/// This is a common pattern that we have especially efficient patterns to lower
	/// across all subtarget feature sets.
	static SDValue lowerShuffleAsElementInsertion(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT ExtVT = VT;
	MVT EltVT = VT.getVectorElementType();

	int V2Index =
	find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
	Mask.begin();
	bool IsV1Zeroable = true;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (i != V2Index && !Zeroable[i]) {
	IsV1Zeroable = false;
	break;
	}

	// Check for a single input from a SCALAR_TO_VECTOR node.
	// FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
	// all the smarts here sunk into that routine. However, the current
	// lowering of BUILD_VECTOR makes that nearly impossible until the old
	// vector shuffle lowering is dead.
	SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
	DAG);
	if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
	// We need to zext the scalar if it is smaller than an i32.
	V2S = DAG.getBitcast(EltVT, V2S);
	if (EltVT == MVT::i8 \|\| EltVT == MVT::i16) {
	// Using zext to expand a narrow element won't work for non-zero
	// insertions.
	if (!IsV1Zeroable)
	return SDValue();

	// Zero-extend directly to i32.
	ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
	V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
	}
	V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
	} else if (Mask[V2Index] != (int)Mask.size() \|\| EltVT == MVT::i8 \|\|
	EltVT == MVT::i16) {
	// Either not inserting from the low element of the input or the input
	// element size is too small to use VZEXT_MOVL to clear the high bits.
	return SDValue();
	}

	if (!IsV1Zeroable) {
	// If V1 can't be treated as a zero vector we have fewer options to lower
	// this. We can't support integer vectors or non-zero targets cheaply, and
	// the V1 elements can't be permuted in any way.
	assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
	if (!VT.isFloatingPoint() \|\| V2Index != 0)
	return SDValue();
	SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
	V1Mask[V2Index] = -1;
	if (!isNoopShuffleMask(V1Mask))
	return SDValue();
	if (!VT.is128BitVector())
	return SDValue();

	// Otherwise, use MOVSD or MOVSS.
	assert((EltVT == MVT::f32 \|\| EltVT == MVT::f64) &&
	"Only two types of floating point element types to handle!");
	return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
	ExtVT, V1, V2);
	}

	// This lowering only works for the low element with floating point vectors.
	if (VT.isFloatingPoint() && V2Index != 0)
	return SDValue();

	V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
	if (ExtVT != VT)
	V2 = DAG.getBitcast(VT, V2);

	if (V2Index != 0) {
	// If we have 4 or fewer lanes we can cheaply shuffle the element into
	// the desired position. Otherwise it is more efficient to do a vector
	// shift left. We know that we can do a vector shift left because all
	// the inputs are zero.
	if (VT.isFloatingPoint() \|\| VT.getVectorNumElements() <= 4) {
	SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
	V2Shuffle[V2Index] = 0;
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
	} else {
	V2 = DAG.getBitcast(MVT::v16i8, V2);
	V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
	DAG.getTargetConstant(
	V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
	V2 = DAG.getBitcast(VT, V2);
	}
	}
	return V2;
	}

	/// Try to lower broadcast of a single - truncated - integer element,
	/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
	///
	/// This assumes we have AVX2.
	static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
	int BroadcastIdx,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX2() &&
	"We can only lower integer broadcasts with AVX2!");

	MVT EltVT = VT.getVectorElementType();
	MVT V0VT = V0.getSimpleValueType();

	assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
	assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");

	MVT V0EltVT = V0VT.getVectorElementType();
	if (!V0EltVT.isInteger())
	return SDValue();

	const unsigned EltSize = EltVT.getSizeInBits();
	const unsigned V0EltSize = V0EltVT.getSizeInBits();

	// This is only a truncation if the original element type is larger.
	if (V0EltSize <= EltSize)
	return SDValue();

	assert(((V0EltSize % EltSize) == 0) &&
	"Scalar type sizes must all be powers of 2 on x86!");

	const unsigned V0Opc = V0.getOpcode();
	const unsigned Scale = V0EltSize / EltSize;
	const unsigned V0BroadcastIdx = BroadcastIdx / Scale;

	if ((V0Opc != ISD::SCALAR_TO_VECTOR \|\| V0BroadcastIdx != 0) &&
	V0Opc != ISD::BUILD_VECTOR)
	return SDValue();

	SDValue Scalar = V0.getOperand(V0BroadcastIdx);

	// If we're extracting non-least-significant bits, shift so we can truncate.
	// Hopefully, we can fold away the trunc/srl/load into the broadcast.
	// Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
	// vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
	if (const int OffsetIdx = BroadcastIdx % Scale)
	Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
	DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));

	return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
	DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
	}

	/// Test whether this can be lowered with a single SHUFPS instruction.
	///
	/// This is used to disable more specialized lowerings when the shufps lowering
	/// will happen to be efficient.
	static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
	// This routine only handles 128-bit shufps.
	assert(Mask.size() == 4 && "Unsupported mask size!");
	assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
	assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
	assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
	assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");

	// To lower with a single SHUFPS we need to have the low half and high half
	// each requiring a single input.
	if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
	return false;
	if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
	return false;

	return true;
	}

	/// If we are extracting two 128-bit halves of a vector and shuffling the
	/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
	/// multi-shuffle lowering.
	static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
	SDValue N1, ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	MVT VT = N0.getSimpleValueType();
	assert((VT.is128BitVector() &&
	(VT.getScalarSizeInBits() == 32 \|\| VT.getScalarSizeInBits() == 64)) &&
	"VPERM* family of shuffles requires 32-bit or 64-bit elements");

	// Check that both sources are extracts of the same source vector.
	if (!N0.hasOneUse() \|\| !N1.hasOneUse() \|\|
	N0.getOpcode() != ISD::EXTRACT_SUBVECTOR \|\|
	N1.getOpcode() != ISD::EXTRACT_SUBVECTOR \|\|
	N0.getOperand(0) != N1.getOperand(0))
	return SDValue();

	SDValue WideVec = N0.getOperand(0);
	MVT WideVT = WideVec.getSimpleValueType();
	if (!WideVT.is256BitVector())
	return SDValue();

	// Match extracts of each half of the wide source vector. Commute the shuffle
	// if the extract of the low half is N1.
	unsigned NumElts = VT.getVectorNumElements();
	SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
	const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
	const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
	if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
	ShuffleVectorSDNode::commuteMask(NewMask);
	else if (ExtIndex0 != 0 \|\| ExtIndex1 != NumElts)
	return SDValue();

	// Final bailout: if the mask is simple, we are better off using an extract
	// and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
	// because that avoids a constant load from memory.
	if (NumElts == 4 &&
	(isSingleSHUFPSMask(NewMask) \|\| is128BitUnpackShuffleMask(NewMask)))
	return SDValue();

	// Extend the shuffle mask with undef elements.
	NewMask.append(NumElts, -1);

	// shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
	SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
	NewMask);
	// This is free: ymm -> xmm.
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
	DAG.getIntPtrConstant(0, DL));
	}

	/// Try to lower broadcast of a single element.
	///
	/// For convenience, this code also bundles all of the subtarget feature set
	/// filtering. While a little annoying to re-dispatch on type here, there isn't
	/// a convenient way to factor it out.
	static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) \|\|
	(Subtarget.hasAVX() && VT.isFloatingPoint()) \|\|
	(Subtarget.hasAVX2() && VT.isInteger())))
	return SDValue();

	// With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
	// we can only broadcast from a register with AVX2.
	unsigned NumEltBits = VT.getScalarSizeInBits();
	unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
	? X86ISD::MOVDDUP
	: X86ISD::VBROADCAST;
	bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) \|\| Subtarget.hasAVX2();

	// Check that the mask is a broadcast.
	int BroadcastIdx = getSplatIndex(Mask);
	if (BroadcastIdx < 0)
	return SDValue();
	assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
	"a sorted mask where the broadcast "
	"comes from V1.");

	// Go up the chain of (vector) values to find a scalar load that we can
	// combine with the broadcast.
	// TODO: Combine this logic with findEltLoadSrc() used by
	// EltsFromConsecutiveLoads().
	int BitOffset = BroadcastIdx * NumEltBits;
	SDValue V = V1;
	for (;;) {
	switch (V.getOpcode()) {
	case ISD::BITCAST: {
	V = V.getOperand(0);
	continue;
	}
	case ISD::CONCAT_VECTORS: {
	int OpBitWidth = V.getOperand(0).getValueSizeInBits();
	int OpIdx = BitOffset / OpBitWidth;
	V = V.getOperand(OpIdx);
	BitOffset %= OpBitWidth;
	continue;
	}
	case ISD::EXTRACT_SUBVECTOR: {
	// The extraction index adds to the existing offset.
	unsigned EltBitWidth = V.getScalarValueSizeInBits();
	unsigned Idx = V.getConstantOperandVal(1);
	unsigned BeginOffset = Idx * EltBitWidth;
	BitOffset += BeginOffset;
	V = V.getOperand(0);
	continue;
	}
	case ISD::INSERT_SUBVECTOR: {
	SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
	int EltBitWidth = VOuter.getScalarValueSizeInBits();
	int Idx = (int)V.getConstantOperandVal(2);
	int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
	int BeginOffset = Idx * EltBitWidth;
	int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
	if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
	BitOffset -= BeginOffset;
	V = VInner;
	} else {
	V = VOuter;
	}
	continue;
	}
	}
	break;
	}
	assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
	BroadcastIdx = BitOffset / NumEltBits;

	// Do we need to bitcast the source to retrieve the original broadcast index?
	bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;

	// Check if this is a broadcast of a scalar. We special case lowering
	// for scalars so that we can more effectively fold with loads.
	// If the original value has a larger element type than the shuffle, the
	// broadcast element is in essence truncated. Make that explicit to ease
	// folding.
	if (BitCastSrc && VT.isInteger())
	if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
	DL, VT, V, BroadcastIdx, Subtarget, DAG))
	return TruncBroadcast;

	// Also check the simpler case, where we can directly reuse the scalar.
	if (!BitCastSrc &&
	((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) \|\|
	(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
	V = V.getOperand(BroadcastIdx);

	// If we can't broadcast from a register, check that the input is a load.
	if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
	return SDValue();
	} else if (ISD::isNormalLoad(V.getNode()) &&
	cast<LoadSDNode>(V)->isSimple()) {
	// We do not check for one-use of the vector load because a broadcast load
	// is expected to be a win for code size, register pressure, and possibly
	// uops even if the original vector load is not eliminated.

	// Reduce the vector load and shuffle to a broadcasted scalar load.
	LoadSDNode *Ld = cast<LoadSDNode>(V);
	SDValue BaseAddr = Ld->getOperand(1);
	MVT SVT = VT.getScalarType();
	unsigned Offset = BroadcastIdx * SVT.getStoreSize();
	assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
	SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);

	// Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
	// than MOVDDUP.
	// FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
	if (Opcode == X86ISD::VBROADCAST) {
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = {Ld->getChain(), NewAddr};
	V = DAG.getMemIntrinsicNode(
	X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
	DAG.getMachineFunction().getMachineMemOperand(
	Ld->getMemOperand(), Offset, SVT.getStoreSize()));
	DAG.makeEquivalentMemoryOrdering(Ld, V);
	return DAG.getBitcast(VT, V);
	}
	assert(SVT == MVT::f64 && "Unexpected VT!");
	V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
	DAG.getMachineFunction().getMachineMemOperand(
	Ld->getMemOperand(), Offset, SVT.getStoreSize()));
	DAG.makeEquivalentMemoryOrdering(Ld, V);
	} else if (!BroadcastFromReg) {
	// We can't broadcast from a vector register.
	return SDValue();
	} else if (BitOffset != 0) {
	// We can only broadcast from the zero-element of a vector register,
	// but it can be advantageous to broadcast from the zero-element of a
	// subvector.
	if (!VT.is256BitVector() && !VT.is512BitVector())
	return SDValue();

	// VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
	if (VT == MVT::v4f64 \|\| VT == MVT::v4i64)
	return SDValue();

	// Only broadcast the zero-element of a 128-bit subvector.
	if ((BitOffset % 128) != 0)
	return SDValue();

	assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
	"Unexpected bit-offset");
	assert((V.getValueSizeInBits() == 256 \|\| V.getValueSizeInBits() == 512) &&
	"Unexpected vector size");
	unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
	V = extract128BitVector(V, ExtractIdx, DAG, DL);
	}

	if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
	DAG.getBitcast(MVT::f64, V));

	// If this is a scalar, do the broadcast on this type and bitcast.
	if (!V.getValueType().isVector()) {
	assert(V.getScalarValueSizeInBits() == NumEltBits &&
	"Unexpected scalar size");
	MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
	VT.getVectorNumElements());
	return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
	}

	// We only support broadcasting from 128-bit vectors to minimize the
	// number of patterns we need to deal with in isel. So extract down to
	// 128-bits, removing as many bitcasts as possible.
	if (V.getValueSizeInBits() > 128)
	V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);

	// Otherwise cast V to a vector with the same element type as VT, but
	// possibly narrower than VT. Then perform the broadcast.
	unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
	MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
	return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
	}

	// Check for whether we can use INSERTPS to perform the shuffle. We only use
	// INSERTPS when the V1 elements are already in the correct locations
	// because otherwise we can just always use two SHUFPS instructions which
	// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
	// perform INSERTPS if a single V1 element is out of place and all V2
	// elements are zeroable.
	static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
	unsigned &InsertPSMask,
	const APInt &Zeroable,
	ArrayRef<int> Mask, SelectionDAG &DAG) {
	assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
	assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	// Attempt to match INSERTPS with one element from VA or VB being
	// inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
	// are updated.
	auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
	ArrayRef<int> CandidateMask) {
	unsigned ZMask = 0;
	int VADstIndex = -1;
	int VBDstIndex = -1;
	bool VAUsedInPlace = false;

	for (int i = 0; i < 4; ++i) {
	// Synthesize a zero mask from the zeroable elements (includes undefs).
	if (Zeroable[i]) {
	ZMask \|= 1 << i;
	continue;
	}

	// Flag if we use any VA inputs in place.
	if (i == CandidateMask[i]) {
	VAUsedInPlace = true;
	continue;
	}

	// We can only insert a single non-zeroable element.
	if (VADstIndex >= 0 \|\| VBDstIndex >= 0)
	return false;

	if (CandidateMask[i] < 4) {
	// VA input out of place for insertion.
	VADstIndex = i;
	} else {
	// VB input for insertion.
	VBDstIndex = i;
	}
	}

	// Don't bother if we have no (non-zeroable) element for insertion.
	if (VADstIndex < 0 && VBDstIndex < 0)
	return false;

	// Determine element insertion src/dst indices. The src index is from the
	// start of the inserted vector, not the start of the concatenated vector.
	unsigned VBSrcIndex = 0;
	if (VADstIndex >= 0) {
	// If we have a VA input out of place, we use VA as the V2 element
	// insertion and don't use the original V2 at all.
	VBSrcIndex = CandidateMask[VADstIndex];
	VBDstIndex = VADstIndex;
	VB = VA;
	} else {
	VBSrcIndex = CandidateMask[VBDstIndex] - 4;
	}

	// If no V1 inputs are used in place, then the result is created only from
	// the zero mask and the V2 insertion - so remove V1 dependency.
	if (!VAUsedInPlace)
	VA = DAG.getUNDEF(MVT::v4f32);

	// Update V1, V2 and InsertPSMask accordingly.
	V1 = VA;
	V2 = VB;

	// Insert the V2 element into the desired position.
	InsertPSMask = VBSrcIndex << 6 \| VBDstIndex << 4 \| ZMask;
	assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
	return true;
	};

	if (matchAsInsertPS(V1, V2, Mask))
	return true;

	// Commute and try again.
	SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
	ShuffleVectorSDNode::commuteMask(CommutedMask);
	if (matchAsInsertPS(V2, V1, CommutedMask))
	return true;

	return false;
	}

	static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
	ArrayRef<int> Mask, const APInt &Zeroable,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");

	// Attempt to match the insertps pattern.
	unsigned InsertPSMask;
	if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
	return SDValue();

	// Insert the V2 element into the desired position.
	return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
	DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
	}

	/// Try to lower a shuffle as a permute of the inputs followed by an
	/// UNPCK instruction.
	///
	/// This specifically targets cases where we end up with alternating between
	/// the two inputs, and so can permute them into something that feeds a single
	/// UNPCK instruction. Note that this routine only targets integer vectors
	/// because for floating point vectors we have a generalized SHUFPS lowering
	/// strategy that handles everything that doesn't exactly match an unpack,
	/// making this clever lowering unnecessary.
	static SDValue lowerShuffleAsPermuteAndUnpack(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(!VT.isFloatingPoint() &&
	"This routine only supports integer vectors.");
	assert(VT.is128BitVector() &&
	"This routine only works on 128-bit vectors.");
	assert(!V2.isUndef() &&
	"This routine should only be used when blending two inputs.");
	assert(Mask.size() >= 2 && "Single element masks are invalid.");

	int Size = Mask.size();

	int NumLoInputs =
	count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
	int NumHiInputs =
	count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });

	bool UnpackLo = NumLoInputs >= NumHiInputs;

	auto TryUnpack = [&](int ScalarSize, int Scale) {
	SmallVector<int, 16> V1Mask((unsigned)Size, -1);
	SmallVector<int, 16> V2Mask((unsigned)Size, -1);

	for (int i = 0; i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	// Each element of the unpack contains Scale elements from this mask.
	int UnpackIdx = i / Scale;

	// We only handle the case where V1 feeds the first slots of the unpack.
	// We rely on canonicalization to ensure this is the case.
	if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
	return SDValue();

	// Setup the mask for this input. The indexing is tricky as we have to
	// handle the unpack stride.
	SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
	VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
	Mask[i] % Size;
	}

	// If we will have to shuffle both inputs to use the unpack, check whether
	// we can just unpack first and shuffle the result. If so, skip this unpack.
	if ((NumLoInputs == 0 \|\| NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
	!isNoopShuffleMask(V2Mask))
	return SDValue();

	// Shuffle the inputs into place.
	V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);

	// Cast the inputs to the type we will use to unpack them.
	MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
	V1 = DAG.getBitcast(UnpackVT, V1);
	V2 = DAG.getBitcast(UnpackVT, V2);

	// Unpack the inputs and cast the result back to the desired type.
	return DAG.getBitcast(
	VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
	UnpackVT, V1, V2));
	};

	// We try each unpack from the largest to the smallest to try and find one
	// that fits this mask.
	int OrigScalarSize = VT.getScalarSizeInBits();
	for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
	if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
	return Unpack;

	// If we're shuffling with a zero vector then we're better off not doing
	// VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
	if (ISD::isBuildVectorAllZeros(V1.getNode()) \|\|
	ISD::isBuildVectorAllZeros(V2.getNode()))
	return SDValue();

	// If none of the unpack-rooted lowerings worked (or were profitable) try an
	// initial unpack.
	if (NumLoInputs == 0 \|\| NumHiInputs == 0) {
	assert((NumLoInputs > 0 \|\| NumHiInputs > 0) &&
	"We have to have some inputs!");
	int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;

	// FIXME: We could consider the total complexity of the permute of each
	// possible unpacking. Or at the least we should consider how many
	// half-crossings are created.
	// FIXME: We could consider commuting the unpacks.

	SmallVector<int, 32> PermMask((unsigned)Size, -1);
	for (int i = 0; i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");

	PermMask[i] =
	2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
	}
	return DAG.getVectorShuffle(
	VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
	DL, VT, V1, V2),
	DAG.getUNDEF(VT), PermMask);
	}

	return SDValue();
	}

	/// Handle lowering of 2-lane 64-bit floating point shuffles.
	///
	/// This is the basis function for the 2-lane 64-bit shuffles as we have full
	/// support for floating point shuffles but not integer shuffles. These
	/// instructions will incur a domain crossing penalty on some chips though so
	/// it is better to avoid lowering through this for integer vectors where
	/// possible.
	static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
	assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// Straight shuffle of a single input vector. Simulate this by using the
	// single input as both of the "inputs" to this instruction..
	unsigned SHUFPDMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1);

	if (Subtarget.hasAVX()) {
	// If we have AVX, we can use VPERMILPS which will allow folding a load
	// into the shuffle.
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
	DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
	}

	return DAG.getNode(
	X86ISD::SHUFP, DL, MVT::v2f64,
	Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
	Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
	DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
	}
	assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[0] < 2 && "We sort V1 to be the first input.");
	assert(Mask[1] >= 2 && "We sort V2 to be the second input.");

	if (Subtarget.hasAVX2())
	if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
	return Extract;

	// When loading a scalar and then shuffling it into a vector we can often do
	// the insertion cheaply.
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;
	// Try inverting the insertion since for v2 masks it is easy to do and we
	// can't reliably sort the mask one way or the other.
	int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
	Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
	return Insertion;

	// Try to use one of the special instruction patterns to handle two common
	// blend patterns if a zero-blend above didn't work.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) \|\|
	isShuffleEquivalent(V1, V2, Mask, {1, 3}))
	if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
	// We can either use a special instruction to load over the low double or
	// to move just the low double.
	return DAG.getNode(
	X86ISD::MOVSD, DL, MVT::v2f64, V2,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));

	if (Subtarget.hasSSE41())
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
	return V;

	unsigned SHUFPDMask = (Mask[0] == 1) \| (((Mask[1] - 2) == 1) << 1);
	return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
	DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
	}

	/// Handle lowering of 2-lane 64-bit integer shuffles.
	///
	/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
	/// the integer unit to minimize domain crossing penalties. However, for blends
	/// it falls back to the floating point shuffle operation with appropriate bit
	/// casting.
	static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
	assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// Straight shuffle of a single input vector. For everything from SSE2
	// onward this has a single fast instruction with no scary immediates.
	// We have to map the mask as it is actually a v4i32 shuffle instruction.
	V1 = DAG.getBitcast(MVT::v4i32, V1);
	int WidenedMask[4] = {
	std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
	std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
	return DAG.getBitcast(
	MVT::v2i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
	getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
	}
	assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[0] < 2 && "We sort V1 to be the first input.");
	assert(Mask[1] >= 2 && "We sort V2 to be the second input.");

	if (Subtarget.hasAVX2())
	if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
	return Extract;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// When loading a scalar and then shuffling it into a vector we can often do
	// the insertion cheaply.
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;
	// Try inverting the insertion since for v2 masks it is easy to do and we
	// can't reliably sort the mask one way or the other.
	int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
	return Insertion;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
	return V;

	// Try to use byte rotation instructions.
	// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
	if (Subtarget.hasSSSE3()) {
	if (Subtarget.hasVLX())
	if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;
	}

	// If we have direct support for blends, we should lower by decomposing into
	// a permute. That will be faster than the domain cross.
	if (IsBlendSupported)
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, Mask,
	Subtarget, DAG);

	// We implement this with SHUFPD which is pretty lame because it will likely
	// incur 2 cycles of stall for integer vectors on Nehalem and older chips.
	// However, all the alternatives are still more cycles and newer chips don't
	// have this problem. It would be really nice if x86 had better shuffles here.
	V1 = DAG.getBitcast(MVT::v2f64, V1);
	V2 = DAG.getBitcast(MVT::v2f64, V2);
	return DAG.getBitcast(MVT::v2i64,
	DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
	}

	/// Lower a vector shuffle using the SHUFPS instruction.
	///
	/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
	/// It makes no assumptions about whether this is the best lowering, it simply
	/// uses it.
	static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	SDValue LowV = V1, HighV = V2;
	SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 1) {
	int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();

	// Compute the index adjacent to V2Index and in the same half by toggling
	// the low bit.
	int V2AdjIndex = V2Index ^ 1;

	if (Mask[V2AdjIndex] < 0) {
	// Handles all the cases where we have a single V2 element and an undef.
	// This will only ever happen in the high lanes because we commute the
	// vector otherwise.
	if (V2Index < 2)
	std::swap(LowV, HighV);
	NewMask[V2Index] -= 4;
	} else {
	// Handle the case where the V2 element ends up adjacent to a V1 element.
	// To make this work, blend them together as the first step.
	int V1Index = V2AdjIndex;
	int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
	V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
	getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

	// Now proceed to reconstruct the final blend as we have the necessary
	// high or low half formed.
	if (V2Index < 2) {
	LowV = V2;
	HighV = V1;
	} else {
	HighV = V2;
	}
	NewMask[V1Index] = 2; // We put the V1 element in V2[2].
	NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
	}
	} else if (NumV2Elements == 2) {
	if (Mask[0] < 4 && Mask[1] < 4) {
	// Handle the easy case where we have V1 in the low lanes and V2 in the
	// high lanes.
	NewMask[2] -= 4;
	NewMask[3] -= 4;
	} else if (Mask[2] < 4 && Mask[3] < 4) {
	// We also handle the reversed case because this utility may get called
	// when we detect a SHUFPS pattern but can't easily commute the shuffle to
	// arrange things in the right direction.
	NewMask[0] -= 4;
	NewMask[1] -= 4;
	HighV = V1;
	LowV = V2;
	} else {
	// We have a mixture of V1 and V2 in both low and high lanes. Rather than
	// trying to place elements directly, just blend them and set up the final
	// shuffle to place them.

	// The first two blend mask elements are for V1, the second two are for
	// V2.
	int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
	Mask[2] < 4 ? Mask[2] : Mask[3],
	(Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
	(Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
	V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
	getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

	// Now we do a normal shuffle of V1 by giving V1 as both operands to
	// a blend.
	LowV = HighV = V1;
	NewMask[0] = Mask[0] < 4 ? 0 : 2;
	NewMask[1] = Mask[0] < 4 ? 2 : 0;
	NewMask[2] = Mask[2] < 4 ? 1 : 3;
	NewMask[3] = Mask[2] < 4 ? 3 : 1;
	}
	}
	return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
	getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
	}

	/// Lower 4-lane 32-bit floating point shuffles.
	///
	/// Uses instructions exclusively from the floating point unit to minimize
	/// domain crossing penalties, as these are sufficient to implement all v4f32
	/// shuffles.
	static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (Subtarget.hasSSE3()) {
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
	if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
	}

	if (Subtarget.hasAVX()) {
	// If we have AVX, we can use VPERMILPS which will allow folding a load
	// into the shuffle.
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
	// in SSE1 because otherwise they are widened to v2f64 and never get here.
	if (!Subtarget.hasSSE2()) {
	if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
	return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
	if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
	return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
	}

	// Otherwise, use a straight shuffle of a single input vector. We pass the
	// input vector to both operands to simulate this with a SHUFPS.
	return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	if (Subtarget.hasAVX2())
	if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
	return Extract;

	// There are special ways we can lower some single-element blends. However, we
	// have custom ways we can lower more complex single-element blends below that
	// we defer to if both this and BLENDPS fail to match, so restrict this to
	// when the V2 input is targeting element 0 of the mask -- that is the fast
	// case here.
	if (NumV2Elements == 1 && Mask[0] >= 4)
	if (SDValue V = lowerShuffleAsElementInsertion(
	DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	if (Subtarget.hasSSE41()) {
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use INSERTPS if we can complete the shuffle efficiently.
	if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
	return V;

	if (!isSingleSHUFPSMask(Mask))
	if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
	V2, Mask, DAG))
	return BlendPerm;
	}

	// Use low/high mov instructions. These are only valid in SSE1 because
	// otherwise they are widened to v2f64 and never get here.
	if (!Subtarget.hasSSE2()) {
	if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
	return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
	if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
	return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
	}

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
	return V;

	// Otherwise fall back to a SHUFPS lowering strategy.
	return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
	}

	/// Lower 4-lane i32 vector shuffles.
	///
	/// We try to handle these with integer-domain shuffles where we can, but for
	/// blends we use the floating point domain blend instructions.
	static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return ZExt;

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 0) {
	// Try to use broadcast unless the mask only has one non-undef element.
	if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;
	}

	// Straight shuffle of a single input vector. For everything from SSE2
	// onward this has a single fast instruction with no scary immediates.
	// We coerce the shuffle pattern to be compatible with UNPCK instructions
	// but we aren't actually going to use the UNPCK instruction because doing
	// so prevents folding a load into this instruction or making a copy.
	const int UnpackLoMask[] = {0, 0, 1, 1};
	const int UnpackHiMask[] = {2, 2, 3, 3};
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
	Mask = UnpackLoMask;
	else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
	Mask = UnpackHiMask;

	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	if (Subtarget.hasAVX2())
	if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
	return Extract;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// There are special ways we can lower some single-element blends.
	if (NumV2Elements == 1)
	if (SDValue V = lowerShuffleAsElementInsertion(
	DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
	return V;

	// Try to use byte rotation instructions.
	// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
	if (Subtarget.hasSSSE3()) {
	if (Subtarget.hasVLX())
	if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;
	}

	// Assume that a single SHUFPS is faster than an alternative sequence of
	// multiple instructions (even if the CPU has a domain penalty).
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (!isSingleSHUFPSMask(Mask)) {
	// If we have direct support for blends, we should lower by decomposing into
	// a permute. That will be faster than the domain cross.
	if (IsBlendSupported)
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, Mask,
	Subtarget, DAG);

	// Try to lower by permuting the inputs into an unpack instruction.
	if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
	Mask, Subtarget, DAG))
	return Unpack;
	}

	// We implement this with SHUFPS because it can blend from two vectors.
	// Because we're going to eventually use SHUFPS, we use SHUFPS even to build
	// up the inputs, bypassing domain shift penalties that we would incur if we
	// directly used PSHUFD on Nehalem and older. For newer chips, this isn't
	// relevant.
	SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
	SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
	return DAG.getBitcast(MVT::v4i32, ShufPS);
	}

	/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
	/// shuffle lowering, and the most complex part.
	///
	/// The lowering strategy is to try to form pairs of input lanes which are
	/// targeted at the same half of the final vector, and then use a dword shuffle
	/// to place them onto the right half, and finally unpack the paired lanes into
	/// their final position.
	///
	/// The exact breakdown of how to form these dword pairs and align them on the
	/// correct sides is really tricky. See the comments within the function for
	/// more of the details.
	///
	/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
	/// lane must shuffle the exact same way. In fact, you must pass a v8 Mask to
	/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
	/// vector, form the analogous 128-bit 8-element Mask.
	static SDValue lowerV8I16GeneralSingleInputShuffle(
	const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
	MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);

	assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
	MutableArrayRef<int> LoMask = Mask.slice(0, 4);
	MutableArrayRef<int> HiMask = Mask.slice(4, 4);

	// Attempt to directly match PSHUFLW or PSHUFHW.
	if (isUndefOrInRange(LoMask, 0, 4) &&
	isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
	return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
	}
	if (isUndefOrInRange(HiMask, 4, 8) &&
	isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
	for (int i = 0; i != 4; ++i)
	HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
	return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
	}

	SmallVector<int, 4> LoInputs;
	copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
	array_pod_sort(LoInputs.begin(), LoInputs.end());
	LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
	SmallVector<int, 4> HiInputs;
	copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
	array_pod_sort(HiInputs.begin(), HiInputs.end());
	HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
	int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
	int NumHToL = LoInputs.size() - NumLToL;
	int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
	int NumHToH = HiInputs.size() - NumLToH;
	MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
	MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
	MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
	MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);

	// If we are shuffling values from one half - check how many different DWORD
	// pairs we need to create. If only 1 or 2 then we can perform this as a
	// PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
	auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
	ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
	V = DAG.getNode(ShufWOp, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
	V = DAG.getBitcast(PSHUFDVT, V);
	V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
	return DAG.getBitcast(VT, V);
	};

	if ((NumHToL + NumHToH) == 0 \|\| (NumLToL + NumLToH) == 0) {
	int PSHUFDMask[4] = { -1, -1, -1, -1 };
	SmallVector<std::pair<int, int>, 4> DWordPairs;
	int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);

	// Collect the different DWORD pairs.
	for (int DWord = 0; DWord != 4; ++DWord) {
	int M0 = Mask[2 * DWord + 0];
	int M1 = Mask[2 * DWord + 1];
	M0 = (M0 >= 0 ? M0 % 4 : M0);
	M1 = (M1 >= 0 ? M1 % 4 : M1);
	if (M0 < 0 && M1 < 0)
	continue;

	bool Match = false;
	for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
	auto &DWordPair = DWordPairs[j];
	if ((M0 < 0 \|\| isUndefOrEqual(DWordPair.first, M0)) &&
	(M1 < 0 \|\| isUndefOrEqual(DWordPair.second, M1))) {
	DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
	DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
	PSHUFDMask[DWord] = DOffset + j;
	Match = true;
	break;
	}
	}
	if (!Match) {
	PSHUFDMask[DWord] = DOffset + DWordPairs.size();
	DWordPairs.push_back(std::make_pair(M0, M1));
	}
	}

	if (DWordPairs.size() <= 2) {
	DWordPairs.resize(2, std::make_pair(-1, -1));
	int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
	DWordPairs[1].first, DWordPairs[1].second};
	if ((NumHToL + NumHToH) == 0)
	return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
	if ((NumLToL + NumLToH) == 0)
	return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
	}
	}

	// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
	// such inputs we can swap two of the dwords across the half mark and end up
	// with <=2 inputs to each half in each half. Once there, we can fall through
	// to the generic code below. For example:
	//
	// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
	// Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
	//
	// However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
	// and an existing 2-into-2 on the other half. In this case we may have to
	// pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
	// 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
	// Fortunately, we don't have to handle anything but a 2-into-2 pattern
	// because any other situation (including a 3-into-1 or 1-into-3 in the other
	// half than the one we target for fixing) will be fixed when we re-enter this
	// path. We will also combine away any sequence of PSHUFD instructions that
	// result into a single instruction. Here is an example of the tricky case:
	//
	// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
	//
	// This now has a 1-into-3 in the high half! Instead, we do two shuffles:
	//
	// Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
	//
	// Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
	//
	// The result is fine to be handled by the generic logic.
	auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
	ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
	int AOffset, int BOffset) {
	assert((AToAInputs.size() == 3 \|\| AToAInputs.size() == 1) &&
	"Must call this with A having 3 or 1 inputs from the A half.");
	assert((BToAInputs.size() == 1 \|\| BToAInputs.size() == 3) &&
	"Must call this with B having 1 or 3 inputs from the B half.");
	assert(AToAInputs.size() + BToAInputs.size() == 4 &&
	"Must call this with either 3:1 or 1:3 inputs (summing to 4).");

	bool ThreeAInputs = AToAInputs.size() == 3;

	// Compute the index of dword with only one word among the three inputs in
	// a half by taking the sum of the half with three inputs and subtracting
	// the sum of the actual three inputs. The difference is the remaining
	// slot.
	int ADWord = 0, BDWord = 0;
	int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
	int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
	int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
	ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
	int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
	int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
	int TripleNonInputIdx =
	TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
	TripleDWord = TripleNonInputIdx / 2;

	// We use xor with one to compute the adjacent DWord to whichever one the
	// OneInput is in.
	OneInputDWord = (OneInput / 2) ^ 1;

	// Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
	// and BToA inputs. If there is also such a problem with the BToB and AToB
	// inputs, we don't try to fix it necessarily -- we'll recurse and see it in
	// the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
	// is essential that we don't create a 3<-1 as then we might oscillate.
	if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
	// Compute how many inputs will be flipped by swapping these DWords. We
	// need
	// to balance this to ensure we don't form a 3-1 shuffle in the other
	// half.
	int NumFlippedAToBInputs =
	std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
	std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
	int NumFlippedBToBInputs =
	std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
	std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
	if ((NumFlippedAToBInputs == 1 &&
	(NumFlippedBToBInputs == 0 \|\| NumFlippedBToBInputs == 2)) \|\|
	(NumFlippedBToBInputs == 1 &&
	(NumFlippedAToBInputs == 0 \|\| NumFlippedAToBInputs == 2))) {
	// We choose whether to fix the A half or B half based on whether that
	// half has zero flipped inputs. At zero, we may not be able to fix it
	// with that half. We also bias towards fixing the B half because that
	// will more commonly be the high half, and we have to bias one way.
	auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
	ArrayRef<int> Inputs) {
	int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
	bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
	// Determine whether the free index is in the flipped dword or the
	// unflipped dword based on where the pinned index is. We use this bit
	// in an xor to conditionally select the adjacent dword.
	int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
	bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
	if (IsFixIdxInput == IsFixFreeIdxInput)
	FixFreeIdx += 1;
	IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
	assert(IsFixIdxInput != IsFixFreeIdxInput &&
	"We need to be changing the number of flipped inputs!");
	int PSHUFHalfMask[] = {0, 1, 2, 3};
	std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
	V = DAG.getNode(
	FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
	MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
	getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));

	for (int &M : Mask)
	if (M >= 0 && M == FixIdx)
	M = FixFreeIdx;
	else if (M >= 0 && M == FixFreeIdx)
	M = FixIdx;
	};
	if (NumFlippedBToBInputs != 0) {
	int BPinnedIdx =
	BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
	FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
	} else {
	assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
	int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
	FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
	}
	}
	}

	int PSHUFDMask[] = {0, 1, 2, 3};
	PSHUFDMask[ADWord] = BDWord;
	PSHUFDMask[BDWord] = ADWord;
	V = DAG.getBitcast(
	VT,
	DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

	// Adjust the mask to match the new locations of A and B.
	for (int &M : Mask)
	if (M >= 0 && M/2 == ADWord)
	M = 2 * BDWord + M % 2;
	else if (M >= 0 && M/2 == BDWord)
	M = 2 * ADWord + M % 2;

	// Recurse back into this routine to re-compute state now that this isn't
	// a 3 and 1 problem.
	return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
	};
	if ((NumLToL == 3 && NumHToL == 1) \|\| (NumLToL == 1 && NumHToL == 3))
	return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
	if ((NumHToH == 3 && NumLToH == 1) \|\| (NumHToH == 1 && NumLToH == 3))
	return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);

	// At this point there are at most two inputs to the low and high halves from
	// each half. That means the inputs can always be grouped into dwords and
	// those dwords can then be moved to the correct half with a dword shuffle.
	// We use at most one low and one high word shuffle to collect these paired
	// inputs into dwords, and finally a dword shuffle to place them.
	int PSHUFLMask[4] = {-1, -1, -1, -1};
	int PSHUFHMask[4] = {-1, -1, -1, -1};
	int PSHUFDMask[4] = {-1, -1, -1, -1};

	// First fix the masks for all the inputs that are staying in their
	// original halves. This will then dictate the targets of the cross-half
	// shuffles.
	auto fixInPlaceInputs =
	[&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
	MutableArrayRef<int> SourceHalfMask,
	MutableArrayRef<int> HalfMask, int HalfOffset) {
	if (InPlaceInputs.empty())
	return;
	if (InPlaceInputs.size() == 1) {
	SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
	InPlaceInputs[0] - HalfOffset;
	PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
	return;
	}
	if (IncomingInputs.empty()) {
	// Just fix all of the in place inputs.
	for (int Input : InPlaceInputs) {
	SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
	PSHUFDMask[Input / 2] = Input / 2;
	}
	return;
	}

	assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
	SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
	InPlaceInputs[0] - HalfOffset;
	// Put the second input next to the first so that they are packed into
	// a dword. We find the adjacent index by toggling the low bit.
	int AdjIndex = InPlaceInputs[0] ^ 1;
	SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
	std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
	PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
	};
	fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
	fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);

	// Now gather the cross-half inputs and place them into a free dword of
	// their target half.
	// FIXME: This operation could almost certainly be simplified dramatically to
	// look more like the 3-1 fixing operation.
	auto moveInputsToRightHalf = [&PSHUFDMask](
	MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
	MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
	MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
	int DestOffset) {
	auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
	return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
	};
	auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
	int Word) {
	int LowWord = Word & ~1;
	int HighWord = Word \| 1;
	return isWordClobbered(SourceHalfMask, LowWord) \|\|
	isWordClobbered(SourceHalfMask, HighWord);
	};

	if (IncomingInputs.empty())
	return;

	if (ExistingInputs.empty()) {
	// Map any dwords with inputs from them into the right half.
	for (int Input : IncomingInputs) {
	// If the source half mask maps over the inputs, turn those into
	// swaps and use the swapped lane.
	if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
	if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
	SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
	Input - SourceOffset;
	// We have to swap the uses in our half mask in one sweep.
	for (int &M : HalfMask)
	if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
	M = Input;
	else if (M == Input)
	M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
	} else {
	assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
	Input - SourceOffset &&
	"Previous placement doesn't match!");
	}
	// Note that this correctly re-maps both when we do a swap and when
	// we observe the other side of the swap above. We rely on that to
	// avoid swapping the members of the input list directly.
	Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
	}

	// Map the input's dword into the correct half.
	if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
	PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
	else
	assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
	Input / 2 &&
	"Previous placement doesn't match!");
	}

	// And just directly shift any other-half mask elements to be same-half
	// as we will have mirrored the dword containing the element into the
	// same position within that half.
	for (int &M : HalfMask)
	if (M >= SourceOffset && M < SourceOffset + 4) {
	M = M - SourceOffset + DestOffset;
	assert(M >= 0 && "This should never wrap below zero!");
	}
	return;
	}

	// Ensure we have the input in a viable dword of its current half. This
	// is particularly tricky because the original position may be clobbered
	// by inputs being moved and staying in that half.
	if (IncomingInputs.size() == 1) {
	if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
	int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
	SourceOffset;
	SourceHalfMask[InputFixed - SourceOffset] =
	IncomingInputs[0] - SourceOffset;
	std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
	InputFixed);
	IncomingInputs[0] = InputFixed;
	}
	} else if (IncomingInputs.size() == 2) {
	if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 \|\|
	isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
	// We have two non-adjacent or clobbered inputs we need to extract from
	// the source half. To do this, we need to map them into some adjacent
	// dword slot in the source mask.
	int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
	IncomingInputs[1] - SourceOffset};

	// If there is a free slot in the source half mask adjacent to one of
	// the inputs, place the other input in it. We use (Index XOR 1) to
	// compute an adjacent index.
	if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
	SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
	SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
	SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
	InputsFixed[1] = InputsFixed[0] ^ 1;
	} else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
	SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
	SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
	SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
	InputsFixed[0] = InputsFixed[1] ^ 1;
	} else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
	// The two inputs are in the same DWord but it is clobbered and the
	// adjacent DWord isn't used at all. Move both inputs to the free
	// slot.
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
	InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
	InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
	} else {
	// The only way we hit this point is if there is no clobbering
	// (because there are no off-half inputs to this half) and there is no
	// free slot adjacent to one of the inputs. In this case, we have to
	// swap an input with a non-input.
	for (int i = 0; i < 4; ++i)
	assert((SourceHalfMask[i] < 0 \|\| SourceHalfMask[i] == i) &&
	"We can't handle any clobbers here!");
	assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
	"Cannot have adjacent inputs here!");

	SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
	SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;

	// We also have to update the final source mask in this case because
	// it may need to undo the above swap.
	for (int &M : FinalSourceHalfMask)
	if (M == (InputsFixed[0] ^ 1) + SourceOffset)
	M = InputsFixed[1] + SourceOffset;
	else if (M == InputsFixed[1] + SourceOffset)
	M = (InputsFixed[0] ^ 1) + SourceOffset;

	InputsFixed[1] = InputsFixed[0] ^ 1;
	}

	// Point everything at the fixed inputs.
	for (int &M : HalfMask)
	if (M == IncomingInputs[0])
	M = InputsFixed[0] + SourceOffset;
	else if (M == IncomingInputs[1])
	M = InputsFixed[1] + SourceOffset;

	IncomingInputs[0] = InputsFixed[0] + SourceOffset;
	IncomingInputs[1] = InputsFixed[1] + SourceOffset;
	}
	} else {
	llvm_unreachable("Unhandled input size!");
	}

	// Now hoist the DWord down to the right half.
	int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
	assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
	PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
	for (int &M : HalfMask)
	for (int Input : IncomingInputs)
	if (M == Input)
	M = FreeDWord * 2 + Input % 2;
	};
	moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
	/SourceOffset/ 4, /DestOffset/ 0);
	moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
	/SourceOffset/ 0, /DestOffset/ 4);

	// Now enact all the shuffles we've computed to move the inputs into their
	// target half.
	if (!isNoopShuffleMask(PSHUFLMask))
	V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
	if (!isNoopShuffleMask(PSHUFHMask))
	V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
	if (!isNoopShuffleMask(PSHUFDMask))
	V = DAG.getBitcast(
	VT,
	DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

	// At this point, each half should contain all its inputs, and we can then
	// just shuffle them into their final position.
	assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
	"Failed to lift all the high half inputs to the low mask!");
	assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
	"Failed to lift all the low half inputs to the high mask!");

	// Do a half shuffle for the low mask.
	if (!isNoopShuffleMask(LoMask))
	V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));

	// Do a half shuffle with the high mask after shifting its values down.
	for (int &M : HiMask)
	if (M >= 0)
	M -= 4;
	if (!isNoopShuffleMask(HiMask))
	V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));

	return V;
	}

	/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
	/// blend if only one input is used.
	static SDValue lowerShuffleAsBlendOfPSHUFBs(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
	assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&
	"Lane crossing shuffle masks not supported");

	int NumBytes = VT.getSizeInBits() / 8;
	int Size = Mask.size();
	int Scale = NumBytes / Size;

	SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
	SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
	V1InUse = false;
	V2InUse = false;

	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / Scale];
	if (M < 0)
	continue;

	const int ZeroMask = 0x80;
	int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
	int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
	if (Zeroable[i / Scale])
	V1Idx = V2Idx = ZeroMask;

	V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
	V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
	V1InUse \|= (ZeroMask != V1Idx);
	V2InUse \|= (ZeroMask != V2Idx);
	}

	MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
	if (V1InUse)
	V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
	DAG.getBuildVector(ShufVT, DL, V1Mask));
	if (V2InUse)
	V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
	DAG.getBuildVector(ShufVT, DL, V2Mask));

	// If we need shuffled inputs from both, blend the two.
	SDValue V;
	if (V1InUse && V2InUse)
	V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
	else
	V = V1InUse ? V1 : V2;

	// Cast the result back to the correct type.
	return DAG.getBitcast(VT, V);
	}

	/// Generic lowering of 8-lane i16 shuffles.
	///
	/// This handles both single-input shuffles and combined shuffle/blends with
	/// two inputs. The single input shuffles are immediately delegated to
	/// a dedicated lowering routine.
	///
	/// The blends are lowered in one of three fundamental ways. If there are few
	/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
	/// of the input is significantly cheaper when lowered as an interleaving of
	/// the two inputs, try to interleave them. Otherwise, blend the low and high
	/// halves of the inputs separately (making them have relatively few inputs)
	/// and then concatenate them.
	static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return ZExt;

	int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });

	if (NumV2Inputs == 0) {
	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// Try to use bit rotation instructions.
	if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
	Subtarget, DAG))
	return Rotate;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
	Subtarget, DAG))
	return Rotate;

	// Make a copy of the mask so it can be modified.
	SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
	return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
	Subtarget, DAG);
	}

	assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
	"All single-input shuffles should be canonicalized to be V1-input "
	"shuffles.");

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// See if we can use SSE4A Extraction / Insertion.
	if (Subtarget.hasSSE4A())
	if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, DAG))
	return V;

	// There are special ways we can lower some single-element blends.
	if (NumV2Inputs == 1)
	if (SDValue V = lowerShuffleAsElementInsertion(
	DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue BitBlend =
	lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
	return BitBlend;

	// Try to use byte shift instructions to mask.
	if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return V;

	// Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
	// We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to
	// be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.
	int NumEvenDrops = canLowerByDroppingEvenElements(Mask, false);
	if ((NumEvenDrops == 1 \|\| NumEvenDrops == 2) && Subtarget.hasSSE41() &&
	!Subtarget.hasVLX()) {
	SmallVector<SDValue, 8> DWordClearOps(4, DAG.getConstant(0, DL, MVT::i32));
	for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
	DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
	SDValue DWordClearMask = DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
	V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
	DWordClearMask);
	V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
	DWordClearMask);
	// Now pack things back together.
	SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);
	if (NumEvenDrops == 2) {
	Result = DAG.getBitcast(MVT::v4i32, Result);
	Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);
	}
	return Result;
	}

	// Try to lower by permuting the inputs into an unpack instruction.
	if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
	Mask, Subtarget, DAG))
	return Unpack;

	// If we can't directly blend but can use PSHUFB, that will be better as it
	// can both shuffle and set up the inefficient blend.
	if (!IsBlendSupported && Subtarget.hasSSSE3()) {
	bool V1InUse, V2InUse;
	return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, DAG, V1InUse, V2InUse);
	}

	// We can always bit-blend if we have to so the fallback strategy is to
	// decompose into single-input permutes and blends.
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
	Mask, Subtarget, DAG);
	}

	static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
	MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());

	SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);

	return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
	}

	/// Generic lowering of v16i8 shuffles.
	///
	/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
	/// detect any complexity reducing interleaving. If that doesn't help, it uses
	/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
	/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
	/// back together.
	static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use a zext lowering.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return ZExt;

	// See if we can use SSE4A Extraction / Insertion.
	if (Subtarget.hasSSE4A())
	if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, DAG))
	return V;

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });

	// For single-input shuffles, there are some nicer lowering tricks we can use.
	if (NumV2Elements == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// Try to use bit rotation instructions.
	if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
	return V;

	// Check whether we can widen this to an i16 shuffle by duplicating bytes.
	// Notably, this handles splat and partial-splat shuffles more efficiently.
	// However, it only makes sense if the pre-duplication shuffle simplifies
	// things significantly. Currently, this means we need to be able to
	// express the pre-duplication shuffle as an i16 shuffle.
	//
	// FIXME: We should check for other patterns which can be widened into an
	// i16 shuffle as well.
	auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
	for (int i = 0; i < 16; i += 2)
	if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
	return false;

	return true;
	};
	auto tryToWidenViaDuplication = [&]() -> SDValue {
	if (!canWidenViaDuplication(Mask))
	return SDValue();
	SmallVector<int, 4> LoInputs;
	copy_if(Mask, std::back_inserter(LoInputs),
	[](int M) { return M >= 0 && M < 8; });
	array_pod_sort(LoInputs.begin(), LoInputs.end());
	LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
	LoInputs.end());
	SmallVector<int, 4> HiInputs;
	copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
	array_pod_sort(HiInputs.begin(), HiInputs.end());
	HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
	HiInputs.end());

	bool TargetLo = LoInputs.size() >= HiInputs.size();
	ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
	ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;

	int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
	SmallDenseMap<int, int, 8> LaneMap;
	for (int I : InPlaceInputs) {
	PreDupI16Shuffle[I/2] = I/2;
	LaneMap[I] = I;
	}
	int j = TargetLo ? 0 : 4, je = j + 4;
	for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
	// Check if j is already a shuffle of this input. This happens when
	// there are two adjacent bytes after we move the low one.
	if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
	// If we haven't yet mapped the input, search for a slot into which
	// we can map it.
	while (j < je && PreDupI16Shuffle[j] >= 0)
	++j;

	if (j == je)
	// We can't place the inputs into a single half with a simple i16 shuffle, so bail.
	return SDValue();

	// Map this input with the i16 shuffle.
	PreDupI16Shuffle[j] = MovingInputs[i] / 2;
	}

	// Update the lane map based on the mapping we ended up with.
	LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
	}
	V1 = DAG.getBitcast(
	MVT::v16i8,
	DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
	DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));

	// Unpack the bytes to form the i16s that will be shuffled into place.
	bool EvenInUse = false, OddInUse = false;
	for (int i = 0; i < 16; i += 2) {
	EvenInUse \|= (Mask[i + 0] >= 0);
	OddInUse \|= (Mask[i + 1] >= 0);
	if (EvenInUse && OddInUse)
	break;
	}
	V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
	MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
	OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));

	int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
	for (int i = 0; i < 16; ++i)
	if (Mask[i] >= 0) {
	int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
	assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
	if (PostDupI16Shuffle[i / 2] < 0)
	PostDupI16Shuffle[i / 2] = MappedMask;
	else
	assert(PostDupI16Shuffle[i / 2] == MappedMask &&
	"Conflicting entries in the original shuffle!");
	}
	return DAG.getBitcast(
	MVT::v16i8,
	DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
	DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
	};
	if (SDValue V = tryToWidenViaDuplication())
	return V;
	}

	if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
	return V;

	// Try to use byte shift instructions to mask.
	if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return V;

	// Check for compaction patterns.
	bool IsSingleInput = V2.isUndef();
	int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput);

	// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
	// with PSHUFB. It is important to do this before we attempt to generate any
	// blends but after all of the single-input lowerings. If the single input
	// lowerings can find an instruction sequence that is faster than a PSHUFB, we
	// want to preserve that and we can DAG combine any longer sequences into
	// a PSHUFB in the end. But once we start blending from multiple inputs,
	// the complexity of DAG combining bad patterns back into PSHUFB is too high,
	// and there are very few patterns that would actually be faster than the
	// PSHUFB approach because of its ability to zero lanes.
	//
	// If the mask is a binary compaction, we can more efficiently perform this
	// as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
	//
	// FIXME: The only exceptions to the above are blends which are exact
	// interleavings with direct instructions supporting them. We currently don't
	// handle those well here.
	if (Subtarget.hasSSSE3() && (IsSingleInput \|\| NumEvenDrops != 1)) {
	bool V1InUse = false;
	bool V2InUse = false;

	SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);

	// If both V1 and V2 are in use and we can use a direct blend or an unpack,
	// do so. This avoids using them to handle blends-with-zero which is
	// important as a single pshufb is significantly faster for that.
	if (V1InUse && V2InUse) {
	if (Subtarget.hasSSE41())
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// We can use an unpack to do the blending rather than an or in some
	// cases. Even though the or may be (very minorly) more efficient, we
	// preference this lowering because there are common cases where part of
	// the complexity of the shuffles goes away when we do the final blend as
	// an unpack.
	// FIXME: It might be worth trying to detect if the unpack-feeding
	// shuffles will both be pshufb, in which case we shouldn't bother with
	// this.
	if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
	DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
	return Unpack;

	// If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
	if (Subtarget.hasVBMI() && Subtarget.hasVLX())
	return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);

	// Use PALIGNR+Permute if possible - permute might become PSHUFB but the
	// PALIGNR will be cheaper than the second PSHUFB+OR.
	if (SDValue V = lowerShuffleAsByteRotateAndPermute(
	DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
	return V;
	}

	return PSHUFB;
	}

	// There are special ways we can lower some single-element blends.
	if (NumV2Elements == 1)
	if (SDValue V = lowerShuffleAsElementInsertion(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
	return Blend;

	// Check whether a compaction lowering can be done. This handles shuffles
	// which take every Nth element for some even N. See the helper function for
	// details.
	//
	// We special case these as they can be particularly efficiently handled with
	// the PACKUSB instruction on x86 and they show up in common patterns of
	// rearranging bytes to truncate wide elements.
	if (NumEvenDrops) {
	// NumEvenDrops is the power of two stride of the elements. Another way of
	// thinking about it is that we need to drop the even elements this many
	// times to get the original input.

	// First we need to zero all the dropped bytes.
	assert(NumEvenDrops <= 3 &&
	"No support for dropping even elements more than 3 times.");
	SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
	for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
	WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
	SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
	V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
	WordClearMask);
	if (!IsSingleInput)
	V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
	WordClearMask);

	// Now pack things back together.
	SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
	IsSingleInput ? V1 : V2);
	for (int i = 1; i < NumEvenDrops; ++i) {
	Result = DAG.getBitcast(MVT::v8i16, Result);
	Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
	}
	return Result;
	}

	// Handle multi-input cases by blending single-input shuffles.
	if (NumV2Elements > 0)
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, Mask,
	Subtarget, DAG);

	// The fallback path for single-input shuffles widens this into two v8i16
	// vectors with unpacks, shuffles those, and then pulls them back together
	// with a pack.
	SDValue V = V1;

	std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
	std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
	for (int i = 0; i < 16; ++i)
	if (Mask[i] >= 0)
	(i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];

	SDValue VLoHalf, VHiHalf;
	// Check if any of the odd lanes in the v16i8 are used. If not, we can mask
	// them out and avoid using UNPCK{L,H} to extract the elements of V as
	// i16s.
	if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
	none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
	// Use a mask to drop the high bytes.
	VLoHalf = DAG.getBitcast(MVT::v8i16, V);
	VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
	DAG.getConstant(0x00FF, DL, MVT::v8i16));

	// This will be a single vector shuffle instead of a blend so nuke VHiHalf.
	VHiHalf = DAG.getUNDEF(MVT::v8i16);

	// Squash the masks to point directly into VLoHalf.
	for (int &M : LoBlendMask)
	if (M >= 0)
	M /= 2;
	for (int &M : HiBlendMask)
	if (M >= 0)
	M /= 2;
	} else {
	// Otherwise just unpack the low half of V into VLoHalf and the high half into
	// VHiHalf so that we can blend them as i16s.
	SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);

	VLoHalf = DAG.getBitcast(
	MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
	VHiHalf = DAG.getBitcast(
	MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
	}

	SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
	SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);

	return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
	}

	/// Dispatching routine to lower various 128-bit x86 vector shuffles.
	///
	/// This routine breaks down the specific type of 128-bit shuffle and
	/// dispatches to the lowering routines accordingly.
	static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	switch (VT.SimpleTy) {
	case MVT::v2i64:
	return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v2f64:
	return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4i32:
	return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4f32:
	return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i16:
	return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i8:
	return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Unimplemented!");
	}
	}

	/// Generic routine to split vector shuffle into half-sized shuffles.
	///
	/// This routine just extracts two subvectors, shuffles them independently, and
	/// then concatenates them back together. This should work effectively with all
	/// AVX vector shuffle types.
	static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(VT.getSizeInBits() >= 256 &&
	"Only for 256-bit or wider vector shuffles!");
	assert(V1.getSimpleValueType() == VT && "Bad operand type!");
	assert(V2.getSimpleValueType() == VT && "Bad operand type!");

	ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
	ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);

	int NumElements = VT.getVectorNumElements();
	int SplitNumElements = NumElements / 2;
	MVT ScalarVT = VT.getVectorElementType();
	MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);

	// Use splitVector/extractSubVector so that split build-vectors just build two
	// narrower build vectors. This helps shuffling with splats and zeros.
	auto SplitVector = [&](SDValue V) {
	SDValue LoV, HiV;
	std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
	return std::make_pair(DAG.getBitcast(SplitVT, LoV),
	DAG.getBitcast(SplitVT, HiV));
	};

	SDValue LoV1, HiV1, LoV2, HiV2;
	std::tie(LoV1, HiV1) = SplitVector(V1);
	std::tie(LoV2, HiV2) = SplitVector(V2);

	// Now create two 4-way blends of these half-width vectors.
	auto HalfBlend = [&](ArrayRef<int> HalfMask) {
	bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
	SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
	SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
	SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
	for (int i = 0; i < SplitNumElements; ++i) {
	int M = HalfMask[i];
	if (M >= NumElements) {
	if (M >= NumElements + SplitNumElements)
	UseHiV2 = true;
	else
	UseLoV2 = true;
	V2BlendMask[i] = M - NumElements;
	BlendMask[i] = SplitNumElements + i;
	} else if (M >= 0) {
	if (M >= SplitNumElements)
	UseHiV1 = true;
	else
	UseLoV1 = true;
	V1BlendMask[i] = M;
	BlendMask[i] = i;
	}
	}

	// Because the lowering happens after all combining takes place, we need to
	// manually combine these blend masks as much as possible so that we create
	// a minimal number of high-level vector shuffle nodes.

	// First try just blending the halves of V1 or V2.
	if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
	return DAG.getUNDEF(SplitVT);
	if (!UseLoV2 && !UseHiV2)
	return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
	if (!UseLoV1 && !UseHiV1)
	return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);

	SDValue V1Blend, V2Blend;
	if (UseLoV1 && UseHiV1) {
	V1Blend =
	DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
	} else {
	// We only use half of V1 so map the usage down into the final blend mask.
	V1Blend = UseLoV1 ? LoV1 : HiV1;
	for (int i = 0; i < SplitNumElements; ++i)
	if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
	BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
	}
	if (UseLoV2 && UseHiV2) {
	V2Blend =
	DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
	} else {
	// We only use half of V2 so map the usage down into the final blend mask.
	V2Blend = UseLoV2 ? LoV2 : HiV2;
	for (int i = 0; i < SplitNumElements; ++i)
	if (BlendMask[i] >= SplitNumElements)
	BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
	}
	return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
	};
	SDValue Lo = HalfBlend(LoMask);
	SDValue Hi = HalfBlend(HiMask);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
	}

	/// Either split a vector in halves or decompose the shuffles and the
	/// blend.
	///
	/// This is provided as a good fallback for many lowerings of non-single-input
	/// shuffles with more than one 128-bit lane. In those cases, we want to select
	/// between splitting the shuffle into 128-bit components and stitching those
	/// back together vs. extracting the single-input shuffles and blending those
	/// results.
	static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(!V2.isUndef() && "This routine must not be used to lower single-input "
	"shuffles as it could then recurse on itself.");
	int Size = Mask.size();

	// If this can be modeled as a broadcast of two elements followed by a blend,
	// prefer that lowering. This is especially important because broadcasts can
	// often fold with memory operands.
	auto DoBothBroadcast = [&] {
	int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
	for (int M : Mask)
	if (M >= Size) {
	if (V2BroadcastIdx < 0)
	V2BroadcastIdx = M - Size;
	else if (M - Size != V2BroadcastIdx)
	return false;
	} else if (M >= 0) {
	if (V1BroadcastIdx < 0)
	V1BroadcastIdx = M;
	else if (M != V1BroadcastIdx)
	return false;
	}
	return true;
	};
	if (DoBothBroadcast())
	return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
	Subtarget, DAG);

	// If the inputs all stem from a single 128-bit lane of each input, then we
	// split them rather than blending because the split will decompose to
	// unusually few instructions.
	int LaneCount = VT.getSizeInBits() / 128;
	int LaneSize = Size / LaneCount;
	SmallBitVector LaneInputs[2];
	LaneInputs[0].resize(LaneCount, false);
	LaneInputs[1].resize(LaneCount, false);
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0)
	LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
	if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
	return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);

	// Otherwise, just fall back to decomposed shuffles and a blend. This requires
	// that the decomposed single-input shuffles don't end up here.
	return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, Subtarget,
	DAG);
	}

	// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
	// TODO: Extend to support v8f32 (+ 512-bit shuffles).
	static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");

	int LHSMask[4] = {-1, -1, -1, -1};
	int RHSMask[4] = {-1, -1, -1, -1};
	unsigned SHUFPMask = 0;

	// As SHUFPD uses a single LHS/RHS element per lane, we can always
	// perform the shuffle once the lanes have been shuffled in place.
	for (int i = 0; i != 4; ++i) {
	int M = Mask[i];
	if (M < 0)
	continue;
	int LaneBase = i & ~1;
	auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
	LaneMask[LaneBase + (M & 1)] = M;
	SHUFPMask \|= (M & 1) << i;
	}

	SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
	SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
	return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
	DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
	}

	/// Lower a vector shuffle crossing multiple 128-bit lanes as
	/// a lane permutation followed by a per-lane permutation.
	///
	/// This is mainly for cases where we can have non-repeating permutes
	/// in each lane.
	///
	/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
	/// we should investigate merging them.
	static SDValue lowerShuffleAsLanePermuteAndPermute(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG, const X86Subtarget &Subtarget) {
	int NumElts = VT.getVectorNumElements();
	int NumLanes = VT.getSizeInBits() / 128;
	int NumEltsPerLane = NumElts / NumLanes;

	SmallVector<int, 4> SrcLaneMask(NumLanes, SM_SentinelUndef);
	SmallVector<int, 16> PermMask(NumElts, SM_SentinelUndef);

	for (int i = 0; i != NumElts; ++i) {
	int M = Mask[i];
	if (M < 0)
	continue;

	// Ensure that each lane comes from a single source lane.
	int SrcLane = M / NumEltsPerLane;
	int DstLane = i / NumEltsPerLane;
	if (!isUndefOrEqual(SrcLaneMask[DstLane], SrcLane))
	return SDValue();
	SrcLaneMask[DstLane] = SrcLane;

	PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane);
	}

	// Make sure we set all elements of the lane mask, to avoid undef propagation.
	SmallVector<int, 16> LaneMask(NumElts, SM_SentinelUndef);
	for (int DstLane = 0; DstLane != NumLanes; ++DstLane) {
	int SrcLane = SrcLaneMask[DstLane];
	if (0 <= SrcLane)
	for (int j = 0; j != NumEltsPerLane; ++j) {
	LaneMask[(DstLane * NumEltsPerLane) + j] =
	(SrcLane * NumEltsPerLane) + j;
	}
	}

	// If we're only shuffling a single lowest lane and the rest are identity
	// then don't bother.
	// TODO - isShuffleMaskInputInPlace could be extended to something like this.
	int NumIdentityLanes = 0;
	bool OnlyShuffleLowestLane = true;
	for (int i = 0; i != NumLanes; ++i) {
	if (isSequentialOrUndefInRange(PermMask, i * NumEltsPerLane, NumEltsPerLane,
	i * NumEltsPerLane))
	NumIdentityLanes++;
	else if (SrcLaneMask[i] != 0 && SrcLaneMask[i] != NumLanes)
	OnlyShuffleLowestLane = false;
	}
	if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
	return SDValue();

	SDValue LanePermute = DAG.getVectorShuffle(VT, DL, V1, V2, LaneMask);
	return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask);
	}

	/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
	/// source with a lane permutation.
	///
	/// This lowering strategy results in four instructions in the worst case for a
	/// single-input cross lane shuffle which is lower than any other fully general
	/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
	/// shuffle pattern should be handled prior to trying this lowering.
	static SDValue lowerShuffleAsLanePermuteAndShuffle(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG, const X86Subtarget &Subtarget) {
	// FIXME: This should probably be generalized for 512-bit vectors as well.
	assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
	int Size = Mask.size();
	int LaneSize = Size / 2;

	// Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
	// Only do this if the elements aren't all from the lower lane,
	// otherwise we're (probably) better off doing a split.
	if (VT == MVT::v4f64 &&
	!all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
	if (SDValue V =
	lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))
	return V;

	// If there are only inputs from one 128-bit lane, splitting will in fact be
	// less expensive. The flags track whether the given lane contains an element
	// that crosses to another lane.
	if (!Subtarget.hasAVX2()) {
	bool LaneCrossing[2] = {false, false};
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
	LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
	if (!LaneCrossing[0] \|\| !LaneCrossing[1])
	return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
	} else {
	bool LaneUsed[2] = {false, false};
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0)
	LaneUsed[(Mask[i] % Size) / LaneSize] = true;
	if (!LaneUsed[0] \|\| !LaneUsed[1])
	return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
	}

	// TODO - we could support shuffling V2 in the Flipped input.
	assert(V2.isUndef() &&
	"This last part of this routine only works on single input shuffles");

	SmallVector<int, 32> InLaneMask(Mask.begin(), Mask.end());
	for (int i = 0; i < Size; ++i) {
	int &M = InLaneMask[i];
	if (M < 0)
	continue;
	if (((M % Size) / LaneSize) != (i / LaneSize))
	M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
	}
	assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
	"In-lane shuffle mask expected");

	// Flip the lanes, and shuffle the results which should now be in-lane.
	MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
	SDValue Flipped = DAG.getBitcast(PVT, V1);
	Flipped =
	DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
	Flipped = DAG.getBitcast(VT, Flipped);
	return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
	}

	/// Handle lowering 2-lane 128-bit shuffles.
	static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
	if (Subtarget.hasAVX2() && V2.isUndef())
	return SDValue();

	bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());

	SmallVector<int, 4> WidenedMask;
	if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
	return SDValue();

	bool IsLowZero = (Zeroable & 0x3) == 0x3;
	bool IsHighZero = (Zeroable & 0xc) == 0xc;

	// Try to use an insert into a zero vector.
	if (WidenedMask[0] == 0 && IsHighZero) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
	SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
	getZeroVector(VT, Subtarget, DAG, DL), LoV,
	DAG.getIntPtrConstant(0, DL));
	}

	// TODO: If minimizing size and one of the inputs is a zero vector and the
	// the zero vector has only one use, we could use a VPERM2X128 to save the
	// instruction bytes needed to explicitly generate the zero vector.

	// Blends are faster and handle all the non-lane-crossing cases.
	if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return Blend;

	// If either input operand is a zero vector, use VPERM2X128 because its mask
	// allows us to replace the zero input with an implicit zero.
	if (!IsLowZero && !IsHighZero) {
	// Check for patterns which can be matched with a single insert of a 128-bit
	// subvector.
	bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
	if (OnlyUsesV1 \|\| isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {

	// With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
	// this will likely become vinsertf128 which can't fold a 256-bit memop.
	if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
	SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
	OnlyUsesV1 ? V1 : V2,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
	DAG.getIntPtrConstant(2, DL));
	}
	}

	// Try to use SHUF128 if possible.
	if (Subtarget.hasVLX()) {
	if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
	unsigned PermMask = ((WidenedMask[0] % 2) << 0) \|
	((WidenedMask[1] % 2) << 1);
	return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
	DAG.getTargetConstant(PermMask, DL, MVT::i8));
	}
	}
	}

	// Otherwise form a 128-bit permutation. After accounting for undefs,
	// convert the 64-bit shuffle mask selection values into 128-bit
	// selection bits by dividing the indexes by 2 and shifting into positions
	// defined by a vperm2*128 instruction's immediate control byte.

	// The immediate permute control byte looks like this:
	// [1:0] - select 128 bits from sources for low half of destination
	// [2] - ignore
	// [3] - zero low half of destination
	// [5:4] - select 128 bits from sources for high half of destination
	// [6] - ignore
	// [7] - zero high half of destination

	assert((WidenedMask[0] >= 0 \|\| IsLowZero) &&
	(WidenedMask[1] >= 0 \|\| IsHighZero) && "Undef half?");

	unsigned PermMask = 0;
	PermMask \|= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
	PermMask \|= IsHighZero ? 0x80 : (WidenedMask[1] << 4);

	// Check the immediate mask and replace unused sources with undef.
	if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
	V1 = DAG.getUNDEF(VT);
	if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
	V2 = DAG.getUNDEF(VT);

	return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
	DAG.getTargetConstant(PermMask, DL, MVT::i8));
	}

	/// Lower a vector shuffle by first fixing the 128-bit lanes and then
	/// shuffling each lane.
	///
	/// This attempts to create a repeated lane shuffle where each lane uses one
	/// or two of the lanes of the inputs. The lanes of the input vectors are
	/// shuffled in one or two independent shuffles to get the lanes into the
	/// position needed by the final shuffle.
	static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(!V2.isUndef() && "This is only useful with multiple inputs.");

	if (is128BitLaneRepeatedShuffleMask(VT, Mask))
	return SDValue();

	int NumElts = Mask.size();
	int NumLanes = VT.getSizeInBits() / 128;
	int NumLaneElts = 128 / VT.getScalarSizeInBits();
	SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
	SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});

	// First pass will try to fill in the RepeatMask from lanes that need two
	// sources.
	for (int Lane = 0; Lane != NumLanes; ++Lane) {
	int Srcs[2] = {-1, -1};
	SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
	for (int i = 0; i != NumLaneElts; ++i) {
	int M = Mask[(Lane * NumLaneElts) + i];
	if (M < 0)
	continue;
	// Determine which of the possible input lanes (NumLanes from each source)
	// this element comes from. Assign that as one of the sources for this
	// lane. We can assign up to 2 sources for this lane. If we run out
	// sources we can't do anything.
	int LaneSrc = M / NumLaneElts;
	int Src;
	if (Srcs[0] < 0 \|\| Srcs[0] == LaneSrc)
	Src = 0;
	else if (Srcs[1] < 0 \|\| Srcs[1] == LaneSrc)
	Src = 1;
	else
	return SDValue();

	Srcs[Src] = LaneSrc;
	InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
	}

	// If this lane has two sources, see if it fits with the repeat mask so far.
	if (Srcs[1] < 0)
	continue;

	LaneSrcs[Lane][0] = Srcs[0];
	LaneSrcs[Lane][1] = Srcs[1];

	auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
	assert(M1.size() == M2.size() && "Unexpected mask size");
	for (int i = 0, e = M1.size(); i != e; ++i)
	if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
	return false;
	return true;
	};

	auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
	assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
	for (int i = 0, e = MergedMask.size(); i != e; ++i) {
	int M = Mask[i];
	if (M < 0)
	continue;
	assert((MergedMask[i] < 0 \|\| MergedMask[i] == M) &&
	"Unexpected mask element");
	MergedMask[i] = M;
	}
	};

	if (MatchMasks(InLaneMask, RepeatMask)) {
	// Merge this lane mask into the final repeat mask.
	MergeMasks(InLaneMask, RepeatMask);
	continue;
	}

	// Didn't find a match. Swap the operands and try again.
	std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
	ShuffleVectorSDNode::commuteMask(InLaneMask);

	if (MatchMasks(InLaneMask, RepeatMask)) {
	// Merge this lane mask into the final repeat mask.
	MergeMasks(InLaneMask, RepeatMask);
	continue;
	}

	// Couldn't find a match with the operands in either order.
	return SDValue();
	}

	// Now handle any lanes with only one source.
	for (int Lane = 0; Lane != NumLanes; ++Lane) {
	// If this lane has already been processed, skip it.
	if (LaneSrcs[Lane][0] >= 0)
	continue;

	for (int i = 0; i != NumLaneElts; ++i) {
	int M = Mask[(Lane * NumLaneElts) + i];
	if (M < 0)
	continue;

	// If RepeatMask isn't defined yet we can define it ourself.
	if (RepeatMask[i] < 0)
	RepeatMask[i] = M % NumLaneElts;

	if (RepeatMask[i] < NumElts) {
	if (RepeatMask[i] != M % NumLaneElts)
	return SDValue();
	LaneSrcs[Lane][0] = M / NumLaneElts;
	} else {
	if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
	return SDValue();
	LaneSrcs[Lane][1] = M / NumLaneElts;
	}
	}

	if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
	return SDValue();
	}

	SmallVector<int, 16> NewMask(NumElts, -1);
	for (int Lane = 0; Lane != NumLanes; ++Lane) {
	int Src = LaneSrcs[Lane][0];
	for (int i = 0; i != NumLaneElts; ++i) {
	int M = -1;
	if (Src >= 0)
	M = Src * NumLaneElts + i;
	NewMask[Lane * NumLaneElts + i] = M;
	}
	}
	SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
	// Ensure we didn't get back the shuffle we started with.
	// FIXME: This is a hack to make up for some splat handling code in
	// getVectorShuffle.
	if (isa<ShuffleVectorSDNode>(NewV1) &&
	cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
	return SDValue();

	for (int Lane = 0; Lane != NumLanes; ++Lane) {
	int Src = LaneSrcs[Lane][1];
	for (int i = 0; i != NumLaneElts; ++i) {
	int M = -1;
	if (Src >= 0)
	M = Src * NumLaneElts + i;
	NewMask[Lane * NumLaneElts + i] = M;
	}
	}
	SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
	// Ensure we didn't get back the shuffle we started with.
	// FIXME: This is a hack to make up for some splat handling code in
	// getVectorShuffle.
	if (isa<ShuffleVectorSDNode>(NewV2) &&
	cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
	return SDValue();

	for (int i = 0; i != NumElts; ++i) {
	NewMask[i] = RepeatMask[i % NumLaneElts];
	if (NewMask[i] < 0)
	continue;

	NewMask[i] += (i / NumLaneElts) * NumLaneElts;
	}
	return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
	}

	/// If the input shuffle mask results in a vector that is undefined in all upper
	/// or lower half elements and that mask accesses only 2 halves of the
	/// shuffle's operands, return true. A mask of half the width with mask indexes
	/// adjusted to access the extracted halves of the original shuffle operands is
	/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
	/// lower half of each input operand is accessed.
	static bool
	getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
	int &HalfIdx1, int &HalfIdx2) {
	assert((Mask.size() == HalfMask.size() * 2) &&
	"Expected input mask to be twice as long as output");

	// Exactly one half of the result must be undef to allow narrowing.
	bool UndefLower = isUndefLowerHalf(Mask);
	bool UndefUpper = isUndefUpperHalf(Mask);
	if (UndefLower == UndefUpper)
	return false;

	unsigned HalfNumElts = HalfMask.size();
	unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
	HalfIdx1 = -1;
	HalfIdx2 = -1;
	for (unsigned i = 0; i != HalfNumElts; ++i) {
	int M = Mask[i + MaskIndexOffset];
	if (M < 0) {
	HalfMask[i] = M;
	continue;
	}

	// Determine which of the 4 half vectors this element is from.
	// i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
	int HalfIdx = M / HalfNumElts;

	// Determine the element index into its half vector source.
	int HalfElt = M % HalfNumElts;

	// We can shuffle with up to 2 half vectors, set the new 'half'
	// shuffle mask accordingly.
	if (HalfIdx1 < 0 \|\| HalfIdx1 == HalfIdx) {
	HalfMask[i] = HalfElt;
	HalfIdx1 = HalfIdx;
	continue;
	}
	if (HalfIdx2 < 0 \|\| HalfIdx2 == HalfIdx) {
	HalfMask[i] = HalfElt + HalfNumElts;
	HalfIdx2 = HalfIdx;
	continue;
	}

	// Too many half vectors referenced.
	return false;
	}

	return true;
	}

	/// Given the output values from getHalfShuffleMask(), create a half width
	/// shuffle of extracted vectors followed by an insert back to full width.
	static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
	ArrayRef<int> HalfMask, int HalfIdx1,
	int HalfIdx2, bool UndefLower,
	SelectionDAG &DAG, bool UseConcat = false) {
	assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
	assert(V1.getValueType().isSimple() && "Expecting only simple types");

	MVT VT = V1.getSimpleValueType();
	MVT HalfVT = VT.getHalfNumVectorElementsVT();
	unsigned HalfNumElts = HalfVT.getVectorNumElements();

	auto getHalfVector = [&](int HalfIdx) {
	if (HalfIdx < 0)
	return DAG.getUNDEF(HalfVT);
	SDValue V = (HalfIdx < 2 ? V1 : V2);
	HalfIdx = (HalfIdx % 2) * HalfNumElts;
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
	DAG.getIntPtrConstant(HalfIdx, DL));
	};

	// ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
	SDValue Half1 = getHalfVector(HalfIdx1);
	SDValue Half2 = getHalfVector(HalfIdx2);
	SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
	if (UseConcat) {
	SDValue Op0 = V;
	SDValue Op1 = DAG.getUNDEF(HalfVT);
	if (UndefLower)
	std::swap(Op0, Op1);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
	}

	unsigned Offset = UndefLower ? HalfNumElts : 0;
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
	DAG.getIntPtrConstant(Offset, DL));
	}

	/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
	/// This allows for fast cases such as subvector extraction/insertion
	/// or shuffling smaller vector types which can lower more efficiently.
	static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert((VT.is256BitVector() \|\| VT.is512BitVector()) &&
	"Expected 256-bit or 512-bit vector");

	bool UndefLower = isUndefLowerHalf(Mask);
	if (!UndefLower && !isUndefUpperHalf(Mask))
	return SDValue();

	assert((!UndefLower \|\| !isUndefUpperHalf(Mask)) &&
	"Completely undef shuffle mask should have been simplified already");

	// Upper half is undef and lower half is whole upper subvector.
	// e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
	MVT HalfVT = VT.getHalfNumVectorElementsVT();
	unsigned HalfNumElts = HalfVT.getVectorNumElements();
	if (!UndefLower &&
	isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
	DAG.getIntPtrConstant(HalfNumElts, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
	DAG.getIntPtrConstant(0, DL));
	}

	// Lower half is undef and upper half is whole lower subvector.
	// e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
	if (UndefLower &&
	isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
	DAG.getIntPtrConstant(HalfNumElts, DL));
	}

	int HalfIdx1, HalfIdx2;
	SmallVector<int, 8> HalfMask(HalfNumElts);
	if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
	return SDValue();

	assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");

	// Only shuffle the halves of the inputs when useful.
	unsigned NumLowerHalves =
	(HalfIdx1 == 0 \|\| HalfIdx1 == 2) + (HalfIdx2 == 0 \|\| HalfIdx2 == 2);
	unsigned NumUpperHalves =
	(HalfIdx1 == 1 \|\| HalfIdx1 == 3) + (HalfIdx2 == 1 \|\| HalfIdx2 == 3);
	assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");

	// Determine the larger pattern of undef/halves, then decide if it's worth
	// splitting the shuffle based on subtarget capabilities and types.
	unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
	if (!UndefLower) {
	// XXXXuuuu: no insert is needed.
	// Always extract lowers when setting lower - these are all free subreg ops.
	if (NumUpperHalves == 0)
	return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
	UndefLower, DAG);

	if (NumUpperHalves == 1) {
	// AVX2 has efficient 32/64-bit element cross-lane shuffles.
	if (Subtarget.hasAVX2()) {
	// extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
	if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
	!is128BitUnpackShuffleMask(HalfMask) &&
	(!isSingleSHUFPSMask(HalfMask) \|\|
	Subtarget.hasFastVariableShuffle()))
	return SDValue();
	// If this is a unary shuffle (assume that the 2nd operand is
	// canonicalized to undef), then we can use vpermpd. Otherwise, we
	// are better off extracting the upper half of 1 operand and using a
	// narrow shuffle.
	if (EltWidth == 64 && V2.isUndef())
	return SDValue();
	}
	// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
	if (Subtarget.hasAVX512() && VT.is512BitVector())
	return SDValue();
	// Extract + narrow shuffle is better than the wide alternative.
	return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
	UndefLower, DAG);
	}

	// Don't extract both uppers, instead shuffle and then extract.
	assert(NumUpperHalves == 2 && "Half vector count went wrong");
	return SDValue();
	}

	// UndefLower - uuuuXXXX: an insert to high half is required if we split this.
	if (NumUpperHalves == 0) {
	// AVX2 has efficient 64-bit element cross-lane shuffles.
	// TODO: Refine to account for unary shuffle, splat, and other masks?
	if (Subtarget.hasAVX2() && EltWidth == 64)
	return SDValue();
	// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
	if (Subtarget.hasAVX512() && VT.is512BitVector())
	return SDValue();
	// Narrow shuffle + insert is better than the wide alternative.
	return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
	UndefLower, DAG);
	}

	// NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
	return SDValue();
	}

	/// Test whether the specified input (0 or 1) is in-place blended by the
	/// given mask.
	///
	/// This returns true if the elements from a particular input are already in the
	/// slot required by the given mask and require no permutation.
	static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
	assert((Input == 0 \|\| Input == 1) && "Only two inputs to shuffles.");
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
	return false;

	return true;
	}

	/// Handle case where shuffle sources are coming from the same 128-bit lane and
	/// every lane can be represented as the same repeating mask - allowing us to
	/// shuffle the sources with the repeating shuffle and then permute the result
	/// to the destination lanes.
	static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	int NumElts = VT.getVectorNumElements();
	int NumLanes = VT.getSizeInBits() / 128;
	int NumLaneElts = NumElts / NumLanes;

	// On AVX2 we may be able to just shuffle the lowest elements and then
	// broadcast the result.
	if (Subtarget.hasAVX2()) {
	for (unsigned BroadcastSize : {16, 32, 64}) {
	if (BroadcastSize <= VT.getScalarSizeInBits())
	continue;
	int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();

	// Attempt to match a repeating pattern every NumBroadcastElts,
	// accounting for UNDEFs but only references the lowest 128-bit
	// lane of the inputs.
	auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
	for (int i = 0; i != NumElts; i += NumBroadcastElts)
	for (int j = 0; j != NumBroadcastElts; ++j) {
	int M = Mask[i + j];
	if (M < 0)
	continue;
	int &R = RepeatMask[j];
	if (0 != ((M % NumElts) / NumLaneElts))
	return false;
	if (0 <= R && R != M)
	return false;
	R = M;
	}
	return true;
	};

	SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
	if (!FindRepeatingBroadcastMask(RepeatMask))
	continue;

	// Shuffle the (lowest) repeated elements in place for broadcast.
	SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);

	// Shuffle the actual broadcast.
	SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
	for (int i = 0; i != NumElts; i += NumBroadcastElts)
	for (int j = 0; j != NumBroadcastElts; ++j)
	BroadcastMask[i + j] = j;
	return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
	BroadcastMask);
	}
	}

	// Bail if the shuffle mask doesn't cross 128-bit lanes.
	if (!is128BitLaneCrossingShuffleMask(VT, Mask))
	return SDValue();

	// Bail if we already have a repeated lane shuffle mask.
	SmallVector<int, 8> RepeatedShuffleMask;
	if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
	return SDValue();

	// On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
	// (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
	int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
	int NumSubLanes = NumLanes * SubLaneScale;
	int NumSubLaneElts = NumLaneElts / SubLaneScale;

	// Check that all the sources are coming from the same lane and see if we can
	// form a repeating shuffle mask (local to each sub-lane). At the same time,
	// determine the source sub-lane for each destination sub-lane.
	int TopSrcSubLane = -1;
	SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
	SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
	SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
	SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};

	for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
	// Extract the sub-lane mask, check that it all comes from the same lane
	// and normalize the mask entries to come from the first lane.
	int SrcLane = -1;
	SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
	for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
	int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
	if (M < 0)
	continue;
	int Lane = (M % NumElts) / NumLaneElts;
	if ((0 <= SrcLane) && (SrcLane != Lane))
	return SDValue();
	SrcLane = Lane;
	int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
	SubLaneMask[Elt] = LocalM;
	}

	// Whole sub-lane is UNDEF.
	if (SrcLane < 0)
	continue;

	// Attempt to match against the candidate repeated sub-lane masks.
	for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
	auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
	for (int i = 0; i != NumSubLaneElts; ++i) {
	if (M1[i] < 0 \|\| M2[i] < 0)
	continue;
	if (M1[i] != M2[i])
	return false;
	}
	return true;
	};

	auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
	if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
	continue;

	// Merge the sub-lane mask into the matching repeated sub-lane mask.
	for (int i = 0; i != NumSubLaneElts; ++i) {
	int M = SubLaneMask[i];
	if (M < 0)
	continue;
	assert((RepeatedSubLaneMask[i] < 0 \|\| RepeatedSubLaneMask[i] == M) &&
	"Unexpected mask element");
	RepeatedSubLaneMask[i] = M;
	}

	// Track the top most source sub-lane - by setting the remaining to UNDEF
	// we can greatly simplify shuffle matching.
	int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
	TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
	Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
	break;
	}

	// Bail if we failed to find a matching repeated sub-lane mask.
	if (Dst2SrcSubLanes[DstSubLane] < 0)
	return SDValue();
	}
	assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
	"Unexpected source lane");

	// Create a repeating shuffle mask for the entire vector.
	SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
	for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
	int Lane = SubLane / SubLaneScale;
	auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
	for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
	int M = RepeatedSubLaneMask[Elt];
	if (M < 0)
	continue;
	int Idx = (SubLane * NumSubLaneElts) + Elt;
	RepeatedMask[Idx] = M + (Lane * NumLaneElts);
	}
	}
	SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);

	// Shuffle each source sub-lane to its destination.
	SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
	for (int i = 0; i != NumElts; i += NumSubLaneElts) {
	int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
	if (SrcSubLane < 0)
	continue;
	for (int j = 0; j != NumSubLaneElts; ++j)
	SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
	}

	return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
	SubLaneMask);
	}

	static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
	bool &ForceV1Zero, bool &ForceV2Zero,
	unsigned &ShuffleImm, ArrayRef<int> Mask,
	const APInt &Zeroable) {
	int NumElts = VT.getVectorNumElements();
	assert(VT.getScalarSizeInBits() == 64 &&
	(NumElts == 2 \|\| NumElts == 4 \|\| NumElts == 8) &&
	"Unexpected data type for VSHUFPD");
	assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
	"Illegal shuffle mask");

	bool ZeroLane[2] = { true, true };
	for (int i = 0; i < NumElts; ++i)
	ZeroLane[i & 1] &= Zeroable[i];

	// Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
	// Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
	ShuffleImm = 0;
	bool ShufpdMask = true;
	bool CommutableMask = true;
	for (int i = 0; i < NumElts; ++i) {
	if (Mask[i] == SM_SentinelUndef \|\| ZeroLane[i & 1])
	continue;
	if (Mask[i] < 0)
	return false;
	int Val = (i & 6) + NumElts * (i & 1);
	int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
	if (Mask[i] < Val \|\| Mask[i] > Val + 1)
	ShufpdMask = false;
	if (Mask[i] < CommutVal \|\| Mask[i] > CommutVal + 1)
	CommutableMask = false;
	ShuffleImm \|= (Mask[i] % 2) << i;
	}

	if (!ShufpdMask && !CommutableMask)
	return false;

	if (!ShufpdMask && CommutableMask)
	std::swap(V1, V2);

	ForceV1Zero = ZeroLane[0];
	ForceV2Zero = ZeroLane[1];
	return true;
	}

	static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert((VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v8f64) &&
	"Unexpected data type for VSHUFPD");

	unsigned Immediate = 0;
	bool ForceV1Zero = false, ForceV2Zero = false;
	if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
	Mask, Zeroable))
	return SDValue();

	// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
	if (ForceV1Zero)
	V1 = getZeroVector(VT, Subtarget, DAG, DL);
	if (ForceV2Zero)
	V2 = getZeroVector(VT, Subtarget, DAG, DL);

	return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
	DAG.getTargetConstant(Immediate, DL, MVT::i8));
	}

	// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
	// by zeroable elements in the remaining 24 elements. Turn this into two
	// vmovqb instructions shuffled together.
	static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	const APInt &Zeroable,
	SelectionDAG &DAG) {
	assert(VT == MVT::v32i8 && "Unexpected type!");

	// The first 8 indices should be every 8th element.
	if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
	return SDValue();

	// Remaining elements need to be zeroable.
	if (Zeroable.countLeadingOnes() < (Mask.size() - 8))
	return SDValue();

	V1 = DAG.getBitcast(MVT::v4i64, V1);
	V2 = DAG.getBitcast(MVT::v4i64, V2);

	V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
	V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);

	// The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
	// the upper bits of the result using an unpckldq.
	SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
	{ 0, 1, 2, 3, 16, 17, 18, 19,
	4, 5, 6, 7, 20, 21, 22, 23 });
	// Insert the unpckldq into a zero vector to widen to v32i8.
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
	DAG.getConstant(0, DL, MVT::v32i8), Unpack,
	DAG.getIntPtrConstant(0, DL));
	}


	/// Handle lowering of 4-lane 64-bit floating point shuffles.
	///
	/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
	/// isn't available.
	static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return V;

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// Use low duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);

	if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
	// Non-half-crossing single input shuffles can be lowered with an
	// interleaved permutation.
	unsigned VPERMILPMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1) \|
	((Mask[2] == 3) << 2) \| ((Mask[3] == 3) << 3);
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
	DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
	}

	// With AVX2 we have direct support for this permutation.
	if (Subtarget.hasAVX2())
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Try to permute the lanes and then use a per-lane permute.
	if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
	Mask, DAG, Subtarget))
	return V;

	// Otherwise, fall back.
	return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
	DAG, Subtarget);
	}

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check if the blend happens to exactly fit that of SHUFPD.
	if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Op;

	// If we have lane crossing shuffles AND they don't all come from the lower
	// lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
	// TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
	// canonicalize to a blend of splat which isn't necessary for this combine.
	if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
	!all_of(Mask, [](int M) { return M < 2 \|\| (4 <= M && M < 6); }) &&
	(V1.getOpcode() != ISD::BUILD_VECTOR) &&
	(V2.getOpcode() != ISD::BUILD_VECTOR))
	if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2,
	Mask, DAG))
	return Op;

	// If we have one input in place, then we can permute the other input and
	// blend the result.
	if (isShuffleMaskInputInPlace(0, Mask) \|\| isShuffleMaskInputInPlace(1, Mask))
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
	Subtarget, DAG);

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle. However, if we have AVX2 and either inputs are already in place,
	// we will be able to shuffle even across lanes the other input in a single
	// instruction so skip this pattern.
	if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) \|\|
	isShuffleMaskInputInPlace(1, Mask))))
	if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// If we have VLX support, we can use VEXPAND.
	if (Subtarget.hasVLX())
	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;

	// If we have AVX2 then we always want to lower with a blend because an v4 we
	// can fully permute the elements.
	if (Subtarget.hasAVX2())
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
	Subtarget, DAG);

	// Otherwise fall back on generic lowering.
	return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// Handle lowering of 4-lane 64-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v4i64 shuffling..
	static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");

	if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	if (V2.isUndef()) {
	// When the shuffle is mirrored between the 128-bit lanes of the unit, we
	// can use lower latency instructions that will operate on both lanes.
	SmallVector<int, 2> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
	SmallVector<int, 4> PSHUFDMask;
	narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
	return DAG.getBitcast(
	MVT::v4i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
	DAG.getBitcast(MVT::v8i32, V1),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}

	// AVX2 provides a direct instruction for permuting a single input across
	// lanes.
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// If we have VLX support, we can use VALIGN or VEXPAND.
	if (Subtarget.hasVLX()) {
	if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;
	}

	// Try to use PALIGNR.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
	return V;

	// If we have one input in place, then we can permute the other input and
	// blend the result.
	if (isShuffleMaskInputInPlace(0, Mask) \|\| isShuffleMaskInputInPlace(1, Mask))
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
	Subtarget, DAG);

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle. However, if we have AVX2 and either inputs are already in place,
	// we will be able to shuffle even across lanes the other input in a single
	// instruction so skip this pattern.
	if (!isShuffleMaskInputInPlace(0, Mask) &&
	!isShuffleMaskInputInPlace(1, Mask))
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic blend lowering.
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// Handle lowering of 8-lane 32-bit floating point shuffles.
	///
	/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
	/// isn't available.
	static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	// If the shuffle mask is repeated in each 128-bit lane, we have many more
	// options to efficiently lower the shuffle.
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
	assert(RepeatedMask.size() == 4 &&
	"Repeated masks must be half the mask width!");

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);

	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
	return V;

	// Otherwise, fall back to a SHUFPS sequence. Here it is important that we
	// have already handled any direct blends.
	return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
	}

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
	return V;

	// If we have a single input shuffle with different shuffle patterns in the
	// two 128-bit lanes use the variable mask to VPERMILPS.
	if (V2.isUndef()) {
	if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
	SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
	return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
	}
	if (Subtarget.hasAVX2()) {
	SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
	return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
	}
	// Otherwise, fall back.
	return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
	DAG, Subtarget);
	}

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// If we have VLX support, we can use VEXPAND.
	if (Subtarget.hasVLX())
	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;

	// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
	// since after split we get a more efficient code using vpunpcklwd and
	// vpunpckhwd instrs than vblend.
	if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
	if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
	Subtarget, DAG))
	return V;

	// If we have AVX2 then we always want to lower with a blend because at v8 we
	// can fully permute the elements.
	if (Subtarget.hasAVX2())
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, Mask,
	Subtarget, DAG);

	// Otherwise fall back on generic lowering.
	return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// Handle lowering of 8-lane 32-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v8i32 shuffling..
	static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return ZExt;

	// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
	// since after split we get a more efficient code than vblend by using
	// vpunpcklwd and vpunpckhwd instrs.
	if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
	!Subtarget.hasAVX512())
	if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask,
	Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	// If the shuffle mask is repeated in each 128-bit lane we can use more
	// efficient instructions that mirror the shuffles across the two 128-bit
	// lanes.
	SmallVector<int, 4> RepeatedMask;
	bool Is128BitLaneRepeatedShuffle =
	is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
	if (Is128BitLaneRepeatedShuffle) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
	if (V2.isUndef())
	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
	return V;
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// If we have VLX support, we can use VALIGN or EXPAND.
	if (Subtarget.hasVLX()) {
	if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;
	}

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
	return V;

	if (V2.isUndef()) {
	// Try to produce a fixed cross-128-bit lane permute followed by unpack
	// because that should be faster than the variable permute alternatives.
	if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
	return V;

	// If the shuffle patterns aren't repeated but it's a single input, directly
	// generate a cross-lane VPERMD instruction.
	SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
	return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
	}

	// Assume that a single SHUFPS is faster than an alternative sequence of
	// multiple instructions (even if the CPU has a domain penalty).
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
	SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
	SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
	CastV1, CastV2, DAG);
	return DAG.getBitcast(MVT::v8i32, ShufPS);
	}

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic blend lowering.
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// Handle lowering of 16-lane 16-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v16i16 shuffling..
	static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
	DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
	return V;

	if (V2.isUndef()) {
	// Try to use bit rotation instructions.
	if (SDValue Rotate =
	lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
	return Rotate;

	// Try to produce a fixed cross-128-bit lane permute followed by unpack
	// because that should be faster than the variable permute alternatives.
	if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
	return V;

	// There are no generalized cross-lane shuffle operations available on i16
	// element types.
	if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
	if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
	DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
	return V;

	return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
	DAG, Subtarget);
	}

	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
	// As this is a single-input shuffle, the repeated mask should be
	// a strictly valid v8i16 mask that we can pass through to the v8i16
	// lowering to handle even the v16 case.
	return lowerV8I16GeneralSingleInputShuffle(
	DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
	}
	}

	if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
	Zeroable, Subtarget, DAG))
	return PSHUFB;

	// AVX512BWVL can lower to VPERMW.
	if (Subtarget.hasBWI() && Subtarget.hasVLX())
	return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Try to permute the lanes and then use a per-lane permute.
	if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
	DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
	return V;

	// Otherwise fall back on generic lowering.
	return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// Handle lowering of 32-lane 8-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v32i8 shuffling..
	static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
	assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return ZExt;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Try to use bit rotation instructions.
	if (V2.isUndef())
	if (SDValue Rotate =
	lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
	return V;

	// There are no generalized cross-lane shuffle operations available on i8
	// element types.
	if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
	// Try to produce a fixed cross-128-bit lane permute followed by unpack
	// because that should be faster than the variable permute alternatives.
	if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
	return V;

	if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
	DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
	return V;

	return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
	DAG, Subtarget);
	}

	if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
	Zeroable, Subtarget, DAG))
	return PSHUFB;

	// AVX512VBMIVL can lower to VPERMB.
	if (Subtarget.hasVBMI() && Subtarget.hasVLX())
	return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Try to permute the lanes and then use a per-lane permute.
	if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
	DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
	return V;

	// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
	// by zeroable elements in the remaining 24 elements. Turn this into two
	// vmovqb instructions shuffled together.
	if (Subtarget.hasVLX())
	if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
	Mask, Zeroable, DAG))
	return V;

	// Otherwise fall back on generic lowering.
	return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// High-level routine to lower various 256-bit x86 vector shuffles.
	///
	/// This routine either breaks down the specific type of a 256-bit x86 vector
	/// shuffle or splits it into two 128-bit shuffles and fuses the results back
	/// together based on the available instructions.
	static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
	SDValue V1, SDValue V2, const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// If we have a single input to the zero element, insert that into V1 if we
	// can do so cheaply.
	int NumElts = VT.getVectorNumElements();
	int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

	if (NumV2Elements == 1 && Mask[0] >= NumElts)
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;

	// Handle special cases where the lower or upper half is UNDEF.
	if (SDValue V =
	lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
	return V;

	// There is a really nice hard cut-over between AVX1 and AVX2 that means we
	// can check for those subtargets here and avoid much of the subtarget
	// querying in the per-vector-type lowering routines. With AVX1 we have
	// essentially zero ability to manipulate a 256-bit vector with integer
	// types. Since we'll use floating point types there eventually, just
	// immediately cast everything to a float and operate entirely in that domain.
	if (VT.isInteger() && !Subtarget.hasAVX2()) {
	int ElementBits = VT.getScalarSizeInBits();
	if (ElementBits < 32) {
	// No floating point type available, if we can't use the bit operations
	// for masking/blending then decompose into 128-bit vectors.
	if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return V;
	if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
	return V;
	return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
	}

	MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
	VT.getVectorNumElements());
	V1 = DAG.getBitcast(FpVT, V1);
	V2 = DAG.getBitcast(FpVT, V2);
	return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
	}

	switch (VT.SimpleTy) {
	case MVT::v4f64:
	return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4i64:
	return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8f32:
	return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i32:
	return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i16:
	return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v32i8:
	return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Not a valid 256-bit x86 vector type!");
	}
	}

	/// Try to lower a vector shuffle as a 128-bit shuffles.
	static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(VT.getScalarSizeInBits() == 64 &&
	"Unexpected element type size for 128bit shuffle.");

	// To handle 256 bit vector requires VLX and most probably
	// function lowerV2X128VectorShuffle() is better solution.
	assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");

	// TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
	SmallVector<int, 4> Widened128Mask;
	if (!canWidenShuffleElements(Mask, Widened128Mask))
	return SDValue();
	assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");

	// Try to use an insert into a zero vector.
	if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
	(Widened128Mask[1] == 1 \|\| (Zeroable & 0x0c) == 0x0c)) {
	unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
	SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
	getZeroVector(VT, Subtarget, DAG, DL), LoV,
	DAG.getIntPtrConstant(0, DL));
	}

	// Check for patterns which can be matched with a single insert of a 256-bit
	// subvector.
	bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 0, 1, 2, 3});
	if (OnlyUsesV1 \|\|
	isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 8, 9, 10, 11})) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
	SDValue SubVec =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
	DAG.getIntPtrConstant(4, DL));
	}

	// See if this is an insertion of the lower 128-bits of V2 into V1.
	bool IsInsert = true;
	int V2Index = -1;
	for (int i = 0; i < 4; ++i) {
	assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
	if (Widened128Mask[i] < 0)
	continue;

	// Make sure all V1 subvectors are in place.
	if (Widened128Mask[i] < 4) {
	if (Widened128Mask[i] != i) {
	IsInsert = false;
	break;
	}
	} else {
	// Make sure we only have a single V2 index and its the lowest 128-bits.
	if (V2Index >= 0 \|\| Widened128Mask[i] != 4) {
	IsInsert = false;
	break;
	}
	V2Index = i;
	}
	}
	if (IsInsert && V2Index >= 0) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
	SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
	DAG.getIntPtrConstant(0, DL));
	return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
	}

	// See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
	// UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
	// possible we at least ensure the lanes stay sequential to help later
	// combines.
	SmallVector<int, 2> Widened256Mask;
	if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
	Widened128Mask.clear();
	narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
	}

	// Try to lower to vshuf64x2/vshuf32x4.
	SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
	unsigned PermMask = 0;
	// Insure elements came from the same Op.
	for (int i = 0; i < 4; ++i) {
	assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
	if (Widened128Mask[i] < 0)
	continue;

	SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
	unsigned OpIndex = i / 2;
	if (Ops[OpIndex].isUndef())
	Ops[OpIndex] = Op;
	else if (Ops[OpIndex] != Op)
	return SDValue();

	// Convert the 128-bit shuffle mask selection values into 128-bit selection
	// bits defined by a vshuf64x2 instruction's immediate control byte.
	PermMask \|= (Widened128Mask[i] % 4) << (i * 2);
	}

	return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
	DAG.getTargetConstant(PermMask, DL, MVT::i8));
	}

	/// Handle lowering of 8-lane 64-bit floating point shuffles.
	static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (V2.isUndef()) {
	// Use low duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
	return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);

	if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
	// Non-half-crossing single input shuffles can be lowered with an
	// interleaved permutation.
	unsigned VPERMILPMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1) \|
	((Mask[2] == 3) << 2) \| ((Mask[3] == 3) << 3) \|
	((Mask[4] == 5) << 4) \| ((Mask[5] == 5) << 5) \|
	((Mask[6] == 7) << 6) \| ((Mask[7] == 7) << 7);
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
	DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
	}

	SmallVector<int, 4> RepeatedMask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
	}

	if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
	V2, Subtarget, DAG))
	return Shuf128;

	if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
	return Unpck;

	// Check if the blend happens to exactly fit that of SHUFPD.
	if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Op;

	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
	}

	/// Handle lowering of 16-lane 32-bit floating point shuffles.
	static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// If the shuffle mask is repeated in each 128-bit lane, we have many more
	// options to efficiently lower the shuffle.
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);

	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Otherwise, fall back to a SHUFPS sequence.
	return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
	}

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
	return V;

	// If we have a single input shuffle with different shuffle patterns in the
	// 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
	if (V2.isUndef() &&
	!is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
	SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
	return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
	}

	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;

	return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
	}

	/// Handle lowering of 8-lane 64-bit integer shuffles.
	static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (V2.isUndef()) {
	// When the shuffle is mirrored between the 128-bit lanes of the unit, we
	// can use lower latency instructions that will operate on all four
	// 128-bit lanes.
	SmallVector<int, 2> Repeated128Mask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
	SmallVector<int, 4> PSHUFDMask;
	narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
	return DAG.getBitcast(
	MVT::v8i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
	DAG.getBitcast(MVT::v16i32, V1),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}

	SmallVector<int, 4> Repeated256Mask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
	getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
	}

	if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
	V2, Subtarget, DAG))
	return Shuf128;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use VALIGN.
	if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Try to use PALIGNR.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
	return Unpck;
	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
	}

	/// Handle lowering of 16-lane 32-bit integer shuffles.
	static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
	DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// If the shuffle mask is repeated in each 128-bit lane we can use more
	// efficient instructions that mirror the shuffles across the four 128-bit
	// lanes.
	SmallVector<int, 4> RepeatedMask;
	bool Is128BitLaneRepeatedShuffle =
	is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
	if (Is128BitLaneRepeatedShuffle) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
	if (V2.isUndef())
	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
	return V;
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use VALIGN.
	if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Try to use byte rotation instructions.
	if (Subtarget.hasBWI())
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Assume that a single SHUFPS is faster than using a permv shuffle.
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
	SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
	SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
	CastV1, CastV2, DAG);
	return DAG.getBitcast(MVT::v16i32, ShufPS);
	}

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
	return V;

	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
	}

	/// Handle lowering of 32-lane 16-bit integer shuffles.
	static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
	assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
	assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
	DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V =
	lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (V2.isUndef()) {
	// Try to use bit rotation instructions.
	if (SDValue Rotate =
	lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
	return Rotate;

	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
	// As this is a single-input shuffle, the repeated mask should be
	// a strictly valid v8i16 mask that we can pass through to the v8i16
	// lowering to handle even the v32 case.
	return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
	RepeatedMask, Subtarget, DAG);
	}
	}

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
	Zeroable, Subtarget, DAG))
	return PSHUFB;

	return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
	}

	/// Handle lowering of 64-lane 8-bit integer shuffles.
	static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
	assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
	assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
	DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Try to use bit rotation instructions.
	if (V2.isUndef())
	if (SDValue Rotate =
	lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
	return Rotate;

	// Lower as AND if possible.
	if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Masked;

	if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
	Zeroable, Subtarget, DAG))
	return PSHUFB;

	// VBMI can use VPERMV/VPERMV3 byte shuffles.
	if (Subtarget.hasVBMI())
	return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (!V2.isUndef())
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// FIXME: Implement direct support for this type!
	return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
	}

	/// High-level routine to lower various 512-bit x86 vector shuffles.
	///
	/// This routine either breaks down the specific type of a 512-bit x86 vector
	/// shuffle or splits it into two 256-bit shuffles and fuses the results back
	/// together based on the available instructions.
	static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"Cannot lower 512-bit vectors w/ basic ISA!");

	// If we have a single input to the zero element, insert that into V1 if we
	// can do so cheaply.
	int NumElts = Mask.size();
	int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

	if (NumV2Elements == 1 && Mask[0] >= NumElts)
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;

	// Handle special cases where the lower or upper half is UNDEF.
	if (SDValue V =
	lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	if ((VT == MVT::v32i16 \|\| VT == MVT::v64i8) && !Subtarget.hasBWI()) {
	// Try using bit ops for masking and blending before falling back to
	// splitting.
	if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return V;
	if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
	return V;

	return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
	}

	// Dispatch to each element type for lowering. If we don't have support for
	// specific element type shuffles at 512 bits, immediately split them and
	// lower them. Each lowering routine of a given type is allowed to assume that
	// the requisite ISA extensions for that element type are available.
	switch (VT.SimpleTy) {
	case MVT::v8f64:
	return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16f32:
	return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i64:
	return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i32:
	return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v32i16:
	return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v64i8:
	return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Not a valid 512-bit x86 vector type!");
	}
	}

	static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// Shuffle should be unary.
	if (!V2.isUndef())
	return SDValue();

	int ShiftAmt = -1;
	int NumElts = Mask.size();
	for (int i = 0; i != NumElts; ++i) {
	int M = Mask[i];
	assert((M == SM_SentinelUndef \|\| (0 <= M && M < NumElts)) &&
	"Unexpected mask index.");
	if (M < 0)
	continue;

	// The first non-undef element determines our shift amount.
	if (ShiftAmt < 0) {
	ShiftAmt = M - i;
	// Need to be shifting right.
	if (ShiftAmt <= 0)
	return SDValue();
	}
	// All non-undef elements must shift by the same amount.
	if (ShiftAmt != M - i)
	return SDValue();
	}
	assert(ShiftAmt >= 0 && "All undef?");

	// Great we found a shift right.
	MVT WideVT = VT;
	if ((!Subtarget.hasDQI() && NumElts == 8) \|\| NumElts < 8)
	WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
	SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
	DAG.getUNDEF(WideVT), V1,
	DAG.getIntPtrConstant(0, DL));
	Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
	DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	}

	// Determine if this shuffle can be implemented with a KSHIFT instruction.
	// Returns the shift amount if possible or -1 if not. This is a simplified
	// version of matchShuffleAsShift.
	static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
	int MaskOffset, const APInt &Zeroable) {
	int Size = Mask.size();

	auto CheckZeros = [&](int Shift, bool Left) {
	for (int j = 0; j < Shift; ++j)
	if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
	return false;

	return true;
	};

	auto MatchShift = [&](int Shift, bool Left) {
	unsigned Pos = Left ? Shift : 0;
	unsigned Low = Left ? 0 : Shift;
	unsigned Len = Size - Shift;
	return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
	};

	for (int Shift = 1; Shift != Size; ++Shift)
	for (bool Left : {true, false})
	if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
	Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
	return Shift;
	}

	return -1;
	}


	// Lower vXi1 vector shuffles.
	// There is no a dedicated instruction on AVX-512 that shuffles the masks.
	// The only way to shuffle bits is to sign-extend the mask vector to SIMD
	// vector, shuffle and then truncate it back.
	static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"Cannot lower 512-bit vectors w/o basic ISA!");

	int NumElts = Mask.size();

	// Try to recognize shuffles that are just padding a subvector with zeros.
	int SubvecElts = 0;
	int Src = -1;
	for (int i = 0; i != NumElts; ++i) {
	if (Mask[i] >= 0) {
	// Grab the source from the first valid mask. All subsequent elements need
	// to use this same source.
	if (Src < 0)
	Src = Mask[i] / NumElts;
	if (Src != (Mask[i] / NumElts) \|\| (Mask[i] % NumElts) != i)
	break;
	}

	++SubvecElts;
	}
	assert(SubvecElts != NumElts && "Identity shuffle?");

	// Clip to a power 2.
	SubvecElts = PowerOf2Floor(SubvecElts);

	// Make sure the number of zeroable bits in the top at least covers the bits
	// not covered by the subvector.
	if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
	assert(Src >= 0 && "Expected a source!");
	MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
	SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
	Src == 0 ? V1 : V2,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
	DAG.getConstant(0, DL, VT),
	Extract, DAG.getIntPtrConstant(0, DL));
	}

	// Try a simple shift right with undef elements. Later we'll try with zeros.
	if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
	DAG))
	return Shift;

	// Try to match KSHIFTs.
	unsigned Offset = 0;
	for (SDValue V : { V1, V2 }) {
	unsigned Opcode;
	int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
	if (ShiftAmt >= 0) {
	MVT WideVT = VT;
	if ((!Subtarget.hasDQI() && NumElts == 8) \|\| NumElts < 8)
	WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
	SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
	DAG.getUNDEF(WideVT), V,
	DAG.getIntPtrConstant(0, DL));
	// Widened right shifts need two shifts to ensure we shift in zeroes.
	if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
	int WideElts = WideVT.getVectorNumElements();
	// Shift left to put the original vector in the MSBs of the new size.
	Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
	DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
	// Increase the shift amount to account for the left shift.
	ShiftAmt += WideElts - NumElts;
	}

	Res = DAG.getNode(Opcode, DL, WideVT, Res,
	DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	}
	Offset += NumElts; // Increment for next iteration.
	}



	MVT ExtVT;
	switch (VT.SimpleTy) {
	default:
	llvm_unreachable("Expected a vector of i1 elements");
	case MVT::v2i1:
	ExtVT = MVT::v2i64;
	break;
	case MVT::v4i1:
	ExtVT = MVT::v4i32;
	break;
	case MVT::v8i1:
	// Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
	// shuffle.
	ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
	break;
	case MVT::v16i1:
	// Take 512-bit type, unless we are avoiding 512-bit types and have the
	// 256-bit operation available.
	ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
	break;
	case MVT::v32i1:
	// Take 512-bit type, unless we are avoiding 512-bit types and have the
	// 256-bit operation available.
	assert(Subtarget.hasBWI() && "Expected AVX512BW support");
	ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
	break;
	case MVT::v64i1:
	// Fall back to scalarization. FIXME: We can do better if the shuffle
	// can be partitioned cleanly.
	if (!Subtarget.useBWIRegs())
	return SDValue();
	ExtVT = MVT::v64i8;
	break;
	}

	V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
	V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);

	SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
	// i1 was sign extended we can use X86ISD::CVT2MASK.
	int NumElems = VT.getVectorNumElements();
	if ((Subtarget.hasBWI() && (NumElems >= 32)) \|\|
	(Subtarget.hasDQI() && (NumElems < 32)))
	return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
	Shuffle, ISD::SETGT);

	return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
	}

	/// Helper function that returns true if the shuffle mask should be
	/// commuted to improve canonicalization.
	static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
	int NumElements = Mask.size();

	int NumV1Elements = 0, NumV2Elements = 0;
	for (int M : Mask)
	if (M < 0)
	continue;
	else if (M < NumElements)
	++NumV1Elements;
	else
	++NumV2Elements;

	// Commute the shuffle as needed such that more elements come from V1 than
	// V2. This allows us to match the shuffle pattern strictly on how many
	// elements come from V1 without handling the symmetric cases.
	if (NumV2Elements > NumV1Elements)
	return true;

	assert(NumV1Elements > 0 && "No V1 indices");

	if (NumV2Elements == 0)
	return false;

	// When the number of V1 and V2 elements are the same, try to minimize the
	// number of uses of V2 in the low half of the vector. When that is tied,
	// ensure that the sum of indices for V1 is equal to or lower than the sum
	// indices for V2. When those are equal, try to ensure that the number of odd
	// indices for V1 is lower than the number of odd indices for V2.
	if (NumV1Elements == NumV2Elements) {
	int LowV1Elements = 0, LowV2Elements = 0;
	for (int M : Mask.slice(0, NumElements / 2))
	if (M >= NumElements)
	++LowV2Elements;
	else if (M >= 0)
	++LowV1Elements;
	if (LowV2Elements > LowV1Elements)
	return true;
	if (LowV2Elements == LowV1Elements) {
	int SumV1Indices = 0, SumV2Indices = 0;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= NumElements)
	SumV2Indices += i;
	else if (Mask[i] >= 0)
	SumV1Indices += i;
	if (SumV2Indices < SumV1Indices)
	return true;
	if (SumV2Indices == SumV1Indices) {
	int NumV1OddIndices = 0, NumV2OddIndices = 0;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= NumElements)
	NumV2OddIndices += i % 2;
	else if (Mask[i] >= 0)
	NumV1OddIndices += i % 2;
	if (NumV2OddIndices < NumV1OddIndices)
	return true;
	}
	}
	}

	return false;
	}

	/// Top-level lowering for x86 vector shuffles.
	///
	/// This handles decomposition, canonicalization, and lowering of all x86
	/// vector shuffles. Most of the specific lowering strategies are encapsulated
	/// above in helper routines. The canonicalization attempts to widen shuffles
	/// to involve fewer lanes of wider elements, consolidate symmetric patterns
	/// s.t. only one of the two inputs needs to be tested, etc.
	static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
	ArrayRef<int> OrigMask = SVOp->getMask();
	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	MVT VT = Op.getSimpleValueType();
	int NumElements = VT.getVectorNumElements();
	SDLoc DL(Op);
	bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);

	assert((VT.getSizeInBits() != 64 \|\| Is1BitVector) &&
	"Can't lower MMX shuffles");

	bool V1IsUndef = V1.isUndef();
	bool V2IsUndef = V2.isUndef();
	if (V1IsUndef && V2IsUndef)
	return DAG.getUNDEF(VT);

	// When we create a shuffle node we put the UNDEF node to second operand,
	// but in some cases the first operand may be transformed to UNDEF.
	// In this case we should just commute the node.
	if (V1IsUndef)
	return DAG.getCommutedVectorShuffle(*SVOp);

	// Check for non-undef masks pointing at an undef vector and make the masks
	// undef as well. This makes it easier to match the shuffle based solely on
	// the mask.
	if (V2IsUndef &&
	any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
	SmallVector<int, 8> NewMask(OrigMask.begin(), OrigMask.end());
	for (int &M : NewMask)
	if (M >= NumElements)
	M = -1;
	return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
	}

	// Check for illegal shuffle mask element index values.
	int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
	(void)MaskUpperLimit;
	assert(llvm::all_of(OrigMask,
	[&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
	"Out of bounds shuffle index");

	// We actually see shuffles that are entirely re-arrangements of a set of
	// zero inputs. This mostly happens while decomposing complex shuffles into
	// simple ones. Directly lower these as a buildvector of zeros.
	APInt KnownUndef, KnownZero;
	computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);

	APInt Zeroable = KnownUndef \| KnownZero;
	if (Zeroable.isAllOnesValue())
	return getZeroVector(VT, Subtarget, DAG, DL);

	bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());

	// Try to collapse shuffles into using a vector type with fewer elements but
	// wider element types. We cap this to not form integers or floating point
	// elements wider than 64 bits, but it might be interesting to form i128
	// integers to handle flipping the low and high halves of AVX 256-bit vectors.
	SmallVector<int, 16> WidenedMask;
	if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
	canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
	// Shuffle mask widening should not interfere with a broadcast opportunity
	// by obfuscating the operands with bitcasts.
	// TODO: Avoid lowering directly from this top-level function: make this
	// a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
	Subtarget, DAG))
	return Broadcast;

	MVT NewEltVT = VT.isFloatingPoint()
	? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
	: MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
	int NewNumElts = NumElements / 2;
	MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
	// Make sure that the new vector type is legal. For example, v2f64 isn't
	// legal on SSE1.
	if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
	if (V2IsZero) {
	// Modify the new Mask to take all zeros from the all-zero vector.
	// Choose indices that are blend-friendly.
	bool UsedZeroVector = false;
	assert(find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&
	"V2's non-undef elements are used?!");
	for (int i = 0; i != NewNumElts; ++i)
	if (WidenedMask[i] == SM_SentinelZero) {
	WidenedMask[i] = i + NewNumElts;
	UsedZeroVector = true;
	}
	// Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
	// some elements to be undef.
	if (UsedZeroVector)
	V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
	}
	V1 = DAG.getBitcast(NewVT, V1);
	V2 = DAG.getBitcast(NewVT, V2);
	return DAG.getBitcast(
	VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
	}
	}

	// Commute the shuffle if it will improve canonicalization.
	SmallVector<int, 64> Mask(OrigMask.begin(), OrigMask.end());
	if (canonicalizeShuffleMaskWithCommute(Mask)) {
	ShuffleVectorSDNode::commuteMask(Mask);
	std::swap(V1, V2);
	}

	if (SDValue V = lowerShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
	return V;

	// For each vector width, delegate to a specialized lowering routine.
	if (VT.is128BitVector())
	return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

	if (VT.is256BitVector())
	return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

	if (VT.is512BitVector())
	return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

	if (Is1BitVector)
	return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

	llvm_unreachable("Unimplemented!");
	}

	/// Try to lower a VSELECT instruction to a vector shuffle.
	static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Cond = Op.getOperand(0);
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	MVT VT = Op.getSimpleValueType();

	// Only non-legal VSELECTs reach this lowering, convert those into generic
	// shuffles and re-use the shuffle lowering path for blends.
	SmallVector<int, 32> Mask;
	if (createShuffleMaskFromVSELECT(Mask, Cond))
	return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);

	return SDValue();
	}

	SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
	SDValue Cond = Op.getOperand(0);
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);

	// A vselect where all conditions and data are constants can be optimized into
	// a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
	if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
	ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
	ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
	return SDValue();

	// Try to lower this to a blend-style vector shuffle. This can handle all
	// constant condition cases.
	if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
	return BlendOp;

	// If this VSELECT has a vector if i1 as a mask, it will be directly matched
	// with patterns on the mask registers on AVX-512.
	MVT CondVT = Cond.getSimpleValueType();
	unsigned CondEltSize = Cond.getScalarValueSizeInBits();
	if (CondEltSize == 1)
	return Op;

	// Variable blends are only legal from SSE4.1 onward.
	if (!Subtarget.hasSSE41())
	return SDValue();

	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	unsigned EltSize = VT.getScalarSizeInBits();
	unsigned NumElts = VT.getVectorNumElements();

	// Expand v32i16/v64i8 without BWI.
	if ((VT == MVT::v32i16 \|\| VT == MVT::v64i8) && !Subtarget.hasBWI())
	return SDValue();

	// If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
	// into an i1 condition so that we can use the mask-based 512-bit blend
	// instructions.
	if (VT.getSizeInBits() == 512) {
	// Build a mask by testing the condition against zero.
	MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
	SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
	DAG.getConstant(0, dl, CondVT),
	ISD::SETNE);
	// Now return a new VSELECT using the mask.
	return DAG.getSelect(dl, VT, Mask, LHS, RHS);
	}

	// SEXT/TRUNC cases where the mask doesn't match the destination size.
	if (CondEltSize != EltSize) {
	// If we don't have a sign splat, rely on the expansion.
	if (CondEltSize != DAG.ComputeNumSignBits(Cond))
	return SDValue();

	MVT NewCondSVT = MVT::getIntegerVT(EltSize);
	MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
	Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
	return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
	}

	// Only some types will be legal on some subtargets. If we can emit a legal
	// VSELECT-matching blend, return Op, and but if we need to expand, return
	// a null value.
	switch (VT.SimpleTy) {
	default:
	// Most of the vector types have blends past SSE4.1.
	return Op;

	case MVT::v32i8:
	// The byte blends for AVX vectors were introduced only in AVX2.
	if (Subtarget.hasAVX2())
	return Op;

	return SDValue();

	case MVT::v8i16:
	case MVT::v16i16: {
	// Bitcast everything to the vXi8 type and use a vXi8 vselect.
	MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
	Cond = DAG.getBitcast(CastVT, Cond);
	LHS = DAG.getBitcast(CastVT, LHS);
	RHS = DAG.getBitcast(CastVT, RHS);
	SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
	return DAG.getBitcast(VT, Select);
	}
	}
	}

	static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDValue Vec = Op.getOperand(0);
	SDValue Idx = Op.getOperand(1);
	assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
	SDLoc dl(Op);

	if (!Vec.getSimpleValueType().is128BitVector())
	return SDValue();

	if (VT.getSizeInBits() == 8) {
	// If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
	// we're going to zero extend the register or fold the store.
	if (llvm::isNullConstant(Idx) && !MayFoldIntoZeroExtend(Op) &&
	!MayFoldIntoStore(Op))
	return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Vec), Idx));

	SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec, Idx);
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
	}

	if (VT == MVT::f32) {
	// EXTRACTPS outputs to a GPR32 register which will require a movd to copy
	// the result back to FR32 register. It's only worth matching if the
	// result has a single use which is a store or a bitcast to i32. And in
	// the case of a store, it's not worth it if the index is a constant 0,
	// because a MOVSSmr can be used instead, which is smaller and faster.
	if (!Op.hasOneUse())
	return SDValue();
	SDNode User = Op.getNode()->use_begin();
	if ((User->getOpcode() != ISD::STORE \|\| isNullConstant(Idx)) &&
	(User->getOpcode() != ISD::BITCAST \|\|
	User->getValueType(0) != MVT::i32))
	return SDValue();
	SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Vec), Idx);
	return DAG.getBitcast(MVT::f32, Extract);
	}

	if (VT == MVT::i32 \|\| VT == MVT::i64)
	return Op;

	return SDValue();
	}

	/// Extract one bit from mask vector, like v16i1 or v8i1.
	/// AVX-512 feature.
	static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Vec = Op.getOperand(0);
	SDLoc dl(Vec);
	MVT VecVT = Vec.getSimpleValueType();
	SDValue Idx = Op.getOperand(1);
	auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
	MVT EltVT = Op.getSimpleValueType();

	assert((VecVT.getVectorNumElements() <= 16 \|\| Subtarget.hasBWI()) &&
	"Unexpected vector type in ExtractBitFromMaskVector");

	// variable index can't be handled in mask registers,
	// extend vector to VR512/128
	if (!IdxC) {
	unsigned NumElts = VecVT.getVectorNumElements();
	// Extending v8i1/v16i1 to 512-bit get better performance on KNL
	// than extending to 128/256bit.
	MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
	MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
	SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
	SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
	return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
	}

	unsigned IdxVal = IdxC->getZExtValue();
	if (IdxVal == 0) // the operation is legal
	return Op;

	// Extend to natively supported kshift.
	unsigned NumElems = VecVT.getVectorNumElements();
	MVT WideVecVT = VecVT;
	if ((!Subtarget.hasDQI() && NumElems == 8) \|\| NumElems < 8) {
	WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
	DAG.getUNDEF(WideVecVT), Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Use kshiftr instruction to move to the lower element.
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
	DAG.getTargetConstant(IdxVal, dl, MVT::i8));

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	SDValue
	X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	MVT VecVT = Vec.getSimpleValueType();
	SDValue Idx = Op.getOperand(1);
	auto* IdxC = dyn_cast<ConstantSDNode>(Idx);

	if (VecVT.getVectorElementType() == MVT::i1)
	return ExtractBitFromMaskVector(Op, DAG, Subtarget);

	if (!IdxC) {
	// Its more profitable to go through memory (1 cycles throughput)
	// than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
	// IACA tool was used to get performance estimation
	// (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
	//
	// example : extractelement <16 x i8> %a, i32 %i
	//
	// Block Throughput: 3.00 Cycles
	// Throughput Bottleneck: Port5
	//
	// \| Num Of \| Ports pressure in cycles \| \|
	// \| Uops \| 0 - DV \| 5 \| 6 \| 7 \| \|
	// ---------------------------------------------
	// \| 1 \| \| 1.0 \| \| \| CP \| vmovd xmm1, edi
	// \| 1 \| \| 1.0 \| \| \| CP \| vpshufb xmm0, xmm0, xmm1
	// \| 2 \| 1.0 \| 1.0 \| \| \| CP \| vpextrb eax, xmm0, 0x0
	// Total Num Of Uops: 4
	//
	//
	// Block Throughput: 1.00 Cycles
	// Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
	//
	// \| \| Ports pressure in cycles \| \|
	// \|Uops\| 1 \| 2 - D \|3 - D \| 4 \| 5 \| \|
	// ---------------------------------------------------------
	// \|2^ \| \| 0.5 \| 0.5 \|1.0\| \|CP\| vmovaps xmmword ptr [rsp-0x18], xmm0
	// \|1 \|0.5\| \| \| \|0.5\| \| lea rax, ptr [rsp-0x18]
	// \|1 \| \|0.5, 0.5\|0.5, 0.5\| \| \|CP\| mov al, byte ptr [rdi+rax*1]
	// Total Num Of Uops: 4

	return SDValue();
	}

	unsigned IdxVal = IdxC->getZExtValue();

	// If this is a 256-bit vector result, first extract the 128-bit vector and
	// then extract the element from the 128-bit vector.
	if (VecVT.is256BitVector() \|\| VecVT.is512BitVector()) {
	// Get the 128-bit vector.
	Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
	MVT EltVT = VecVT.getVectorElementType();

	unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
	// this can be done with a mask.
	IdxVal &= ElemsPerChunk - 1;
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
	DAG.getIntPtrConstant(IdxVal, dl));
	}

	assert(VecVT.is128BitVector() && "Unexpected vector length");

	MVT VT = Op.getSimpleValueType();

	if (VT.getSizeInBits() == 16) {
	// If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
	// we're going to zero extend the register or fold the store (SSE41 only).
	if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
	!(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
	return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Vec), Idx));

	SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec, Idx);
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
	}

	if (Subtarget.hasSSE41())
	if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
	return Res;

	// TODO: We only extract a single element from v16i8, we can probably afford
	// to be more aggressive here before using the default approach of spilling to
	// stack.
	if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
	// Extract either the lowest i32 or any i16, and extract the sub-byte.
	int DWordIdx = IdxVal / 4;
	if (DWordIdx == 0) {
	SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Vec),
	DAG.getIntPtrConstant(DWordIdx, dl));
	int ShiftVal = (IdxVal % 4) * 8;
	if (ShiftVal != 0)
	Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
	DAG.getConstant(ShiftVal, dl, MVT::i8));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	}

	int WordIdx = IdxVal / 2;
	SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
	DAG.getBitcast(MVT::v8i16, Vec),
	DAG.getIntPtrConstant(WordIdx, dl));
	int ShiftVal = (IdxVal % 2) * 8;
	if (ShiftVal != 0)
	Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
	DAG.getConstant(ShiftVal, dl, MVT::i8));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	}

	if (VT.getSizeInBits() == 32) {
	if (IdxVal == 0)
	return Op;

	// SHUFPS the element to the lowest double word, then movss.
	int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
	Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	if (VT.getSizeInBits() == 64) {
	// FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
	// FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
	// to match extract_elt for f64.
	if (IdxVal == 0)
	return Op;

	// UNPCKHPD the element to the lowest double word, then movsd.
	// Note if the lower 64 bits of the result of the UNPCKHPD is then stored
	// to a f64mem, the whole operation is folded into a single MOVHPDmr.
	int Mask[2] = { 1, -1 };
	Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	return SDValue();
	}

	/// Insert one bit to mask vector, like v16i1 or v8i1.
	/// AVX-512 feature.
	static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	SDValue Elt = Op.getOperand(1);
	SDValue Idx = Op.getOperand(2);
	MVT VecVT = Vec.getSimpleValueType();

	if (!isa<ConstantSDNode>(Idx)) {
	// Non constant index. Extend source and destination,
	// insert element and then truncate the result.
	unsigned NumElts = VecVT.getVectorNumElements();
	MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
	MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
	SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
	DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
	DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
	return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
	}

	// Copy into a k-register, extract to v1i1 and insert_subvector.
	SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
	}

	SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElts = VT.getVectorNumElements();

	if (EltVT == MVT::i1)
	return InsertBitToMaskVector(Op, DAG, Subtarget);

	SDLoc dl(Op);
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);
	SDValue N2 = Op.getOperand(2);

	auto *N2C = dyn_cast<ConstantSDNode>(N2);
	if (!N2C \|\| N2C->getAPIntValue().uge(NumElts))
	return SDValue();
	uint64_t IdxVal = N2C->getZExtValue();

	bool IsZeroElt = X86::isZeroNode(N1);
	bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);

	// If we are inserting a element, see if we can do this more efficiently with
	// a blend shuffle with a rematerializable vector than a costly integer
	// insertion.
	if ((IsZeroElt \|\| IsAllOnesElt) && Subtarget.hasSSE41() &&
	16 <= EltVT.getSizeInBits()) {
	SmallVector<int, 8> BlendMask;
	for (unsigned i = 0; i != NumElts; ++i)
	BlendMask.push_back(i == IdxVal ? i + NumElts : i);
	SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
	: getOnesVector(VT, DAG, dl);
	return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
	}

	// If the vector is wider than 128 bits, extract the 128-bit subvector, insert
	// into that, and then insert the subvector back into the result.
	if (VT.is256BitVector() \|\| VT.is512BitVector()) {
	// With a 256-bit vector, we can insert into the zero element efficiently
	// using a blend if we have AVX or AVX2 and the right data type.
	if (VT.is256BitVector() && IdxVal == 0) {
	// TODO: It is worthwhile to cast integer to floating point and back
	// and incur a domain crossing penalty if that's what we'll end up
	// doing anyway after extracting to a 128-bit vector.
	if ((Subtarget.hasAVX() && (EltVT == MVT::f64 \|\| EltVT == MVT::f32)) \|\|
	(Subtarget.hasAVX2() && EltVT == MVT::i32)) {
	SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
	return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
	DAG.getTargetConstant(1, dl, MVT::i8));
	}
	}

	// Get the desired 128-bit vector chunk.
	SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);

	// Insert the element into the desired chunk.
	unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
	assert(isPowerOf2_32(NumEltsIn128));
	// Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
	unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);

	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
	DAG.getIntPtrConstant(IdxIn128, dl));

	// Insert the changed part back into the bigger vector
	return insert128BitVector(N0, V, IdxVal, DAG, dl);
	}
	assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");

	// This will be just movd/movq/movss/movsd.
	if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
	if (EltVT == MVT::i32 \|\| EltVT == MVT::f32 \|\| EltVT == MVT::f64 \|\|
	EltVT == MVT::i64) {
	N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
	return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
	}

	// We can't directly insert an i8 or i16 into a vector, so zero extend
	// it to i32 first.
	if (EltVT == MVT::i16 \|\| EltVT == MVT::i8) {
	N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
	MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
	N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
	N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
	return DAG.getBitcast(VT, N1);
	}
	}

	// Transform it so it match pinsr{b,w} which expects a GR32 as its second
	// argument. SSE41 required for pinsrb.
	if (VT == MVT::v8i16 \|\| (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
	unsigned Opc;
	if (VT == MVT::v8i16) {
	assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
	Opc = X86ISD::PINSRW;
	} else {
	assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
	assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
	Opc = X86ISD::PINSRB;
	}

	if (N1.getValueType() != MVT::i32)
	N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
	if (N2.getValueType() != MVT::i32)
	N2 = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(Opc, dl, VT, N0, N1, N2);
	}

	if (Subtarget.hasSSE41()) {
	if (EltVT == MVT::f32) {
	// Bits [7:6] of the constant are the source select. This will always be
	// zero here. The DAG Combiner may combine an extract_elt index into
	// these bits. For example (insert (extract, 3), 2) could be matched by
	// putting the '3' into bits [7:6] of X86ISD::INSERTPS.
	// Bits [5:4] of the constant are the destination select. This is the
	// value of the incoming immediate.
	// Bits [3:0] of the constant are the zero mask. The DAG Combiner may
	// combine either bitwise AND or insert of float 0.0 to set these bits.

	bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
	if (IdxVal == 0 && (!MinSize \|\| !MayFoldLoad(N1))) {
	// If this is an insertion of 32-bits into the low 32-bits of
	// a vector, we prefer to generate a blend with immediate rather
	// than an insertps. Blends are simpler operations in hardware and so
	// will always have equal or better performance than insertps.
	// But if optimizing for size and there's a load folding opportunity,
	// generate insertps because blendps does not have a 32-bit memory
	// operand form.
	N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
	return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
	DAG.getTargetConstant(1, dl, MVT::i8));
	}
	// Create this as a scalar to vector..
	N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
	return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
	DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
	}

	// PINSR* works with constant index.
	if (EltVT == MVT::i32 \|\| EltVT == MVT::i64)
	return Op;
	}

	return SDValue();
	}

	static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT OpVT = Op.getSimpleValueType();

	// It's always cheaper to replace a xor+movd with xorps and simplifies further
	// combines.
	if (X86::isZeroNode(Op.getOperand(0)))
	return getZeroVector(OpVT, Subtarget, DAG, dl);

	// If this is a 256-bit vector result, first insert into a 128-bit
	// vector and then insert into the 256-bit vector.
	if (!OpVT.is128BitVector()) {
	// Insert into a 128-bit vector.
	unsigned SizeFactor = OpVT.getSizeInBits() / 128;
	MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
	OpVT.getVectorNumElements() / SizeFactor);

	Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));

	// Insert the 128-bit vector.
	return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
	}
	assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
	"Expected an SSE type!");

	// Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
	if (OpVT == MVT::v4i32)
	return Op;

	SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
	return DAG.getBitcast(
	OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
	}

	// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
	// simple superregister reference or explicit instructions to insert
	// the upper bits of a vector.
	static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);

	return insert1BitVector(Op, DAG, Subtarget);
	}

	static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
	"Only vXi1 extract_subvectors need custom lowering");

	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	uint64_t IdxVal = Op.getConstantOperandVal(1);

	if (IdxVal == 0) // the operation is legal
	return Op;

	MVT VecVT = Vec.getSimpleValueType();
	unsigned NumElems = VecVT.getVectorNumElements();

	// Extend to natively supported kshift.
	MVT WideVecVT = VecVT;
	if ((!Subtarget.hasDQI() && NumElems == 8) \|\| NumElems < 8) {
	WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
	DAG.getUNDEF(WideVecVT), Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Shift to the LSB.
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
	DAG.getTargetConstant(IdxVal, dl, MVT::i8));

	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Returns the appropriate wrapper opcode for a global reference.
	unsigned X86TargetLowering::getGlobalWrapperKind(
	const GlobalValue *GV, const unsigned char OpFlags) const {
	// References to absolute symbols are never PC-relative.
	if (GV && GV->isAbsoluteSymbolRef())
	return X86ISD::Wrapper;

	CodeModel::Model M = getTargetMachine().getCodeModel();
	if (Subtarget.isPICStyleRIPRel() &&
	(M == CodeModel::Small \|\| M == CodeModel::Kernel))
	return X86ISD::WrapperRIP;

	// GOTPCREL references must always use RIP.
	if (OpFlags == X86II::MO_GOTPCREL)
	return X86ISD::WrapperRIP;

	return X86ISD::Wrapper;
	}

	// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
	// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
	// one of the above mentioned nodes. It has to be wrapped because otherwise
	// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
	// be used to form addressing mode. These wrapped nodes will be selected
	// into MOV32ri.
	SDValue
	X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
	ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetConstantPool(
	CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
	SDLoc DL(CP);
	Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
	// With PIC, the address is actually $g + Offset.
	if (OpFlag) {
	Result =
	DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
	}

	return Result;
	}

	SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
	JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
	SDLoc DL(JT);
	Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (OpFlag)
	Result =
	DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);

	return Result;
	}

	SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
	SelectionDAG &DAG) const {
	return LowerGlobalOrExternal(Op, DAG, /ForCall=/false);
	}

	SDValue
	X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
	// Create the TargetBlockAddressAddress node.
	unsigned char OpFlags =
	Subtarget.classifyBlockAddressReference();
	const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
	int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
	SDLoc dl(Op);
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
	Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (isGlobalRelativeToPICBase(OpFlags)) {
	Result = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
	}

	return Result;
	}

	/// Creates target global address or external symbol nodes for calls or
	/// other uses.
	SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
	bool ForCall) const {
	// Unpack the global address or external symbol.
	const SDLoc &dl = SDLoc(Op);
	const GlobalValue *GV = nullptr;
	int64_t Offset = 0;
	const char *ExternalSym = nullptr;
	if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
	GV = G->getGlobal();
	Offset = G->getOffset();
	} else {
	const auto *ES = cast<ExternalSymbolSDNode>(Op);
	ExternalSym = ES->getSymbol();
	}

	// Calculate some flags for address lowering.
	const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
	unsigned char OpFlags;
	if (ForCall)
	OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
	else
	OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
	bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
	bool NeedsLoad = isGlobalStubReference(OpFlags);

	CodeModel::Model M = DAG.getTarget().getCodeModel();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result;

	if (GV) {
	// Create a target global address if this is a global. If possible, fold the
	// offset into the global address reference. Otherwise, ADD it on later.
	int64_t GlobalOffset = 0;
	if (OpFlags == X86II::MO_NO_FLAG &&
	X86::isOffsetSuitableForCodeModel(Offset, M)) {
	std::swap(GlobalOffset, Offset);
	}
	Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
	} else {
	// If this is not a global address, this must be an external symbol.
	Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
	}

	// If this is a direct call, avoid the wrapper if we don't need to do any
	// loads or adds. This allows SDAG ISel to match direct calls.
	if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
	return Result;

	Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (HasPICReg) {
	Result = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
	}

	// For globals that require a load from a stub to get the address, emit the
	// load.
	if (NeedsLoad)
	Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));

	// If there was a non-zero offset that we didn't fold, create an explicit
	// addition for it.
	if (Offset != 0)
	Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
	DAG.getConstant(Offset, dl, PtrVT));

	return Result;
	}

	SDValue
	X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
	return LowerGlobalOrExternal(Op, DAG, /ForCall=/false);
	}

	static SDValue
	GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
	SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
	unsigned char OperandFlags, bool LocalDynamic = false) {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDLoc dl(GA);
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(),
	OperandFlags);

	X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
	: X86ISD::TLSADDR;

	if (InFlag) {
	SDValue Ops[] = { Chain, TGA, *InFlag };
	Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
	} else {
	SDValue Ops[] = { Chain, TGA };
	Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
	}

	// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
	MFI.setAdjustsStack(true);
	MFI.setHasCalls(true);

	SDValue Flag = Chain.getValue(1);
	return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
	}

	// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
	static SDValue
	LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT) {
	SDValue InFlag;
	SDLoc dl(GA); // ? function entry point might be better
	SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
	DAG.getNode(X86ISD::GlobalBaseReg,
	SDLoc(), PtrVT), InFlag);
	InFlag = Chain.getValue(1);

	return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
	}

	// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
	static SDValue
	LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT) {
	return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
	X86::RAX, X86II::MO_TLSGD);
	}

	static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
	SelectionDAG &DAG,
	const EVT PtrVT,
	bool is64Bit) {
	SDLoc dl(GA);

	// Get the start address of the TLS block for this module.
	X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
	.getInfo<X86MachineFunctionInfo>();
	MFI->incNumLocalDynamicTLSAccesses();

	SDValue Base;
	if (is64Bit) {
	Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
	X86II::MO_TLSLD, /LocalDynamic=/true);
	} else {
	SDValue InFlag;
	SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
	InFlag = Chain.getValue(1);
	Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
	X86II::MO_TLSLDM, /LocalDynamic=/true);
	}

	// Note: the CleanupLocalDynamicTLSPass will remove redundant computations
	// of Base.

	// Build x@dtpoff.
	unsigned char OperandFlags = X86II::MO_DTPOFF;
	unsigned WrapperKind = X86ISD::Wrapper;
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(), OperandFlags);
	SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

	// Add x@dtpoff with the base.
	return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
	}

	// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
	static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT, TLSModel::Model model,
	bool is64Bit, bool isPIC) {
	SDLoc dl(GA);

	// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
	Value Ptr = Constant::getNullValue(Type::getInt8PtrTy(DAG.getContext(),
	is64Bit ? 257 : 256));

	SDValue ThreadPointer =
	DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
	MachinePointerInfo(Ptr));

	unsigned char OperandFlags = 0;
	// Most TLS accesses are not RIP relative, even on x86-64. One exception is
	// initialexec.
	unsigned WrapperKind = X86ISD::Wrapper;
	if (model == TLSModel::LocalExec) {
	OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
	} else if (model == TLSModel::InitialExec) {
	if (is64Bit) {
	OperandFlags = X86II::MO_GOTTPOFF;
	WrapperKind = X86ISD::WrapperRIP;
	} else {
	OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
	}
	} else {
	llvm_unreachable("Unexpected model");
	}

	// emit "addl x@ntpoff,%eax" (local exec)
	// or "addl x@indntpoff,%eax" (initial exec)
	// or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
	SDValue TGA =
	DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
	GA->getOffset(), OperandFlags);
	SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

	if (model == TLSModel::InitialExec) {
	if (isPIC && !is64Bit) {
	Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
	Offset);
	}

	Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
	}

	// The address of the thread local variable is the add of the thread
	// pointer with the offset of the variable.
	return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
	}

	SDValue
	X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {

	GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);

	if (DAG.getTarget().useEmulatedTLS())
	return LowerToTLSEmulatedModel(GA, DAG);

	const GlobalValue *GV = GA->getGlobal();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	bool PositionIndependent = isPositionIndependent();

	if (Subtarget.isTargetELF()) {
	TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
	switch (model) {
	case TLSModel::GeneralDynamic:
	if (Subtarget.is64Bit())
	return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
	return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
	case TLSModel::LocalDynamic:
	return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
	Subtarget.is64Bit());
	case TLSModel::InitialExec:
	case TLSModel::LocalExec:
	return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
	PositionIndependent);
	}
	llvm_unreachable("Unknown TLS model.");
	}

	if (Subtarget.isTargetDarwin()) {
	// Darwin only has one model of TLS. Lower to that.
	unsigned char OpFlag = 0;
	unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
	X86ISD::WrapperRIP : X86ISD::Wrapper;

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
	if (PIC32)
	OpFlag = X86II::MO_TLVP_PIC_BASE;
	else
	OpFlag = X86II::MO_TLVP;
	SDLoc DL(Op);
	SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
	GA->getValueType(0),
	GA->getOffset(), OpFlag);
	SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);

	// With PIC32, the address is actually $g + Offset.
	if (PIC32)
	Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
	Offset);

	// Lowering the machine isd will make sure everything is in the right
	// location.
	SDValue Chain = DAG.getEntryNode();
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
	SDValue Args[] = { Chain, Offset };
	Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
	DAG.getIntPtrConstant(0, DL, true),
	Chain.getValue(1), DL);

	// TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setAdjustsStack(true);

	// And our return value (tls address) is in the standard call return value
	// location.
	unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
	return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
	}

	if (Subtarget.isOSWindows()) {
	// Just use the implicit TLS architecture
	// Need to generate something similar to:
	// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
	// ; from TEB
	// mov ecx, dword [rel _tls_index]: Load index (from C runtime)
	// mov rcx, qword [rdx+rcx*8]
	// mov eax, .tls$:tlsvar
	// [rax+rcx] contains the address
	// Windows 64bit: gs:0x58
	// Windows 32bit: fs:__tls_array

	SDLoc dl(GA);
	SDValue Chain = DAG.getEntryNode();

	// Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
	// %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
	// use its literal value of 0x2C.
	Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
	? Type::getInt8PtrTy(*DAG.getContext(),
	256)
	: Type::getInt32PtrTy(*DAG.getContext(),
	257));

	SDValue TlsArray = Subtarget.is64Bit()
	? DAG.getIntPtrConstant(0x58, dl)
	: (Subtarget.isTargetWindowsGNU()
	? DAG.getIntPtrConstant(0x2C, dl)
	: DAG.getExternalSymbol("_tls_array", PtrVT));

	SDValue ThreadPointer =
	DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));

	SDValue res;
	if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
	res = ThreadPointer;
	} else {
	// Load the _tls_index variable
	SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
	if (Subtarget.is64Bit())
	IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
	MachinePointerInfo(), MVT::i32);
	else
	IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());

	auto &DL = DAG.getDataLayout();
	SDValue Scale =
	DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
	IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);

	res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
	}

	res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());

	// Get the offset of start of .tls section
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(), X86II::MO_SECREL);
	SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);

	// The address of the thread local variable is the add of the thread
	// pointer with the offset of the variable.
	return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
	}

	llvm_unreachable("TLS not implemented for this target.");
	}

	/// Lower SRA_PARTS and friends, which return two i32 values
	/// and take a 2 x i32 value to shift plus a shift amount.
	/// TODO: Can this be moved to general expansion code?
	static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getNumOperands() == 3 && "Not a double-shift!");
	MVT VT = Op.getSimpleValueType();
	unsigned VTBits = VT.getSizeInBits();
	SDLoc dl(Op);
	bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
	SDValue ShOpLo = Op.getOperand(0);
	SDValue ShOpHi = Op.getOperand(1);
	SDValue ShAmt = Op.getOperand(2);
	// ISD::FSHL and ISD::FSHR have defined overflow behavior but ISD::SHL and
	// ISD::SRA/L nodes haven't. Insert an AND to be safe, it's optimized away
	// during isel.
	SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
	DAG.getConstant(VTBits - 1, dl, MVT::i8));
	SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
	DAG.getConstant(VTBits - 1, dl, MVT::i8))
	: DAG.getConstant(0, dl, VT);

	SDValue Tmp2, Tmp3;
	if (Op.getOpcode() == ISD::SHL_PARTS) {
	Tmp2 = DAG.getNode(ISD::FSHL, dl, VT, ShOpHi, ShOpLo, ShAmt);
	Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
	} else {
	Tmp2 = DAG.getNode(ISD::FSHR, dl, VT, ShOpHi, ShOpLo, ShAmt);
	Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
	}

	// If the shift amount is larger or equal than the width of a part we can't
	// rely on the results of shld/shrd. Insert a test and select the appropriate
	// values for large shift amounts.
	SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
	DAG.getConstant(VTBits, dl, MVT::i8));
	SDValue Cond = DAG.getSetCC(dl, MVT::i8, AndNode,
	DAG.getConstant(0, dl, MVT::i8), ISD::SETNE);

	SDValue Hi, Lo;
	if (Op.getOpcode() == ISD::SHL_PARTS) {
	Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
	Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
	} else {
	Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
	Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
	}

	return DAG.getMergeValues({ Lo, Hi }, dl);
	}

	static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	assert((Op.getOpcode() == ISD::FSHL \|\| Op.getOpcode() == ISD::FSHR) &&
	"Unexpected funnel shift opcode!");

	SDLoc DL(Op);
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue Amt = Op.getOperand(2);

	bool IsFSHR = Op.getOpcode() == ISD::FSHR;

	if (VT.isVector()) {
	assert(Subtarget.hasVBMI2() && "Expected VBMI2");

	if (IsFSHR)
	std::swap(Op0, Op1);

	APInt APIntShiftAmt;
	if (X86::isConstantSplat(Amt, APIntShiftAmt)) {
	uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
	return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, Op0,
	Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
	}

	return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
	Op0, Op1, Amt);
	}
	assert(
	(VT == MVT::i8 \|\| VT == MVT::i16 \|\| VT == MVT::i32 \|\| VT == MVT::i64) &&
	"Unexpected funnel shift type!");

	// Expand slow SHLD/SHRD cases if we are not optimizing for size.
	bool OptForSize = DAG.shouldOptForSize();
	bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();

	// fshl(x,y,z) -> (((aext(x) << bw) \| zext(y)) << (z & (bw-1))) >> bw.
	// fshr(x,y,z) -> (((aext(x) << bw) \| zext(y)) >> (z & (bw-1))).
	if ((VT == MVT::i8 \|\| (ExpandFunnel && VT == MVT::i16)) &&
	!isa<ConstantSDNode>(Amt)) {
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
	SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
	Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
	Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
	Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
	SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
	Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
	if (IsFSHR) {
	Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
	} else {
	Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
	Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
	}
	return DAG.getZExtOrTrunc(Res, DL, VT);
	}

	if (VT == MVT::i8 \|\| ExpandFunnel)
	return SDValue();

	// i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
	if (VT == MVT::i16) {
	Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
	DAG.getConstant(15, DL, Amt.getValueType()));
	unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
	return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
	}

	return Op;
	}

	// Try to use a packed vector operation to handle i64 on 32-bit targets when
	// AVX512DQ is enabled.
	static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert((Op.getOpcode() == ISD::SINT_TO_FP \|\|
	Op.getOpcode() == ISD::STRICT_SINT_TO_FP \|\|
	Op.getOpcode() == ISD::STRICT_UINT_TO_FP \|\|
	Op.getOpcode() == ISD::UINT_TO_FP) &&
	"Unexpected opcode!");
	bool IsStrict = Op->isStrictFPOpcode();
	unsigned OpNo = IsStrict ? 1 : 0;
	SDValue Src = Op.getOperand(OpNo);
	MVT SrcVT = Src.getSimpleValueType();
	MVT VT = Op.getSimpleValueType();

	if (!Subtarget.hasDQI() \|\| SrcVT != MVT::i64 \|\| Subtarget.is64Bit() \|\|
	(VT != MVT::f32 && VT != MVT::f64))
	return SDValue();

	// Pack the i64 into a vector, do the operation and extract.

	// Using 256-bit to ensure result is 128-bits for f32 case.
	unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
	MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
	MVT VecVT = MVT::getVectorVT(VT, NumElts);

	SDLoc dl(Op);
	SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
	if (IsStrict) {
	SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
	{Op.getOperand(0), InVec});
	SDValue Chain = CvtVec.getValue(1);
	SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
	DAG.getIntPtrConstant(0, dl));
	return DAG.getMergeValues({Value, Chain}, dl);
	}

	SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
	DAG.getIntPtrConstant(0, dl));
	}

	static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
	const X86Subtarget &Subtarget) {
	switch (Opcode) {
	case ISD::SINT_TO_FP:
	// TODO: Handle wider types with AVX/AVX512.
	if (!Subtarget.hasSSE2() \|\| FromVT != MVT::v4i32)
	return false;
	// CVTDQ2PS or (V)CVTDQ2PD
	return ToVT == MVT::v4f32 \|\| (Subtarget.hasAVX() && ToVT == MVT::v4f64);

	case ISD::UINT_TO_FP:
	// TODO: Handle wider types and i64 elements.
	if (!Subtarget.hasAVX512() \|\| FromVT != MVT::v4i32)
	return false;
	// VCVTUDQ2PS or VCVTUDQ2PD
	return ToVT == MVT::v4f32 \|\| ToVT == MVT::v4f64;

	default:
	return false;
	}
	}

	/// Given a scalar cast operation that is extracted from a vector, try to
	/// vectorize the cast op followed by extraction. This will avoid an expensive
	/// round-trip between XMM and GPR.
	static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// TODO: This could be enhanced to handle smaller integer types by peeking
	// through an extend.
	SDValue Extract = Cast.getOperand(0);
	MVT DestVT = Cast.getSimpleValueType();
	if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(Extract.getOperand(1)))
	return SDValue();

	// See if we have a 128-bit vector cast op for this type of cast.
	SDValue VecOp = Extract.getOperand(0);
	MVT FromVT = VecOp.getSimpleValueType();
	unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
	MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
	MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
	if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
	return SDValue();

	// If we are extracting from a non-zero element, first shuffle the source
	// vector to allow extracting from element zero.
	SDLoc DL(Cast);
	if (!isNullConstant(Extract.getOperand(1))) {
	SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
	Mask[0] = Extract.getConstantOperandVal(1);
	VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
	}
	// If the source vector is wider than 128-bits, extract the low part. Do not
	// create an unnecessarily wide vector cast op.
	if (FromVT != Vec128VT)
	VecOp = extract128BitVector(VecOp, 0, DAG, DL);

	// cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
	// cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
	SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
	DAG.getIntPtrConstant(0, DL));
	}

	/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
	/// try to vectorize the cast ops. This will avoid an expensive round-trip
	/// between XMM and GPR.
	static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// TODO: Allow FP_TO_UINT.
	SDValue CastToInt = CastToFP.getOperand(0);
	MVT VT = CastToFP.getSimpleValueType();
	if (CastToInt.getOpcode() != ISD::FP_TO_SINT \|\| VT.isVector())
	return SDValue();

	MVT IntVT = CastToInt.getSimpleValueType();
	SDValue X = CastToInt.getOperand(0);
	MVT SrcVT = X.getSimpleValueType();
	if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
	return SDValue();

	// See if we have 128-bit vector cast instructions for this type of cast.
	// We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
	if (!Subtarget.hasSSE2() \|\| (VT != MVT::f32 && VT != MVT::f64) \|\|
	IntVT != MVT::i32)
	return SDValue();

	unsigned SrcSize = SrcVT.getSizeInBits();
	unsigned IntSize = IntVT.getSizeInBits();
	unsigned VTSize = VT.getSizeInBits();
	MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
	MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
	MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);

	// We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
	unsigned ToIntOpcode =
	SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
	unsigned ToFPOpcode =
	IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;

	// sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
	//
	// We are not defining the high elements (for example, zero them) because
	// that could nullify any performance advantage that we hoped to gain from
	// this vector op hack. We do not expect any adverse effects (like denorm
	// penalties) with cast ops.
	SDLoc DL(CastToFP);
	SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
	SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
	SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
	SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
	}

	static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(Op);
	bool IsStrict = Op->isStrictFPOpcode();
	MVT VT = Op->getSimpleValueType(0);
	SDValue Src = Op->getOperand(IsStrict ? 1 : 0);

	if (Subtarget.hasDQI()) {
	assert(!Subtarget.hasVLX() && "Unexpected features");

	assert((Src.getSimpleValueType() == MVT::v2i64 \|\|
	Src.getSimpleValueType() == MVT::v4i64) &&
	"Unsupported custom type");

	// With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
	assert((VT == MVT::v4f32 \|\| VT == MVT::v2f64 \|\| VT == MVT::v4f64) &&
	"Unexpected VT!");
	MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;

	// Need to concat with zero vector for strict fp to avoid spurious
	// exceptions.
	SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
	: DAG.getUNDEF(MVT::v8i64);
	Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
	DAG.getIntPtrConstant(0, DL));
	SDValue Res, Chain;
	if (IsStrict) {
	Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
	{Op->getOperand(0), Src});
	Chain = Res.getValue(1);
	} else {
	Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
	}

	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));

	if (IsStrict)
	return DAG.getMergeValues({Res, Chain}, DL);
	return Res;
	}

	bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP \|\|
	Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
	if (VT != MVT::v4f32 \|\| IsSigned)
	return SDValue();

	SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
	SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
	SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
	DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
	DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
	SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
	SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
	SmallVector<SDValue, 4> SignCvts(4);
	SmallVector<SDValue, 4> Chains(4);
	for (int i = 0; i != 4; ++i) {
	SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
	DAG.getIntPtrConstant(i, DL));
	if (IsStrict) {
	SignCvts[i] =
	DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
	{Op.getOperand(0), Elt});
	Chains[i] = SignCvts[i].getValue(1);
	} else {
	SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
	}
	}
	SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);

	SDValue Slow, Chain;
	if (IsStrict) {
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
	Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
	{Chain, SignCvt, SignCvt});
	Chain = Slow.getValue(1);
	} else {
	Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
	}

	IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
	SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);

	if (IsStrict)
	return DAG.getMergeValues({Cvt, Chain}, DL);

	return Cvt;
	}

	SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	bool IsStrict = Op->isStrictFPOpcode();
	unsigned OpNo = IsStrict ? 1 : 0;
	SDValue Src = Op.getOperand(OpNo);
	SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
	MVT SrcVT = Src.getSimpleValueType();
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
	return Extract;

	if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))
	return R;

	if (SrcVT.isVector()) {
	if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
	// Note: Since v2f64 is a legal type. We don't need to zero extend the
	// source for strict FP.
	if (IsStrict)
	return DAG.getNode(
	X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
	{Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
	DAG.getUNDEF(SrcVT))});
	return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
	DAG.getUNDEF(SrcVT)));
	}
	if (SrcVT == MVT::v2i64 \|\| SrcVT == MVT::v4i64)
	return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);

	return SDValue();
	}

	assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
	"Unknown SINT_TO_FP to lower!");

	bool UseSSEReg = isScalarFPTypeInSSEReg(VT);

	// These are really Legal; return the operand so the caller accepts it as
	// Legal.
	if (SrcVT == MVT::i32 && UseSSEReg)
	return Op;
	if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
	return Op;

	if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
	return V;

	// SSE doesn't have an i16 conversion so we need to promote.
	if (SrcVT == MVT::i16 && (UseSSEReg \|\| VT == MVT::f128)) {
	SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
	if (IsStrict)
	return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
	{Chain, Ext});

	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
	}

	if (VT == MVT::f128)
	return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT));

	SDValue ValueToStore = Src;
	if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
	// Bitcasting to f64 here allows us to do a single 64-bit store from
	// an SSE register, avoiding the store forwarding penalty that would come
	// with two 32-bit stores.
	ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

	unsigned Size = SrcVT.getStoreSize();
	Align Alignment(Size);
	MachineFunction &MF = DAG.getMachineFunction();
	auto PtrVT = getPointerTy(MF.getDataLayout());
	int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
	MachinePointerInfo MPI =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
	Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
	std::pair<SDValue, SDValue> Tmp =
	BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);

	if (IsStrict)
	return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);

	return Tmp.first;
	}

	std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
	EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
	MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
	// Build the FILD
	SDVTList Tys;
	bool useSSE = isScalarFPTypeInSSEReg(DstVT);
	if (useSSE)
	Tys = DAG.getVTList(MVT::f80, MVT::Other);
	else
	Tys = DAG.getVTList(DstVT, MVT::Other);

	SDValue FILDOps[] = {Chain, Pointer};
	SDValue Result =
	DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
	Alignment, MachineMemOperand::MOLoad);
	Chain = Result.getValue(1);

	if (useSSE) {
	MachineFunction &MF = DAG.getMachineFunction();
	unsigned SSFISize = DstVT.getStoreSize();
	int SSFI =
	MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
	auto PtrVT = getPointerTy(MF.getDataLayout());
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
	Tys = DAG.getVTList(MVT::Other);
	SDValue FSTOps[] = {Chain, Result, StackSlot};
	MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
	MachineMemOperand::MOStore, SSFISize, Align(SSFISize));

	Chain =
	DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
	Result = DAG.getLoad(
	DstVT, DL, Chain, StackSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
	Chain = Result.getValue(1);
	}

	return { Result, Chain };
	}

	/// Horizontal vector math instructions may be slower than normal math with
	/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
	/// implementation, and likely shuffle complexity of the alternate sequence.
	static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	bool IsOptimizingSize = DAG.shouldOptForSize();
	bool HasFastHOps = Subtarget.hasFastHorizontalOps();
	return !IsSingleSource \|\| IsOptimizingSize \|\| HasFastHOps;
	}

	/// 64-bit unsigned integer to double expansion.
	static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// This algorithm is not obvious. Here it is what we're trying to output:
	/*
	movq %rax, %xmm0
	punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
	subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
	#ifdef __SSE3__
	haddpd %xmm0, %xmm0
	#else
	pshufd $0x4e, %xmm0, %xmm1
	addpd %xmm1, %xmm0
	#endif
	*/

	bool IsStrict = Op->isStrictFPOpcode();
	unsigned OpNo = IsStrict ? 1 : 0;
	SDLoc dl(Op);
	LLVMContext *Context = DAG.getContext();

	// Build some magic constants.
	static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
	Constant C0 = ConstantDataVector::get(Context, CV0);
	auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
	SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));

	SmallVector<Constant*,2> CV1;
	CV1.push_back(
	ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
	APInt(64, 0x4330000000000000ULL))));
	CV1.push_back(
	ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
	APInt(64, 0x4530000000000000ULL))));
	Constant *C1 = ConstantVector::get(CV1);
	SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));

	// Load the 64-bit value into an XMM register.
	SDValue XR1 =
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(OpNo));
	SDValue CLod0 =
	DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	/* Alignment = */ 16);
	SDValue Unpck1 =
	getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);

	SDValue CLod1 =
	DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	/* Alignment = */ 16);
	SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
	SDValue Sub;
	SDValue Chain;
	// TODO: Are there any fast-math-flags to propagate here?
	if (IsStrict) {
	Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
	{Op.getOperand(0), XR2F, CLod1});
	Chain = Sub.getValue(1);
	} else
	Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
	SDValue Result;

	if (!IsStrict && Subtarget.hasSSE3() &&
	shouldUseHorizontalOp(true, DAG, Subtarget)) {
	// FIXME: Do we need a STRICT version of FHADD?
	Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
	} else {
	SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
	if (IsStrict) {
	Result = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v2f64, MVT::Other},
	{Chain, Shuffle, Sub});
	Chain = Result.getValue(1);
	} else
	Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
	}
	Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
	DAG.getIntPtrConstant(0, dl));
	if (IsStrict)
	return DAG.getMergeValues({Result, Chain}, dl);

	return Result;
	}

	/// 32-bit unsigned integer to float expansion.
	static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
	SDLoc dl(Op);
	// FP constant to bias correct the final result.
	SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
	MVT::f64);

	// Load the 32-bit value into an XMM register.
	SDValue Load =
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));

	// Zero out the upper parts of the register.
	Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);

	// Or the load with the bias.
	SDValue Or = DAG.getNode(
	ISD::OR, dl, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64, Load),
	DAG.getBitcast(MVT::v2i64,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
	Or =
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
	DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));

	if (Op.getNode()->isStrictFPOpcode()) {
	// Subtract the bias.
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Chain = Op.getOperand(0);
	SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
	{Chain, Or, Bias});

	if (Op.getValueType() == Sub.getValueType())
	return Sub;

	// Handle final rounding.
	std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
	Sub, Sub.getValue(1), dl, Op.getSimpleValueType());

	return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
	}

	// Subtract the bias.
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);

	// Handle final rounding.
	return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
	}

	static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	const SDLoc &DL) {
	if (Op.getSimpleValueType() != MVT::v2f64)
	return SDValue();

	bool IsStrict = Op->isStrictFPOpcode();

	SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
	assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");

	if (Subtarget.hasAVX512()) {
	if (!Subtarget.hasVLX()) {
	// Let generic type legalization widen this.
	if (!IsStrict)
	return SDValue();
	// Otherwise pad the integer input with 0s and widen the operation.
	N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
	DAG.getConstant(0, DL, MVT::v2i32));
	SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
	{Op.getOperand(0), N0});
	SDValue Chain = Res.getValue(1);
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getMergeValues({Res, Chain}, DL);
	}

	// Legalize to v4i32 type.
	N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
	DAG.getUNDEF(MVT::v2i32));
	if (IsStrict)
	return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
	{Op.getOperand(0), N0});
	return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
	}

	// Zero extend to 2i64, OR with the floating point representation of 2^52.
	// This gives us the floating point equivalent of 2^52 + the i32 integer
	// since double has 52-bits of mantissa. Then subtract 2^52 in floating
	// point leaving just our i32 integers in double format.
	SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
	SDValue VBias =
	DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64);
	SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
	DAG.getBitcast(MVT::v2i64, VBias));
	Or = DAG.getBitcast(MVT::v2f64, Or);

	if (IsStrict)
	return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
	{Op.getOperand(0), Or, VBias});
	return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
	}

	static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(Op);
	bool IsStrict = Op->isStrictFPOpcode();
	SDValue V = Op->getOperand(IsStrict ? 1 : 0);
	MVT VecIntVT = V.getSimpleValueType();
	assert((VecIntVT == MVT::v4i32 \|\| VecIntVT == MVT::v8i32) &&
	"Unsupported custom type");

	if (Subtarget.hasAVX512()) {
	// With AVX512, but not VLX we need to widen to get a 512-bit result type.
	assert(!Subtarget.hasVLX() && "Unexpected features");
	MVT VT = Op->getSimpleValueType(0);

	// v8i32->v8f64 is legal with AVX512 so just return it.
	if (VT == MVT::v8f64)
	return Op;

	assert((VT == MVT::v4f32 \|\| VT == MVT::v8f32 \|\| VT == MVT::v4f64) &&
	"Unexpected VT!");
	MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
	MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
	// Need to concat with zero vector for strict fp to avoid spurious
	// exceptions.
	SDValue Tmp =
	IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
	V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
	DAG.getIntPtrConstant(0, DL));
	SDValue Res, Chain;
	if (IsStrict) {
	Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
	{Op->getOperand(0), V});
	Chain = Res.getValue(1);
	} else {
	Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
	}

	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));

	if (IsStrict)
	return DAG.getMergeValues({Res, Chain}, DL);
	return Res;
	}

	if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
	Op->getSimpleValueType(0) == MVT::v4f64) {
	SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
	Constant *Bias = ConstantFP::get(
	*DAG.getContext(),
	APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
	auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
	SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
	SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
	SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
	SDValue VBias = DAG.getMemIntrinsicNode(
	X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),
	MachineMemOperand::MOLoad);

	SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
	DAG.getBitcast(MVT::v4i64, VBias));
	Or = DAG.getBitcast(MVT::v4f64, Or);

	if (IsStrict)
	return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
	{Op.getOperand(0), Or, VBias});
	return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
	}

	// The algorithm is the following:
	// #ifdef __SSE4_1__
	// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
	// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
	// (uint4) 0x53000000, 0xaa);
	// #else
	// uint4 lo = (v & (uint4) 0xffff) \| (uint4) 0x4b000000;
	// uint4 hi = (v >> 16) \| (uint4) 0x53000000;
	// #endif
	// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
	// return (float4) lo + fhi;

	bool Is128 = VecIntVT == MVT::v4i32;
	MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
	// If we convert to something else than the supported type, e.g., to v4f64,
	// abort early.
	if (VecFloatVT != Op->getSimpleValueType(0))
	return SDValue();

	// In the #idef/#else code, we have in common:
	// - The vector of constants:
	// -- 0x4b000000
	// -- 0x53000000
	// - A shift:
	// -- v >> 16

	// Create the splat vector for 0x4b000000.
	SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
	// Create the splat vector for 0x53000000.
	SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);

	// Create the right shift.
	SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
	SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);

	SDValue Low, High;
	if (Subtarget.hasSSE41()) {
	MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
	// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
	SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
	SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
	// Low will be bitcasted right away, so do not bother bitcasting back to its
	// original type.
	Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
	VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
	// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
	// (uint4) 0x53000000, 0xaa);
	SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
	SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
	// High will be bitcasted right away, so do not bother bitcasting back to
	// its original type.
	High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
	VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
	} else {
	SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
	// uint4 lo = (v & (uint4) 0xffff) \| (uint4) 0x4b000000;
	SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
	Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);

	// uint4 hi = (v >> 16) \| (uint4) 0x53000000;
	High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
	}

	// Create the vector constant for (0x1.0p39f + 0x1.0p23f).
	SDValue VecCstFSub = DAG.getConstantFP(
	APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);

	// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
	// NOTE: By using fsub of a positive constant instead of fadd of a negative
	// constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
	// enabled. See PR24512.
	SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
	// TODO: Are there any fast-math-flags to propagate here?
	// (float4) lo;
	SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
	// return (float4) lo + fhi;
	if (IsStrict) {
	SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
	{Op.getOperand(0), HighBitcast, VecCstFSub});
	return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
	{FHigh.getValue(1), LowBitcast, FHigh});
	}

	SDValue FHigh =
	DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
	return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
	}

	static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
	SDValue N0 = Op.getOperand(OpNo);
	MVT SrcVT = N0.getSimpleValueType();
	SDLoc dl(Op);

	switch (SrcVT.SimpleTy) {
	default:
	llvm_unreachable("Custom UINT_TO_FP is not supported!");
	case MVT::v2i32:
	return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
	case MVT::v4i32:
	case MVT::v8i32:
	return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
	case MVT::v2i64:
	case MVT::v4i64:
	return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
	}
	}

	SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	bool IsStrict = Op->isStrictFPOpcode();
	unsigned OpNo = IsStrict ? 1 : 0;
	SDValue Src = Op.getOperand(OpNo);
	SDLoc dl(Op);
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	MVT SrcVT = Src.getSimpleValueType();
	MVT DstVT = Op->getSimpleValueType(0);
	SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

	if (DstVT == MVT::f128)
	return LowerF128Call(Op, DAG, RTLIB::getUINTTOFP(SrcVT, DstVT));

	if (DstVT.isVector())
	return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);

	if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
	return Extract;

	if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
	(SrcVT == MVT::i32 \|\| (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
	// Conversions from unsigned i32 to f32/f64 are legal,
	// using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
	return Op;
	}

	// Promote i32 to i64 and use a signed conversion on 64-bit targets.
	if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
	Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
	if (IsStrict)
	return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
	{Chain, Src});
	return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
	}

	if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
	return V;

	if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
	return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
	if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)
	return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
	if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
	return SDValue();

	// Make a 64-bit buffer, and use it to build an FILD.
	SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
	int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
	MachinePointerInfo MPI =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
	if (SrcVT == MVT::i32) {
	SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
	SDValue Store1 =
	DAG.getStore(Chain, dl, Src, StackSlot, MPI, 8 /Align/);
	SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
	OffsetSlot, MPI.getWithOffset(4), 4);
	std::pair<SDValue, SDValue> Tmp =
	BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, Align(8), DAG);
	if (IsStrict)
	return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);

	return Tmp.first;
	}

	assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
	SDValue ValueToStore = Src;
	if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
	// Bitcasting to f64 here allows us to do a single 64-bit store from
	// an SSE register, avoiding the store forwarding penalty that would come
	// with two 32-bit stores.
	ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
	}
	SDValue Store =
	DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Align(8));
	// For i64 source, we need to add the appropriate power of 2 if the input
	// was negative. This is the same as the optimization in
	// DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
	// we must be careful to do the computation in x87 extended precision, not
	// in SSE. (The generic code can't know it's OK to do this, or how to.)
	SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
	SDValue Ops[] = { Store, StackSlot };
	SDValue Fild =
	DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
	Align(8), MachineMemOperand::MOLoad);
	Chain = Fild.getValue(1);


	// Check whether the sign bit is set.
	SDValue SignSet = DAG.getSetCC(
	dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
	Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);

	// Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
	APInt FF(64, 0x5F80000000000000ULL);
	SDValue FudgePtr = DAG.getConstantPool(
	ConstantInt::get(*DAG.getContext(), FF), PtrVT);
	Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();

	// Get a pointer to FF if the sign bit was set, or to 0 otherwise.
	SDValue Zero = DAG.getIntPtrConstant(0, dl);
	SDValue Four = DAG.getIntPtrConstant(4, dl);
	SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
	FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);

	// Load the value out, extending it from f32 to f80.
	SDValue Fudge = DAG.getExtLoad(
	ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
	CPAlignment);
	Chain = Fudge.getValue(1);
	// Extend everything to 80 bits to force it to be done on x87.
	// TODO: Are there any fast-math-flags to propagate here?
	if (IsStrict) {
	SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},
	{Chain, Fild, Fudge});
	// STRICT_FP_ROUND can't handle equal types.
	if (DstVT == MVT::f80)
	return Add;
	return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
	{Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
	}
	SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
	return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
	DAG.getIntPtrConstant(0, dl));
	}

	// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
	// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
	// just return an SDValue().
	// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
	// to i16, i32 or i64, and we lower it to a legal sequence and return the
	// result.
	SDValue
	X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
	bool IsSigned, SDValue &Chain) const {
	bool IsStrict = Op->isStrictFPOpcode();
	SDLoc DL(Op);

	EVT DstTy = Op.getValueType();
	SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
	EVT TheVT = Value.getValueType();
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
	// f16 must be promoted before using the lowering in this routine.
	// fp128 does not use this lowering.
	return SDValue();
	}

	// If using FIST to compute an unsigned i64, we'll need some fixup
	// to handle values above the maximum signed i64. A FIST is always
	// used for the 32-bit subtarget, but also for f80 on a 64-bit target.
	bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;

	// FIXME: This does not generate an invalid exception if the input does not
	// fit in i32. PR44019
	if (!IsSigned && DstTy != MVT::i64) {
	// Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
	// The low 32 bits of the fist result will have the correct uint32 result.
	assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
	DstTy = MVT::i64;
	}

	assert(DstTy.getSimpleVT() <= MVT::i64 &&
	DstTy.getSimpleVT() >= MVT::i16 &&
	"Unknown FP_TO_INT to lower!");

	// We lower FP->int64 into FISTP64 followed by a load from a temporary
	// stack slot.
	MachineFunction &MF = DAG.getMachineFunction();
	unsigned MemSize = DstTy.getStoreSize();
	int SSFI =
	MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

	Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

	SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.

	if (UnsignedFixup) {
	//
	// Conversion to unsigned i64 is implemented with a select,
	// depending on whether the source value fits in the range
	// of a signed i64. Let Thresh be the FP equivalent of
	// 0x8000000000000000ULL.
	//
	// Adjust = (Value < Thresh) ? 0 : 0x80000000;
	// FltOfs = (Value < Thresh) ? 0 : 0x80000000;
	// FistSrc = (Value - FltOfs);
	// Fist-to-mem64 FistSrc
	// Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
	// to XOR'ing the high 32 bits with Adjust.
	//
	// Being a power of 2, Thresh is exactly representable in all FP formats.
	// For X87 we'd like to use the smallest FP type for this constant, but
	// for DAG type consistency we have to match the FP operand type.

	APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
	LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
	bool LosesInfo = false;
	if (TheVT == MVT::f64)
	// The rounding mode is irrelevant as the conversion should be exact.
	Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
	&LosesInfo);
	else if (TheVT == MVT::f80)
	Status = Thresh.convert(APFloat::x87DoubleExtended(),
	APFloat::rmNearestTiesToEven, &LosesInfo);

	assert(Status == APFloat::opOK && !LosesInfo &&
	"FP conversion should have been exact");

	SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);

	EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), TheVT);
	SDValue Cmp;
	if (IsStrict) {
	Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT,
	Chain, /IsSignaling/ true);
	Chain = Cmp.getValue(1);
	} else {
	Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT);
	}

	Adjust = DAG.getSelect(DL, MVT::i64, Cmp,
	DAG.getConstant(0, DL, MVT::i64),
	DAG.getConstant(APInt::getSignMask(64),
	DL, MVT::i64));
	SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp,
	DAG.getConstantFP(0.0, DL, TheVT),
	ThreshVal);

	if (IsStrict) {
	Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
	{ Chain, Value, FltOfs });
	Chain = Value.getValue(1);
	} else
	Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
	}

	MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);

	// FIXME This causes a redundant load/store if the SSE-class value is already
	// in memory, such as if it is on the callstack.
	if (isScalarFPTypeInSSEReg(TheVT)) {
	assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
	Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
	SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
	SDValue Ops[] = { Chain, StackSlot };

	unsigned FLDSize = TheVT.getStoreSize();
	assert(FLDSize <= MemSize && "Stack slot not big enough");
	MachineMemOperand *MMO = MF.getMachineMemOperand(
	MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
	Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
	Chain = Value.getValue(1);
	}

	// Build the FP_TO_INT*_IN_MEM
	MachineMemOperand *MMO = MF.getMachineMemOperand(
	MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
	SDValue Ops[] = { Chain, Value, StackSlot };
	SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
	DAG.getVTList(MVT::Other),
	Ops, DstTy, MMO);

	SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
	Chain = Res.getValue(1);

	// If we need an unsigned fixup, XOR the result with adjust.
	if (UnsignedFixup)
	Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);

	return Res;
	}

	static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT InVT = In.getSimpleValueType();
	SDLoc dl(Op);
	unsigned Opc = Op.getOpcode();

	assert(VT.isVector() && InVT.isVector() && "Expected vector type");
	assert((Opc == ISD::ANY_EXTEND \|\| Opc == ISD::ZERO_EXTEND) &&
	"Unexpected extension opcode");
	assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
	"Expected same number of elements");
	assert((VT.getVectorElementType() == MVT::i16 \|\|
	VT.getVectorElementType() == MVT::i32 \|\|
	VT.getVectorElementType() == MVT::i64) &&
	"Unexpected element type");
	assert((InVT.getVectorElementType() == MVT::i8 \|\|
	InVT.getVectorElementType() == MVT::i16 \|\|
	InVT.getVectorElementType() == MVT::i32) &&
	"Unexpected element type");

	unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);

	if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
	assert(InVT == MVT::v32i8 && "Unexpected VT!");
	return splitVectorIntUnary(Op, DAG);
	}

	if (Subtarget.hasInt256())
	return Op;

	// Optimize vectors in AVX mode:
	//
	// v8i16 -> v8i32
	// Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
	// Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
	// Concat upper and lower parts.
	//
	// v4i32 -> v4i64
	// Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
	// Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
	// Concat upper and lower parts.
	//
	MVT HalfVT = VT.getHalfNumVectorElementsVT();
	SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);

	// Short-circuit if we can determine that each 128-bit half is the same value.
	// Otherwise, this is difficult to match and optimize.
	if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
	if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);

	SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
	SDValue Undef = DAG.getUNDEF(InVT);
	bool NeedZero = Opc == ISD::ZERO_EXTEND;
	SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
	OpHi = DAG.getBitcast(HalfVT, OpHi);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
	}

	// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
	static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
	const SDLoc &dl, SelectionDAG &DAG) {
	assert((VT == MVT::v16i8 \|\| VT == MVT::v16i16) && "Unexpected VT.");
	SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
	DAG.getIntPtrConstant(0, dl));
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
	DAG.getIntPtrConstant(8, dl));
	Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
	Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	}

	static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
	SDLoc DL(Op);
	unsigned NumElts = VT.getVectorNumElements();

	// For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
	// avoids a constant pool load.
	if (VT.getVectorElementType() != MVT::i8) {
	SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
	return DAG.getNode(ISD::SRL, DL, VT, Extend,
	DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
	}

	// Extend VT if BWI is not supported.
	MVT ExtVT = VT;
	if (!Subtarget.hasBWI()) {
	// If v16i32 is to be avoided, we'll need to split and concatenate.
	if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
	return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);

	ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
	}

	// Widen to 512-bits if VLX is not supported.
	MVT WideVT = ExtVT;
	if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
	NumElts *= 512 / ExtVT.getSizeInBits();
	InVT = MVT::getVectorVT(MVT::i1, NumElts);
	In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
	In, DAG.getIntPtrConstant(0, DL));
	WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
	NumElts);
	}

	SDValue One = DAG.getConstant(1, DL, WideVT);
	SDValue Zero = DAG.getConstant(0, DL, WideVT);

	SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);

	// Truncate if we had to extend above.
	if (VT != ExtVT) {
	WideVT = MVT::getVectorVT(MVT::i8, NumElts);
	SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
	}

	// Extract back to 128/256-bit if we widened.
	if (WideVT != VT)
	SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
	DAG.getIntPtrConstant(0, DL));

	return SelectedVal;
	}

	static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = Op.getOperand(0);
	MVT SVT = In.getSimpleValueType();

	if (SVT.getVectorElementType() == MVT::i1)
	return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);

	assert(Subtarget.hasAVX() && "Expected AVX support");
	return LowerAVXExtend(Op, DAG, Subtarget);
	}

	/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
	/// It makes use of the fact that vectors with enough leading sign/zero bits
	/// prevent the PACKSS/PACKUS from saturating the results.
	/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
	/// within each 128-bit lane.
	static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
	const SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert((Opcode == X86ISD::PACKSS \|\| Opcode == X86ISD::PACKUS) &&
	"Unexpected PACK opcode");
	assert(DstVT.isVector() && "VT not a vector?");

	// Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
	if (!Subtarget.hasSSE2())
	return SDValue();

	EVT SrcVT = In.getValueType();

	// No truncation required, we might get here due to recursive calls.
	if (SrcVT == DstVT)
	return In;

	// We only support vector truncation to 64bits or greater from a
	// 128bits or greater source.
	unsigned DstSizeInBits = DstVT.getSizeInBits();
	unsigned SrcSizeInBits = SrcVT.getSizeInBits();
	if ((DstSizeInBits % 64) != 0 \|\| (SrcSizeInBits % 128) != 0)
	return SDValue();

	unsigned NumElems = SrcVT.getVectorNumElements();
	if (!isPowerOf2_32(NumElems))
	return SDValue();

	LLVMContext &Ctx = *DAG.getContext();
	assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
	assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");

	EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);

	// Pack to the largest type possible:
	// vXi64/vXi32 -> PACKSDW and vXi16 -> PACKSWB.
	EVT InVT = MVT::i16, OutVT = MVT::i8;
	if (SrcVT.getScalarSizeInBits() > 16 &&
	(Opcode == X86ISD::PACKSS \|\| Subtarget.hasSSE41())) {
	InVT = MVT::i32;
	OutVT = MVT::i16;
	}

	// 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
	if (SrcVT.is128BitVector()) {
	InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
	OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
	In = DAG.getBitcast(InVT, In);
	SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));
	Res = extractSubVector(Res, 0, DAG, DL, 64);
	return DAG.getBitcast(DstVT, Res);
	}

	// Split lower/upper subvectors.
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = splitVector(In, DAG, DL);

	unsigned SubSizeInBits = SrcSizeInBits / 2;
	InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
	OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());

	// 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
	if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
	Lo = DAG.getBitcast(InVT, Lo);
	Hi = DAG.getBitcast(InVT, Hi);
	SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
	return DAG.getBitcast(DstVT, Res);
	}

	// AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
	// AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
	if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
	Lo = DAG.getBitcast(InVT, Lo);
	Hi = DAG.getBitcast(InVT, Hi);
	SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);

	// 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
	// so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
	// Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
	SmallVector<int, 64> Mask;
	int Scale = 64 / OutVT.getScalarSizeInBits();
	narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
	Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);

	if (DstVT.is256BitVector())
	return DAG.getBitcast(DstVT, Res);

	// If 512bit -> 128bit truncate another stage.
	EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
	Res = DAG.getBitcast(PackedVT, Res);
	return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
	}

	// Recursively pack lower/upper subvectors, concat result and pack again.
	assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
	EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
	Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
	Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);

	PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
	return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
	}

	static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {

	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT InVT = In.getSimpleValueType();

	assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");

	// Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
	unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
	if (InVT.getScalarSizeInBits() <= 16) {
	if (Subtarget.hasBWI()) {
	// legal, will go to VPMOVB2M, VPMOVW2M
	if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
	// We need to shift to get the lsb into sign position.
	// Shift packed bytes not supported natively, bitcast to word
	MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
	In = DAG.getNode(ISD::SHL, DL, ExtVT,
	DAG.getBitcast(ExtVT, In),
	DAG.getConstant(ShiftInx, DL, ExtVT));
	In = DAG.getBitcast(InVT, In);
	}
	return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
	In, ISD::SETGT);
	}
	// Use TESTD/Q, extended vector to packed dword/qword.
	assert((InVT.is256BitVector() \|\| InVT.is128BitVector()) &&
	"Unexpected vector type.");
	unsigned NumElts = InVT.getVectorNumElements();
	assert((NumElts == 8 \|\| NumElts == 16) && "Unexpected number of elements");
	// We need to change to a wider element type that we have support for.
	// For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
	// For 16 element vectors we extend to v16i32 unless we are explicitly
	// trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
	// we need to split into two 8 element vectors which we can extend to v8i32,
	// truncate and concat the results. There's an additional complication if
	// the original type is v16i8. In that case we can't split the v16i8
	// directly, so we need to shuffle high elements to low and use
	// sign_extend_vector_inreg.
	if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
	SDValue Lo, Hi;
	if (InVT == MVT::v16i8) {
	Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
	Hi = DAG.getVectorShuffle(
	InVT, DL, In, In,
	{8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
	Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
	} else {
	assert(InVT == MVT::v16i16 && "Unexpected VT!");
	Lo = extract128BitVector(In, 0, DAG, DL);
	Hi = extract128BitVector(In, 8, DAG, DL);
	}
	// We're split now, just emit two truncates and a concat. The two
	// truncates will trigger legalization to come back to this function.
	Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
	Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
	}
	// We either have 8 elements or we're allowed to use 512-bit vectors.
	// If we have VLX, we want to use the narrowest vector that can get the
	// job done so we use vXi32.
	MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
	MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
	In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
	InVT = ExtVT;
	ShiftInx = InVT.getScalarSizeInBits() - 1;
	}

	if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
	// We need to shift to get the lsb into sign position.
	In = DAG.getNode(ISD::SHL, DL, InVT, In,
	DAG.getConstant(ShiftInx, DL, InVT));
	}
	// If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
	if (Subtarget.hasDQI())
	return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
	return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
	}

	SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT InVT = In.getSimpleValueType();
	unsigned InNumEltBits = InVT.getScalarSizeInBits();

	assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
	"Invalid TRUNCATE operation");

	// If we're called by the type legalizer, handle a few cases.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isTypeLegal(InVT)) {
	if ((InVT == MVT::v8i64 \|\| InVT == MVT::v16i32 \|\| InVT == MVT::v16i64) &&
	VT.is128BitVector()) {
	assert((InVT == MVT::v16i64 \|\| Subtarget.hasVLX()) &&
	"Unexpected subtarget!");
	// The default behavior is to truncate one step, concatenate, and then
	// truncate the remainder. We'd rather produce two 64-bit results and
	// concatenate those.
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(In, DL);

	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

	Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
	Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
	}

	// Otherwise let default legalization handle it.
	return SDValue();
	}

	if (VT.getVectorElementType() == MVT::i1)
	return LowerTruncateVecI1(Op, DAG, Subtarget);

	// vpmovqb/w/d, vpmovdb/w, vpmovwb
	if (Subtarget.hasAVX512()) {
	if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
	assert(VT == MVT::v32i8 && "Unexpected VT!");
	return splitVectorIntUnary(Op, DAG);
	}

	// word to byte only under BWI. Otherwise we have to promoted to v16i32
	// and then truncate that. But we should only do that if we haven't been
	// asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
	// handled by isel patterns.
	if (InVT != MVT::v16i16 \|\| Subtarget.hasBWI() \|\|
	Subtarget.canExtendTo512DQ())
	return Op;
	}

	unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
	unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;

	// Truncate with PACKUS if we are truncating a vector with leading zero bits
	// that extend all the way to the packed/truncated value.
	// Pre-SSE41 we can only use PACKUSWB.
	KnownBits Known = DAG.computeKnownBits(In);
	if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
	if (SDValue V =
	truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
	return V;

	// Truncate with PACKSS if we are truncating a vector with sign-bits that
	// extend all the way to the packed/truncated value.
	if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
	if (SDValue V =
	truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
	return V;

	// Handle truncation of V256 to V128 using shuffles.
	assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");

	if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
	// On AVX2, v4i64 -> v4i32 becomes VPERMD.
	if (Subtarget.hasInt256()) {
	static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
	In = DAG.getBitcast(MVT::v8i32, In);
	In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
	DAG.getIntPtrConstant(0, DL));
	}

	SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(0, DL));
	SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(2, DL));
	OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
	OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
	static const int ShufMask[] = {0, 2, 4, 6};
	return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
	}

	if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
	// On AVX2, v8i32 -> v8i16 becomes PSHUFB.
	if (Subtarget.hasInt256()) {
	In = DAG.getBitcast(MVT::v32i8, In);

	// The PSHUFB mask:
	static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
	-1, -1, -1, -1, -1, -1, -1, -1,
	16, 17, 20, 21, 24, 25, 28, 29,
	-1, -1, -1, -1, -1, -1, -1, -1 };
	In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
	In = DAG.getBitcast(MVT::v4i64, In);

	static const int ShufMask2[] = {0, 2, -1, -1};
	In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
	In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getBitcast(VT, In);
	}

	SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
	DAG.getIntPtrConstant(0, DL));

	SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
	DAG.getIntPtrConstant(4, DL));

	OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
	OpHi = DAG.getBitcast(MVT::v16i8, OpHi);

	// The PSHUFB mask:
	static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
	-1, -1, -1, -1, -1, -1, -1, -1};

	OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
	OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);

	OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
	OpHi = DAG.getBitcast(MVT::v4i32, OpHi);

	// The MOVLHPS Mask:
	static const int ShufMask2[] = {0, 1, 4, 5};
	SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
	return DAG.getBitcast(MVT::v8i16, res);
	}

	if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
	// Use an AND to zero uppper bits for PACKUS.
	In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));

	SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
	DAG.getIntPtrConstant(0, DL));
	SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
	DAG.getIntPtrConstant(8, DL));
	return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
	}

	llvm_unreachable("All 256->128 cases should have been handled above!");
	}

	SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
	bool IsStrict = Op->isStrictFPOpcode();
	bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT \|\|
	Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
	MVT VT = Op->getSimpleValueType(0);
	SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
	MVT SrcVT = Src.getSimpleValueType();
	SDLoc dl(Op);

	if (VT.isVector()) {
	if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
	MVT ResVT = MVT::v4i32;
	MVT TruncVT = MVT::v4i1;
	unsigned Opc;
	if (IsStrict)
	Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
	else
	Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

	if (!IsSigned && !Subtarget.hasVLX()) {
	assert(Subtarget.useAVX512Regs() && "Unexpected features!");
	// Widen to 512-bits.
	ResVT = MVT::v8i32;
	TruncVT = MVT::v8i1;
	Opc = Op.getOpcode();
	// Need to concat with zero vector for strict fp to avoid spurious
	// exceptions.
	// TODO: Should we just do this for non-strict as well?
	SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
	: DAG.getUNDEF(MVT::v8f64);
	Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
	DAG.getIntPtrConstant(0, dl));
	}
	SDValue Res, Chain;
	if (IsStrict) {
	Res =
	DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Op->getOperand(0), Src});
	Chain = Res.getValue(1);
	} else {
	Res = DAG.getNode(Opc, dl, ResVT, Src);
	}

	Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
	DAG.getIntPtrConstant(0, dl));
	if (IsStrict)
	return DAG.getMergeValues({Res, Chain}, dl);
	return Res;
	}

	// v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
	if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
	assert(!IsSigned && "Expected unsigned conversion!");
	assert(Subtarget.useAVX512Regs() && "Requires avx512f");
	return Op;
	}

	// Widen vXi32 fp_to_uint with avx512f to 512-bit source.
	if ((VT == MVT::v4i32 \|\| VT == MVT::v8i32) &&
	(SrcVT == MVT::v4f64 \|\| SrcVT == MVT::v4f32 \|\| SrcVT == MVT::v8f32)) {
	assert(!IsSigned && "Expected unsigned conversion!");
	assert(Subtarget.useAVX512Regs() && !Subtarget.hasVLX() &&
	"Unexpected features!");
	MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
	MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
	// Need to concat with zero vector for strict fp to avoid spurious
	// exceptions.
	// TODO: Should we just do this for non-strict as well?
	SDValue Tmp =
	IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
	Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
	DAG.getIntPtrConstant(0, dl));

	SDValue Res, Chain;
	if (IsStrict) {
	Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
	{Op->getOperand(0), Src});
	Chain = Res.getValue(1);
	} else {
	Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
	}

	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
	DAG.getIntPtrConstant(0, dl));

	if (IsStrict)
	return DAG.getMergeValues({Res, Chain}, dl);
	return Res;
	}

	// Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
	if ((VT == MVT::v2i64 \|\| VT == MVT::v4i64) &&
	(SrcVT == MVT::v2f64 \|\| SrcVT == MVT::v4f64 \|\| SrcVT == MVT::v4f32)) {
	assert(Subtarget.useAVX512Regs() && Subtarget.hasDQI() &&
	!Subtarget.hasVLX() && "Unexpected features!");
	MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
	// Need to concat with zero vector for strict fp to avoid spurious
	// exceptions.
	// TODO: Should we just do this for non-strict as well?
	SDValue Tmp =
	IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
	Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
	DAG.getIntPtrConstant(0, dl));

	SDValue Res, Chain;
	if (IsStrict) {
	Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
	{Op->getOperand(0), Src});
	Chain = Res.getValue(1);
	} else {
	Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
	}

	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
	DAG.getIntPtrConstant(0, dl));

	if (IsStrict)
	return DAG.getMergeValues({Res, Chain}, dl);
	return Res;
	}

	if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
	if (!Subtarget.hasVLX()) {
	// Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
	// legalizer and then widened again by vector op legalization.
	if (!IsStrict)
	return SDValue();

	SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
	SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
	{Src, Zero, Zero, Zero});
	Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
	{Op->getOperand(0), Tmp});
	SDValue Chain = Tmp.getValue(1);
	Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
	DAG.getIntPtrConstant(0, dl));
	if (IsStrict)
	return DAG.getMergeValues({Tmp, Chain}, dl);
	return Tmp;
	}

	assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
	SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
	DAG.getUNDEF(MVT::v2f32));
	if (IsStrict) {
	unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
	: X86ISD::STRICT_CVTTP2UI;
	return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
	}
	unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
	return DAG.getNode(Opc, dl, VT, Tmp);
	}

	return SDValue();
	}

	assert(!VT.isVector());

	bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);

	if (!IsSigned && UseSSEReg) {
	// Conversions from f32/f64 with AVX512 should be legal.
	if (Subtarget.hasAVX512())
	return Op;

	// Use default expansion for i64.
	if (VT == MVT::i64)
	return SDValue();

	assert(VT == MVT::i32 && "Unexpected VT!");

	// Promote i32 to i64 and use a signed operation on 64-bit targets.
	// FIXME: This does not generate an invalid exception if the input does not
	// fit in i32. PR44019
	if (Subtarget.is64Bit()) {
	SDValue Res, Chain;
	if (IsStrict) {
	Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i64, MVT::Other},
	{ Op.getOperand(0), Src });
	Chain = Res.getValue(1);
	} else
	Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);

	Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	if (IsStrict)
	return DAG.getMergeValues({ Res, Chain }, dl);
	return Res;
	}

	// Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
	// use fisttp which will be handled later.
	if (!Subtarget.hasSSE3())
	return SDValue();
	}

	// Promote i16 to i32 if we can use a SSE operation or the type is f128.
	// FIXME: This does not generate an invalid exception if the input does not
	// fit in i16. PR44019
	if (VT == MVT::i16 && (UseSSEReg \|\| SrcVT == MVT::f128)) {
	assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
	SDValue Res, Chain;
	if (IsStrict) {
	Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i32, MVT::Other},
	{ Op.getOperand(0), Src });
	Chain = Res.getValue(1);
	} else
	Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);

	Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	if (IsStrict)
	return DAG.getMergeValues({ Res, Chain }, dl);
	return Res;
	}

	// If this is a FP_TO_SINT using SSEReg we're done.
	if (UseSSEReg && IsSigned)
	return Op;

	// fp128 needs to use a libcall.
	if (SrcVT == MVT::f128) {
	RTLIB::Libcall LC;
	if (IsSigned)
	LC = RTLIB::getFPTOSINT(SrcVT, VT);
	else
	LC = RTLIB::getFPTOUINT(SrcVT, VT);

	SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
	MakeLibCallOptions CallOptions;
	std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
	SDLoc(Op), Chain);

	if (IsStrict)
	return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);

	return Tmp.first;
	}

	// Fall back to X87.
	SDValue Chain;
	if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
	if (IsStrict)
	return DAG.getMergeValues({V, Chain}, dl);
	return V;
	}

	llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
	}

	SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();

	// If the source is in an SSE register, the node is Legal.
	if (isScalarFPTypeInSSEReg(SrcVT))
	return Op;

	return LRINT_LLRINTHelper(Op.getNode(), DAG);
	}

	SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
	SelectionDAG &DAG) const {
	EVT DstVT = N->getValueType(0);
	SDValue Src = N->getOperand(0);
	EVT SrcVT = Src.getValueType();

	if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
	// f16 must be promoted before using the lowering in this routine.
	// fp128 does not use this lowering.
	return SDValue();
	}

	SDLoc DL(N);
	SDValue Chain = DAG.getEntryNode();

	bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);

	// If we're converting from SSE, the stack slot needs to hold both types.
	// Otherwise it only needs to hold the DstVT.
	EVT OtherVT = UseSSE ? SrcVT : DstVT;
	SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
	int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
	MachinePointerInfo MPI =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

	if (UseSSE) {
	assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
	Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
	SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
	SDValue Ops[] = { Chain, StackPtr };

	Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
	/Align/ None, MachineMemOperand::MOLoad);
	Chain = Src.getValue(1);
	}

	SDValue StoreOps[] = { Chain, Src, StackPtr };
	Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
	StoreOps, DstVT, MPI, /Align/ None,
	MachineMemOperand::MOStore);

	return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
	}

	SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
	bool IsStrict = Op->isStrictFPOpcode();

	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(IsStrict ? 1 : 0);
	MVT SVT = In.getSimpleValueType();

	if (VT == MVT::f128) {
	RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT);
	return LowerF128Call(Op, DAG, LC);
	}

	assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");

	SDValue Res =
	DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
	if (IsStrict)
	return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
	{Op->getOperand(0), Res});
	return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
	}

	SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
	bool IsStrict = Op->isStrictFPOpcode();

	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(IsStrict ? 1 : 0);
	MVT SVT = In.getSimpleValueType();

	// It's legal except when f128 is involved
	if (SVT != MVT::f128)
	return Op;

	RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, VT);

	// FP_ROUND node has a second operand indicating whether it is known to be
	// precise. That doesn't take part in the LibCall so we can't directly use
	// LowerF128Call.

	SDLoc dl(Op);
	SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
	MakeLibCallOptions CallOptions;
	std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, In, CallOptions,
	dl, Chain);

	if (IsStrict)
	return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);

	return Tmp.first;
	}

	static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
	bool IsStrict = Op->isStrictFPOpcode();
	SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
	assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
	"Unexpected VT!");

	SDLoc dl(Op);
	SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
	DAG.getConstant(0, dl, MVT::v8i16), Src,
	DAG.getIntPtrConstant(0, dl));

	SDValue Chain;
	if (IsStrict) {
	Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
	{Op.getOperand(0), Res});
	Chain = Res.getValue(1);
	} else {
	Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
	}

	Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
	DAG.getIntPtrConstant(0, dl));

	if (IsStrict)
	return DAG.getMergeValues({Res, Chain}, dl);

	return Res;
	}

	static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
	bool IsStrict = Op->isStrictFPOpcode();
	SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
	assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
	"Unexpected VT!");

	SDLoc dl(Op);
	SDValue Res, Chain;
	if (IsStrict) {
	Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
	DAG.getConstantFP(0, dl, MVT::v4f32), Src,
	DAG.getIntPtrConstant(0, dl));
	Res = DAG.getNode(
	X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
	{Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
	Chain = Res.getValue(1);
	} else {
	// FIXME: Should we use zeros for upper elements for non-strict?
	Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
	Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
	DAG.getTargetConstant(4, dl, MVT::i32));
	}

	Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
	DAG.getIntPtrConstant(0, dl));

	if (IsStrict)
	return DAG.getMergeValues({Res, Chain}, dl);

	return Res;
	}

	/// Depending on uarch and/or optimizing for size, we might prefer to use a
	/// vector operation in place of the typical scalar operation.
	static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// If both operands have other uses, this is probably not profitable.
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	if (!LHS.hasOneUse() && !RHS.hasOneUse())
	return Op;

	// FP horizontal add/sub were added with SSE3. Integer with SSSE3.
	bool IsFP = Op.getSimpleValueType().isFloatingPoint();
	if (IsFP && !Subtarget.hasSSE3())
	return Op;
	if (!IsFP && !Subtarget.hasSSSE3())
	return Op;

	// Extract from a common vector.
	if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	LHS.getOperand(0) != RHS.getOperand(0) \|\|
	!isa<ConstantSDNode>(LHS.getOperand(1)) \|\|
	!isa<ConstantSDNode>(RHS.getOperand(1)) \|\|
	!shouldUseHorizontalOp(true, DAG, Subtarget))
	return Op;

	// Allow commuted 'hadd' ops.
	// TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
	unsigned HOpcode;
	switch (Op.getOpcode()) {
	case ISD::ADD: HOpcode = X86ISD::HADD; break;
	case ISD::SUB: HOpcode = X86ISD::HSUB; break;
	case ISD::FADD: HOpcode = X86ISD::FHADD; break;
	case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
	default:
	llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
	}
	unsigned LExtIndex = LHS.getConstantOperandVal(1);
	unsigned RExtIndex = RHS.getConstantOperandVal(1);
	if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
	(HOpcode == X86ISD::HADD \|\| HOpcode == X86ISD::FHADD))
	std::swap(LExtIndex, RExtIndex);

	if ((LExtIndex & 1) != 0 \|\| RExtIndex != (LExtIndex + 1))
	return Op;

	SDValue X = LHS.getOperand(0);
	EVT VecVT = X.getValueType();
	unsigned BitWidth = VecVT.getSizeInBits();
	unsigned NumLanes = BitWidth / 128;
	unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
	assert((BitWidth == 128 \|\| BitWidth == 256 \|\| BitWidth == 512) &&
	"Not expecting illegal vector widths here");

	// Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
	// equivalent, so extract the 256/512-bit source op to 128-bit if we can.
	SDLoc DL(Op);
	if (BitWidth == 256 \|\| BitWidth == 512) {
	unsigned LaneIdx = LExtIndex / NumEltsPerLane;
	X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
	LExtIndex %= NumEltsPerLane;
	}

	// add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
	// add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
	// add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
	// sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
	SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
	DAG.getIntPtrConstant(LExtIndex / 2, DL));
	}

	/// Depending on uarch and/or optimizing for size, we might prefer to use a
	/// vector operation in place of the typical scalar operation.
	SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
	assert((Op.getValueType() == MVT::f32 \|\| Op.getValueType() == MVT::f64) &&
	"Only expecting float/double");
	return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
	}

	/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
	/// This mode isn't supported in hardware on X86. But as long as we aren't
	/// compiling with trapping math, we can emulate this with
	/// floor(X + copysign(nextafter(0.5, 0.0), X)).
	static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
	SDValue N0 = Op.getOperand(0);
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	// N0 += copysign(nextafter(0.5, 0.0), N0)
	const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
	bool Ignored;
	APFloat Point5Pred = APFloat(0.5f);
	Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
	Point5Pred.next(/nextDown/true);

	SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
	DAG.getConstantFP(Point5Pred, dl, VT), N0);
	N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);

	// Truncate the result to remove fraction.
	return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
	}

	/// The only differences between FABS and FNEG are the mask and the logic op.
	/// FNEG also has a folding opportunity for FNEG(FABS(x)).
	static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
	assert((Op.getOpcode() == ISD::FABS \|\| Op.getOpcode() == ISD::FNEG) &&
	"Wrong opcode for lowering FABS or FNEG.");

	bool IsFABS = (Op.getOpcode() == ISD::FABS);

	// If this is a FABS and it has an FNEG user, bail out to fold the combination
	// into an FNABS. We'll lower the FABS after that if it is still in use.
	if (IsFABS)
	for (SDNode *User : Op->uses())
	if (User->getOpcode() == ISD::FNEG)
	return Op;

	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	bool IsF128 = (VT == MVT::f128);
	assert((VT == MVT::f64 \|\| VT == MVT::f32 \|\| VT == MVT::f128 \|\|
	VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v4f32 \|\|
	VT == MVT::v8f32 \|\| VT == MVT::v8f64 \|\| VT == MVT::v16f32) &&
	"Unexpected type in LowerFABSorFNEG");

	// FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
	// decide if we should generate a 16-byte constant mask when we only need 4 or
	// 8 bytes for the scalar case.

	// There are no scalar bitwise logical SSE/AVX instructions, so we
	// generate a 16-byte vector constant and logic op even for the scalar case.
	// Using a 16-byte mask allows folding the load of the mask with
	// the logic op, so it can save (~4 bytes) on code size.
	bool IsFakeVector = !VT.isVector() && !IsF128;
	MVT LogicVT = VT;
	if (IsFakeVector)
	LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;

	unsigned EltBits = VT.getScalarSizeInBits();
	// For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
	APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
	APInt::getSignMask(EltBits);
	const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
	SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);

	SDValue Op0 = Op.getOperand(0);
	bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
	unsigned LogicOp = IsFABS ? X86ISD::FAND :
	IsFNABS ? X86ISD::FOR :
	X86ISD::FXOR;
	SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;

	if (VT.isVector() \|\| IsF128)
	return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);

	// For the scalar case extend to a 128-bit vector, perform the logic op,
	// and extract the scalar result back out.
	Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
	SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
	DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
	SDValue Mag = Op.getOperand(0);
	SDValue Sign = Op.getOperand(1);
	SDLoc dl(Op);

	// If the sign operand is smaller, extend it first.
	MVT VT = Op.getSimpleValueType();
	if (Sign.getSimpleValueType().bitsLT(VT))
	Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);

	// And if it is bigger, shrink it first.
	if (Sign.getSimpleValueType().bitsGT(VT))
	Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));

	// At this point the operands and the result should have the same
	// type, and that won't be f80 since that is not custom lowered.
	bool IsF128 = (VT == MVT::f128);
	assert((VT == MVT::f64 \|\| VT == MVT::f32 \|\| VT == MVT::f128 \|\|
	VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v4f32 \|\|
	VT == MVT::v8f32 \|\| VT == MVT::v8f64 \|\| VT == MVT::v16f32) &&
	"Unexpected type in LowerFCOPYSIGN");

	const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);

	// Perform all scalar logic operations as 16-byte vectors because there are no
	// scalar FP logic instructions in SSE.
	// TODO: This isn't necessary. If we used scalar types, we might avoid some
	// unnecessary splats, but we might miss load folding opportunities. Should
	// this decision be based on OptimizeForSize?
	bool IsFakeVector = !VT.isVector() && !IsF128;
	MVT LogicVT = VT;
	if (IsFakeVector)
	LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;

	// The mask constants are automatically splatted for vector types.
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	SDValue SignMask = DAG.getConstantFP(
	APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
	SDValue MagMask = DAG.getConstantFP(
	APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);

	// First, clear all bits but the sign bit from the second operand (sign).
	if (IsFakeVector)
	Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
	SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);

	// Next, clear the sign bit from the first operand (magnitude).
	// TODO: If we had general constant folding for FP logic ops, this check
	// wouldn't be necessary.
	SDValue MagBits;
	if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
	APFloat APF = Op0CN->getValueAPF();
	APF.clearSign();
	MagBits = DAG.getConstantFP(APF, dl, LogicVT);
	} else {
	// If the magnitude operand wasn't a constant, we need to AND out the sign.
	if (IsFakeVector)
	Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
	MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
	}

	// OR the magnitude value with the sign bit.
	SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
	return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
	DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
	SDValue N0 = Op.getOperand(0);
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	MVT OpVT = N0.getSimpleValueType();
	assert((OpVT == MVT::f32 \|\| OpVT == MVT::f64) &&
	"Unexpected type for FGETSIGN");

	// Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
	MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
	SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
	Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
	Res = DAG.getZExtOrTrunc(Res, dl, VT);
	Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
	return Res;
	}

	/// Helper for creating a X86ISD::SETCC node.
	static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
	SelectionDAG &DAG) {
	return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
	DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
	}

	/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
	/// style scalarized (associative) reduction patterns. Partial reductions
	/// are supported when the pointer SrcMask is non-null.
	/// TODO - move this to SelectionDAG?
	static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
	SmallVectorImpl<SDValue> &SrcOps,
	SmallVectorImpl<APInt> *SrcMask = nullptr) {
	SmallVector<SDValue, 8> Opnds;
	DenseMap<SDValue, APInt> SrcOpMap;
	EVT VT = MVT::Other;

	// Recognize a special case where a vector is casted into wide integer to
	// test all 0s.
	assert(Op.getOpcode() == unsigned(BinOp) &&
	"Unexpected bit reduction opcode");
	Opnds.push_back(Op.getOperand(0));
	Opnds.push_back(Op.getOperand(1));

	for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
	SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
	// BFS traverse all BinOp operands.
	if (I->getOpcode() == unsigned(BinOp)) {
	Opnds.push_back(I->getOperand(0));
	Opnds.push_back(I->getOperand(1));
	// Re-evaluate the number of nodes to be traversed.
	e += 2; // 2 more nodes (LHS and RHS) are pushed.
	continue;
	}

	// Quit if a non-EXTRACT_VECTOR_ELT
	if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return false;

	// Quit if without a constant index.
	auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
	if (!Idx)
	return false;

	SDValue Src = I->getOperand(0);
	DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
	if (M == SrcOpMap.end()) {
	VT = Src.getValueType();
	// Quit if not the same type.
	if (SrcOpMap.begin() != SrcOpMap.end() &&
	VT != SrcOpMap.begin()->first.getValueType())
	return false;
	unsigned NumElts = VT.getVectorNumElements();
	APInt EltCount = APInt::getNullValue(NumElts);
	M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
	SrcOps.push_back(Src);
	}

	// Quit if element already used.
	unsigned CIdx = Idx->getZExtValue();
	if (M->second[CIdx])
	return false;
	M->second.setBit(CIdx);
	}

	if (SrcMask) {
	// Collect the source partial masks.
	for (SDValue &SrcOp : SrcOps)
	SrcMask->push_back(SrcOpMap[SrcOp]);
	} else {
	// Quit if not all elements are used.
	for (DenseMap<SDValue, APInt>::const_iterator I = SrcOpMap.begin(),
	E = SrcOpMap.end();
	I != E; ++I) {
	if (!I->second.isAllOnesValue())
	return false;
	}
	}

	return true;
	}

	// Helper function for comparing all bits of a vector against zero.
	static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
	const APInt &Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG, X86::CondCode &X86CC) {
	EVT VT = V.getValueType();
	assert(Mask.getBitWidth() == VT.getScalarSizeInBits() &&
	"Element Mask vs Vector bitwidth mismatch");

	assert((CC == ISD::SETEQ \|\| CC == ISD::SETNE) && "Unsupported ISD::CondCode");
	X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);

	auto MaskBits = [&](SDValue Src) {
	if (Mask.isAllOnesValue())
	return Src;
	EVT SrcVT = Src.getValueType();
	SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
	return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
	};

	// For sub-128-bit vector, cast to (legal) integer and compare with zero.
	if (VT.getSizeInBits() < 128) {
	EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
	if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT))
	return SDValue();
	return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
	DAG.getBitcast(IntVT, MaskBits(V)),
	DAG.getConstant(0, DL, IntVT));
	}

	// Quit if not splittable to 128/256-bit vector.
	if (!isPowerOf2_32(VT.getSizeInBits()))
	return SDValue();

	// Split down to 128/256-bit vector.
	unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;
	while (VT.getSizeInBits() > TestSize) {
	auto Split = DAG.SplitVector(V, DL);
	VT = Split.first.getValueType();
	V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
	}

	bool UsePTEST = Subtarget.hasSSE41();
	if (UsePTEST) {
	MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
	V = DAG.getBitcast(TestVT, MaskBits(V));
	return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
	}

	// Without PTEST, a masked v2i64 or-reduction is not faster than
	// scalarization.
	if (!Mask.isAllOnesValue() && VT.getScalarSizeInBits() > 32)
	return SDValue();

	V = DAG.getBitcast(MVT::v16i8, MaskBits(V));
	V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V,
	getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
	V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
	return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
	DAG.getConstant(0xFFFF, DL, MVT::i32));
	}

	// Check whether an OR'd reduction tree is PTEST-able, or if we can fallback to
	// CMP(MOVMSK(PCMPEQB(X,0))).
	static SDValue MatchVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
	const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG, SDValue &X86CC) {
	assert((CC == ISD::SETEQ \|\| CC == ISD::SETNE) && "Unsupported ISD::CondCode");

	if (!Subtarget.hasSSE2() \|\| !Op->hasOneUse())
	return SDValue();

	// Check whether we're masking/truncating an OR-reduction result, in which
	// case track the masked bits.
	APInt Mask = APInt::getAllOnesValue(Op.getScalarValueSizeInBits());
	switch (Op.getOpcode()) {
	case ISD::TRUNCATE: {
	SDValue Src = Op.getOperand(0);
	Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
	Op.getScalarValueSizeInBits());
	Op = Src;
	break;
	}
	case ISD::AND: {
	if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	Mask = Cst->getAPIntValue();
	Op = Op.getOperand(0);
	}
	break;
	}
	}

	SmallVector<SDValue, 8> VecIns;
	if (Op.getOpcode() == ISD::OR && matchScalarReduction(Op, ISD::OR, VecIns)) {
	EVT VT = VecIns[0].getValueType();
	assert(llvm::all_of(VecIns,
	[VT](SDValue V) { return VT == V.getValueType(); }) &&
	"Reduction source vector mismatch");

	// Quit if less than 128-bits or not splittable to 128/256-bit vector.
	if (VT.getSizeInBits() < 128 \|\| !isPowerOf2_32(VT.getSizeInBits()))
	return SDValue();

	// If more than one full vector is evaluated, OR them first before PTEST.
	for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
	Slot += 2, e += 1) {
	// Each iteration will OR 2 nodes and append the result until there is
	// only 1 node left, i.e. the final OR'd value of all vectors.
	SDValue LHS = VecIns[Slot];
	SDValue RHS = VecIns[Slot + 1];
	VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS));
	}

	X86::CondCode CCode;
	if (SDValue V = LowerVectorAllZero(DL, VecIns.back(), CC, Mask, Subtarget,
	DAG, CCode)) {
	X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
	return V;
	}
	}

	if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	ISD::NodeType BinOp;
	if (SDValue Match =
	DAG.matchBinOpReduction(Op.getNode(), BinOp, {ISD::OR})) {
	X86::CondCode CCode;
	if (SDValue V =
	LowerVectorAllZero(DL, Match, CC, Mask, Subtarget, DAG, CCode)) {
	X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
	return V;
	}
	}
	}

	return SDValue();
	}

	/// return true if \c Op has a use that doesn't just read flags.
	static bool hasNonFlagsUse(SDValue Op) {
	for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
	++UI) {
	SDNode User = UI;
	unsigned UOpNo = UI.getOperandNo();
	if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
	// Look pass truncate.
	UOpNo = User->use_begin().getOperandNo();
	User = *User->use_begin();
	}

	if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
	!(User->getOpcode() == ISD::SELECT && UOpNo == 0))
	return true;
	}
	return false;
	}

	// Transform to an x86-specific ALU node with flags if there is a chance of
	// using an RMW op or only the flags are used. Otherwise, leave
	// the node alone and emit a 'cmp' or 'test' instruction.
	static bool isProfitableToUseFlagOp(SDValue Op) {
	for (SDNode *U : Op->uses())
	if (U->getOpcode() != ISD::CopyToReg &&
	U->getOpcode() != ISD::SETCC &&
	U->getOpcode() != ISD::STORE)
	return false;

	return true;
	}

	/// Emit nodes that will be selected as "test Op0,Op0", or something
	/// equivalent.
	static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
	SelectionDAG &DAG, const X86Subtarget &Subtarget) {
	// CF and OF aren't always set the way we want. Determine which
	// of these we need.
	bool NeedCF = false;
	bool NeedOF = false;
	switch (X86CC) {
	default: break;
	case X86::COND_A: case X86::COND_AE:
	case X86::COND_B: case X86::COND_BE:
	NeedCF = true;
	break;
	case X86::COND_G: case X86::COND_GE:
	case X86::COND_L: case X86::COND_LE:
	case X86::COND_O: case X86::COND_NO: {
	// Check if we really need to set the
	// Overflow flag. If NoSignedWrap is present
	// that is not actually needed.
	switch (Op->getOpcode()) {
	case ISD::ADD:
	case ISD::SUB:
	case ISD::MUL:
	case ISD::SHL:
	if (Op.getNode()->getFlags().hasNoSignedWrap())
	break;
	LLVM_FALLTHROUGH;
	default:
	NeedOF = true;
	break;
	}
	break;
	}
	}
	// See if we can use the EFLAGS value from the operand instead of
	// doing a separate TEST. TEST always sets OF and CF to 0, so unless
	// we prove that the arithmetic won't overflow, we can't use OF or CF.
	if (Op.getResNo() != 0 \|\| NeedOF \|\| NeedCF) {
	// Emit a CMP with 0, which is the TEST pattern.
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, Op.getValueType()));
	}
	unsigned Opcode = 0;
	unsigned NumOperands = 0;

	SDValue ArithOp = Op;

	// NOTICE: In the code below we use ArithOp to hold the arithmetic operation
	// which may be the result of a CAST. We use the variable 'Op', which is the
	// non-casted variable when we check for possible users.
	switch (ArithOp.getOpcode()) {
	case ISD::AND:
	// If the primary 'and' result isn't used, don't bother using X86ISD::AND,
	// because a TEST instruction will be better.
	if (!hasNonFlagsUse(Op))
	break;

	LLVM_FALLTHROUGH;
	case ISD::ADD:
	case ISD::SUB:
	case ISD::OR:
	case ISD::XOR:
	if (!isProfitableToUseFlagOp(Op))
	break;

	// Otherwise use a regular EFLAGS-setting instruction.
	switch (ArithOp.getOpcode()) {
	default: llvm_unreachable("unexpected operator!");
	case ISD::ADD: Opcode = X86ISD::ADD; break;
	case ISD::SUB: Opcode = X86ISD::SUB; break;
	case ISD::XOR: Opcode = X86ISD::XOR; break;
	case ISD::AND: Opcode = X86ISD::AND; break;
	case ISD::OR: Opcode = X86ISD::OR; break;
	}

	NumOperands = 2;
	break;
	case X86ISD::ADD:
	case X86ISD::SUB:
	case X86ISD::OR:
	case X86ISD::XOR:
	case X86ISD::AND:
	return SDValue(Op.getNode(), 1);
	case ISD::SSUBO:
	case ISD::USUBO: {
	// /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
	Op->getOperand(1)).getValue(1);
	}
	default:
	break;
	}

	if (Opcode == 0) {
	// Emit a CMP with 0, which is the TEST pattern.
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, Op.getValueType()));
	}
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);

	SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
	return SDValue(New.getNode(), 1);
	}

	/// Emit nodes that will be selected as "cmp Op0,Op1", or something
	/// equivalent.
	static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
	const SDLoc &dl, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (isNullConstant(Op1))
	return EmitTest(Op0, X86CC, dl, DAG, Subtarget);

	EVT CmpVT = Op0.getValueType();

	assert((CmpVT == MVT::i8 \|\| CmpVT == MVT::i16 \|\|
	CmpVT == MVT::i32 \|\| CmpVT == MVT::i64) && "Unexpected VT!");

	// Only promote the compare up to I32 if it is a 16 bit operation
	// with an immediate. 16 bit immediates are to be avoided.
	if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
	!DAG.getMachineFunction().getFunction().hasMinSize()) {
	ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
	ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
	// Don't do this if the immediate can fit in 8-bits.
	if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) \|\|
	(COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
	unsigned ExtendOp =
	isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	if (X86CC == X86::COND_E \|\| X86CC == X86::COND_NE) {
	// For equality comparisons try to use SIGN_EXTEND if the input was
	// truncate from something with enough sign bits.
	if (Op0.getOpcode() == ISD::TRUNCATE) {
	SDValue In = Op0.getOperand(0);
	unsigned EffBits =
	In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
	if (EffBits <= 16)
	ExtendOp = ISD::SIGN_EXTEND;
	} else if (Op1.getOpcode() == ISD::TRUNCATE) {
	SDValue In = Op1.getOperand(0);
	unsigned EffBits =
	In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
	if (EffBits <= 16)
	ExtendOp = ISD::SIGN_EXTEND;
	}
	}

	CmpVT = MVT::i32;
	Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
	Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
	}
	}

	// Try to shrink i64 compares if the input has enough zero bits.
	// FIXME: Do this for non-constant compares for constant on LHS?
	if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
	Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
	cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
	DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
	CmpVT = MVT::i32;
	Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
	Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
	}

	// 0-x == y --> x+y == 0
	// 0-x != y --> x+y != 0
	if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
	Op0.hasOneUse() && (X86CC == X86::COND_E \|\| X86CC == X86::COND_NE)) {
	SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
	SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
	return Add.getValue(1);
	}

	// x == 0-y --> x+y == 0
	// x != 0-y --> x+y != 0
	if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
	Op1.hasOneUse() && (X86CC == X86::COND_E \|\| X86CC == X86::COND_NE)) {
	SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
	SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
	return Add.getValue(1);
	}

	// Use SUB instead of CMP to enable CSE between SUB and CMP.
	SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
	SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
	return Sub.getValue(1);
	}

	/// Check if replacement of SQRT with RSQRT should be disabled.
	bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();

	// We never want to use both SQRT and RSQRT instructions for the same input.
	if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
	return false;

	if (VT.isVector())
	return Subtarget.hasFastVectorFSQRT();
	return Subtarget.hasFastScalarFSQRT();
	}

	/// The minimum architected relative accuracy is 2^-12. We need one
	/// Newton-Raphson step to have a good float result (24 bits of precision).
	SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
	SelectionDAG &DAG, int Enabled,
	int &RefinementSteps,
	bool &UseOneConstNR,
	bool Reciprocal) const {
	EVT VT = Op.getValueType();

	// SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
	// It is likely not profitable to do this for f64 because a double-precision
	// rsqrt estimate with refinement on x86 prior to FMA requires at least 16
	// instructions: convert to single, rsqrtss, convert back to double, refine
	// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
	// along with FMA, this could be a throughput win.
	// TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
	// after legalize types.
	if ((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) \|\|
	(VT == MVT::v8f32 && Subtarget.hasAVX()) \|\|
	(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
	if (RefinementSteps == ReciprocalEstimate::Unspecified)
	RefinementSteps = 1;

	UseOneConstNR = false;
	// There is no FSQRT for 512-bits, but there is RSQRT14.
	unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
	return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
	}
	return SDValue();
	}

	/// The minimum architected relative accuracy is 2^-12. We need one
	/// Newton-Raphson step to have a good float result (24 bits of precision).
	SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
	int Enabled,
	int &RefinementSteps) const {
	EVT VT = Op.getValueType();

	// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
	// It is likely not profitable to do this for f64 because a double-precision
	// reciprocal estimate with refinement on x86 prior to FMA requires
	// 15 instructions: convert to single, rcpss, convert back to double, refine
	// (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
	// along with FMA, this could be a throughput win.

	if ((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v8f32 && Subtarget.hasAVX()) \|\|
	(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
	// Enable estimate codegen with 1 refinement step for vector division.
	// Scalar division estimates are disabled because they break too much
	// real-world code. These defaults are intended to match GCC behavior.
	if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
	return SDValue();

	if (RefinementSteps == ReciprocalEstimate::Unspecified)
	RefinementSteps = 1;

	// There is no FSQRT for 512-bits, but there is RCP14.
	unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
	return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
	}
	return SDValue();
	}

	/// If we have at least two divisions that use the same divisor, convert to
	/// multiplication by a reciprocal. This may need to be adjusted for a given
	/// CPU if a division's cost is not at least twice the cost of a multiplication.
	/// This is because we still need one division to calculate the reciprocal and
	/// then we need two multiplies by that reciprocal as replacements for the
	/// original divisions.
	unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
	return 2;
	}

	SDValue
	X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
	SelectionDAG &DAG,
	SmallVectorImpl<SDNode *> &Created) const {
	AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
	if (isIntDivCheap(N->getValueType(0), Attr))
	return SDValue(N,0); // Lower SDIV as SDIV

	assert((Divisor.isPowerOf2() \|\| (-Divisor).isPowerOf2()) &&
	"Unexpected divisor!");

	// Only perform this transform if CMOV is supported otherwise the select
	// below will become a branch.
	if (!Subtarget.hasCMov())
	return SDValue();

	// fold (sdiv X, pow2)
	EVT VT = N->getValueType(0);
	// FIXME: Support i8.
	if (VT != MVT::i16 && VT != MVT::i32 &&
	!(Subtarget.is64Bit() && VT == MVT::i64))
	return SDValue();

	unsigned Lg2 = Divisor.countTrailingZeros();

	// If the divisor is 2 or -2, the default expansion is better.
	if (Lg2 == 1)
	return SDValue();

	SDLoc DL(N);
	SDValue N0 = N->getOperand(0);
	SDValue Zero = DAG.getConstant(0, DL, VT);
	APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
	SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);

	// If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
	SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
	SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);

	Created.push_back(Cmp.getNode());
	Created.push_back(Add.getNode());
	Created.push_back(CMov.getNode());

	// Divide by pow2.
	SDValue SRA =
	DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));

	// If we're dividing by a positive value, we're done. Otherwise, we must
	// negate the result.
	if (Divisor.isNonNegative())
	return SRA;

	Created.push_back(SRA.getNode());
	return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
	}

	/// Result of 'and' is compared against zero. Change to a BT node if possible.
	/// Returns the BT node and the condition code needed to use it.
	static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG,
	SDValue &X86CC) {
	assert(And.getOpcode() == ISD::AND && "Expected AND node!");
	SDValue Op0 = And.getOperand(0);
	SDValue Op1 = And.getOperand(1);
	if (Op0.getOpcode() == ISD::TRUNCATE)
	Op0 = Op0.getOperand(0);
	if (Op1.getOpcode() == ISD::TRUNCATE)
	Op1 = Op1.getOperand(0);

	SDValue Src, BitNo;
	if (Op1.getOpcode() == ISD::SHL)
	std::swap(Op0, Op1);
	if (Op0.getOpcode() == ISD::SHL) {
	if (isOneConstant(Op0.getOperand(0))) {
	// If we looked past a truncate, check that it's only truncating away
	// known zeros.
	unsigned BitWidth = Op0.getValueSizeInBits();
	unsigned AndBitWidth = And.getValueSizeInBits();
	if (BitWidth > AndBitWidth) {
	KnownBits Known = DAG.computeKnownBits(Op0);
	if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
	return SDValue();
	}
	Src = Op1;
	BitNo = Op0.getOperand(1);
	}
	} else if (Op1.getOpcode() == ISD::Constant) {
	ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
	uint64_t AndRHSVal = AndRHS->getZExtValue();
	SDValue AndLHS = Op0;

	if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
	Src = AndLHS.getOperand(0);
	BitNo = AndLHS.getOperand(1);
	} else {
	// Use BT if the immediate can't be encoded in a TEST instruction or we
	// are optimizing for size and the immedaite won't fit in a byte.
	bool OptForSize = DAG.shouldOptForSize();
	if ((!isUInt<32>(AndRHSVal) \|\| (OptForSize && !isUInt<8>(AndRHSVal))) &&
	isPowerOf2_64(AndRHSVal)) {
	Src = AndLHS;
	BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
	Src.getValueType());
	}
	}
	}

	// No patterns found, give up.
	if (!Src.getNode())
	return SDValue();

	// If Src is i8, promote it to i32 with any_extend. There is no i8 BT
	// instruction. Since the shift amount is in-range-or-undefined, we know
	// that doing a bittest on the i32 value is ok. We extend to i32 because
	// the encoding for the i16 version is larger than the i32 version.
	// Also promote i16 to i32 for performance / code size reason.
	if (Src.getValueType() == MVT::i8 \|\| Src.getValueType() == MVT::i16)
	Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);

	// See if we can use the 32-bit instruction instead of the 64-bit one for a
	// shorter encoding. Since the former takes the modulo 32 of BitNo and the
	// latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
	// known to be zero.
	if (Src.getValueType() == MVT::i64 &&
	DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
	Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);

	// If the operand types disagree, extend the shift amount to match. Since
	// BT ignores high bits (like shifts) we can use anyextend.
	if (Src.getValueType() != BitNo.getValueType())
	BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);

	X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
	dl, MVT::i8);
	return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
	}

	/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
	/// CMPs.
	static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
	SDValue &Op1, bool &IsAlwaysSignaling) {
	unsigned SSECC;
	bool Swap = false;

	// SSE Condition code mapping:
	// 0 - EQ
	// 1 - LT
	// 2 - LE
	// 3 - UNORD
	// 4 - NEQ
	// 5 - NLT
	// 6 - NLE
	// 7 - ORD
	switch (SetCCOpcode) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETOEQ:
	case ISD::SETEQ: SSECC = 0; break;
	case ISD::SETOGT:
	case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETLT:
	case ISD::SETOLT: SSECC = 1; break;
	case ISD::SETOGE:
	case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETLE:
	case ISD::SETOLE: SSECC = 2; break;
	case ISD::SETUO: SSECC = 3; break;
	case ISD::SETUNE:
	case ISD::SETNE: SSECC = 4; break;
	case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETUGE: SSECC = 5; break;
	case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETUGT: SSECC = 6; break;
	case ISD::SETO: SSECC = 7; break;
	case ISD::SETUEQ: SSECC = 8; break;
	case ISD::SETONE: SSECC = 12; break;
	}
	if (Swap)
	std::swap(Op0, Op1);

	switch (SetCCOpcode) {
	default:
	IsAlwaysSignaling = true;
	break;
	case ISD::SETEQ:
	case ISD::SETOEQ:
	case ISD::SETUEQ:
	case ISD::SETNE:
	case ISD::SETONE:
	case ISD::SETUNE:
	case ISD::SETO:
	case ISD::SETUO:
	IsAlwaysSignaling = false;
	break;
	}

	return SSECC;
	}

	/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
	/// concatenate the result back.
	static SDValue splitIntVSETCC(SDValue Op, SelectionDAG &DAG) {
	EVT VT = Op.getValueType();

	assert(Op.getOpcode() == ISD::SETCC && "Unsupported operation");
	assert(Op.getOperand(0).getValueType().isInteger() &&
	VT == Op.getOperand(0).getValueType() && "Unsupported VTs!");

	SDLoc dl(Op);
	SDValue CC = Op.getOperand(2);

	// Extract the LHS Lo/Hi vectors
	SDValue LHS1, LHS2;
	std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);

	// Extract the RHS Lo/Hi vectors
	SDValue RHS1, RHS2;
	std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);

	// Issue the operation on the smaller types and concatenate the result back
	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
	DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
	}

	static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue CC = Op.getOperand(2);
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	assert(VT.getVectorElementType() == MVT::i1 &&
	"Cannot set masked compare for this operation");

	ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();

	// Prefer SETGT over SETLT.
	if (SetCCOpcode == ISD::SETLT) {
	SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
	std::swap(Op0, Op1);
	}

	return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
	}

	/// Given a buildvector constant, return a new vector constant with each element
	/// incremented or decremented. If incrementing or decrementing would result in
	/// unsigned overflow or underflow or this is not a simple vector constant,
	/// return an empty value.
	static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
	auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
	if (!BV)
	return SDValue();

	MVT VT = V.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElts = VT.getVectorNumElements();
	SmallVector<SDValue, 8> NewVecC;
	SDLoc DL(V);
	for (unsigned i = 0; i < NumElts; ++i) {
	auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
	if (!Elt \|\| Elt->isOpaque() \|\| Elt->getSimpleValueType(0) != EltVT)
	return SDValue();

	// Avoid overflow/underflow.
	const APInt &EltC = Elt->getAPIntValue();
	if ((IsInc && EltC.isMaxValue()) \|\| (!IsInc && EltC.isNullValue()))
	return SDValue();

	NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
	}

	return DAG.getBuildVector(VT, DL, NewVecC);
	}

	/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
	/// Op0 u<= Op1:
	/// t = psubus Op0, Op1
	/// pcmpeq t, <0..0>
	static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
	ISD::CondCode Cond, const SDLoc &dl,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	MVT VET = VT.getVectorElementType();
	if (VET != MVT::i8 && VET != MVT::i16)
	return SDValue();

	switch (Cond) {
	default:
	return SDValue();
	case ISD::SETULT: {
	// If the comparison is against a constant we can turn this into a
	// setule. With psubus, setule does not require a swap. This is
	// beneficial because the constant in the register is no longer
	// destructed as the destination so it can be hoisted out of a loop.
	// Only do this pre-AVX since vpcmp* is no longer destructive.
	if (Subtarget.hasAVX())
	return SDValue();
	SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /IsInc/false);
	if (!ULEOp1)
	return SDValue();
	Op1 = ULEOp1;
	break;
	}
	case ISD::SETUGT: {
	// If the comparison is against a constant, we can turn this into a setuge.
	// This is beneficial because materializing a constant 0 for the PCMPEQ is
	// probably cheaper than XOR+PCMPGT using 2 different vector constants:
	// cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
	SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /IsInc/true);
	if (!UGEOp1)
	return SDValue();
	Op1 = Op0;
	Op0 = UGEOp1;
	break;
	}
	// Psubus is better than flip-sign because it requires no inversion.
	case ISD::SETUGE:
	std::swap(Op0, Op1);
	break;
	case ISD::SETULE:
	break;
	}

	SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
	return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
	DAG.getConstant(0, dl, VT));
	}

	static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC \|\|
	Op.getOpcode() == ISD::STRICT_FSETCCS;
	SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
	SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
	SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
	MVT VT = Op->getSimpleValueType(0);
	ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
	bool isFP = Op1.getSimpleValueType().isFloatingPoint();
	SDLoc dl(Op);

	if (isFP) {
	#ifndef NDEBUG
	MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
	assert(EltVT == MVT::f32 \|\| EltVT == MVT::f64);
	#endif

	bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
	SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

	// If we have a strict compare with a vXi1 result and the input is 128/256
	// bits we can't use a masked compare unless we have VLX. If we use a wider
	// compare like we do for non-strict, we might trigger spurious exceptions
	// from the upper elements. Instead emit a AVX compare and convert to mask.
	unsigned Opc;
	if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
	(!IsStrict \|\| Subtarget.hasVLX() \|\|
	Op0.getSimpleValueType().is512BitVector())) {
	assert(VT.getVectorNumElements() <= 16);
	Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
	} else {
	Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
	// The SSE/AVX packed FP comparison nodes are defined with a
	// floating-point vector result that matches the operand type. This allows
	// them to work with an SSE1 target (integer vector types are not legal).
	VT = Op0.getSimpleValueType();
	}

	SDValue Cmp;
	bool IsAlwaysSignaling;
	unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
	if (!Subtarget.hasAVX()) {
	// TODO: We could use following steps to handle a quiet compare with
	// signaling encodings.
	// 1. Get ordered masks from a quiet ISD::SETO
	// 2. Use the masks to mask potential unordered elements in operand A, B
	// 3. Get the compare results of masked A, B
	// 4. Calculating final result using the mask and result from 3
	// But currently, we just fall back to scalar operations.
	if (IsStrict && IsAlwaysSignaling && !IsSignaling)
	return SDValue();

	// Insert an extra signaling instruction to raise exception.
	if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
	SDValue SignalCmp = DAG.getNode(
	Opc, dl, {VT, MVT::Other},
	{Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
	// FIXME: It seems we need to update the flags of all new strict nodes.
	// Otherwise, mayRaiseFPException in MI will return false due to
	// NoFPExcept = false by default. However, I didn't find it in other
	// patches.
	SignalCmp->setFlags(Op->getFlags());
	Chain = SignalCmp.getValue(1);
	}

	// In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
	// emit two comparisons and a logic op to tie them together.
	if (SSECC >= 8) {
	// LLVM predicate is SETUEQ or SETONE.
	unsigned CC0, CC1;
	unsigned CombineOpc;
	if (Cond == ISD::SETUEQ) {
	CC0 = 3; // UNORD
	CC1 = 0; // EQ
	CombineOpc = X86ISD::FOR;
	} else {
	assert(Cond == ISD::SETONE);
	CC0 = 7; // ORD
	CC1 = 4; // NEQ
	CombineOpc = X86ISD::FAND;
	}

	SDValue Cmp0, Cmp1;
	if (IsStrict) {
	Cmp0 = DAG.getNode(
	Opc, dl, {VT, MVT::Other},
	{Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
	Cmp1 = DAG.getNode(
	Opc, dl, {VT, MVT::Other},
	{Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
	Cmp1.getValue(1));
	} else {
	Cmp0 = DAG.getNode(
	Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
	Cmp1 = DAG.getNode(
	Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
	}
	Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
	} else {
	if (IsStrict) {
	Cmp = DAG.getNode(
	Opc, dl, {VT, MVT::Other},
	{Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
	Chain = Cmp.getValue(1);
	} else
	Cmp = DAG.getNode(
	Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
	}
	} else {
	// Handle all other FP comparisons here.
	if (IsStrict) {
	// Make a flip on already signaling CCs before setting bit 4 of AVX CC.
	SSECC \|= (IsAlwaysSignaling ^ IsSignaling) << 4;
	Cmp = DAG.getNode(
	Opc, dl, {VT, MVT::Other},
	{Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
	Chain = Cmp.getValue(1);
	} else
	Cmp = DAG.getNode(
	Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
	}

	if (VT.getSizeInBits() > Op.getSimpleValueType().getSizeInBits()) {
	// We emitted a compare with an XMM/YMM result. Finish converting to a
	// mask register using a vptestm.
	EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
	Cmp = DAG.getBitcast(CastVT, Cmp);
	Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
	DAG.getConstant(0, dl, CastVT), ISD::SETNE);
	} else {
	// If this is SSE/AVX CMPP, bitcast the result back to integer to match
	// the result type of SETCC. The bitcast is expected to be optimized
	// away during combining/isel.
	Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
	}

	if (IsStrict)
	return DAG.getMergeValues({Cmp, Chain}, dl);

	return Cmp;
	}

	assert(!IsStrict && "Strict SETCC only handles FP operands.");

	MVT VTOp0 = Op0.getSimpleValueType();
	(void)VTOp0;
	assert(VTOp0 == Op1.getSimpleValueType() &&
	"Expected operands with same type!");
	assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
	"Invalid number of packed elements for source and destination!");

	// The non-AVX512 code below works under the assumption that source and
	// destination types are the same.
	assert((Subtarget.hasAVX512() \|\| (VT == VTOp0)) &&
	"Value types for source and destination must be the same!");

	// The result is boolean, but operands are int/float
	if (VT.getVectorElementType() == MVT::i1) {
	// In AVX-512 architecture setcc returns mask with i1 elements,
	// But there is no compare instruction for i8 and i16 elements in KNL.
	assert((VTOp0.getScalarSizeInBits() >= 32 \|\| Subtarget.hasBWI()) &&
	"Unexpected operand type");
	return LowerIntVSETCC_AVX512(Op, DAG);
	}

	// Lower using XOP integer comparisons.
	if (VT.is128BitVector() && Subtarget.hasXOP()) {
	// Translate compare code to XOP PCOM compare mode.
	unsigned CmpMode = 0;
	switch (Cond) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETULT:
	case ISD::SETLT: CmpMode = 0x00; break;
	case ISD::SETULE:
	case ISD::SETLE: CmpMode = 0x01; break;
	case ISD::SETUGT:
	case ISD::SETGT: CmpMode = 0x02; break;
	case ISD::SETUGE:
	case ISD::SETGE: CmpMode = 0x03; break;
	case ISD::SETEQ: CmpMode = 0x04; break;
	case ISD::SETNE: CmpMode = 0x05; break;
	}

	// Are we comparing unsigned or signed integers?
	unsigned Opc =
	ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;

	return DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getTargetConstant(CmpMode, dl, MVT::i8));
	}

	// (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
	// Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
	if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
	SDValue BC0 = peekThroughBitcasts(Op0);
	if (BC0.getOpcode() == ISD::AND) {
	APInt UndefElts;
	SmallVector<APInt, 64> EltBits;
	if (getTargetConstantBitsFromNode(BC0.getOperand(1),
	VT.getScalarSizeInBits(), UndefElts,
	EltBits, false, false)) {
	if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
	Cond = ISD::SETEQ;
	Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
	}
	}
	}
	}

	// ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
	if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
	Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
	ConstantSDNode *C1 = isConstOrConstSplat(Op1);
	if (C1 && C1->getAPIntValue().isPowerOf2()) {
	unsigned BitWidth = VT.getScalarSizeInBits();
	unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;

	SDValue Result = Op0.getOperand(0);
	Result = DAG.getNode(ISD::SHL, dl, VT, Result,
	DAG.getConstant(ShiftAmt, dl, VT));
	Result = DAG.getNode(ISD::SRA, dl, VT, Result,
	DAG.getConstant(BitWidth - 1, dl, VT));
	return Result;
	}
	}

	// Break 256-bit integer vector compare into smaller ones.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return splitIntVSETCC(Op, DAG);

	if (VT == MVT::v32i16 \|\| VT == MVT::v64i8) {
	assert(!Subtarget.hasBWI() && "Unexpected VT with AVX512BW!");
	return splitIntVSETCC(Op, DAG);
	}

	// If this is a SETNE against the signed minimum value, change it to SETGT.
	// If this is a SETNE against the signed maximum value, change it to SETLT.
	// which will be swapped to SETGT.
	// Otherwise we use PCMPEQ+invert.
	APInt ConstValue;
	if (Cond == ISD::SETNE &&
	ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
	if (ConstValue.isMinSignedValue())
	Cond = ISD::SETGT;
	else if (ConstValue.isMaxSignedValue())
	Cond = ISD::SETLT;
	}

	// If both operands are known non-negative, then an unsigned compare is the
	// same as a signed compare and there's no need to flip signbits.
	// TODO: We could check for more general simplifications here since we're
	// computing known bits.
	bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
	!(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));

	// Special case: Use min/max operations for unsigned compares.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (ISD::isUnsignedIntSetCC(Cond) &&
	(FlipSigns \|\| ISD::isTrueWhenEqual(Cond)) &&
	TLI.isOperationLegal(ISD::UMIN, VT)) {
	// If we have a constant operand, increment/decrement it and change the
	// condition to avoid an invert.
	if (Cond == ISD::SETUGT) {
	// X > C --> X >= (C+1) --> X == umax(X, C+1)
	if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /IsInc/true)) {
	Op1 = UGTOp1;
	Cond = ISD::SETUGE;
	}
	}
	if (Cond == ISD::SETULT) {
	// X < C --> X <= (C-1) --> X == umin(X, C-1)
	if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /IsInc/false)) {
	Op1 = ULTOp1;
	Cond = ISD::SETULE;
	}
	}
	bool Invert = false;
	unsigned Opc;
	switch (Cond) {
	default: llvm_unreachable("Unexpected condition code");
	case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH;
	case ISD::SETULE: Opc = ISD::UMIN; break;
	case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH;
	case ISD::SETUGE: Opc = ISD::UMAX; break;
	}

	SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
	Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);

	// If the logical-not of the result is required, perform that now.
	if (Invert)
	Result = DAG.getNOT(dl, Result, VT);

	return Result;
	}

	// Try to use SUBUS and PCMPEQ.
	if (SDValue V = LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
	return V;

	// We are handling one of the integer comparisons here. Since SSE only has
	// GT and EQ comparisons for integer, swapping operands and multiple
	// operations may be required for some comparisons.
	unsigned Opc = (Cond == ISD::SETEQ \|\| Cond == ISD::SETNE) ? X86ISD::PCMPEQ
	: X86ISD::PCMPGT;
	bool Swap = Cond == ISD::SETLT \|\| Cond == ISD::SETULT \|\|
	Cond == ISD::SETGE \|\| Cond == ISD::SETUGE;
	bool Invert = Cond == ISD::SETNE \|\|
	(Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));

	if (Swap)
	std::swap(Op0, Op1);

	// Check that the operation in question is available (most are plain SSE2,
	// but PCMPGTQ and PCMPEQQ have different requirements).
	if (VT == MVT::v2i64) {
	if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
	assert(Subtarget.hasSSE2() && "Don't know how to lower!");

	// Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
	// the odd elements over the even elements.
	if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
	Op0 = DAG.getConstant(0, dl, MVT::v4i32);
	Op1 = DAG.getBitcast(MVT::v4i32, Op1);

	SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
	static const int MaskHi[] = { 1, 1, 3, 3 };
	SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

	return DAG.getBitcast(VT, Result);
	}

	if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
	Op0 = DAG.getBitcast(MVT::v4i32, Op0);
	Op1 = DAG.getConstant(-1, dl, MVT::v4i32);

	SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
	static const int MaskHi[] = { 1, 1, 3, 3 };
	SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

	return DAG.getBitcast(VT, Result);
	}

	// Since SSE has no unsigned integer comparisons, we need to flip the sign
	// bits of the inputs before performing those operations. The lower
	// compare is always unsigned.
	SDValue SB;
	if (FlipSigns) {
	SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);
	} else {
	SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);
	}
	Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
	Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);

	// Cast everything to the right type.
	Op0 = DAG.getBitcast(MVT::v4i32, Op0);
	Op1 = DAG.getBitcast(MVT::v4i32, Op1);

	// Emulate PCMPGTQ with (hi1 > hi2) \| ((hi1 == hi2) & (lo1 > lo2))
	SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
	SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);

	// Create masks for only the low parts/high parts of the 64 bit integers.
	static const int MaskHi[] = { 1, 1, 3, 3 };
	static const int MaskLo[] = { 0, 0, 2, 2 };
	SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
	SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
	SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

	SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
	Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);

	if (Invert)
	Result = DAG.getNOT(dl, Result, MVT::v4i32);

	return DAG.getBitcast(VT, Result);
	}

	if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
	// If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
	// pcmpeqd + pshufd + pand.
	assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");

	// First cast everything to the right type.
	Op0 = DAG.getBitcast(MVT::v4i32, Op0);
	Op1 = DAG.getBitcast(MVT::v4i32, Op1);

	// Do the compare.
	SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);

	// Make sure the lower and upper halves are both all-ones.
	static const int Mask[] = { 1, 0, 3, 2 };
	SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
	Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);

	if (Invert)
	Result = DAG.getNOT(dl, Result, MVT::v4i32);

	return DAG.getBitcast(VT, Result);
	}
	}

	// Since SSE has no unsigned integer comparisons, we need to flip the sign
	// bits of the inputs before performing those operations.
	if (FlipSigns) {
	MVT EltVT = VT.getVectorElementType();
	SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
	VT);
	Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
	Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
	}

	SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);

	// If the logical-not of the result is required, perform that now.
	if (Invert)
	Result = DAG.getNOT(dl, Result, VT);

	return Result;
	}

	// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
	static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SDValue &X86CC) {
	// Only support equality comparisons.
	if (CC != ISD::SETEQ && CC != ISD::SETNE)
	return SDValue();

	// Must be a bitcast from vXi1.
	if (Op0.getOpcode() != ISD::BITCAST)
	return SDValue();

	Op0 = Op0.getOperand(0);
	MVT VT = Op0.getSimpleValueType();
	if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
	!(Subtarget.hasDQI() && VT == MVT::v8i1) &&
	!(Subtarget.hasBWI() && (VT == MVT::v32i1 \|\| VT == MVT::v64i1)))
	return SDValue();

	X86::CondCode X86Cond;
	if (isNullConstant(Op1)) {
	X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
	} else if (isAllOnesConstant(Op1)) {
	// C flag is set for all ones.
	X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
	} else
	return SDValue();

	// If the input is an AND, we can combine it's operands into the KTEST.
	bool KTestable = false;
	if (Subtarget.hasDQI() && (VT == MVT::v8i1 \|\| VT == MVT::v16i1))
	KTestable = true;
	if (Subtarget.hasBWI() && (VT == MVT::v32i1 \|\| VT == MVT::v64i1))
	KTestable = true;
	if (!isNullConstant(Op1))
	KTestable = false;
	if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
	SDValue LHS = Op0.getOperand(0);
	SDValue RHS = Op0.getOperand(1);
	X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
	return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
	}

	// If the input is an OR, we can combine it's operands into the KORTEST.
	SDValue LHS = Op0;
	SDValue RHS = Op0;
	if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
	LHS = Op0.getOperand(0);
	RHS = Op0.getOperand(1);
	}

	X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
	return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
	}

	/// Emit flags for the given setcc condition and operands. Also returns the
	/// corresponding X86 condition code constant in X86CC.
	SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
	ISD::CondCode CC, const SDLoc &dl,
	SelectionDAG &DAG,
	SDValue &X86CC) const {
	// Optimize to BT if possible.
	// Lower (X & (1 << N)) == 0 to BT(X, N).
	// Lower ((X >>u N) & 1) != 0 to BT(X, N).
	// Lower ((X >>s N) & 1) != 0 to BT(X, N).
	if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC))
	return BT;
	}

	// Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0.
	// TODO: We could do AND tree with all 1s as well by using the C flag.
	if (isNullConstant(Op1) && (CC == ISD::SETEQ \|\| CC == ISD::SETNE))
	if (SDValue CmpZ =
	MatchVectorAllZeroTest(Op0, CC, dl, Subtarget, DAG, X86CC))
	return CmpZ;

	// Try to lower using KORTEST or KTEST.
	if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
	return Test;

	// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
	// these.
	if ((isOneConstant(Op1) \|\| isNullConstant(Op1)) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	// If the input is a setcc, then reuse the input setcc or use a new one with
	// the inverted condition.
	if (Op0.getOpcode() == X86ISD::SETCC) {
	bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);

	X86CC = Op0.getOperand(0);
	if (Invert) {
	X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
	CCode = X86::GetOppositeBranchCondition(CCode);
	X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
	}

	return Op0.getOperand(1);
	}
	}

	// Try to use the carry flag from the add in place of an separate CMP for:
	// (seteq (add X, -1), -1). Similar for setne.
	if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
	Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	if (isProfitableToUseFlagOp(Op0)) {
	SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);

	SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
	Op0.getOperand(1));
	DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
	X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
	X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
	return SDValue(New.getNode(), 1);
	}
	}

	X86::CondCode CondCode =
	TranslateX86CC(CC, dl, /IsFP/ false, Op0, Op1, DAG);
	assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");

	SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
	X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
	return EFLAGS;
	}

	SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

	bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC \|\|
	Op.getOpcode() == ISD::STRICT_FSETCCS;
	MVT VT = Op->getSimpleValueType(0);

	if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);

	assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
	SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
	SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
	SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
	SDLoc dl(Op);
	ISD::CondCode CC =
	cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();

	// Handle f128 first, since one possible outcome is a normal integer
	// comparison which gets handled by emitFlagsForSetcc.
	if (Op0.getValueType() == MVT::f128) {
	softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
	Op.getOpcode() == ISD::STRICT_FSETCCS);

	// If softenSetCCOperands returned a scalar, use it.
	if (!Op1.getNode()) {
	assert(Op0.getValueType() == Op.getValueType() &&
	"Unexpected setcc expansion!");
	if (IsStrict)
	return DAG.getMergeValues({Op0, Chain}, dl);
	return Op0;
	}
	}

	if (Op0.getSimpleValueType().isInteger()) {
	SDValue X86CC;
	SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
	SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
	return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
	}

	// Handle floating point.
	X86::CondCode CondCode = TranslateX86CC(CC, dl, /IsFP/ true, Op0, Op1, DAG);
	if (CondCode == X86::COND_INVALID)
	return SDValue();

	SDValue EFLAGS;
	if (IsStrict) {
	bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
	EFLAGS =
	DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
	dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
	Chain = EFLAGS.getValue(1);
	} else {
	EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
	}

	SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
	SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
	return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
	}

	SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	SDValue Carry = Op.getOperand(2);
	SDValue Cond = Op.getOperand(3);
	SDLoc DL(Op);

	assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
	X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());

	// Recreate the carry if needed.
	EVT CarryVT = Carry.getValueType();
	Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
	Carry, DAG.getAllOnesConstant(DL, CarryVT));

	SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
	SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
	return getSETCC(CC, Cmp.getValue(1), DL, DAG);
	}

	// This function returns three things: the arithmetic computation itself
	// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
	// flag and the condition code define the case in which the arithmetic
	// computation overflows.
	static std::pair<SDValue, SDValue>
	getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
	assert(Op.getResNo() == 0 && "Unexpected result number!");
	SDValue Value, Overflow;
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	unsigned BaseOp = 0;
	SDLoc DL(Op);
	switch (Op.getOpcode()) {
	default: llvm_unreachable("Unknown ovf instruction!");
	case ISD::SADDO:
	BaseOp = X86ISD::ADD;
	Cond = X86::COND_O;
	break;
	case ISD::UADDO:
	BaseOp = X86ISD::ADD;
	Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
	break;
	case ISD::SSUBO:
	BaseOp = X86ISD::SUB;
	Cond = X86::COND_O;
	break;
	case ISD::USUBO:
	BaseOp = X86ISD::SUB;
	Cond = X86::COND_B;
	break;
	case ISD::SMULO:
	BaseOp = X86ISD::SMUL;
	Cond = X86::COND_O;
	break;
	case ISD::UMULO:
	BaseOp = X86ISD::UMUL;
	Cond = X86::COND_O;
	break;
	}

	if (BaseOp) {
	// Also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
	Overflow = Value.getValue(1);
	}

	return std::make_pair(Value, Overflow);
	}

	static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
	// Lower the "add/sub/mul with overflow" instruction into a regular ins plus
	// a "setcc" instruction that checks the overflow flag. The "brcond" lowering
	// looks for this combo and may remove the "setcc" instruction if the "setcc"
	// has only one use.
	SDLoc DL(Op);
	X86::CondCode Cond;
	SDValue Value, Overflow;
	std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);

	SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
	assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
	return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
	}

	/// Return true if opcode is a X86 logical comparison.
	static bool isX86LogicalCmp(SDValue Op) {
	unsigned Opc = Op.getOpcode();
	if (Opc == X86ISD::CMP \|\| Opc == X86ISD::COMI \|\| Opc == X86ISD::UCOMI \|\|
	Opc == X86ISD::FCMP)
	return true;
	if (Op.getResNo() == 1 &&
	(Opc == X86ISD::ADD \|\| Opc == X86ISD::SUB \|\| Opc == X86ISD::ADC \|\|
	Opc == X86ISD::SBB \|\| Opc == X86ISD::SMUL \|\| Opc == X86ISD::UMUL \|\|
	Opc == X86ISD::OR \|\| Opc == X86ISD::XOR \|\| Opc == X86ISD::AND))
	return true;

	return false;
	}

	static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
	if (V.getOpcode() != ISD::TRUNCATE)
	return false;

	SDValue VOp0 = V.getOperand(0);
	unsigned InBits = VOp0.getValueSizeInBits();
	unsigned Bits = V.getValueSizeInBits();
	return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
	}

	SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
	bool AddTest = true;
	SDValue Cond = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue Op2 = Op.getOperand(2);
	SDLoc DL(Op);
	MVT VT = Op1.getSimpleValueType();
	SDValue CC;

	// Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
	// are available or VBLENDV if AVX is available.
	// Otherwise FP cmovs get lowered into a less efficient branch sequence later.
	if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
	VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
	SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
	bool IsAlwaysSignaling;
	unsigned SSECC =
	translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
	CondOp0, CondOp1, IsAlwaysSignaling);

	if (Subtarget.hasAVX512()) {
	SDValue Cmp =
	DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
	DAG.getTargetConstant(SSECC, DL, MVT::i8));
	assert(!VT.isVector() && "Not a scalar type?");
	return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
	}

	if (SSECC < 8 \|\| Subtarget.hasAVX()) {
	SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
	DAG.getTargetConstant(SSECC, DL, MVT::i8));

	// If we have AVX, we can use a variable vector select (VBLENDV) instead
	// of 3 logic instructions for size savings and potentially speed.
	// Unfortunately, there is no scalar form of VBLENDV.

	// If either operand is a +0.0 constant, don't try this. We can expect to
	// optimize away at least one of the logic instructions later in that
	// case, so that sequence would be faster than a variable blend.

	// BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
	// uses XMM0 as the selection register. That may need just as many
	// instructions as the AND/ANDN/OR sequence due to register moves, so
	// don't bother.
	if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
	!isNullFPConstant(Op2)) {
	// Convert to vectors, do a VSELECT, and convert back to scalar.
	// All of the conversions should be optimized away.
	MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
	SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
	SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
	SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);

	MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
	VCmp = DAG.getBitcast(VCmpVT, VCmp);

	SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
	VSel, DAG.getIntPtrConstant(0, DL));
	}
	SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
	SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
	return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
	}
	}

	// AVX512 fallback is to lower selects of scalar floats to masked moves.
	if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
	SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
	return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
	}

	if (Cond.getOpcode() == ISD::SETCC) {
	if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
	Cond = NewCond;
	// If the condition was updated, it's possible that the operands of the
	// select were also updated (for example, EmitTest has a RAUW). Refresh
	// the local references to the select operands in case they got stale.
	Op1 = Op.getOperand(1);
	Op2 = Op.getOperand(2);
	}
	}

	// (select (x == 0), -1, y) -> (sign_bit (x - 1)) \| y
	// (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) \| y
	// (select (x != 0), y, -1) -> (sign_bit (x - 1)) \| y
	// (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) \| y
	// (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
	// (select (and (x , 0x1) == 0), y, (z \| y) ) -> (-(and (x , 0x1)) & z ) \| y
	if (Cond.getOpcode() == X86ISD::SETCC &&
	Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
	isNullConstant(Cond.getOperand(1).getOperand(1))) {
	SDValue Cmp = Cond.getOperand(1);
	SDValue CmpOp0 = Cmp.getOperand(0);
	unsigned CondCode = Cond.getConstantOperandVal(0);

	// Special handling for __builtin_ffs(X) - 1 pattern which looks like
	// (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
	// handle to keep the CMP with 0. This should be removed by
	// optimizeCompareInst by using the flags from the BSR/TZCNT used for the
	// cttz_zero_undef.
	auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
	return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
	Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
	};
	if (Subtarget.hasCMov() && (VT == MVT::i32 \|\| VT == MVT::i64) &&
	((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) \|\|
	(CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
	// Keep Cmp.
	} else if ((isAllOnesConstant(Op1) \|\| isAllOnesConstant(Op2)) &&
	(CondCode == X86::COND_E \|\| CondCode == X86::COND_NE)) {
	SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;

	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);

	// Apply further optimizations for special cases
	// (select (x != 0), -1, 0) -> neg & sbb
	// (select (x == 0), 0, -1) -> neg & sbb
	if (isNullConstant(Y) &&
	(isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
	SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
	Zero = DAG.getConstant(0, DL, Op.getValueType());
	return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Neg.getValue(1));
	}

	Cmp = DAG.getNode(X86ISD::SUB, DL, CmpVTs,
	CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));

	SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());
	SDValue Res = // Res = 0 or -1.
	DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp.getValue(1));

	if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
	Res = DAG.getNOT(DL, Res, Res.getValueType());

	return DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
	} else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
	Cmp.getOperand(0).getOpcode() == ISD::AND &&
	isOneConstant(Cmp.getOperand(0).getOperand(1))) {
	SDValue Src1, Src2;
	// true if Op2 is XOR or OR operator and one of its operands
	// is equal to Op1
	// ( a , a op b) \|\| ( b , a op b)
	auto isOrXorPattern = [&]() {
	if ((Op2.getOpcode() == ISD::XOR \|\| Op2.getOpcode() == ISD::OR) &&
	(Op2.getOperand(0) == Op1 \|\| Op2.getOperand(1) == Op1)) {
	Src1 =
	Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
	Src2 = Op1;
	return true;
	}
	return false;
	};

	if (isOrXorPattern()) {
	SDValue Neg;
	unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
	// we need mask of all zeros or ones with same size of the other
	// operands.
	if (CmpSz > VT.getSizeInBits())
	Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
	else if (CmpSz < VT.getSizeInBits())
	Neg = DAG.getNode(ISD::AND, DL, VT,
	DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
	DAG.getConstant(1, DL, VT));
	else
	Neg = CmpOp0;
	SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	Neg); // -(and (x, 0x1))
	SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
	return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
	}
	}
	}

	// Look past (and (setcc_carry (cmp ...)), 1).
	if (Cond.getOpcode() == ISD::AND &&
	Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
	isOneConstant(Cond.getOperand(1)))
	Cond = Cond.getOperand(0);

	// If condition flag is set by a X86ISD::CMP, then use it as the condition
	// setting operand in place of the X86ISD::SETCC.
	unsigned CondOpcode = Cond.getOpcode();
	if (CondOpcode == X86ISD::SETCC \|\|
	CondOpcode == X86ISD::SETCC_CARRY) {
	CC = Cond.getOperand(0);

	SDValue Cmp = Cond.getOperand(1);
	bool IllegalFPCMov = false;
	if (VT.isFloatingPoint() && !VT.isVector() &&
	!isScalarFPTypeInSSEReg(VT) && Subtarget.hasCMov()) // FPStack?
	IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());

	if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) \|\|
	Cmp.getOpcode() == X86ISD::BT) { // FIXME
	Cond = Cmp;
	AddTest = false;
	}
	} else if (CondOpcode == ISD::USUBO \|\| CondOpcode == ISD::SSUBO \|\|
	CondOpcode == ISD::UADDO \|\| CondOpcode == ISD::SADDO \|\|
	CondOpcode == ISD::UMULO \|\| CondOpcode == ISD::SMULO) {
	SDValue Value;
	X86::CondCode X86Cond;
	std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);

	CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
	AddTest = false;
	}

	if (AddTest) {
	// Look past the truncate if the high bits are known zero.
	if (isTruncWithZeroHighBitsInput(Cond, DAG))
	Cond = Cond.getOperand(0);

	// We know the result of AND is compared against zero. Try to match
	// it to BT.
	if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
	SDValue BTCC;
	if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) {
	CC = BTCC;
	Cond = BT;
	AddTest = false;
	}
	}
	}

	if (AddTest) {
	CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
	Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
	}

	// a < b ? -1 : 0 -> RES = ~setcc_carry
	// a < b ? 0 : -1 -> RES = setcc_carry
	// a >= b ? -1 : 0 -> RES = setcc_carry
	// a >= b ? 0 : -1 -> RES = ~setcc_carry
	if (Cond.getOpcode() == X86ISD::SUB) {
	unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();

	if ((CondCode == X86::COND_AE \|\| CondCode == X86::COND_B) &&
	(isAllOnesConstant(Op1) \|\| isAllOnesConstant(Op2)) &&
	(isNullConstant(Op1) \|\| isNullConstant(Op2))) {
	SDValue Res =
	DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
	DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
	if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
	return DAG.getNOT(DL, Res, Res.getValueType());
	return Res;
	}
	}

	// X86 doesn't have an i8 cmov. If both operands are the result of a truncate
	// widen the cmov and push the truncate through. This avoids introducing a new
	// branch during isel and doesn't add any extensions.
	if (Op.getValueType() == MVT::i8 &&
	Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
	SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
	if (T1.getValueType() == T2.getValueType() &&
	// Exclude CopyFromReg to avoid partial register stalls.
	T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
	SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
	CC, Cond);
	return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
	}
	}

	// Or finally, promote i8 cmovs if we have CMOV,
	// or i16 cmovs if it won't prevent folding a load.
	// FIXME: we should not limit promotion of i8 case to only when the CMOV is
	// legal, but EmitLoweredSelect() can not deal with these extensions
	// being inserted between two CMOV's. (in i16 case too TBN)
	// https://bugs.llvm.org/show_bug.cgi?id=40974
	if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) \|\|
	(Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&
	!MayFoldLoad(Op2))) {
	Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
	Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
	SDValue Ops[] = { Op2, Op1, CC, Cond };
	SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
	return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
	}

	// X86ISD::CMOV means set the result (which is operand 1) to the RHS if
	// condition is true.
	SDValue Ops[] = { Op2, Op1, CC, Cond };
	return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
	}

	static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
	MVT VTElt = VT.getVectorElementType();
	SDLoc dl(Op);

	unsigned NumElts = VT.getVectorNumElements();

	// Extend VT if the scalar type is i8/i16 and BWI is not supported.
	MVT ExtVT = VT;
	if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
	// If v16i32 is to be avoided, we'll need to split and concatenate.
	if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
	return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);

	ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
	}

	// Widen to 512-bits if VLX is not supported.
	MVT WideVT = ExtVT;
	if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
	NumElts *= 512 / ExtVT.getSizeInBits();
	InVT = MVT::getVectorVT(MVT::i1, NumElts);
	In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
	In, DAG.getIntPtrConstant(0, dl));
	WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
	}

	SDValue V;
	MVT WideEltVT = WideVT.getVectorElementType();
	if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) \|\|
	(Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
	V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
	} else {
	SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
	SDValue Zero = DAG.getConstant(0, dl, WideVT);
	V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
	}

	// Truncate if we had to extend i16/i8 above.
	if (VT != ExtVT) {
	WideVT = MVT::getVectorVT(VTElt, NumElts);
	V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
	}

	// Extract back to 128/256-bit if we widened.
	if (WideVT != VT)
	V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
	DAG.getIntPtrConstant(0, dl));

	return V;
	}

	static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();

	if (InVT.getVectorElementType() == MVT::i1)
	return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

	assert(Subtarget.hasAVX() && "Expected AVX support");
	return LowerAVXExtend(Op, DAG, Subtarget);
	}

	// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
	// For sign extend this needs to handle all vector sizes and SSE4.1 and
	// non-SSE4.1 targets. For zero extend this should only handle inputs of
	// MVT::v64i8 when BWI is not supported, but AVX512 is.
	static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = Op->getOperand(0);
	MVT VT = Op->getSimpleValueType(0);
	MVT InVT = In.getSimpleValueType();

	MVT SVT = VT.getVectorElementType();
	MVT InSVT = InVT.getVectorElementType();
	assert(SVT.getSizeInBits() > InSVT.getSizeInBits());

	if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
	return SDValue();
	if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
	return SDValue();
	if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
	!(VT.is256BitVector() && Subtarget.hasAVX()) &&
	!(VT.is512BitVector() && Subtarget.hasAVX512()))
	return SDValue();

	SDLoc dl(Op);
	unsigned Opc = Op.getOpcode();
	unsigned NumElts = VT.getVectorNumElements();

	// For 256-bit vectors, we only need the lower (128-bit) half of the input.
	// For 512-bit vectors, we need 128-bits or 256-bits.
	if (InVT.getSizeInBits() > 128) {
	// Input needs to be at least the same number of elements as output, and
	// at least 128-bits.
	int InSize = InSVT.getSizeInBits() * NumElts;
	In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
	InVT = In.getSimpleValueType();
	}

	// SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
	// so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
	// need to be handled here for 256/512-bit results.
	if (Subtarget.hasInt256()) {
	assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");

	if (InVT.getVectorNumElements() != NumElts)
	return DAG.getNode(Op.getOpcode(), dl, VT, In);

	// FIXME: Apparently we create inreg operations that could be regular
	// extends.
	unsigned ExtOpc =
	Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
	: ISD::ZERO_EXTEND;
	return DAG.getNode(ExtOpc, dl, VT, In);
	}

	// pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
	if (Subtarget.hasAVX()) {
	assert(VT.is256BitVector() && "256-bit vector expected");
	MVT HalfVT = VT.getHalfNumVectorElementsVT();
	int HalfNumElts = HalfVT.getVectorNumElements();

	unsigned NumSrcElts = InVT.getVectorNumElements();
	SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
	for (int i = 0; i != HalfNumElts; ++i)
	HiMask[i] = HalfNumElts + i;

	SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
	SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
	Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
	}

	// We should only get here for sign extend.
	assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
	assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");

	// pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
	SDValue Curr = In;
	SDValue SignExt = Curr;

	// As SRAI is only available on i16/i32 types, we expand only up to i32
	// and handle i64 separately.
	if (InVT != MVT::v4i32) {
	MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;

	unsigned DestWidth = DestVT.getScalarSizeInBits();
	unsigned Scale = DestWidth / InSVT.getSizeInBits();

	unsigned InNumElts = InVT.getVectorNumElements();
	unsigned DestElts = DestVT.getVectorNumElements();

	// Build a shuffle mask that takes each input element and places it in the
	// MSBs of the new element size.
	SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
	for (unsigned i = 0; i != DestElts; ++i)
	Mask[i * Scale + (Scale - 1)] = i;

	Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
	Curr = DAG.getBitcast(DestVT, Curr);

	unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
	SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
	DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
	}

	if (VT == MVT::v2i64) {
	assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
	SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
	SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
	SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
	SignExt = DAG.getBitcast(VT, SignExt);
	}

	return SignExt;
	}

	static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	SDLoc dl(Op);

	if (InVT.getVectorElementType() == MVT::i1)
	return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

	assert(VT.isVector() && InVT.isVector() && "Expected vector type");
	assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
	"Expected same number of elements");
	assert((VT.getVectorElementType() == MVT::i16 \|\|
	VT.getVectorElementType() == MVT::i32 \|\|
	VT.getVectorElementType() == MVT::i64) &&
	"Unexpected element type");
	assert((InVT.getVectorElementType() == MVT::i8 \|\|
	InVT.getVectorElementType() == MVT::i16 \|\|
	InVT.getVectorElementType() == MVT::i32) &&
	"Unexpected element type");

	if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
	assert(InVT == MVT::v32i8 && "Unexpected VT!");
	return splitVectorIntUnary(Op, DAG);
	}

	if (Subtarget.hasInt256())
	return Op;

	// Optimize vectors in AVX mode
	// Sign extend v8i16 to v8i32 and
	// v4i32 to v4i64
	//
	// Divide input vector into two parts
	// for v4i32 the high shuffle mask will be {2, 3, -1, -1}
	// use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
	// concat the vectors to original VT
	MVT HalfVT = VT.getHalfNumVectorElementsVT();
	SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);

	unsigned NumElems = InVT.getVectorNumElements();
	SmallVector<int,8> ShufMask(NumElems, -1);
	for (unsigned i = 0; i != NumElems/2; ++i)
	ShufMask[i] = i + NumElems/2;

	SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
	OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
	}

	/// Change a vector store into a pair of half-size vector stores.
	static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
	SDValue StoredVal = Store->getValue();
	assert((StoredVal.getValueType().is256BitVector() \|\|
	StoredVal.getValueType().is512BitVector()) &&
	"Expecting 256/512-bit op");

	// Splitting volatile memory ops is not allowed unless the operation was not
	// legal to begin with. Assume the input store is legal (this transform is
	// only used for targets with AVX). Note: It is possible that we have an
	// illegal type like v2i128, and so we could allow splitting a volatile store
	// in that case if that is important.
	if (!Store->isSimple())
	return SDValue();

	SDLoc DL(Store);
	SDValue Value0, Value1;
	std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
	unsigned HalfOffset = Value0.getValueType().getStoreSize();
	SDValue Ptr0 = Store->getBasePtr();
	SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfOffset, DL);
	SDValue Ch0 =
	DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
	Store->getOriginalAlign(),
	Store->getMemOperand()->getFlags());
	SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
	Store->getPointerInfo().getWithOffset(HalfOffset),
	Store->getOriginalAlign(),
	Store->getMemOperand()->getFlags());
	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
	}

	/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
	/// type.
	static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
	SelectionDAG &DAG) {
	SDValue StoredVal = Store->getValue();
	assert(StoreVT.is128BitVector() &&
	StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
	StoredVal = DAG.getBitcast(StoreVT, StoredVal);

	// Splitting volatile memory ops is not allowed unless the operation was not
	// legal to begin with. We are assuming the input op is legal (this transform
	// is only used for targets with AVX).
	if (!Store->isSimple())
	return SDValue();

	MVT StoreSVT = StoreVT.getScalarType();
	unsigned NumElems = StoreVT.getVectorNumElements();
	unsigned ScalarSize = StoreSVT.getStoreSize();

	SDLoc DL(Store);
	SmallVector<SDValue, 4> Stores;
	for (unsigned i = 0; i != NumElems; ++i) {
	unsigned Offset = i * ScalarSize;
	SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(), Offset, DL);
	SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
	DAG.getIntPtrConstant(i, DL));
	SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
	Store->getPointerInfo().getWithOffset(Offset),
	Store->getOriginalAlign(),
	Store->getMemOperand()->getFlags());
	Stores.push_back(Ch);
	}
	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
	}

	static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
	SDLoc dl(St);
	SDValue StoredVal = St->getValue();

	// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
	if (StoredVal.getValueType().isVector() &&
	StoredVal.getValueType().getVectorElementType() == MVT::i1) {
	assert(StoredVal.getValueType().getVectorNumElements() <= 8 &&
	"Unexpected VT");
	assert(!St->isTruncatingStore() && "Expected non-truncating store");
	assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
	"Expected AVX512F without AVX512DQI");

	StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
	DAG.getUNDEF(MVT::v16i1), StoredVal,
	DAG.getIntPtrConstant(0, dl));
	StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
	StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);

	return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
	St->getPointerInfo(), St->getOriginalAlign(),
	St->getMemOperand()->getFlags());
	}

	if (St->isTruncatingStore())
	return SDValue();

	// If this is a 256-bit store of concatenated ops, we are better off splitting
	// that store into two 128-bit stores. This avoids spurious use of 256-bit ops
	// and each half can execute independently. Some cores would split the op into
	// halves anyway, so the concat (vinsertf128) is purely an extra op.
	MVT StoreVT = StoredVal.getSimpleValueType();
	if (StoreVT.is256BitVector() \|\|
	((StoreVT == MVT::v32i16 \|\| StoreVT == MVT::v64i8) &&
	!Subtarget.hasBWI())) {
	SmallVector<SDValue, 4> CatOps;
	if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
	return splitVectorStore(St, DAG);
	return SDValue();
	}

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&
	"Unexpected VT");
	assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
	TargetLowering::TypeWidenVector && "Unexpected type action!");

	EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
	StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
	DAG.getUNDEF(StoreVT));

	if (Subtarget.hasSSE2()) {
	// Widen the vector, cast to a v2x64 type, extract the single 64-bit element
	// and store it.
	MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
	MVT CastVT = MVT::getVectorVT(StVT, 2);
	StoredVal = DAG.getBitcast(CastVT, StoredVal);
	StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
	DAG.getIntPtrConstant(0, dl));

	return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
	St->getPointerInfo(), St->getOriginalAlign(),
	St->getMemOperand()->getFlags());
	}
	assert(Subtarget.hasSSE1() && "Expected SSE");
	SDVTList Tys = DAG.getVTList(MVT::Other);
	SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
	return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
	St->getMemOperand());
	}

	// Lower vector extended loads using a shuffle. If SSSE3 is not available we
	// may emit an illegal shuffle but the expansion is still better than scalar
	// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
	// we'll emit a shuffle and a arithmetic shift.
	// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
	// TODO: It is possible to support ZExt by zeroing the undef values during
	// the shuffle phase or after the shuffle.
	static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT RegVT = Op.getSimpleValueType();
	assert(RegVT.isVector() && "We only custom lower vector loads.");
	assert(RegVT.isInteger() &&
	"We only custom lower integer vector loads.");

	LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
	SDLoc dl(Ld);

	// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
	if (RegVT.getVectorElementType() == MVT::i1) {
	assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
	assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
	assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
	"Expected AVX512F without AVX512DQI");

	SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), Ld->getOriginalAlign(),
	Ld->getMemOperand()->getFlags());

	// Replace chain users with the new chain.
	assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");

	SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
	Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
	DAG.getBitcast(MVT::v16i1, Val),
	DAG.getIntPtrConstant(0, dl));
	return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
	}

	return SDValue();
	}

	/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
	/// each of which has no other use apart from the AND / OR.
	static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
	Opc = Op.getOpcode();
	if (Opc != ISD::OR && Opc != ISD::AND)
	return false;
	return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
	Op.getOperand(0).hasOneUse() &&
	Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
	Op.getOperand(1).hasOneUse());
	}

	SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
	SDValue Chain = Op.getOperand(0);
	SDValue Cond = Op.getOperand(1);
	SDValue Dest = Op.getOperand(2);
	SDLoc dl(Op);

	if (Cond.getOpcode() == ISD::SETCC &&
	Cond.getOperand(0).getValueType() != MVT::f128) {
	SDValue LHS = Cond.getOperand(0);
	SDValue RHS = Cond.getOperand(1);
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

	// Special case for
	// setcc([su]{add,sub,mul}o == 0)
	// setcc([su]{add,sub,mul}o != 1)
	if (ISD::isOverflowIntrOpRes(LHS) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE) &&
	(isNullConstant(RHS) \|\| isOneConstant(RHS))) {
	SDValue Value, Overflow;
	X86::CondCode X86Cond;
	std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);

	if ((CC == ISD::SETEQ) == isNullConstant(RHS))
	X86Cond = X86::GetOppositeBranchCondition(X86Cond);

	SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
	return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
	Overflow);
	}

	if (LHS.getSimpleValueType().isInteger()) {
	SDValue CCVal;
	SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
	return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
	EFLAGS);
	}

	if (CC == ISD::SETOEQ) {
	// For FCMP_OEQ, we can emit
	// two branches instead of an explicit AND instruction with a
	// separate test. However, we only do this if this block doesn't
	// have a fall-through edge, because this requires an explicit
	// jmp when the condition is false.
	if (Op.getNode()->hasOneUse()) {
	SDNode User = Op.getNode()->use_begin();
	// Look for an unconditional branch following this conditional branch.
	// We need this because we need to reverse the successors in order
	// to implement FCMP_OEQ.
	if (User->getOpcode() == ISD::BR) {
	SDValue FalseBB = User->getOperand(1);
	SDNode *NewBR =
	DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
	assert(NewBR == User);
	(void)NewBR;
	Dest = FalseBB;

	SDValue Cmp =
	DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
	SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
	Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
	CCVal, Cmp);
	CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
	return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
	Cmp);
	}
	}
	} else if (CC == ISD::SETUNE) {
	// For FCMP_UNE, we can emit
	// two branches instead of an explicit OR instruction with a
	// separate test.
	SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
	SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
	Chain =
	DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
	CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
	return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
	Cmp);
	} else {
	X86::CondCode X86Cond =
	TranslateX86CC(CC, dl, /IsFP/ true, LHS, RHS, DAG);
	SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
	SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
	return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
	Cmp);
	}
	}

	if (ISD::isOverflowIntrOpRes(Cond)) {
	SDValue Value, Overflow;
	X86::CondCode X86Cond;
	std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);

	SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
	return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
	Overflow);
	}

	// Look past the truncate if the high bits are known zero.
	if (isTruncWithZeroHighBitsInput(Cond, DAG))
	Cond = Cond.getOperand(0);

	EVT CondVT = Cond.getValueType();

	// Add an AND with 1 if we don't already have one.
	if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
	Cond =
	DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));

	SDValue LHS = Cond;
	SDValue RHS = DAG.getConstant(0, dl, CondVT);

	SDValue CCVal;
	SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
	return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
	EFLAGS);
	}

	// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
	// Calls to _alloca are needed to probe the stack when allocating more than 4k
	// bytes in one go. Touching the stack at 4K increments is necessary to ensure
	// that the guard pages used by the OS virtual memory manager are allocated in
	// correct sequence.
	SDValue
	X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	bool SplitStack = MF.shouldSplitStack();
	bool EmitStackProbeCall = hasStackProbeSymbol(MF);
	bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) \|\|
	SplitStack \|\| EmitStackProbeCall;
	SDLoc dl(Op);

	// Get the inputs.
	SDNode *Node = Op.getNode();
	SDValue Chain = Op.getOperand(0);
	SDValue Size = Op.getOperand(1);
	MaybeAlign Alignment(Op.getConstantOperandVal(2));
	EVT VT = Node->getValueType(0);

	// Chain the dynamic stack allocation so that it doesn't modify the stack
	// pointer when other instructions are using the stack.
	Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

	bool Is64Bit = Subtarget.is64Bit();
	MVT SPTy = getPointerTy(DAG.getDataLayout());

	SDValue Result;
	if (!Lower) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
	assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
	" not tell us which reg is the stack pointer!");

	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	const Align StackAlign = TFI.getStackAlign();
	if (hasInlineStackProbe(MF)) {
	MachineRegisterInfo &MRI = MF.getRegInfo();

	const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
	Register Vreg = MRI.createVirtualRegister(AddrRegClass);
	Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
	Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
	DAG.getRegister(Vreg, SPTy));
	} else {
	SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
	Chain = SP.getValue(1);
	Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
	}
	if (Alignment && *Alignment > StackAlign)
	Result =
	DAG.getNode(ISD::AND, dl, VT, Result,
	DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
	} else if (SplitStack) {
	MachineRegisterInfo &MRI = MF.getRegInfo();

	if (Is64Bit) {
	// The 64 bit implementation of segmented stacks needs to clobber both r10
	// r11. This makes it impossible to use it along with nested parameters.
	const Function &F = MF.getFunction();
	for (const auto &A : F.args()) {
	if (A.hasNestAttr())
	report_fatal_error("Cannot use segmented stacks with functions that "
	"have nested arguments.");
	}
	}

	const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
	Register Vreg = MRI.createVirtualRegister(AddrRegClass);
	Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
	Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
	DAG.getRegister(Vreg, SPTy));
	} else {
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
	MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);

	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	Register SPReg = RegInfo->getStackRegister();
	SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
	Chain = SP.getValue(1);

	if (Alignment) {
	SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
	DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
	}

	Result = SP;
	}

	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
	DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);

	SDValue Ops[2] = {Result, Chain};
	return DAG.getMergeValues(Ops, dl);
	}

	SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	auto PtrVT = getPointerTy(MF.getDataLayout());
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	SDLoc DL(Op);

	if (!Subtarget.is64Bit() \|\|
	Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
	// vastart just stores the address of the VarArgsFrameIndex slot into the
	// memory location argument.
	SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
	return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	// __va_list_tag:
	// gp_offset (0 - 6 * 8)
	// fp_offset (48 - 48 + 8 * 16)
	// overflow_arg_area (point to parameters coming in memory).
	// reg_save_area
	SmallVector<SDValue, 8> MemOps;
	SDValue FIN = Op.getOperand(1);
	// Store gp_offset
	SDValue Store = DAG.getStore(
	Op.getOperand(0), DL,
	DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
	MachinePointerInfo(SV));
	MemOps.push_back(Store);

	// Store fp_offset
	FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
	Store = DAG.getStore(
	Op.getOperand(0), DL,
	DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
	MachinePointerInfo(SV, 4));
	MemOps.push_back(Store);

	// Store ptr to overflow_arg_area
	FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
	SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
	Store =
	DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
	MemOps.push_back(Store);

	// Store ptr to reg_save_area.
	FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
	Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
	SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
	Store = DAG.getStore(
	Op.getOperand(0), DL, RSFIN, FIN,
	MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
	MemOps.push_back(Store);
	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
	}

	SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
	assert(Subtarget.is64Bit() &&
	"LowerVAARG only handles 64-bit va_arg!");
	assert(Op.getNumOperands() == 4);

	MachineFunction &MF = DAG.getMachineFunction();
	if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
	// The Win64 ABI uses char* instead of a structure.
	return DAG.expandVAArg(Op.getNode());

	SDValue Chain = Op.getOperand(0);
	SDValue SrcPtr = Op.getOperand(1);
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	unsigned Align = Op.getConstantOperandVal(3);
	SDLoc dl(Op);

	EVT ArgVT = Op.getNode()->getValueType(0);
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
	uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
	uint8_t ArgMode;

	// Decide which area this value should be read from.
	// TODO: Implement the AMD64 ABI in its entirety. This simple
	// selection mechanism works only for the basic types.
	assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
	if (ArgVT.isFloatingPoint() && ArgSize <= 16 /bytes/) {
	ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
	} else {
	assert(ArgVT.isInteger() && ArgSize <= 32 /bytes/ &&
	"Unhandled argument type in LowerVAARG");
	ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
	}

	if (ArgMode == 2) {
	// Sanity Check: Make sure using fp_offset makes sense.
	assert(!Subtarget.useSoftFloat() &&
	!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
	Subtarget.hasSSE1());
	}

	// Insert VAARG_64 node into the DAG
	// VAARG_64 returns two values: Variable Argument Address, Chain
	SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
	DAG.getConstant(ArgMode, dl, MVT::i8),
	DAG.getConstant(Align, dl, MVT::i32)};
	SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
	SDValue VAARG = DAG.getMemIntrinsicNode(
	X86ISD::VAARG_64, dl, VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
	/Align=/None, MachineMemOperand::MOLoad \| MachineMemOperand::MOStore);
	Chain = VAARG.getValue(1);

	// Load the next argument and return it
	return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
	}

	static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// X86-64 va_list is a struct { i32, i32, i8, i8 }, except on Windows,
	// where a va_list is still an i8*.
	assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
	if (Subtarget.isCallingConvWin64(
	DAG.getMachineFunction().getFunction().getCallingConv()))
	// Probably a Win64 va_copy.
	return DAG.expandVACopy(Op.getNode());

	SDValue Chain = Op.getOperand(0);
	SDValue DstPtr = Op.getOperand(1);
	SDValue SrcPtr = Op.getOperand(2);
	const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
	const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
	SDLoc DL(Op);

	return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(24, DL),
	Align(8), /isVolatile/ false, false, false,
	MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
	}

	// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
	static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
	switch (Opc) {
	case ISD::SHL:
	case X86ISD::VSHL:
	case X86ISD::VSHLI:
	return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
	case ISD::SRL:
	case X86ISD::VSRL:
	case X86ISD::VSRLI:
	return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
	case ISD::SRA:
	case X86ISD::VSRA:
	case X86ISD::VSRAI:
	return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
	}
	llvm_unreachable("Unknown target vector shift node");
	}

	/// Handle vector element shifts where the shift amount is a constant.
	/// Takes immediate version of shift as input.
	static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
	SDValue SrcOp, uint64_t ShiftAmt,
	SelectionDAG &DAG) {
	MVT ElementType = VT.getVectorElementType();

	// Bitcast the source vector to the output type, this is mainly necessary for
	// vXi8/vXi64 shifts.
	if (VT != SrcOp.getSimpleValueType())
	SrcOp = DAG.getBitcast(VT, SrcOp);

	// Fold this packed shift into its first operand if ShiftAmt is 0.
	if (ShiftAmt == 0)
	return SrcOp;

	// Check for ShiftAmt >= element width
	if (ShiftAmt >= ElementType.getSizeInBits()) {
	if (Opc == X86ISD::VSRAI)
	ShiftAmt = ElementType.getSizeInBits() - 1;
	else
	return DAG.getConstant(0, dl, VT);
	}

	assert((Opc == X86ISD::VSHLI \|\| Opc == X86ISD::VSRLI \|\| Opc == X86ISD::VSRAI)
	&& "Unknown target vector shift-by-constant node");

	// Fold this packed vector shift into a build vector if SrcOp is a
	// vector of Constants or UNDEFs.
	if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
	SmallVector<SDValue, 8> Elts;
	unsigned NumElts = SrcOp->getNumOperands();

	switch (Opc) {
	default: llvm_unreachable("Unknown opcode!");
	case X86ISD::VSHLI:
	for (unsigned i = 0; i != NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	// Must produce 0s in the correct bits.
	Elts.push_back(DAG.getConstant(0, dl, ElementType));
	continue;
	}
	auto *ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
	}
	break;
	case X86ISD::VSRLI:
	for (unsigned i = 0; i != NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	// Must produce 0s in the correct bits.
	Elts.push_back(DAG.getConstant(0, dl, ElementType));
	continue;
	}
	auto *ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
	}
	break;
	case X86ISD::VSRAI:
	for (unsigned i = 0; i != NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	// All shifted in bits must be the same so use 0.
	Elts.push_back(DAG.getConstant(0, dl, ElementType));
	continue;
	}
	auto *ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
	}
	break;
	}

	return DAG.getBuildVector(VT, dl, Elts);
	}

	return DAG.getNode(Opc, dl, VT, SrcOp,
	DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
	}

	/// Handle vector element shifts where the shift amount may or may not be a
	/// constant. Takes immediate version of shift as input.
	static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
	SDValue SrcOp, SDValue ShAmt,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT SVT = ShAmt.getSimpleValueType();
	assert((SVT == MVT::i32 \|\| SVT == MVT::i64) && "Unexpected value type!");

	// Catch shift-by-constant.
	if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
	return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
	CShAmt->getZExtValue(), DAG);

	// Change opcode to non-immediate version.
	Opc = getTargetVShiftUniformOpcode(Opc, true);

	// Need to build a vector containing shift amount.
	// SSE/AVX packed shifts only use the lower 64-bit of the shift count.
	// +====================+============+=======================================+
	// \| ShAmt is \| HasSSE4.1? \| Construct ShAmt vector as \|
	// +====================+============+=======================================+
	// \| i64 \| Yes, No \| Use ShAmt as lowest elt \|
	// \| i32 \| Yes \| zero-extend in-reg \|
	// \| (i32 zext(i16/i8)) \| Yes \| zero-extend in-reg \|
	// \| (i32 zext(i16/i8)) \| No \| byte-shift-in-reg \|
	// \| i16/i32 \| No \| v4i32 build_vector(ShAmt, 0, ud, ud)) \|
	// +====================+============+=======================================+

	if (SVT == MVT::i64)
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
	else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
	ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	(ShAmt.getOperand(0).getSimpleValueType() == MVT::i16 \|\|
	ShAmt.getOperand(0).getSimpleValueType() == MVT::i8)) {
	ShAmt = ShAmt.getOperand(0);
	MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);
	if (Subtarget.hasSSE41())
	ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
	MVT::v2i64, ShAmt);
	else {
	SDValue ByteShift = DAG.getTargetConstant(
	(128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
	ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
	ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
	ByteShift);
	ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
	ByteShift);
	}
	} else if (Subtarget.hasSSE41() &&
	ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
	ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
	MVT::v2i64, ShAmt);
	} else {
	SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),
	DAG.getUNDEF(SVT)};
	ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
	}

	// The return type has to be a 128-bit type with the same element
	// type as the input type.
	MVT EltVT = VT.getVectorElementType();
	MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());

	ShAmt = DAG.getBitcast(ShVT, ShAmt);
	return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
	}

	/// Return Mask with the necessary casting or extending
	/// for \p Mask according to \p MaskVT when lowering masking intrinsics
	static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl) {

	if (isAllOnesConstant(Mask))
	return DAG.getConstant(1, dl, MaskVT);
	if (X86::isZeroNode(Mask))
	return DAG.getConstant(0, dl, MaskVT);

	assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");

	if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
	assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
	assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
	// In case 32bit mode, bitcast i64 is illegal, extend/split it.
	SDValue Lo, Hi;
	Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
	DAG.getConstant(0, dl, MVT::i32));
	Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
	DAG.getConstant(1, dl, MVT::i32));

	Lo = DAG.getBitcast(MVT::v32i1, Lo);
	Hi = DAG.getBitcast(MVT::v32i1, Hi);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
	} else {
	MVT BitcastVT = MVT::getVectorVT(MVT::i1,
	Mask.getSimpleValueType().getSizeInBits());
	// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
	// are extracted by EXTRACT_SUBVECTOR.
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
	DAG.getBitcast(BitcastVT, Mask),
	DAG.getIntPtrConstant(0, dl));
	}
	}

	/// Return (and \p Op, \p Mask) for compare instructions or
	/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
	/// necessary casting or extending for \p Mask when lowering masking intrinsics
	static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	unsigned OpcodeSelect = ISD::VSELECT;
	SDLoc dl(Op);

	if (isAllOnesConstant(Mask))
	return Op;

	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	if (PreservedSrc.isUndef())
	PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
	return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
	}

	/// Creates an SDNode for a predicated scalar operation.
	/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
	/// The mask is coming as MVT::i8 and it should be transformed
	/// to MVT::v1i1 while lowering masking intrinsics.
	/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
	/// "X86select" instead of "vselect". We just can't create the "vselect" node
	/// for a scalar instruction.
	static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {

	if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
	if (MaskConst->getZExtValue() & 0x1)
	return Op;

	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
	SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
	DAG.getBitcast(MVT::v8i1, Mask),
	DAG.getIntPtrConstant(0, dl));
	if (Op.getOpcode() == X86ISD::FSETCCM \|\|
	Op.getOpcode() == X86ISD::FSETCCM_SAE \|\|
	Op.getOpcode() == X86ISD::VFPCLASSS)
	return DAG.getNode(ISD::AND, dl, VT, Op, IMask);

	if (PreservedSrc.isUndef())
	PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
	return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
	}

	static int getSEHRegistrationNodeSize(const Function *Fn) {
	if (!Fn->hasPersonalityFn())
	report_fatal_error(
	"querying registration node size for function without personality");
	// The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
	// WinEHStatePass for the full struct definition.
	switch (classifyEHPersonality(Fn->getPersonalityFn())) {
	case EHPersonality::MSVC_X86SEH: return 24;
	case EHPersonality::MSVC_CXX: return 16;
	default: break;
	}
	report_fatal_error(
	"can only recover FP for 32-bit MSVC EH personality functions");
	}

	/// When the MSVC runtime transfers control to us, either to an outlined
	/// function or when returning to a parent frame after catching an exception, we
	/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
	/// Here's the math:
	/// RegNodeBase = EntryEBP - RegNodeSize
	/// ParentFP = RegNodeBase - ParentFrameOffset
	/// Subtracting RegNodeSize takes us to the offset of the registration node, and
	/// subtracting the offset (negative on x86) takes us back to the parent FP.
	static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
	SDValue EntryEBP) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDLoc dl;

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

	// It's possible that the parent function no longer has a personality function
	// if the exceptional code was optimized away, in which case we just return
	// the incoming EBP.
	if (!Fn->hasPersonalityFn())
	return EntryEBP;

	// Get an MCSymbol that will ultimately resolve to the frame offset of the EH
	// registration, or the .set_setframe offset.
	MCSymbol *OffsetSym =
	MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
	GlobalValue::dropLLVMManglingEscape(Fn->getName()));
	SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
	SDValue ParentFrameOffset =
	DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);

	// Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
	// prologue to RBP in the parent function.
	const X86Subtarget &Subtarget =
	static_cast<const X86Subtarget &>(DAG.getSubtarget());
	if (Subtarget.is64Bit())
	return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);

	int RegNodeSize = getSEHRegistrationNodeSize(Fn);
	// RegNodeBase = EntryEBP - RegNodeSize
	// ParentFP = RegNodeBase - ParentFrameOffset
	SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
	DAG.getConstant(RegNodeSize, dl, PtrVT));
	return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
	}

	SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
	SelectionDAG &DAG) const {
	// Helper to detect if the operand is CUR_DIRECTION rounding mode.
	auto isRoundModeCurDirection = [](SDValue Rnd) {
	if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
	return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;

	return false;
	};
	auto isRoundModeSAE = [](SDValue Rnd) {
	if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
	unsigned RC = C->getZExtValue();
	if (RC & X86::STATIC_ROUNDING::NO_EXC) {
	// Clear the NO_EXC bit and check remaining bits.
	RC ^= X86::STATIC_ROUNDING::NO_EXC;
	// As a convenience we allow no other bits or explicitly
	// current direction.
	return RC == 0 \|\| RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
	}
	}

	return false;
	};
	auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
	if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
	RC = C->getZExtValue();
	if (RC & X86::STATIC_ROUNDING::NO_EXC) {
	// Clear the NO_EXC bit and check remaining bits.
	RC ^= X86::STATIC_ROUNDING::NO_EXC;
	return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT \|\|
	RC == X86::STATIC_ROUNDING::TO_NEG_INF \|\|
	RC == X86::STATIC_ROUNDING::TO_POS_INF \|\|
	RC == X86::STATIC_ROUNDING::TO_ZERO;
	}
	}

	return false;
	};

	SDLoc dl(Op);
	unsigned IntNo = Op.getConstantOperandVal(0);
	MVT VT = Op.getSimpleValueType();
	const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);

	if (IntrData) {
	switch(IntrData->Type) {
	case INTR_TYPE_1OP: {
	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(2);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
	Op.getOperand(1),
	DAG.getTargetConstant(RC, dl, MVT::i32));
	if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(1));
	}
	case INTR_TYPE_1OP_SAE: {
	SDValue Sae = Op.getOperand(2);

	unsigned Opc;
	if (isRoundModeCurDirection(Sae))
	Opc = IntrData->Opc0;
	else if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else
	return SDValue();

	return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
	}
	case INTR_TYPE_2OP: {
	SDValue Src2 = Op.getOperand(2);

	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(3);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
	Op.getOperand(1), Src2,
	DAG.getTargetConstant(RC, dl, MVT::i32));
	if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}

	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(1), Src2);
	}
	case INTR_TYPE_2OP_SAE: {
	SDValue Sae = Op.getOperand(3);

	unsigned Opc;
	if (isRoundModeCurDirection(Sae))
	Opc = IntrData->Opc0;
	else if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else
	return SDValue();

	return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
	Op.getOperand(2));
	}
	case INTR_TYPE_3OP:
	case INTR_TYPE_3OP_IMM8: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);

	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(4);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
	Src1, Src2, Src3,
	DAG.getTargetConstant(RC, dl, MVT::i32));
	if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}

	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	{Src1, Src2, Src3});
	}
	case INTR_TYPE_4OP:
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
	Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
	case INTR_TYPE_1OP_MASK: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	// We add rounding mode to the Node when
	// - RC Opcode is specified and
	// - RC is not "current direction".
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(4);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	return getVectorMaskingNode(
	DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
	Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
	Mask, PassThru, Subtarget, DAG);
	if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}
	return getVectorMaskingNode(
	DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
	Subtarget, DAG);
	}
	case INTR_TYPE_1OP_MASK_SAE: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue Rnd = Op.getOperand(4);

	unsigned Opc;
	if (isRoundModeCurDirection(Rnd))
	Opc = IntrData->Opc0;
	else if (isRoundModeSAE(Rnd))
	Opc = IntrData->Opc1;
	else
	return SDValue();

	return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
	Subtarget, DAG);
	}
	case INTR_TYPE_SCALAR_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue passThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	// There are 2 kinds of intrinsics in this group:
	// (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
	// (2) With rounding mode and sae - 7 operands.
	bool HasRounding = IntrWithRoundingModeOpcode != 0;
	if (Op.getNumOperands() == (5U + HasRounding)) {
	if (HasRounding) {
	SDValue Rnd = Op.getOperand(5);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	return getScalarMaskingNode(
	DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
	DAG.getTargetConstant(RC, dl, MVT::i32)),
	Mask, passThru, Subtarget, DAG);
	if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
	Src2),
	Mask, passThru, Subtarget, DAG);
	}

	assert(Op.getNumOperands() == (6U + HasRounding) &&
	"Unexpected intrinsic form");
	SDValue RoundingMode = Op.getOperand(5);
	unsigned Opc = IntrData->Opc0;
	if (HasRounding) {
	SDValue Sae = Op.getOperand(6);
	if (isRoundModeSAE(Sae))
	Opc = IntrWithRoundingModeOpcode;
	else if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}
	return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
	Src2, RoundingMode),
	Mask, passThru, Subtarget, DAG);
	}
	case INTR_TYPE_SCALAR_MASK_RND: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue passThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	SDValue Rnd = Op.getOperand(5);

	SDValue NewOp;
	unsigned RC = 0;
	if (isRoundModeCurDirection(Rnd))
	NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
	else if (isRoundModeSAEToX(Rnd, RC))
	NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
	DAG.getTargetConstant(RC, dl, MVT::i32));
	else
	return SDValue();

	return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
	}
	case INTR_TYPE_SCALAR_MASK_SAE: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue passThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	SDValue Sae = Op.getOperand(5);
	unsigned Opc;
	if (isRoundModeCurDirection(Sae))
	Opc = IntrData->Opc0;
	else if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else
	return SDValue();

	return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
	Mask, passThru, Subtarget, DAG);
	}
	case INTR_TYPE_2OP_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	SDValue NewOp;
	if (IntrData->Opc1 != 0) {
	SDValue Rnd = Op.getOperand(5);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
	DAG.getTargetConstant(RC, dl, MVT::i32));
	else if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}
	if (!NewOp)
	NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
	return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_2OP_MASK_SAE: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);

	unsigned Opc = IntrData->Opc0;
	if (IntrData->Opc1 != 0) {
	SDValue Sae = Op.getOperand(5);
	if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}

	return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue PassThru = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Sae = Op.getOperand(6);
	unsigned Opc;
	if (isRoundModeCurDirection(Sae))
	Opc = IntrData->Opc0;
	else if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else
	return SDValue();

	return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_3OP_MASK_SAE: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue PassThru = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);

	unsigned Opc = IntrData->Opc0;
	if (IntrData->Opc1 != 0) {
	SDValue Sae = Op.getOperand(6);
	if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}
	return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case BLENDV: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);

	EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
	Src3 = DAG.getBitcast(MaskVT, Src3);

	// Reverse the operands to match VSELECT order.
	return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
	}
	case VPERM_2OP : {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);

	// Swap Src1 and Src2 in the node creation
	return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
	}
	case IFMA_OP:
	// NOTE: We need to swizzle the operands to pass the multiply operands
	// first.
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
	case FPCLASSS: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Imm = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
	SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
	Subtarget, DAG);
	// Need to fill with zeros to ensure the bitcast will produce zeroes
	// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
	SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
	DAG.getConstant(0, dl, MVT::v8i1),
	FPclassMask, DAG.getIntPtrConstant(0, dl));
	return DAG.getBitcast(MVT::i8, Ins);
	}

	case CMP_MASK_CC: {
	MVT MaskVT = Op.getSimpleValueType();
	SDValue CC = Op.getOperand(3);
	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	if (IntrData->Opc1 != 0) {
	SDValue Sae = Op.getOperand(4);
	if (isRoundModeSAE(Sae))
	return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
	Op.getOperand(2), CC, Sae);
	if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}
	//default rounding mode
	return DAG.getNode(IntrData->Opc0, dl, MaskVT,
	{Op.getOperand(1), Op.getOperand(2), CC});
	}
	case CMP_MASK_SCALAR_CC: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue CC = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);

	SDValue Cmp;
	if (IntrData->Opc1 != 0) {
	SDValue Sae = Op.getOperand(5);
	if (isRoundModeSAE(Sae))
	Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
	else if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}
	//default rounding mode
	if (!Cmp.getNode())
	Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);

	SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
	Subtarget, DAG);
	// Need to fill with zeros to ensure the bitcast will produce zeroes
	// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
	SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
	DAG.getConstant(0, dl, MVT::v8i1),
	CmpMask, DAG.getIntPtrConstant(0, dl));
	return DAG.getBitcast(MVT::i8, Ins);
	}
	case COMI: { // Comparison intrinsics
	ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	// Some conditions require the operands to be swapped.
	if (CC == ISD::SETLT \|\| CC == ISD::SETLE)
	std::swap(LHS, RHS);

	SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
	SDValue SetCC;
	switch (CC) {
	case ISD::SETEQ: { // (ZF = 0 and PF = 0)
	SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
	SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
	SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
	break;
	}
	case ISD::SETNE: { // (ZF = 1 or PF = 1)
	SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
	SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
	SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
	break;
	}
	case ISD::SETGT: // (CF = 0 and ZF = 0)
	case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
	SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
	break;
	}
	case ISD::SETGE: // CF = 0
	case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
	SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
	break;
	default:
	llvm_unreachable("Unexpected illegal condition!");
	}
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}
	case COMI_RM: { // Comparison intrinsics with Sae
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	unsigned CondVal = Op.getConstantOperandVal(3);
	SDValue Sae = Op.getOperand(4);

	SDValue FCmp;
	if (isRoundModeCurDirection(Sae))
	FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
	DAG.getTargetConstant(CondVal, dl, MVT::i8));
	else if (isRoundModeSAE(Sae))
	FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
	DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
	else
	return SDValue();
	// Need to fill with zeros to ensure the bitcast will produce zeroes
	// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
	SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
	DAG.getConstant(0, dl, MVT::v16i1),
	FCmp, DAG.getIntPtrConstant(0, dl));
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
	DAG.getBitcast(MVT::i16, Ins));
	}
	case VSHIFT:
	return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
	Op.getOperand(1), Op.getOperand(2), Subtarget,
	DAG);
	case COMPRESS_EXPAND_IN_REG: {
	SDValue Mask = Op.getOperand(3);
	SDValue DataToCompress = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
	return Op.getOperand(1);

	// Avoid false dependency.
	if (PassThru.isUndef())
	PassThru = DAG.getConstant(0, dl, VT);

	return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
	Mask);
	}
	case FIXUPIMM:
	case FIXUPIMM_MASKZ: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue Imm = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Passthru = (IntrData->Type == FIXUPIMM)
	? Src1
	: getZeroVector(VT, Subtarget, DAG, dl);

	unsigned Opc = IntrData->Opc0;
	if (IntrData->Opc1 != 0) {
	SDValue Sae = Op.getOperand(6);
	if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}

	SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);

	if (Opc == X86ISD::VFIXUPIMM \|\| Opc == X86ISD::VFIXUPIMM_SAE)
	return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);

	return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
	}
	case ROUNDP: {
	assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
	// Clear the upper bits of the rounding immediate so that the legacy
	// intrinsic can't trigger the scaling behavior of VRNDSCALE.
	auto Round = cast<ConstantSDNode>(Op.getOperand(2));
	SDValue RoundingMode =
	DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(1), RoundingMode);
	}
	case ROUNDS: {
	assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
	// Clear the upper bits of the rounding immediate so that the legacy
	// intrinsic can't trigger the scaling behavior of VRNDSCALE.
	auto Round = cast<ConstantSDNode>(Op.getOperand(3));
	SDValue RoundingMode =
	DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2), RoundingMode);
	}
	case BEXTRI: {
	assert(IntrData->Opc0 == X86ISD::BEXTR && "Unexpected opcode");

	// The control is a TargetConstant, but we need to convert it to a
	// ConstantSDNode.
	uint64_t Imm = Op.getConstantOperandVal(2);
	SDValue Control = DAG.getConstant(Imm, dl, Op.getValueType());
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(1), Control);
	}
	// ADC/ADCX/SBB
	case ADX: {
	SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
	SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);

	SDValue Res;
	// If the carry in is zero, then we should just use ADD/SUB instead of
	// ADC/SBB.
	if (isNullConstant(Op.getOperand(1))) {
	Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
	Op.getOperand(3));
	} else {
	SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
	DAG.getConstant(-1, dl, MVT::i8));
	Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
	Op.getOperand(3), GenCF.getValue(1));
	}
	SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
	SDValue Results[] = { SetCC, Res };
	return DAG.getMergeValues(Results, dl);
	}
	case CVTPD2PS_MASK:
	case CVTPD2DQ_MASK:
	case CVTQQ2PS_MASK:
	case TRUNCATE_TO_REG: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);

	if (isAllOnesConstant(Mask))
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);

	MVT SrcVT = Src.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
	Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
	{Src, PassThru, Mask});
	}
	case CVTPS2PH_MASK: {
	SDValue Src = Op.getOperand(1);
	SDValue Rnd = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);

	if (isAllOnesConstant(Mask))
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);

	MVT SrcVT = Src.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
	Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
	PassThru, Mask);

	}
	case CVTNEPS2BF16_MASK: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);

	if (ISD::isBuildVectorAllOnes(Mask.getNode()))
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);

	// Break false dependency.
	if (PassThru.isUndef())
	PassThru = DAG.getConstant(0, dl, PassThru.getValueType());

	return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
	Mask);
	}
	default:
	break;
	}
	}

	switch (IntNo) {
	default: return SDValue(); // Don't custom lower most intrinsics.

	// ptest and testp intrinsics. The intrinsic these come from are designed to
	// return an integer value, not just an instruction so lower it to the ptest
	// or testp pattern and a setcc for the result.
	case Intrinsic::x86_avx512_ktestc_b:
	case Intrinsic::x86_avx512_ktestc_w:
	case Intrinsic::x86_avx512_ktestc_d:
	case Intrinsic::x86_avx512_ktestc_q:
	case Intrinsic::x86_avx512_ktestz_b:
	case Intrinsic::x86_avx512_ktestz_w:
	case Intrinsic::x86_avx512_ktestz_d:
	case Intrinsic::x86_avx512_ktestz_q:
	case Intrinsic::x86_sse41_ptestz:
	case Intrinsic::x86_sse41_ptestc:
	case Intrinsic::x86_sse41_ptestnzc:
	case Intrinsic::x86_avx_ptestz_256:
	case Intrinsic::x86_avx_ptestc_256:
	case Intrinsic::x86_avx_ptestnzc_256:
	case Intrinsic::x86_avx_vtestz_ps:
	case Intrinsic::x86_avx_vtestc_ps:
	case Intrinsic::x86_avx_vtestnzc_ps:
	case Intrinsic::x86_avx_vtestz_pd:
	case Intrinsic::x86_avx_vtestc_pd:
	case Intrinsic::x86_avx_vtestnzc_pd:
	case Intrinsic::x86_avx_vtestz_ps_256:
	case Intrinsic::x86_avx_vtestc_ps_256:
	case Intrinsic::x86_avx_vtestnzc_ps_256:
	case Intrinsic::x86_avx_vtestz_pd_256:
	case Intrinsic::x86_avx_vtestc_pd_256:
	case Intrinsic::x86_avx_vtestnzc_pd_256: {
	unsigned TestOpc = X86ISD::PTEST;
	X86::CondCode X86CC;
	switch (IntNo) {
	default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
	case Intrinsic::x86_avx512_ktestc_b:
	case Intrinsic::x86_avx512_ktestc_w:
	case Intrinsic::x86_avx512_ktestc_d:
	case Intrinsic::x86_avx512_ktestc_q:
	// CF = 1
	TestOpc = X86ISD::KTEST;
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_avx512_ktestz_b:
	case Intrinsic::x86_avx512_ktestz_w:
	case Intrinsic::x86_avx512_ktestz_d:
	case Intrinsic::x86_avx512_ktestz_q:
	TestOpc = X86ISD::KTEST;
	X86CC = X86::COND_E;
	break;
	case Intrinsic::x86_avx_vtestz_ps:
	case Intrinsic::x86_avx_vtestz_pd:
	case Intrinsic::x86_avx_vtestz_ps_256:
	case Intrinsic::x86_avx_vtestz_pd_256:
	TestOpc = X86ISD::TESTP;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestz:
	case Intrinsic::x86_avx_ptestz_256:
	// ZF = 1
	X86CC = X86::COND_E;
	break;
	case Intrinsic::x86_avx_vtestc_ps:
	case Intrinsic::x86_avx_vtestc_pd:
	case Intrinsic::x86_avx_vtestc_ps_256:
	case Intrinsic::x86_avx_vtestc_pd_256:
	TestOpc = X86ISD::TESTP;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestc:
	case Intrinsic::x86_avx_ptestc_256:
	// CF = 1
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_avx_vtestnzc_ps:
	case Intrinsic::x86_avx_vtestnzc_pd:
	case Intrinsic::x86_avx_vtestnzc_ps_256:
	case Intrinsic::x86_avx_vtestnzc_pd_256:
	TestOpc = X86ISD::TESTP;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestnzc:
	case Intrinsic::x86_avx_ptestnzc_256:
	// ZF and CF = 0
	X86CC = X86::COND_A;
	break;
	}

	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
	SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}

	case Intrinsic::x86_sse42_pcmpistria128:
	case Intrinsic::x86_sse42_pcmpestria128:
	case Intrinsic::x86_sse42_pcmpistric128:
	case Intrinsic::x86_sse42_pcmpestric128:
	case Intrinsic::x86_sse42_pcmpistrio128:
	case Intrinsic::x86_sse42_pcmpestrio128:
	case Intrinsic::x86_sse42_pcmpistris128:
	case Intrinsic::x86_sse42_pcmpestris128:
	case Intrinsic::x86_sse42_pcmpistriz128:
	case Intrinsic::x86_sse42_pcmpestriz128: {
	unsigned Opcode;
	X86::CondCode X86CC;
	switch (IntNo) {
	default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
	case Intrinsic::x86_sse42_pcmpistria128:
	Opcode = X86ISD::PCMPISTR;
	X86CC = X86::COND_A;
	break;
	case Intrinsic::x86_sse42_pcmpestria128:
	Opcode = X86ISD::PCMPESTR;
	X86CC = X86::COND_A;
	break;
	case Intrinsic::x86_sse42_pcmpistric128:
	Opcode = X86ISD::PCMPISTR;
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_sse42_pcmpestric128:
	Opcode = X86ISD::PCMPESTR;
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_sse42_pcmpistrio128:
	Opcode = X86ISD::PCMPISTR;
	X86CC = X86::COND_O;
	break;
	case Intrinsic::x86_sse42_pcmpestrio128:
	Opcode = X86ISD::PCMPESTR;
	X86CC = X86::COND_O;
	break;
	case Intrinsic::x86_sse42_pcmpistris128:
	Opcode = X86ISD::PCMPISTR;
	X86CC = X86::COND_S;
	break;
	case Intrinsic::x86_sse42_pcmpestris128:
	Opcode = X86ISD::PCMPESTR;
	X86CC = X86::COND_S;
	break;
	case Intrinsic::x86_sse42_pcmpistriz128:
	Opcode = X86ISD::PCMPISTR;
	X86CC = X86::COND_E;
	break;
	case Intrinsic::x86_sse42_pcmpestriz128:
	Opcode = X86ISD::PCMPESTR;
	X86CC = X86::COND_E;
	break;
	}
	SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
	SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
	SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}

	case Intrinsic::x86_sse42_pcmpistri128:
	case Intrinsic::x86_sse42_pcmpestri128: {
	unsigned Opcode;
	if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
	Opcode = X86ISD::PCMPISTR;
	else
	Opcode = X86ISD::PCMPESTR;

	SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
	return DAG.getNode(Opcode, dl, VTs, NewOps);
	}

	case Intrinsic::x86_sse42_pcmpistrm128:
	case Intrinsic::x86_sse42_pcmpestrm128: {
	unsigned Opcode;
	if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
	Opcode = X86ISD::PCMPISTR;
	else
	Opcode = X86ISD::PCMPESTR;

	SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
	return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
	}

	case Intrinsic::eh_sjlj_lsda: {
	MachineFunction &MF = DAG.getMachineFunction();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
	auto &Context = MF.getMMI().getContext();
	MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
	Twine(MF.getFunctionNumber()));
	return DAG.getNode(getGlobalWrapperKind(), dl, VT,
	DAG.getMCSymbol(S, PtrVT));
	}

	case Intrinsic::x86_seh_lsda: {
	// Compute the symbol for the LSDA. We know it'll get emitted later.
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Op1 = Op.getOperand(1);
	auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
	MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
	GlobalValue::dropLLVMManglingEscape(Fn->getName()));

	// Generate a simple absolute symbol reference. This intrinsic is only
	// supported on 32-bit Windows, which isn't PIC.
	SDValue Result = DAG.getMCSymbol(LSDASym, VT);
	return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
	}

	case Intrinsic::eh_recoverfp: {
	SDValue FnOp = Op.getOperand(1);
	SDValue IncomingFPOp = Op.getOperand(2);
	GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
	auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
	if (!Fn)
	report_fatal_error(
	"llvm.eh.recoverfp must take a function as the first argument");
	return recoverFramePointer(DAG, Fn, IncomingFPOp);
	}

	case Intrinsic::localaddress: {
	// Returns one of the stack, base, or frame pointer registers, depending on
	// which is used to reference local variables.
	MachineFunction &MF = DAG.getMachineFunction();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned Reg;
	if (RegInfo->hasBasePointer(MF))
	Reg = RegInfo->getBaseRegister();
	else { // Handles the SP or FP case.
	bool CantUseFP = RegInfo->needsStackRealignment(MF);
	if (CantUseFP)
	Reg = RegInfo->getPtrSizedStackRegister(MF);
	else
	Reg = RegInfo->getPtrSizedFrameRegister(MF);
	}
	return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
	}

	case Intrinsic::x86_avx512_vp2intersect_q_512:
	case Intrinsic::x86_avx512_vp2intersect_q_256:
	case Intrinsic::x86_avx512_vp2intersect_q_128:
	case Intrinsic::x86_avx512_vp2intersect_d_512:
	case Intrinsic::x86_avx512_vp2intersect_d_256:
	case Intrinsic::x86_avx512_vp2intersect_d_128: {
	MVT MaskVT = Op.getSimpleValueType();

	SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
	SDLoc DL(Op);

	SDValue Operation =
	DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
	Op->getOperand(1), Op->getOperand(2));

	SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
	MaskVT, Operation);
	SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
	MaskVT, Operation);
	return DAG.getMergeValues({Result0, Result1}, DL);
	}
	case Intrinsic::x86_mmx_pslli_w:
	case Intrinsic::x86_mmx_pslli_d:
	case Intrinsic::x86_mmx_pslli_q:
	case Intrinsic::x86_mmx_psrli_w:
	case Intrinsic::x86_mmx_psrli_d:
	case Intrinsic::x86_mmx_psrli_q:
	case Intrinsic::x86_mmx_psrai_w:
	case Intrinsic::x86_mmx_psrai_d: {
	SDLoc DL(Op);
	SDValue ShAmt = Op.getOperand(2);
	// If the argument is a constant, convert it to a target constant.
	if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
	// Clamp out of bounds shift amounts since they will otherwise be masked
	// to 8-bits which may make it no longer out of bounds.
	unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
	if (ShiftAmount == 0)
	return Op.getOperand(1);

	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
	Op.getOperand(0), Op.getOperand(1),
	DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
	}

	unsigned NewIntrinsic;
	switch (IntNo) {
	default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
	case Intrinsic::x86_mmx_pslli_w:
	NewIntrinsic = Intrinsic::x86_mmx_psll_w;
	break;
	case Intrinsic::x86_mmx_pslli_d:
	NewIntrinsic = Intrinsic::x86_mmx_psll_d;
	break;
	case Intrinsic::x86_mmx_pslli_q:
	NewIntrinsic = Intrinsic::x86_mmx_psll_q;
	break;
	case Intrinsic::x86_mmx_psrli_w:
	NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
	break;
	case Intrinsic::x86_mmx_psrli_d:
	NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
	break;
	case Intrinsic::x86_mmx_psrli_q:
	NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
	break;
	case Intrinsic::x86_mmx_psrai_w:
	NewIntrinsic = Intrinsic::x86_mmx_psra_w;
	break;
	case Intrinsic::x86_mmx_psrai_d:
	NewIntrinsic = Intrinsic::x86_mmx_psra_d;
	break;
	}

	// The vector shift intrinsics with scalars uses 32b shift amounts but
	// the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
	// MMX register.
	ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
	DAG.getConstant(NewIntrinsic, DL, MVT::i32),
	Op.getOperand(1), ShAmt);

	}
	}
	}

	static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
	TLI.getPointerTy(DAG.getDataLayout()));
	EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
	// If source is undef or we know it won't be used, use a zero vector
	// to break register dependency.
	// TODO: use undef instead and let BreakFalseDeps deal with it?
	if (Src.isUndef() \|\| ISD::isBuildVectorAllOnes(Mask.getNode()))
	Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);

	// Cast mask to an integer type.
	Mask = DAG.getBitcast(MaskVT, Mask);

	MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

	SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
	SDValue Res =
	DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
	MemIntr->getMemoryVT(), MemIntr->getMemOperand());
	return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
	}

	static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
	TLI.getPointerTy(DAG.getDataLayout()));
	unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
	VT.getVectorNumElements());
	MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);

	// We support two versions of the gather intrinsics. One with scalar mask and
	// one with vXi1 mask. Convert scalar to vXi1 if necessary.
	if (Mask.getValueType() != MaskVT)
	Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
	// If source is undef or we know it won't be used, use a zero vector
	// to break register dependency.
	// TODO: use undef instead and let BreakFalseDeps deal with it?
	if (Src.isUndef() \|\| ISD::isBuildVectorAllOnes(Mask.getNode()))
	Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);

	MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

	SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
	SDValue Res =
	DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
	MemIntr->getMemoryVT(), MemIntr->getMemOperand());
	return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
	}

	static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
	TLI.getPointerTy(DAG.getDataLayout()));
	unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
	Src.getSimpleValueType().getVectorNumElements());
	MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);

	// We support two versions of the scatter intrinsics. One with scalar mask and
	// one with vXi1 mask. Convert scalar to vXi1 if necessary.
	if (Mask.getValueType() != MaskVT)
	Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

	SDVTList VTs = DAG.getVTList(MVT::Other);
	SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
	SDValue Res =
	DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
	MemIntr->getMemoryVT(), MemIntr->getMemOperand());
	return Res;
	}

	static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Mask, SDValue Base, SDValue Index,
	SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
	TLI.getPointerTy(DAG.getDataLayout()));
	SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
	SDValue Segment = DAG.getRegister(0, MVT::i32);
	MVT MaskVT =
	MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
	SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
	return SDValue(Res, 0);
	}

	/// Handles the lowering of builtin intrinsics with chain that return their
	/// value into registers EDX:EAX.
	/// If operand ScrReg is a valid register identifier, then operand 2 of N is
	/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
	/// TargetOpcode.
	/// Returns a Glue value which can be used to add extra copy-from-reg if the
	/// expanded intrinsics implicitly defines extra registers (i.e. not just
	/// EDX:EAX).
	static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
	SelectionDAG &DAG,
	unsigned TargetOpcode,
	unsigned SrcReg,
	const X86Subtarget &Subtarget,
	SmallVectorImpl<SDValue> &Results) {
	SDValue Chain = N->getOperand(0);
	SDValue Glue;

	if (SrcReg) {
	assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
	Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
	Glue = Chain.getValue(1);
	}

	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue N1Ops[] = {Chain, Glue};
	SDNode *N1 = DAG.getMachineNode(
	TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
	Chain = SDValue(N1, 0);

	// Reads the content of XCR and returns it in registers EDX:EAX.
	SDValue LO, HI;
	if (Subtarget.is64Bit()) {
	LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
	LO.getValue(2));
	} else {
	LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
	LO.getValue(2));
	}
	Chain = HI.getValue(1);
	Glue = HI.getValue(2);

	if (Subtarget.is64Bit()) {
	// Merge the two 32-bit values into a 64-bit one.
	SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
	DAG.getConstant(32, DL, MVT::i8));
	Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
	Results.push_back(Chain);
	return Glue;
	}

	// Use a buildpair to merge the two 32-bit values into a 64-bit one.
	SDValue Ops[] = { LO, HI };
	SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
	Results.push_back(Pair);
	Results.push_back(Chain);
	return Glue;
	}

	/// Handles the lowering of builtin intrinsics that read the time stamp counter
	/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
	/// READCYCLECOUNTER nodes.
	static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SmallVectorImpl<SDValue> &Results) {
	// The processor's time-stamp counter (a 64-bit MSR) is stored into the
	// EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
	// and the EAX register is loaded with the low-order 32 bits.
	SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
	/* NoRegister */0, Subtarget,
	Results);
	if (Opcode != X86::RDTSCP)
	return;

	SDValue Chain = Results[1];
	// Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
	// the ECX register. Add 'ecx' explicitly to the chain.
	SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
	Results[1] = ecx;
	Results.push_back(ecx.getValue(1));
	}

	static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SmallVector<SDValue, 3> Results;
	SDLoc DL(Op);
	getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
	Results);
	return DAG.getMergeValues(Results, DL);
	}

	static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Chain = Op.getOperand(0);
	SDValue RegNode = Op.getOperand(2);
	WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
	if (!EHInfo)
	report_fatal_error("EH registrations only live in functions using WinEH");

	// Cast the operand to an alloca, and remember the frame index.
	auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
	if (!FINode)
	report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
	EHInfo->EHRegNodeFrameIndex = FINode->getIndex();

	// Return the chain operand without making any DAG nodes.
	return Chain;
	}

	static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Chain = Op.getOperand(0);
	SDValue EHGuard = Op.getOperand(2);
	WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
	if (!EHInfo)
	report_fatal_error("EHGuard only live in functions using WinEH");

	// Cast the operand to an alloca, and remember the frame index.
	auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
	if (!FINode)
	report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
	EHInfo->EHGuardFrameIndex = FINode->getIndex();

	// Return the chain operand without making any DAG nodes.
	return Chain;
	}

	/// Emit Truncating Store with signed or unsigned saturation.
	static SDValue
	EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
	SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
	SelectionDAG &DAG) {
	SDVTList VTs = DAG.getVTList(MVT::Other);
	SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
	SDValue Ops[] = { Chain, Val, Ptr, Undef };
	unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
	return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
	}

	/// Emit Masked Truncating Store with signed or unsigned saturation.
	static SDValue
	EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
	SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
	MachineMemOperand *MMO, SelectionDAG &DAG) {
	SDVTList VTs = DAG.getVTList(MVT::Other);
	SDValue Ops[] = { Chain, Val, Ptr, Mask };
	unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
	return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
	}

	static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	unsigned IntNo = Op.getConstantOperandVal(1);
	const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
	if (!IntrData) {
	switch (IntNo) {
	case llvm::Intrinsic::x86_seh_ehregnode:
	return MarkEHRegistrationNode(Op, DAG);
	case llvm::Intrinsic::x86_seh_ehguard:
	return MarkEHGuard(Op, DAG);
	case llvm::Intrinsic::x86_rdpkru: {
	SDLoc dl(Op);
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
	// Create a RDPKRU node and pass 0 to the ECX parameter.
	return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
	DAG.getConstant(0, dl, MVT::i32));
	}
	case llvm::Intrinsic::x86_wrpkru: {
	SDLoc dl(Op);
	// Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
	// to the EDX and ECX parameters.
	return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
	Op.getOperand(0), Op.getOperand(2),
	DAG.getConstant(0, dl, MVT::i32),
	DAG.getConstant(0, dl, MVT::i32));
	}
	case llvm::Intrinsic::x86_flags_read_u32:
	case llvm::Intrinsic::x86_flags_read_u64:
	case llvm::Intrinsic::x86_flags_write_u32:
	case llvm::Intrinsic::x86_flags_write_u64: {
	// We need a frame pointer because this will get lowered to a PUSH/POP
	// sequence.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setHasCopyImplyingStackAdjustment(true);
	// Don't do anything here, we will expand these intrinsics out later
	// during FinalizeISel in EmitInstrWithCustomInserter.
	return Op;
	}
	case Intrinsic::x86_lwpins32:
	case Intrinsic::x86_lwpins64:
	case Intrinsic::x86_umwait:
	case Intrinsic::x86_tpause: {
	SDLoc dl(Op);
	SDValue Chain = Op->getOperand(0);
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
	unsigned Opcode;

	switch (IntNo) {
	default: llvm_unreachable("Impossible intrinsic");
	case Intrinsic::x86_umwait:
	Opcode = X86ISD::UMWAIT;
	break;
	case Intrinsic::x86_tpause:
	Opcode = X86ISD::TPAUSE;
	break;
	case Intrinsic::x86_lwpins32:
	case Intrinsic::x86_lwpins64:
	Opcode = X86ISD::LWPINS;
	break;
	}

	SDValue Operation =
	DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
	Op->getOperand(3), Op->getOperand(4));
	SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
	Operation.getValue(1));
	}
	case Intrinsic::x86_enqcmd:
	case Intrinsic::x86_enqcmds: {
	SDLoc dl(Op);
	SDValue Chain = Op.getOperand(0);
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
	unsigned Opcode;
	switch (IntNo) {
	default: llvm_unreachable("Impossible intrinsic!");
	case Intrinsic::x86_enqcmd:
	Opcode = X86ISD::ENQCMD;
	break;
	case Intrinsic::x86_enqcmds:
	Opcode = X86ISD::ENQCMDS;
	break;
	}
	SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
	Op.getOperand(3));
	SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
	Operation.getValue(1));
	}
	}
	return SDValue();
	}

	SDLoc dl(Op);
	switch(IntrData->Type) {
	default: llvm_unreachable("Unknown Intrinsic Type");
	case RDSEED:
	case RDRAND: {
	// Emit the node with the right value type.
	SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
	SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

	// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
	// Otherwise return the value from Rand, which is always 0, casted to i32.
	SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
	DAG.getConstant(1, dl, Op->getValueType(1)),
	DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
	SDValue(Result.getNode(), 1)};
	SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);

	// Return { result, isValid, chain }.
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
	SDValue(Result.getNode(), 2));
	}
	case GATHER_AVX2: {
	SDValue Chain = Op.getOperand(0);
	SDValue Src = Op.getOperand(2);
	SDValue Base = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
	Scale, Chain, Subtarget);
	}
	case GATHER: {
	//gather(v1, mask, index, base, scale);
	SDValue Chain = Op.getOperand(0);
	SDValue Src = Op.getOperand(2);
	SDValue Base = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
	Chain, Subtarget);
	}
	case SCATTER: {
	//scatter(base, mask, index, v1, scale);
	SDValue Chain = Op.getOperand(0);
	SDValue Base = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Src = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
	Scale, Chain, Subtarget);
	}
	case PREFETCH: {
	const APInt &HintVal = Op.getConstantOperandAPInt(6);
	assert((HintVal == 2 \|\| HintVal == 3) &&
	"Wrong prefetch hint in intrinsic: should be 2 or 3");
	unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
	SDValue Chain = Op.getOperand(0);
	SDValue Mask = Op.getOperand(2);
	SDValue Index = Op.getOperand(3);
	SDValue Base = Op.getOperand(4);
	SDValue Scale = Op.getOperand(5);
	return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
	Subtarget);
	}
	// Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
	case RDTSC: {
	SmallVector<SDValue, 2> Results;
	getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
	Results);
	return DAG.getMergeValues(Results, dl);
	}
	// Read Performance Monitoring Counters.
	case RDPMC:
	// GetExtended Control Register.
	case XGETBV: {
	SmallVector<SDValue, 2> Results;

	// RDPMC uses ECX to select the index of the performance counter to read.
	// XGETBV uses ECX to select the index of the XCR register to return.
	// The result is stored into registers EDX:EAX.
	expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
	Subtarget, Results);
	return DAG.getMergeValues(Results, dl);
	}
	// XTEST intrinsics.
	case XTEST: {
	SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
	SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

	SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
	SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
	Ret, SDValue(InTrans.getNode(), 1));
	}
	case TRUNCATE_TO_MEM_VI8:
	case TRUNCATE_TO_MEM_VI16:
	case TRUNCATE_TO_MEM_VI32: {
	SDValue Mask = Op.getOperand(4);
	SDValue DataToTruncate = Op.getOperand(3);
	SDValue Addr = Op.getOperand(2);
	SDValue Chain = Op.getOperand(0);

	MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
	assert(MemIntr && "Expected MemIntrinsicSDNode!");

	EVT MemVT = MemIntr->getMemoryVT();

	uint16_t TruncationOp = IntrData->Opc0;
	switch (TruncationOp) {
	case X86ISD::VTRUNC: {
	if (isAllOnesConstant(Mask)) // return just a truncate store
	return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
	MemIntr->getMemOperand());

	MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	SDValue Offset = DAG.getUNDEF(VMask.getValueType());

	return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
	MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
	true /* truncating */);
	}
	case X86ISD::VTRUNCUS:
	case X86ISD::VTRUNCS: {
	bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
	if (isAllOnesConstant(Mask))
	return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
	MemIntr->getMemOperand(), DAG);

	MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
	VMask, MemVT, MemIntr->getMemOperand(), DAG);
	}
	default:
	llvm_unreachable("Unsupported truncstore intrinsic");
	}
	}
	}
	}

	SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setReturnAddressIsTaken(true);

	if (verifyReturnAddressArgumentIsConstant(Op, DAG))
	return SDValue();

	unsigned Depth = Op.getConstantOperandVal(0);
	SDLoc dl(Op);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	if (Depth > 0) {
	SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
	return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
	DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
	MachinePointerInfo());
	}

	// Just load the return address.
	SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
	return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
	MachinePointerInfo());
	}

	SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
	return getReturnAddressFrameIndex(DAG);
	}

	SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	EVT VT = Op.getValueType();

	MFI.setFrameAddressIsTaken(true);

	if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
	// Depth > 0 makes no sense on targets which use Windows unwind codes. It
	// is not possible to crawl up the stack without looking at the unwind codes
	// simultaneously.
	int FrameAddrIndex = FuncInfo->getFAIndex();
	if (!FrameAddrIndex) {
	// Set up a frame object for the return address.
	unsigned SlotSize = RegInfo->getSlotSize();
	FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
	SlotSize, /SPOffset=/0, /IsImmutable=/false);
	FuncInfo->setFAIndex(FrameAddrIndex);
	}
	return DAG.getFrameIndex(FrameAddrIndex, VT);
	}

	unsigned FrameReg =
	RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
	SDLoc dl(Op); // FIXME probably not meaningful
	unsigned Depth = Op.getConstantOperandVal(0);
	assert(((FrameReg == X86::RBP && VT == MVT::i64) \|\|
	(FrameReg == X86::EBP && VT == MVT::i32)) &&
	"Invalid Frame Register!");
	SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
	while (Depth--)
	FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
	MachinePointerInfo());
	return FrameAddr;
	}

	// FIXME? Maybe this could be a TableGen attribute on some registers and
	// this table could be generated automatically from RegInfo.
	Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
	const MachineFunction &MF) const {
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

	Register Reg = StringSwitch<unsigned>(RegName)
	.Case("esp", X86::ESP)
	.Case("rsp", X86::RSP)
	.Case("ebp", X86::EBP)
	.Case("rbp", X86::RBP)
	.Default(0);

	if (Reg == X86::EBP \|\| Reg == X86::RBP) {
	if (!TFI.hasFP(MF))
	report_fatal_error("register " + StringRef(RegName) +
	" is allocatable: function has no frame pointer");
	#ifndef NDEBUG
	else {
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
	assert((FrameReg == X86::EBP \|\| FrameReg == X86::RBP) &&
	"Invalid Frame Register!");
	}
	#endif
	}

	if (Reg)
	return Reg;

	report_fatal_error("Invalid register name global variable");
	}

	SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
	SelectionDAG &DAG) const {
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
	}

	Register X86TargetLowering::getExceptionPointerRegister(
	const Constant *PersonalityFn) const {
	if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
	return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;

	return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
	}

	Register X86TargetLowering::getExceptionSelectorRegister(
	const Constant *PersonalityFn) const {
	// Funclet personalities don't use selectors (the runtime does the selection).
	assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
	return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
	}

	bool X86TargetLowering::needsFixedCatchObjects() const {
	return Subtarget.isTargetWin64();
	}

	SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
	SDValue Chain = Op.getOperand(0);
	SDValue Offset = Op.getOperand(1);
	SDValue Handler = Op.getOperand(2);
	SDLoc dl (Op);

	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
	assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) \|\|
	(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
	"Invalid Frame Register!");
	SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
	Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;

	SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
	DAG.getIntPtrConstant(RegInfo->getSlotSize(),
	dl));
	StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
	Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
	Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);

	return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
	DAG.getRegister(StoreAddrReg, PtrVT));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	// If the subtarget is not 64bit, we may need the global base reg
	// after isel expand pseudo, i.e., after CGBR pass ran.
	// Therefore, ask for the GlobalBaseReg now, so that the pass
	// inserts the code for us in case we need it.
	// Otherwise, we will end up in a situation where we will
	// reference a virtual register that is not defined!
	if (!Subtarget.is64Bit()) {
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	(void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
	}
	return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
	DAG.getVTList(MVT::i32, MVT::Other),
	Op.getOperand(0), Op.getOperand(1));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
	Op.getOperand(0), Op.getOperand(1));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
	Op.getOperand(0));
	}

	static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
	return Op.getOperand(0);
	}

	SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue Root = Op.getOperand(0);
	SDValue Trmp = Op.getOperand(1); // trampoline
	SDValue FPtr = Op.getOperand(2); // nested function
	SDValue Nest = Op.getOperand(3); // 'nest' parameter value
	SDLoc dl (Op);

	const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

	if (Subtarget.is64Bit()) {
	SDValue OutChains[6];

	// Large code-model.
	const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
	const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.

	const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
	const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;

	const unsigned char REX_WB = 0x40 \| 0x08 \| 0x01; // REX prefix

	// Load the pointer to the nested function into R11.
	unsigned OpCode = ((MOV64ri \| N86R11) << 8) \| REX_WB; // movabsq r11
	SDValue Addr = Trmp;
	OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(2, dl, MVT::i64));
	OutChains[1] =
	DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
	/* Alignment = */ 2);

	// Load the 'nest' parameter value into R10.
	// R10 is specified in X86CallingConv.td
	OpCode = ((MOV64ri \| N86R10) << 8) \| REX_WB; // movabsq r10
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(10, dl, MVT::i64));
	OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr, 10));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(12, dl, MVT::i64));
	OutChains[3] =
	DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
	/* Alignment = */ 2);

	// Jump to the nested function.
	OpCode = (JMP64r << 8) \| REX_WB; // jmpq *...
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(20, dl, MVT::i64));
	OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr, 20));

	unsigned char ModRM = N86R11 \| (4 << 3) \| (3 << 6); // ...r11
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(22, dl, MVT::i64));
	OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
	Addr, MachinePointerInfo(TrmpAddr, 22));

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	} else {
	const Function *Func =
	cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
	CallingConv::ID CC = Func->getCallingConv();
	unsigned NestReg;

	switch (CC) {
	default:
	llvm_unreachable("Unsupported calling convention");
	case CallingConv::C:
	case CallingConv::X86_StdCall: {
	// Pass 'nest' parameter in ECX.
	// Must be kept in sync with X86CallingConv.td
	NestReg = X86::ECX;

	// Check that ECX wasn't needed by an 'inreg' parameter.
	FunctionType *FTy = Func->getFunctionType();
	const AttributeList &Attrs = Func->getAttributes();

	if (!Attrs.isEmpty() && !Func->isVarArg()) {
	unsigned InRegCount = 0;
	unsigned Idx = 1;

	for (FunctionType::param_iterator I = FTy->param_begin(),
	E = FTy->param_end(); I != E; ++I, ++Idx)
	if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
	auto &DL = DAG.getDataLayout();
	// FIXME: should only count parameters that are lowered to integers.
	InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
	}

	if (InRegCount > 2) {
	report_fatal_error("Nest register in use - reduce number of inreg"
	" parameters!");
	}
	}
	break;
	}
	case CallingConv::X86_FastCall:
	case CallingConv::X86_ThisCall:
	case CallingConv::Fast:
	case CallingConv::Tail:
	// Pass 'nest' parameter in EAX.
	// Must be kept in sync with X86CallingConv.td
	NestReg = X86::EAX;
	break;
	}

	SDValue OutChains[4];
	SDValue Addr, Disp;

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(10, dl, MVT::i32));
	Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);

	// This is storing the opcode for MOV32ri.
	const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
	const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
	OutChains[0] =
	DAG.getStore(Root, dl, DAG.getConstant(MOV32ri \| N86Reg, dl, MVT::i8),
	Trmp, MachinePointerInfo(TrmpAddr));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(1, dl, MVT::i32));
	OutChains[1] =
	DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
	/* Alignment = */ 1);

	const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(5, dl, MVT::i32));
	OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
	Addr, MachinePointerInfo(TrmpAddr, 5),
	/* Alignment = */ 1);

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(6, dl, MVT::i32));
	OutChains[3] =
	DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
	/* Alignment = */ 1);

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	}
	}

	SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
	SelectionDAG &DAG) const {
	/*
	The rounding mode is in bits 11:10 of FPSR, and has the following
	settings:
	00 Round to nearest
	01 Round to -inf
	10 Round to +inf
	11 Round to 0

	FLT_ROUNDS, on the other hand, expects the following:
	-1 Undefined
	0 Round to 0
	1 Round to nearest
	2 Round to +inf
	3 Round to -inf

	To perform the conversion, we use a packed lookup table of the four 2-bit
	values that we can index by FPSP[11:10]
	0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]

	(0x2d >> ((FPSR & 0xc00) >> 9)) & 3
	*/

	MachineFunction &MF = DAG.getMachineFunction();
	MVT VT = Op.getSimpleValueType();
	SDLoc DL(Op);

	// Save FP Control Word to stack slot
	int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
	SDValue StackSlot =
	DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));

	MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);

	SDValue Chain = Op.getOperand(0);
	SDValue Ops[] = {Chain, StackSlot};
	Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
	DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
	Align(2), MachineMemOperand::MOStore);

	// Load FP Control Word from stack slot
	SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
	Chain = CWD.getValue(1);

	// Mask and turn the control bits into a shift for the lookup table.
	SDValue Shift =
	DAG.getNode(ISD::SRL, DL, MVT::i16,
	DAG.getNode(ISD::AND, DL, MVT::i16,
	CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
	DAG.getConstant(9, DL, MVT::i8));
	Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);

	SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
	SDValue RetVal =
	DAG.getNode(ISD::AND, DL, MVT::i32,
	DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
	DAG.getConstant(3, DL, MVT::i32));

	RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);

	return DAG.getMergeValues({RetVal, Chain}, DL);
	}

	/// Lower a vector CTLZ using native supported vector CTLZ instruction.
	//
	// i8/i16 vector implemented using dword LZCNT vector instruction
	// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
	// split the vector, perform operation on it's Lo a Hi part and
	// concatenate the results.
	static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(Op.getOpcode() == ISD::CTLZ);
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElems = VT.getVectorNumElements();

	assert((EltVT == MVT::i8 \|\| EltVT == MVT::i16) &&
	"Unsupported element type");

	// Split vector, it's Lo and Hi parts will be handled in next iteration.
	if (NumElems > 16 \|\|
	(NumElems == 16 && !Subtarget.canExtendTo512DQ()))
	return splitVectorIntUnary(Op, DAG);

	MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
	assert((NewVT.is256BitVector() \|\| NewVT.is512BitVector()) &&
	"Unsupported value type for operation");

	// Use native supported vector instruction vplzcntd.
	Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
	SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
	SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
	SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);

	return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
	}

	// Lower CTLZ using a PSHUFB lookup table implementation.
	static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	int NumElts = VT.getVectorNumElements();
	int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
	MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);

	// Per-nibble leading zero PSHUFB lookup table.
	const int LUT[16] = {/* 0 / 4, / 1 / 3, / 2 / 2, / 3 */ 2,
	/* 4 / 1, / 5 / 1, / 6 / 1, / 7 */ 1,
	/* 8 / 0, / 9 / 0, / a / 0, / b */ 0,
	/* c / 0, / d / 0, / e / 0, / f */ 0};

	SmallVector<SDValue, 64> LUTVec;
	for (int i = 0; i < NumBytes; ++i)
	LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
	SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);

	// Begin by bitcasting the input to byte vector, then split those bytes
	// into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
	// If the hi input nibble is zero then we add both results together, otherwise
	// we just take the hi result (by masking the lo result to zero before the
	// add).
	SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
	SDValue Zero = DAG.getConstant(0, DL, CurrVT);

	SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
	SDValue Lo = Op0;
	SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
	SDValue HiZ;
	if (CurrVT.is512BitVector()) {
	MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
	HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
	HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
	} else {
	HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
	}

	Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
	Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
	Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
	SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);

	// Merge result back from vXi8 back to VT, working on the lo/hi halves
	// of the current vector width in the same way we did for the nibbles.
	// If the upper half of the input element is zero then add the halves'
	// leading zero counts together, otherwise just use the upper half's.
	// Double the width of the result until we are at target width.
	while (CurrVT != VT) {
	int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
	int CurrNumElts = CurrVT.getVectorNumElements();
	MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
	MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
	SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);

	// Check if the upper half of the input element is zero.
	if (CurrVT.is512BitVector()) {
	MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
	HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
	DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
	HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
	} else {
	HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
	DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
	}
	HiZ = DAG.getBitcast(NextVT, HiZ);

	// Move the upper/lower halves to the lower bits as we'll be extending to
	// NextVT. Mask the lower result to zero if HiZ is true and add the results
	// together.
	SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
	SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
	SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
	R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
	Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
	CurrVT = NextVT;
	}

	return Res;
	}

	static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	if (Subtarget.hasCDI() &&
	// vXi8 vectors need to be promoted to 512-bits for vXi32.
	(Subtarget.canExtendTo512DQ() \|\| VT.getVectorElementType() != MVT::i8))
	return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return splitVectorIntUnary(Op, DAG);

	// Decompose 512-bit ops into smaller 256-bit ops.
	if (VT.is512BitVector() && !Subtarget.hasBWI())
	return splitVectorIntUnary(Op, DAG);

	assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
	return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
	}

	static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT OpVT = VT;
	unsigned NumBits = VT.getSizeInBits();
	SDLoc dl(Op);
	unsigned Opc = Op.getOpcode();

	if (VT.isVector())
	return LowerVectorCTLZ(Op, dl, Subtarget, DAG);

	Op = Op.getOperand(0);
	if (VT == MVT::i8) {
	// Zero extend to i32 since there is not an i8 bsr.
	OpVT = MVT::i32;
	Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
	}

	// Issue a bsr (scan bits in reverse) which also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
	Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);

	if (Opc == ISD::CTLZ) {
	// If src is zero (i.e. bsr sets ZF), returns NumBits.
	SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
	DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
	Op.getValue(1)};
	Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
	}

	// Finally xor with NumBits-1.
	Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
	DAG.getConstant(NumBits - 1, dl, OpVT));

	if (VT == MVT::i8)
	Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
	return Op;
	}

	static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	unsigned NumBits = VT.getScalarSizeInBits();
	SDValue N0 = Op.getOperand(0);
	SDLoc dl(Op);

	assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
	"Only scalar CTTZ requires custom lowering");

	// Issue a bsf (scan bits forward) which also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);

	// If src is zero (i.e. bsf sets ZF), returns NumBits.
	SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
	DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
	Op.getValue(1)};
	return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
	}

	static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	if (VT == MVT::i16 \|\| VT == MVT::i32)
	return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);

	if (VT.getScalarType() == MVT::i1)
	return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
	Op.getOperand(0), Op.getOperand(1));

	if (VT == MVT::v32i16 \|\| VT == MVT::v64i8)
	return splitVectorIntBinary(Op, DAG);

	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return splitVectorIntBinary(Op, DAG);
	}

	static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
	unsigned Opcode = Op.getOpcode();
	if (VT.getScalarType() == MVT::i1) {
	SDLoc dl(Op);
	switch (Opcode) {
	default: llvm_unreachable("Expected saturated arithmetic opcode");
	case ISD::UADDSAT:
	case ISD::SADDSAT:
	// *addsat i1 X, Y --> X \| Y
	return DAG.getNode(ISD::OR, dl, VT, X, Y);
	case ISD::USUBSAT:
	case ISD::SSUBSAT:
	// *subsat i1 X, Y --> X & ~Y
	return DAG.getNode(ISD::AND, dl, VT, X, DAG.getNOT(dl, Y, VT));
	}
	}

	if (VT.is128BitVector()) {
	// Avoid the generic expansion with min/max if we don't have pminu/pmaxu.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), VT);
	SDLoc DL(Op);
	if (Opcode == ISD::UADDSAT && !TLI.isOperationLegal(ISD::UMIN, VT)) {
	// uaddsat X, Y --> (X >u (X + Y)) ? -1 : X + Y
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, X, Y);
	SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Add, ISD::SETUGT);
	return DAG.getSelect(DL, VT, Cmp, DAG.getAllOnesConstant(DL, VT), Add);
	}
	if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
	// usubsat X, Y --> (X >u Y) ? X - Y : 0
	SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
	SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
	return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
	}
	// Use default expansion.
	return SDValue();
	}

	if (VT == MVT::v32i16 \|\| VT == MVT::v64i8)
	return splitVectorIntBinary(Op, DAG);

	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return splitVectorIntBinary(Op, DAG);
	}

	static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	if (VT == MVT::i16 \|\| VT == MVT::i32 \|\| VT == MVT::i64) {
	// Since X86 does not have CMOV for 8-bit integer, we don't convert
	// 8-bit integer abs to NEG and CMOV.
	SDLoc DL(Op);
	SDValue N0 = Op.getOperand(0);
	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
	DAG.getConstant(0, DL, VT), N0);
	SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_GE, DL, MVT::i8),
	SDValue(Neg.getNode(), 1)};
	return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
	}

	// ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
	if ((VT == MVT::v2i64 \|\| VT == MVT::v4i64) && Subtarget.hasSSE41()) {
	SDLoc DL(Op);
	SDValue Src = Op.getOperand(0);
	SDValue Sub =
	DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
	return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
	}

	if (VT.is256BitVector() && !Subtarget.hasInt256()) {
	assert(VT.isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return splitVectorIntUnary(Op, DAG);
	}

	if ((VT == MVT::v32i16 \|\| VT == MVT::v64i8) && !Subtarget.hasBWI())
	return splitVectorIntUnary(Op, DAG);

	// Default to expand.
	return SDValue();
	}

	static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	// For AVX1 cases, split to use legal ops (everything but v4i64).
	if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
	return splitVectorIntBinary(Op, DAG);

	if (VT == MVT::v32i16 \|\| VT == MVT::v64i8)
	return splitVectorIntBinary(Op, DAG);

	SDLoc DL(Op);
	unsigned Opcode = Op.getOpcode();
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);

	// For pre-SSE41, we can perform UMIN/UMAX v8i16 by flipping the signbit,
	// using the SMIN/SMAX instructions and flipping the signbit back.
	if (VT == MVT::v8i16) {
	assert((Opcode == ISD::UMIN \|\| Opcode == ISD::UMAX) &&
	"Unexpected MIN/MAX opcode");
	SDValue Sign = DAG.getConstant(APInt::getSignedMinValue(16), DL, VT);
	N0 = DAG.getNode(ISD::XOR, DL, VT, N0, Sign);
	N1 = DAG.getNode(ISD::XOR, DL, VT, N1, Sign);
	Opcode = (Opcode == ISD::UMIN ? ISD::SMIN : ISD::SMAX);
	SDValue Result = DAG.getNode(Opcode, DL, VT, N0, N1);
	return DAG.getNode(ISD::XOR, DL, VT, Result, Sign);
	}

	// Else, expand to a compare/select.
	ISD::CondCode CC;
	switch (Opcode) {
	case ISD::SMIN: CC = ISD::CondCode::SETLT; break;
	case ISD::SMAX: CC = ISD::CondCode::SETGT; break;
	case ISD::UMIN: CC = ISD::CondCode::SETULT; break;
	case ISD::UMAX: CC = ISD::CondCode::SETUGT; break;
	default: llvm_unreachable("Unknown MINMAX opcode");
	}

	SDValue Cond = DAG.getSetCC(DL, VT, N0, N1, CC);
	return DAG.getSelect(DL, VT, Cond, N0, N1);
	}

	static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	if (VT.getScalarType() == MVT::i1)
	return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));

	// Decompose 256-bit ops into 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return splitVectorIntBinary(Op, DAG);

	if ((VT == MVT::v32i16 \|\| VT == MVT::v64i8) && !Subtarget.hasBWI())
	return splitVectorIntBinary(Op, DAG);

	SDValue A = Op.getOperand(0);
	SDValue B = Op.getOperand(1);

	// Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
	// vector pairs, multiply and truncate.
	if (VT == MVT::v16i8 \|\| VT == MVT::v32i8 \|\| VT == MVT::v64i8) {
	unsigned NumElts = VT.getVectorNumElements();

	if ((VT == MVT::v16i8 && Subtarget.hasInt256()) \|\|
	(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
	MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
	return DAG.getNode(
	ISD::TRUNCATE, dl, VT,
	DAG.getNode(ISD::MUL, dl, ExVT,
	DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
	DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
	}

	MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

	// Extract the lo/hi parts to any extend to i16.
	// We're going to mask off the low byte of each result element of the
	// pmullw, so it doesn't matter what's in the high byte of each 16-bit
	// element.
	SDValue Undef = DAG.getUNDEF(VT);
	SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
	SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));

	SDValue BLo, BHi;
	if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
	// If the LHS is a constant, manually unpackl/unpackh.
	SmallVector<SDValue, 16> LoOps, HiOps;
	for (unsigned i = 0; i != NumElts; i += 16) {
	for (unsigned j = 0; j != 8; ++j) {
	LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
	MVT::i16));
	HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
	MVT::i16));
	}
	}

	BLo = DAG.getBuildVector(ExVT, dl, LoOps);
	BHi = DAG.getBuildVector(ExVT, dl, HiOps);
	} else {
	BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
	BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
	}

	// Multiply, mask the lower 8bits of the lo/hi results and pack.
	SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
	SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
	RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
	RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}

	// Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
	if (VT == MVT::v4i32) {
	assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
	"Should not custom lower when pmulld is available!");

	// Extract the odd parts.
	static const int UnpackMask[] = { 1, -1, 3, -1 };
	SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
	SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);

	// Multiply the even parts.
	SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64, A),
	DAG.getBitcast(MVT::v2i64, B));
	// Now multiply odd parts.
	SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64, Aodds),
	DAG.getBitcast(MVT::v2i64, Bodds));

	Evens = DAG.getBitcast(VT, Evens);
	Odds = DAG.getBitcast(VT, Odds);

	// Merge the two vectors back together with a shuffle. This expands into 2
	// shuffles.
	static const int ShufMask[] = { 0, 4, 2, 6 };
	return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
	}

	assert((VT == MVT::v2i64 \|\| VT == MVT::v4i64 \|\| VT == MVT::v8i64) &&
	"Only know how to lower V2I64/V4I64/V8I64 multiply");
	assert(!Subtarget.hasDQI() && "DQI should use MULLQ");

	// Ahi = psrlqi(a, 32);
	// Bhi = psrlqi(b, 32);
	//
	// AloBlo = pmuludq(a, b);
	// AloBhi = pmuludq(a, Bhi);
	// AhiBlo = pmuludq(Ahi, b);
	//
	// Hi = psllqi(AloBhi + AhiBlo, 32);
	// return AloBlo + Hi;
	KnownBits AKnown = DAG.computeKnownBits(A);
	KnownBits BKnown = DAG.computeKnownBits(B);

	APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
	bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
	bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);

	APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
	bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
	bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);

	SDValue Zero = DAG.getConstant(0, dl, VT);

	// Only multiply lo/hi halves that aren't known to be zero.
	SDValue AloBlo = Zero;
	if (!ALoIsZero && !BLoIsZero)
	AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);

	SDValue AloBhi = Zero;
	if (!ALoIsZero && !BHiIsZero) {
	SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
	AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
	}

	SDValue AhiBlo = Zero;
	if (!AHiIsZero && !BLoIsZero) {
	SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
	AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
	}

	SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
	Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);

	return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
	}

	static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	bool IsSigned = Op->getOpcode() == ISD::MULHS;
	unsigned NumElts = VT.getVectorNumElements();
	SDValue A = Op.getOperand(0);
	SDValue B = Op.getOperand(1);

	// Decompose 256-bit ops into 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return splitVectorIntBinary(Op, DAG);

	if ((VT == MVT::v32i16 \|\| VT == MVT::v64i8) && !Subtarget.hasBWI())
	return splitVectorIntBinary(Op, DAG);

	if (VT == MVT::v4i32 \|\| VT == MVT::v8i32 \|\| VT == MVT::v16i32) {
	assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) \|\|
	(VT == MVT::v8i32 && Subtarget.hasInt256()) \|\|
	(VT == MVT::v16i32 && Subtarget.hasAVX512()));

	// PMULxD operations multiply each even value (starting at 0) of LHS with
	// the related value of RHS and produce a widen result.
	// E.g., PMULUDQ <4 x i32> <a\|b\|c\|d>, <4 x i32> <e\|f\|g\|h>
	// => <2 x i64> <ae\|cg>
	//
	// In other word, to have all the results, we need to perform two PMULxD:
	// 1. one with the even values.
	// 2. one with the odd values.
	// To achieve #2, with need to place the odd values at an even position.
	//
	// Place the odd value at an even position (basically, shift all values 1
	// step to the left):
	const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
	9, -1, 11, -1, 13, -1, 15, -1};
	// <a\|b\|c\|d> => <b\|undef\|d\|undef>
	SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A,
	makeArrayRef(&Mask[0], NumElts));
	// <e\|f\|g\|h> => <f\|undef\|h\|undef>
	SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B,
	makeArrayRef(&Mask[0], NumElts));

	// Emit two multiplies, one for the lower 2 ints and one for the higher 2
	// ints.
	MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
	unsigned Opcode =
	(IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
	// PMULUDQ <4 x i32> <a\|b\|c\|d>, <4 x i32> <e\|f\|g\|h>
	// => <2 x i64> <ae\|cg>
	SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
	DAG.getBitcast(MulVT, A),
	DAG.getBitcast(MulVT, B)));
	// PMULUDQ <4 x i32> <b\|undef\|d\|undef>, <4 x i32> <f\|undef\|h\|undef>
	// => <2 x i64> <bf\|dh>
	SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
	DAG.getBitcast(MulVT, Odd0),
	DAG.getBitcast(MulVT, Odd1)));

	// Shuffle it back into the right order.
	SmallVector<int, 16> ShufMask(NumElts);
	for (int i = 0; i != (int)NumElts; ++i)
	ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;

	SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);

	// If we have a signed multiply but no PMULDQ fix up the result of an
	// unsigned multiply.
	if (IsSigned && !Subtarget.hasSSE41()) {
	SDValue Zero = DAG.getConstant(0, dl, VT);
	SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
	DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
	SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
	DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);

	SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
	Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
	}

	return Res;
	}

	// Only i8 vectors should need custom lowering after this.
	assert((VT == MVT::v16i8 \|\| (VT == MVT::v32i8 && Subtarget.hasInt256()) \|\|
	(VT == MVT::v64i8 && Subtarget.hasBWI())) &&
	"Unsupported vector type");

	// Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
	// logical shift down the upper half and pack back to i8.

	// With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
	// and then ashr/lshr the upper bits down to the lower bits before multiply.
	unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

	if ((VT == MVT::v16i8 && Subtarget.hasInt256()) \|\|
	(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
	MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
	SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
	SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
	SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
	Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
	}

	// For vXi8 we will unpack the low and high half of each 128 bit lane to widen
	// to a vXi16 type. Do the multiplies, shift the results and pack the half
	// lane results back together.

	MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

	static const int PSHUFDMask[] = { 8, 9, 10, 11, 12, 13, 14, 15,
	-1, -1, -1, -1, -1, -1, -1, -1};

	// Extract the lo parts and zero/sign extend to i16.
	// Only use SSE4.1 instructions for signed v16i8 where using unpack requires
	// shifts to sign extend. Using unpack for unsigned only requires an xor to
	// create zeros and a copy due to tied registers contraints pre-avx. But using
	// zero_extend_vector_inreg would require an additional pshufd for the high
	// part.

	SDValue ALo, AHi;
	if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
	ALo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, A);

	AHi = DAG.getVectorShuffle(VT, dl, A, A, PSHUFDMask);
	AHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, AHi);
	} else if (IsSigned) {
	ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), A));
	AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), A));

	ALo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, ALo, 8, DAG);
	AHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, AHi, 8, DAG);
	} else {
	ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A,
	DAG.getConstant(0, dl, VT)));
	AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A,
	DAG.getConstant(0, dl, VT)));
	}

	SDValue BLo, BHi;
	if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
	// If the LHS is a constant, manually unpackl/unpackh and extend.
	SmallVector<SDValue, 16> LoOps, HiOps;
	for (unsigned i = 0; i != NumElts; i += 16) {
	for (unsigned j = 0; j != 8; ++j) {
	SDValue LoOp = B.getOperand(i + j);
	SDValue HiOp = B.getOperand(i + j + 8);

	if (IsSigned) {
	LoOp = DAG.getSExtOrTrunc(LoOp, dl, MVT::i16);
	HiOp = DAG.getSExtOrTrunc(HiOp, dl, MVT::i16);
	} else {
	LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
	HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
	}

	LoOps.push_back(LoOp);
	HiOps.push_back(HiOp);
	}
	}

	BLo = DAG.getBuildVector(ExVT, dl, LoOps);
	BHi = DAG.getBuildVector(ExVT, dl, HiOps);
	} else if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
	BLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, B);

	BHi = DAG.getVectorShuffle(VT, dl, B, B, PSHUFDMask);
	BHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, BHi);
	} else if (IsSigned) {
	BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), B));
	BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), B));

	BLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BLo, 8, DAG);
	BHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BHi, 8, DAG);
	} else {
	BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B,
	DAG.getConstant(0, dl, VT)));
	BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B,
	DAG.getConstant(0, dl, VT)));
	}

	// Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
	// pack back to vXi8.
	SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
	SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
	RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);
	RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG);

	// Bitcast back to VT and then pack all the even elements from Lo and Hi.
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}

	SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
	assert(Subtarget.isTargetWin64() && "Unexpected target");
	EVT VT = Op.getValueType();
	assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
	"Unexpected return type for lowering");

	RTLIB::Libcall LC;
	bool isSigned;
	switch (Op->getOpcode()) {
	default: llvm_unreachable("Unexpected request for libcall!");
	case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
	case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
	case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
	case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
	case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
	case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
	}

	SDLoc dl(Op);
	SDValue InChain = DAG.getEntryNode();

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
	EVT ArgVT = Op->getOperand(i).getValueType();
	assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
	"Unexpected argument type for lowering");
	SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
	int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
	MachinePointerInfo MPI =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
	Entry.Node = StackPtr;
	InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
	MPI, /* Alignment = */ 16);
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
	Entry.Ty = PointerType::get(ArgTy,0);
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);
	}

	SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
	getPointerTy(DAG.getDataLayout()));

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(InChain)
	.setLibCallee(
	getLibcallCallingConv(LC),
	static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
	std::move(Args))
	.setInRegister()
	.setSExtResult(isSigned)
	.setZExtResult(!isSigned);

	std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
	return DAG.getBitcast(VT, CallInfo.first);
	}

	// Return true if the required (according to Opcode) shift-imm form is natively
	// supported by the Subtarget
	static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {
	if (VT.getScalarSizeInBits() < 16)
	return false;

	if (VT.is512BitVector() && Subtarget.hasAVX512() &&
	(VT.getScalarSizeInBits() > 16 \|\| Subtarget.hasBWI()))
	return true;

	bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(VT.is256BitVector() && Subtarget.hasInt256());

	bool AShift = LShift && (Subtarget.hasAVX512() \|\|
	(VT != MVT::v2i64 && VT != MVT::v4i64));
	return (Opcode == ISD::SRA) ? AShift : LShift;
	}

	// The shift amount is a variable, but it is the same for all vector lanes.
	// These instructions are defined together with shift-immediate.
	static
	bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {
	return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
	}

	// Return true if the required (according to Opcode) variable-shift form is
	// natively supported by the Subtarget
	static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {

	if (!Subtarget.hasInt256() \|\| VT.getScalarSizeInBits() < 16)
	return false;

	// vXi16 supported only on AVX-512, BWI
	if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
	return false;

	if (Subtarget.hasAVX512())
	return true;

	bool LShift = VT.is128BitVector() \|\| VT.is256BitVector();
	bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
	return (Opcode == ISD::SRA) ? AShift : LShift;
	}

	static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);

	auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
	assert((VT == MVT::v2i64 \|\| VT == MVT::v4i64) && "Unexpected SRA type");
	MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
	SDValue Ex = DAG.getBitcast(ExVT, R);

	// ashr(R, 63) === cmp_slt(R, 0)
	if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
	assert((VT != MVT::v4i64 \|\| Subtarget.hasInt256()) &&
	"Unsupported PCMPGT op");
	return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
	}

	if (ShiftAmt >= 32) {
	// Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
	SDValue Upper =
	getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
	SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
	ShiftAmt - 32, DAG);
	if (VT == MVT::v2i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
	if (VT == MVT::v4i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
	{9, 1, 11, 3, 13, 5, 15, 7});
	} else {
	// SRA upper i32, SRL whole i64 and select lower i32.
	SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
	ShiftAmt, DAG);
	SDValue Lower =
	getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
	Lower = DAG.getBitcast(ExVT, Lower);
	if (VT == MVT::v2i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
	if (VT == MVT::v4i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
	{8, 1, 10, 3, 12, 5, 14, 7});
	}
	return DAG.getBitcast(VT, Ex);
	};

	// Optimize shl/srl/sra with constant shift amount.
	APInt APIntShiftAmt;
	if (!X86::isConstantSplat(Amt, APIntShiftAmt))
	return SDValue();

	// If the shift amount is out of range, return undef.
	if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
	return DAG.getUNDEF(VT);

	uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();

	if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
	return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);

	// i64 SRA needs to be performed as partial shifts.
	if (((!Subtarget.hasXOP() && VT == MVT::v2i64) \|\|
	(Subtarget.hasInt256() && VT == MVT::v4i64)) &&
	Op.getOpcode() == ISD::SRA)
	return ArithmeticShiftRight64(ShiftAmt);

	if (VT == MVT::v16i8 \|\| (Subtarget.hasInt256() && VT == MVT::v32i8) \|\|
	(Subtarget.hasBWI() && VT == MVT::v64i8)) {
	unsigned NumElts = VT.getVectorNumElements();
	MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

	// Simple i8 add case
	if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
	return DAG.getNode(ISD::ADD, dl, VT, R, R);

	// ashr(R, 7) === cmp_slt(R, 0)
	if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
	SDValue Zeros = DAG.getConstant(0, dl, VT);
	if (VT.is512BitVector()) {
	assert(VT == MVT::v64i8 && "Unexpected element type!");
	SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
	return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
	}
	return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
	}

	// XOP can shift v16i8 directly instead of as shift v8i16 + mask.
	if (VT == MVT::v16i8 && Subtarget.hasXOP())
	return SDValue();

	if (Op.getOpcode() == ISD::SHL) {
	// Make a large shift.
	SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
	ShiftAmt, DAG);
	SHL = DAG.getBitcast(VT, SHL);
	// Zero out the rightmost bits.
	APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
	return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
	}
	if (Op.getOpcode() == ISD::SRL) {
	// Make a large shift.
	SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
	ShiftAmt, DAG);
	SRL = DAG.getBitcast(VT, SRL);
	// Zero out the leftmost bits.
	return DAG.getNode(ISD::AND, dl, VT, SRL,
	DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
	}
	if (Op.getOpcode() == ISD::SRA) {
	// ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
	SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);

	SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
	Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
	Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
	return Res;
	}
	llvm_unreachable("Unknown shift opcode.");
	}

	return SDValue();
	}

	static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	unsigned Opcode = Op.getOpcode();
	unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
	unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);

	if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) {
	if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
	MVT EltVT = VT.getVectorElementType();
	assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
	if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
	BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
	else if (EltVT.bitsLT(MVT::i32))
	BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);

	return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
	}

	// vXi8 shifts - shift as v8i16 + mask result.
	if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) \|\|
	(VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) \|\|
	VT == MVT::v64i8) &&
	!Subtarget.hasXOP()) {
	unsigned NumElts = VT.getVectorNumElements();
	MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
	if (SupportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
	unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
	unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
	BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);

	// Create the mask using vXi16 shifts. For shift-rights we need to move
	// the upper byte down before splatting the vXi8 mask.
	SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
	BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
	BaseShAmt, Subtarget, DAG);
	if (Opcode != ISD::SHL)
	BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
	8, DAG);
	BitMask = DAG.getBitcast(VT, BitMask);
	BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
	SmallVector<int, 64>(NumElts, 0));

	SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
	DAG.getBitcast(ExtVT, R), BaseShAmt,
	Subtarget, DAG);
	Res = DAG.getBitcast(VT, Res);
	Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);

	if (Opcode == ISD::SRA) {
	// ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
	// SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
	SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
	SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask,
	BaseShAmt, Subtarget, DAG);
	SignMask = DAG.getBitcast(VT, SignMask);
	Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
	Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
	}
	return Res;
	}
	}
	}

	// Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
	if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
	Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
	Amt = Amt.getOperand(0);
	unsigned Ratio = 64 / Amt.getScalarValueSizeInBits();
	std::vector<SDValue> Vals(Ratio);
	for (unsigned i = 0; i != Ratio; ++i)
	Vals[i] = Amt.getOperand(i);
	for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) {
	for (unsigned j = 0; j != Ratio; ++j)
	if (Vals[j] != Amt.getOperand(i + j))
	return SDValue();
	}

	if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
	return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
	}
	return SDValue();
	}

	// Convert a shift/rotate left amount to a multiplication scale factor.
	static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Amt.getSimpleValueType();
	if (!(VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
	(Subtarget.hasInt256() && VT == MVT::v16i16) \|\|
	(!Subtarget.hasAVX512() && VT == MVT::v16i8)))
	return SDValue();

	if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
	SmallVector<SDValue, 8> Elts;
	MVT SVT = VT.getVectorElementType();
	unsigned SVTBits = SVT.getSizeInBits();
	APInt One(SVTBits, 1);
	unsigned NumElems = VT.getVectorNumElements();

	for (unsigned i = 0; i != NumElems; ++i) {
	SDValue Op = Amt->getOperand(i);
	if (Op->isUndef()) {
	Elts.push_back(Op);
	continue;
	}

	ConstantSDNode *ND = cast<ConstantSDNode>(Op);
	APInt C(SVTBits, ND->getZExtValue());
	uint64_t ShAmt = C.getZExtValue();
	if (ShAmt >= SVTBits) {
	Elts.push_back(DAG.getUNDEF(SVT));
	continue;
	}
	Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
	}
	return DAG.getBuildVector(VT, dl, Elts);
	}

	// If the target doesn't support variable shifts, use either FP conversion
	// or integer multiplication to avoid shifting each element individually.
	if (VT == MVT::v4i32) {
	Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
	DAG.getConstant(0x3f800000U, dl, VT));
	Amt = DAG.getBitcast(MVT::v4f32, Amt);
	return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
	}

	// AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
	if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
	SDValue Z = DAG.getConstant(0, dl, VT);
	SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
	SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
	Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
	Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
	if (Subtarget.hasSSE41())
	return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);

	return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
	DAG.getBitcast(VT, Hi),
	{0, 2, 4, 6, 8, 10, 12, 14});
	}

	return SDValue();
	}

	static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

	unsigned Opc = Op.getOpcode();
	unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
	unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);

	assert(VT.isVector() && "Custom lowering only for vector shifts!");
	assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");

	if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
	return V;

	if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
	return V;

	if (SupportedVectorVarShift(VT, Subtarget, Opc))
	return Op;

	// XOP has 128-bit variable logical/arithmetic shifts.
	// +ve/-ve Amt = shift left/right.
	if (Subtarget.hasXOP() && (VT == MVT::v2i64 \|\| VT == MVT::v4i32 \|\|
	VT == MVT::v8i16 \|\| VT == MVT::v16i8)) {
	if (Opc == ISD::SRL \|\| Opc == ISD::SRA) {
	SDValue Zero = DAG.getConstant(0, dl, VT);
	Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
	}
	if (Opc == ISD::SHL \|\| Opc == ISD::SRL)
	return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
	if (Opc == ISD::SRA)
	return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
	}

	// 2i64 vector logical shifts can efficiently avoid scalarization - do the
	// shifts per-lane and then shuffle the partial results back together.
	if (VT == MVT::v2i64 && Opc != ISD::SRA) {
	// Splat the shift amounts so the scalar shifts above will catch it.
	SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
	SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
	SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
	SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
	return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
	}

	// i64 vector arithmetic shift can be emulated with the transform:
	// M = lshr(SIGN_MASK, Amt)
	// ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
	if ((VT == MVT::v2i64 \|\| (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
	Opc == ISD::SRA) {
	SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
	SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
	R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
	R = DAG.getNode(ISD::XOR, dl, VT, R, M);
	R = DAG.getNode(ISD::SUB, dl, VT, R, M);
	return R;
	}

	// If possible, lower this shift as a sequence of two shifts by
	// constant plus a BLENDing shuffle instead of scalarizing it.
	// Example:
	// (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
	//
	// Could be rewritten as:
	// (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
	//
	// The advantage is that the two shifts from the example would be
	// lowered as X86ISD::VSRLI nodes in parallel before blending.
	if (ConstantAmt && (VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
	(VT == MVT::v16i16 && Subtarget.hasInt256()))) {
	SDValue Amt1, Amt2;
	unsigned NumElts = VT.getVectorNumElements();
	SmallVector<int, 8> ShuffleMask;
	for (unsigned i = 0; i != NumElts; ++i) {
	SDValue A = Amt->getOperand(i);
	if (A.isUndef()) {
	ShuffleMask.push_back(SM_SentinelUndef);
	continue;
	}
	if (!Amt1 \|\| Amt1 == A) {
	ShuffleMask.push_back(i);
	Amt1 = A;
	continue;
	}
	if (!Amt2 \|\| Amt2 == A) {
	ShuffleMask.push_back(i + NumElts);
	Amt2 = A;
	continue;
	}
	break;
	}

	// Only perform this blend if we can perform it without loading a mask.
	if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
	(VT != MVT::v16i16 \|\|
	is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
	(VT == MVT::v4i32 \|\| Subtarget.hasSSE41() \|\| Opc != ISD::SHL \|\|
	canWidenShuffleElements(ShuffleMask))) {
	auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
	auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
	if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
	Cst2->getAPIntValue().ult(EltSizeInBits)) {
	SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
	Cst1->getZExtValue(), DAG);
	SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
	Cst2->getZExtValue(), DAG);
	return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
	}
	}
	}

	// If possible, lower this packed shift into a vector multiply instead of
	// expanding it into a sequence of scalar shifts.
	if (Opc == ISD::SHL)
	if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
	return DAG.getNode(ISD::MUL, dl, VT, R, Scale);

	// Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
	// can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
	if (Opc == ISD::SRL && ConstantAmt &&
	(VT == MVT::v8i16 \|\| (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
	SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
	SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
	if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
	SDValue Zero = DAG.getConstant(0, dl, VT);
	SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
	SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
	return DAG.getSelect(dl, VT, ZAmt, R, Res);
	}
	}

	// Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
	// can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
	// TODO: Special case handling for shift by 0/1, really we can afford either
	// of these cases in pre-SSE41/XOP/AVX512 but not both.
	if (Opc == ISD::SRA && ConstantAmt &&
	(VT == MVT::v8i16 \|\| (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
	((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
	!Subtarget.hasAVX512()) \|\|
	DAG.isKnownNeverZero(Amt))) {
	SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
	SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
	if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
	SDValue Amt0 =
	DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
	SDValue Amt1 =
	DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
	SDValue Sra1 =
	getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
	SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
	Res = DAG.getSelect(dl, VT, Amt0, R, Res);
	return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
	}
	}

	// v4i32 Non Uniform Shifts.
	// If the shift amount is constant we can shift each lane using the SSE2
	// immediate shifts, else we need to zero-extend each lane to the lower i64
	// and shift using the SSE2 variable shifts.
	// The separate results can then be blended together.
	if (VT == MVT::v4i32) {
	SDValue Amt0, Amt1, Amt2, Amt3;
	if (ConstantAmt) {
	Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
	Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
	Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
	Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
	} else {
	// The SSE2 shifts use the lower i64 as the same shift amount for
	// all lanes and the upper i64 is ignored. On AVX we're better off
	// just zero-extending, but for SSE just duplicating the top 16-bits is
	// cheaper and has the same effect for out of range values.
	if (Subtarget.hasAVX()) {
	SDValue Z = DAG.getConstant(0, dl, VT);
	Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
	Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
	Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
	Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
	} else {
	SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
	SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
	{4, 5, 6, 7, -1, -1, -1, -1});
	Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
	{0, 1, 1, 1, -1, -1, -1, -1});
	Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
	{2, 3, 3, 3, -1, -1, -1, -1});
	Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
	{0, 1, 1, 1, -1, -1, -1, -1});
	Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
	{2, 3, 3, 3, -1, -1, -1, -1});
	}
	}

	unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
	SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
	SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
	SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
	SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));

	// Merge the shifted lane results optimally with/without PBLENDW.
	// TODO - ideally shuffle combining would handle this.
	if (Subtarget.hasSSE41()) {
	SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
	SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
	return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
	}
	SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
	SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
	return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
	}

	// It's worth extending once and using the vXi16/vXi32 shifts for smaller
	// types, but without AVX512 the extra overheads to get from vXi8 to vXi32
	// make the existing SSE solution better.
	// NOTE: We honor prefered vector width before promoting to 512-bits.
	if ((Subtarget.hasInt256() && VT == MVT::v8i16) \|\|
	(Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) \|\|
	(Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) \|\|
	(Subtarget.canExtendTo512BW() && VT == MVT::v32i8) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
	assert((!Subtarget.hasBWI() \|\| VT == MVT::v32i8 \|\| VT == MVT::v16i8) &&
	"Unexpected vector type");
	MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
	MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
	unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	R = DAG.getNode(ExtOpc, dl, ExtVT, R);
	Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
	return DAG.getNode(ISD::TRUNCATE, dl, VT,
	DAG.getNode(Opc, dl, ExtVT, R, Amt));
	}

	// Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
	// extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
	if (ConstantAmt && (Opc == ISD::SRA \|\| Opc == ISD::SRL) &&
	(VT == MVT::v16i8 \|\| (VT == MVT::v32i8 && Subtarget.hasInt256()) \|\|
	(VT == MVT::v64i8 && Subtarget.hasBWI())) &&
	!Subtarget.hasXOP()) {
	int NumElts = VT.getVectorNumElements();
	SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);

	// Extend constant shift amount to vXi16 (it doesn't matter if the type
	// isn't legal).
	MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
	Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
	Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
	Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
	assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&
	"Constant build vector expected");

	if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
	R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
	: DAG.getZExtOrTrunc(R, dl, ExVT);
	R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
	R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
	return DAG.getZExtOrTrunc(R, dl, VT);
	}

	SmallVector<SDValue, 16> LoAmt, HiAmt;
	for (int i = 0; i != NumElts; i += 16) {
	for (int j = 0; j != 8; ++j) {
	LoAmt.push_back(Amt.getOperand(i + j));
	HiAmt.push_back(Amt.getOperand(i + j + 8));
	}
	}

	MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
	SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
	SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);

	SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
	SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
	LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
	HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
	LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
	HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
	LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
	HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
	return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
	}

	if (VT == MVT::v16i8 \|\|
	(VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) \|\|
	(VT == MVT::v64i8 && Subtarget.hasBWI())) {
	MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);

	auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
	if (VT.is512BitVector()) {
	// On AVX512BW targets we make use of the fact that VSELECT lowers
	// to a masked blend which selects bytes based just on the sign bit
	// extracted to a mask.
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	V0 = DAG.getBitcast(VT, V0);
	V1 = DAG.getBitcast(VT, V1);
	Sel = DAG.getBitcast(VT, Sel);
	Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
	ISD::SETGT);
	return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
	} else if (Subtarget.hasSSE41()) {
	// On SSE41 targets we can use PBLENDVB which selects bytes based just
	// on the sign bit.
	V0 = DAG.getBitcast(VT, V0);
	V1 = DAG.getBitcast(VT, V1);
	Sel = DAG.getBitcast(VT, Sel);
	return DAG.getBitcast(SelVT,
	DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
	}
	// On pre-SSE41 targets we test for the sign bit by comparing to
	// zero - a negative value will set all bits of the lanes to true
	// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
	SDValue Z = DAG.getConstant(0, dl, SelVT);
	SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
	return DAG.getSelect(dl, SelVT, C, V0, V1);
	};

	// Turn 'a' into a mask suitable for VSELECT: a = a << 5;
	// We can safely do this using i16 shifts as we're only interested in
	// the 3 lower bits of each byte.
	Amt = DAG.getBitcast(ExtVT, Amt);
	Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
	Amt = DAG.getBitcast(VT, Amt);

	if (Opc == ISD::SHL \|\| Opc == ISD::SRL) {
	// r = VSELECT(r, shift(r, 4), a);
	SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 2), a);
	M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// return VSELECT(r, shift(r, 1), a);
	M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);
	return R;
	}

	if (Opc == ISD::SRA) {
	// For SRA we need to unpack each byte to the higher byte of a i16 vector
	// so we can correctly sign extend. We don't care what happens to the
	// lower byte.
	SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
	SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
	SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
	SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
	ALo = DAG.getBitcast(ExtVT, ALo);
	AHi = DAG.getBitcast(ExtVT, AHi);
	RLo = DAG.getBitcast(ExtVT, RLo);
	RHi = DAG.getBitcast(ExtVT, RHi);

	// r = VSELECT(r, shift(r, 4), a);
	SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
	SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// a += a
	ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
	AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

	// r = VSELECT(r, shift(r, 2), a);
	MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
	MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// a += a
	ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
	AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

	// r = VSELECT(r, shift(r, 1), a);
	MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
	MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// Logical shift the result back to the lower byte, leaving a zero upper
	// byte meaning that we can safely pack with PACKUSWB.
	RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
	RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}
	}

	if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
	MVT ExtVT = MVT::v8i32;
	SDValue Z = DAG.getConstant(0, dl, VT);
	SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
	SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
	SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
	SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
	ALo = DAG.getBitcast(ExtVT, ALo);
	AHi = DAG.getBitcast(ExtVT, AHi);
	RLo = DAG.getBitcast(ExtVT, RLo);
	RHi = DAG.getBitcast(ExtVT, RHi);
	SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
	SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
	Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
	Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
	return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
	}

	if (VT == MVT::v8i16) {
	// If we have a constant shift amount, the non-SSE41 path is best as
	// avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
	bool UseSSE41 = Subtarget.hasSSE41() &&
	!ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

	auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
	// On SSE41 targets we can use PBLENDVB which selects bytes based just on
	// the sign bit.
	if (UseSSE41) {
	MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
	V0 = DAG.getBitcast(ExtVT, V0);
	V1 = DAG.getBitcast(ExtVT, V1);
	Sel = DAG.getBitcast(ExtVT, Sel);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
	}
	// On pre-SSE41 targets we splat the sign bit - a negative value will
	// set all bits of the lanes to true and VSELECT uses that in
	// its OR(AND(V0,C),AND(V1,~C)) lowering.
	SDValue C =
	getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
	return DAG.getSelect(dl, VT, C, V0, V1);
	};

	// Turn 'a' into a mask suitable for VSELECT: a = a << 12;
	if (UseSSE41) {
	// On SSE41 targets we need to replicate the shift mask in both
	// bytes for PBLENDVB.
	Amt = DAG.getNode(
	ISD::OR, dl, VT,
	getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
	getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
	} else {
	Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
	}

	// r = VSELECT(r, shift(r, 8), a);
	SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 4), a);
	M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 2), a);
	M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// return VSELECT(r, shift(r, 1), a);
	M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
	R = SignBitSelect(Amt, M, R);
	return R;
	}

	// Decompose 256-bit shifts into 128-bit shifts.
	if (VT.is256BitVector())
	return splitVectorIntBinary(Op, DAG);

	if (VT == MVT::v32i16 \|\| VT == MVT::v64i8)
	return splitVectorIntBinary(Op, DAG);

	return SDValue();
	}

	static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	assert(VT.isVector() && "Custom lowering only for vector rotates!");

	SDLoc DL(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	unsigned Opcode = Op.getOpcode();
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	int NumElts = VT.getVectorNumElements();

	// Check for constant splat rotation amount.
	APInt CstSplatValue;
	bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);

	// Check for splat rotate by zero.
	if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
	return R;

	// AVX512 implicitly uses modulo rotation amounts.
	if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
	// Attempt to rotate by immediate.
	if (IsCstSplat) {
	unsigned RotOpc = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
	uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
	return DAG.getNode(RotOpc, DL, VT, R,
	DAG.getTargetConstant(RotAmt, DL, MVT::i8));
	}

	// Else, fall-back on VPROLV/VPRORV.
	return Op;
	}

	assert((Opcode == ISD::ROTL) && "Only ROTL supported");

	// XOP has 128-bit vector variable + immediate rotates.
	// +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
	// XOP implicitly uses modulo rotation amounts.
	if (Subtarget.hasXOP()) {
	if (VT.is256BitVector())
	return splitVectorIntBinary(Op, DAG);
	assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");

	// Attempt to rotate by immediate.
	if (IsCstSplat) {
	uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
	return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
	DAG.getTargetConstant(RotAmt, DL, MVT::i8));
	}

	// Use general rotate by variable (per-element).
	return Op;
	}

	// Split 256-bit integers on pre-AVX2 targets.
	if (VT.is256BitVector() && !Subtarget.hasAVX2())
	return splitVectorIntBinary(Op, DAG);

	assert((VT == MVT::v4i32 \|\| VT == MVT::v8i16 \|\| VT == MVT::v16i8 \|\|
	((VT == MVT::v8i32 \|\| VT == MVT::v16i16 \|\| VT == MVT::v32i8) &&
	Subtarget.hasAVX2())) &&
	"Only vXi32/vXi16/vXi8 vector rotates supported");

	// Rotate by an uniform constant - expand back to shifts.
	if (IsCstSplat)
	return SDValue();

	bool IsSplatAmt = DAG.isSplatValue(Amt);

	// v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
	// the amount bit.
	if (EltSizeInBits == 8 && !IsSplatAmt) {
	if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
	return SDValue();

	// We don't need ModuloAmt here as we just peek at individual bits.
	MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

	auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
	if (Subtarget.hasSSE41()) {
	// On SSE41 targets we can use PBLENDVB which selects bytes based just
	// on the sign bit.
	V0 = DAG.getBitcast(VT, V0);
	V1 = DAG.getBitcast(VT, V1);
	Sel = DAG.getBitcast(VT, Sel);
	return DAG.getBitcast(SelVT,
	DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
	}
	// On pre-SSE41 targets we test for the sign bit by comparing to
	// zero - a negative value will set all bits of the lanes to true
	// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
	SDValue Z = DAG.getConstant(0, DL, SelVT);
	SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
	return DAG.getSelect(DL, SelVT, C, V0, V1);
	};

	// Turn 'a' into a mask suitable for VSELECT: a = a << 5;
	// We can safely do this using i16 shifts as we're only interested in
	// the 3 lower bits of each byte.
	Amt = DAG.getBitcast(ExtVT, Amt);
	Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
	Amt = DAG.getBitcast(VT, Amt);

	// r = VSELECT(r, rot(r, 4), a);
	SDValue M;
	M = DAG.getNode(
	ISD::OR, DL, VT,
	DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
	DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);

	// r = VSELECT(r, rot(r, 2), a);
	M = DAG.getNode(
	ISD::OR, DL, VT,
	DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
	DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);

	// return VSELECT(r, rot(r, 1), a);
	M = DAG.getNode(
	ISD::OR, DL, VT,
	DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
	DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
	return SignBitSelect(VT, Amt, M, R);
	}

	// ISD::ROT* uses modulo rotate amounts.
	Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
	DAG.getConstant(EltSizeInBits - 1, DL, VT));

	bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
	bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
	SupportedVectorVarShift(VT, Subtarget, ISD::SRL);

	// Fallback for splats + all supported variable shifts.
	// Fallback for non-constants AVX2 vXi16 as well.
	if (IsSplatAmt \|\| LegalVarShifts \|\| (Subtarget.hasAVX2() && !ConstantAmt)) {
	SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
	AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
	SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
	SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
	return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
	}

	// As with shifts, convert the rotation amount to a multiplication factor.
	SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
	assert(Scale && "Failed to convert ROTL amount to scale");

	// v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
	if (EltSizeInBits == 16) {
	SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
	SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
	return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
	}

	// v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
	// to v2i64 results at a time. The upper 32-bits contain the wrapped bits
	// that can then be OR'd with the lower 32-bits.
	assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
	static const int OddMask[] = {1, -1, 3, -1};
	SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
	SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);

	SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64, R),
	DAG.getBitcast(MVT::v2i64, Scale));
	SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64, R13),
	DAG.getBitcast(MVT::v2i64, Scale13));
	Res02 = DAG.getBitcast(VT, Res02);
	Res13 = DAG.getBitcast(VT, Res13);

	return DAG.getNode(ISD::OR, DL, VT,
	DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
	DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
	}

	/// Returns true if the operand type is exactly twice the native width, and
	/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
	/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
	/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
	bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
	unsigned OpWidth = MemType->getPrimitiveSizeInBits();

	if (OpWidth == 64)
	return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit();
	if (OpWidth == 128)
	return Subtarget.hasCmpxchg16b();

	return false;
	}

	bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
	Type *MemType = SI->getValueOperand()->getType();

	bool NoImplicitFloatOps =
	SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
	if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
	!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
	(Subtarget.hasSSE1() \|\| Subtarget.hasX87()))
	return false;

	return needsCmpXchgNb(MemType);
	}

	// Note: this turns large loads into lock cmpxchg8b/16b.
	// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
	TargetLowering::AtomicExpansionKind
	X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
	Type *MemType = LI->getType();

	// If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
	// can use movq to do the load. If we have X87 we can load into an 80-bit
	// X87 register and store it to a stack temporary.
	bool NoImplicitFloatOps =
	LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
	if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
	!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
	(Subtarget.hasSSE1() \|\| Subtarget.hasX87()))
	return AtomicExpansionKind::None;

	return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	}

	TargetLowering::AtomicExpansionKind
	X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
	unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
	Type *MemType = AI->getType();

	// If the operand is too big, we must see if cmpxchg8/16b is available
	// and default to library calls otherwise.
	if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
	return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	}

	AtomicRMWInst::BinOp Op = AI->getOperation();
	switch (Op) {
	default:
	llvm_unreachable("Unknown atomic operation");
	case AtomicRMWInst::Xchg:
	case AtomicRMWInst::Add:
	case AtomicRMWInst::Sub:
	// It's better to use xadd, xsub or xchg for these in all cases.
	return AtomicExpansionKind::None;
	case AtomicRMWInst::Or:
	case AtomicRMWInst::And:
	case AtomicRMWInst::Xor:
	// If the atomicrmw's result isn't actually used, we can just add a "lock"
	// prefix to a normal instruction for these operations.
	return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	case AtomicRMWInst::Nand:
	case AtomicRMWInst::Max:
	case AtomicRMWInst::Min:
	case AtomicRMWInst::UMax:
	case AtomicRMWInst::UMin:
	case AtomicRMWInst::FAdd:
	case AtomicRMWInst::FSub:
	// These always require a non-trivial set of data operations on x86. We must
	// use a cmpxchg loop.
	return AtomicExpansionKind::CmpXChg;
	}
	}

	LoadInst *
	X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
	unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
	Type *MemType = AI->getType();
	// Accesses larger than the native width are turned into cmpxchg/libcalls, so
	// there is no benefit in turning such RMWs into loads, and it is actually
	// harmful as it introduces a mfence.
	if (MemType->getPrimitiveSizeInBits() > NativeWidth)
	return nullptr;

	// If this is a canonical idempotent atomicrmw w/no uses, we have a better
	// lowering available in lowerAtomicArith.
	// TODO: push more cases through this path.
	if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
	if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
	AI->use_empty())
	return nullptr;

	IRBuilder<> Builder(AI);
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	auto SSID = AI->getSyncScopeID();
	// We must restrict the ordering to avoid generating loads with Release or
	// ReleaseAcquire orderings.
	auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());

	// Before the load we need a fence. Here is an example lifted from
	// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
	// is required:
	// Thread 0:
	// x.store(1, relaxed);
	// r1 = y.fetch_add(0, release);
	// Thread 1:
	// y.fetch_add(42, acquire);
	// r2 = x.load(relaxed);
	// r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
	// lowered to just a load without a fence. A mfence flushes the store buffer,
	// making the optimization clearly correct.
	// FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
	// otherwise, we might be able to be more aggressive on relaxed idempotent
	// rmw. In practice, they do not look useful, so we don't try to be
	// especially clever.
	if (SSID == SyncScope::SingleThread)
	// FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
	// the IR level, so we must wrap it in an intrinsic.
	return nullptr;

	if (!Subtarget.hasMFence())
	// FIXME: it might make sense to use a locked operation here but on a
	// different cache-line to prevent cache-line bouncing. In practice it
	// is probably a small win, and x86 processors without mfence are rare
	// enough that we do not bother.
	return nullptr;

	Function *MFence =
	llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
	Builder.CreateCall(MFence, {});

	// Finally we can emit the atomic load.
	LoadInst *Loaded =
	Builder.CreateAlignedLoad(AI->getType(), AI->getPointerOperand(),
	Align(AI->getType()->getPrimitiveSizeInBits()));
	Loaded->setAtomic(Order, SSID);
	AI->replaceAllUsesWith(Loaded);
	AI->eraseFromParent();
	return Loaded;
	}

	bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
	if (!SI.isUnordered())
	return false;
	return ExperimentalUnorderedISEL;
	}
	bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
	if (!LI.isUnordered())
	return false;
	return ExperimentalUnorderedISEL;
	}


	/// Emit a locked operation on a stack location which does not change any
	/// memory location, but does involve a lock prefix. Location is chosen to be
	/// a) very likely accessed only by a single thread to minimize cache traffic,
	/// and b) definitely dereferenceable. Returns the new Chain result.
	static SDValue emitLockedStackOp(SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SDValue Chain, SDLoc DL) {
	// Implementation notes:
	// 1) LOCK prefix creates a full read/write reordering barrier for memory
	// operations issued by the current processor. As such, the location
	// referenced is not relevant for the ordering properties of the instruction.
	// See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
	// 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
	// 2) Using an immediate operand appears to be the best encoding choice
	// here since it doesn't require an extra register.
	// 3) OR appears to be very slightly faster than ADD. (Though, the difference
	// is small enough it might just be measurement noise.)
	// 4) When choosing offsets, there are several contributing factors:
	// a) If there's no redzone, we default to TOS. (We could allocate a cache
	// line aligned stack object to improve this case.)
	// b) To minimize our chances of introducing a false dependence, we prefer
	// to offset the stack usage from TOS slightly.
	// c) To minimize concerns about cross thread stack usage - in particular,
	// the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
	// captures state in the TOS frame and accesses it from many threads -
	// we want to use an offset such that the offset is in a distinct cache
	// line from the TOS frame.
	//
	// For a general discussion of the tradeoffs and benchmark results, see:
	// https://shipilev.net/blog/2014/on-the-fence-with-dependencies/

	auto &MF = DAG.getMachineFunction();
	auto &TFL = *Subtarget.getFrameLowering();
	const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;

	if (Subtarget.is64Bit()) {
	SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
	SDValue Ops[] = {
	DAG.getRegister(X86::RSP, MVT::i64), // Base
	DAG.getTargetConstant(1, DL, MVT::i8), // Scale
	DAG.getRegister(0, MVT::i64), // Index
	DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
	DAG.getRegister(0, MVT::i16), // Segment.
	Zero,
	Chain};
	SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
	MVT::Other, Ops);
	return SDValue(Res, 1);
	}

	SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
	SDValue Ops[] = {
	DAG.getRegister(X86::ESP, MVT::i32), // Base
	DAG.getTargetConstant(1, DL, MVT::i8), // Scale
	DAG.getRegister(0, MVT::i32), // Index
	DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
	DAG.getRegister(0, MVT::i16), // Segment.
	Zero,
	Chain
	};
	SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
	MVT::Other, Ops);
	return SDValue(Res, 1);
	}

	static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	AtomicOrdering FenceOrdering =
	static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
	SyncScope::ID FenceSSID =
	static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));

	// The only fence that needs an instruction is a sequentially-consistent
	// cross-thread fence.
	if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
	FenceSSID == SyncScope::System) {
	if (Subtarget.hasMFence())
	return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));

	SDValue Chain = Op.getOperand(0);
	return emitLockedStackOp(DAG, Subtarget, Chain, dl);
	}

	// MEMBARRIER is a compiler barrier; it codegens to a no-op.
	return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
	}

	static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT T = Op.getSimpleValueType();
	SDLoc DL(Op);
	unsigned Reg = 0;
	unsigned size = 0;
	switch(T.SimpleTy) {
	default: llvm_unreachable("Invalid value type!");
	case MVT::i8: Reg = X86::AL; size = 1; break;
	case MVT::i16: Reg = X86::AX; size = 2; break;
	case MVT::i32: Reg = X86::EAX; size = 4; break;
	case MVT::i64:
	assert(Subtarget.is64Bit() && "Node not type legal!");
	Reg = X86::RAX; size = 8;
	break;
	}
	SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
	Op.getOperand(2), SDValue());
	SDValue Ops[] = { cpIn.getValue(0),
	Op.getOperand(1),
	Op.getOperand(3),
	DAG.getTargetConstant(size, DL, MVT::i8),
	cpIn.getValue(1) };
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
	SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
	Ops, T, MMO);

	SDValue cpOut =
	DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
	SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
	MVT::i32, cpOut.getValue(2));
	SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);

	return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
	cpOut, Success, EFLAGS.getValue(1));
	}

	// Create MOVMSKB, taking into account whether we need to split for AVX1.
	static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT InVT = V.getSimpleValueType();

	if (InVT == MVT::v64i8) {
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
	Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
	Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
	Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
	Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
	Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
	DAG.getConstant(32, DL, MVT::i8));
	return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
	}
	if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
	Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
	Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
	Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
	DAG.getConstant(16, DL, MVT::i8));
	return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
	}

	return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
	}

	static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	MVT DstVT = Op.getSimpleValueType();

	// Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
	// half to v32i1 and concatenating the result.
	if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
	assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
	assert(Subtarget.hasBWI() && "Expected BWI target");
	SDLoc dl(Op);
	SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
	DAG.getIntPtrConstant(0, dl));
	Lo = DAG.getBitcast(MVT::v32i1, Lo);
	SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
	DAG.getIntPtrConstant(1, dl));
	Hi = DAG.getBitcast(MVT::v32i1, Hi);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
	}

	// Use MOVMSK for vector to scalar conversion to prevent scalarization.
	if ((SrcVT == MVT::v16i1 \|\| SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
	assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
	MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
	SDLoc DL(Op);
	SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
	V = getPMOVMSKB(DL, V, DAG, Subtarget);
	return DAG.getZExtOrTrunc(V, DL, DstVT);
	}

	assert((SrcVT == MVT::v2i32 \|\| SrcVT == MVT::v4i16 \|\| SrcVT == MVT::v8i8 \|\|
	SrcVT == MVT::i64) && "Unexpected VT!");

	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
	!(DstVT == MVT::x86mmx && SrcVT.isVector()))
	// This conversion needs to be expanded.
	return SDValue();

	SDLoc dl(Op);
	if (SrcVT.isVector()) {
	// Widen the vector in input in the case of MVT::v2i32.
	// Example: from MVT::v2i32 to MVT::v4i32.
	MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
	SrcVT.getVectorNumElements() * 2);
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
	DAG.getUNDEF(SrcVT));
	} else {
	assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
	"Unexpected source type in LowerBITCAST");
	Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
	}

	MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
	Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);

	if (DstVT == MVT::x86mmx)
	return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
	DAG.getIntPtrConstant(0, dl));
	}

	/// Compute the horizontal sum of bytes in V for the elements of VT.
	///
	/// Requires V to be a byte vector and VT to be an integer vector type with
	/// wider elements than V's type. The width of the elements of VT determines
	/// how many bytes of V are summed horizontally to produce each element of the
	/// result.
	static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc DL(V);
	MVT ByteVecVT = V.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
	"Expected value to have byte element type.");
	assert(EltVT != MVT::i8 &&
	"Horizontal byte sum only makes sense for wider elements!");
	unsigned VecSize = VT.getSizeInBits();
	assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");

	// PSADBW instruction horizontally add all bytes and leave the result in i64
	// chunks, thus directly computes the pop count for v2i64 and v4i64.
	if (EltVT == MVT::i64) {
	SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
	MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
	V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
	return DAG.getBitcast(VT, V);
	}

	if (EltVT == MVT::i32) {
	// We unpack the low half and high half into i32s interleaved with zeros so
	// that we can use PSADBW to horizontally sum them. The most useful part of
	// this is that it lines up the results of two PSADBW instructions to be
	// two v2i64 vectors which concatenated are the 4 population counts. We can
	// then use PACKUSWB to shrink and concatenate them into a v4i32 again.
	SDValue Zeros = DAG.getConstant(0, DL, VT);
	SDValue V32 = DAG.getBitcast(VT, V);
	SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
	SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);

	// Do the horizontal sums into two v2i64s.
	Zeros = DAG.getConstant(0, DL, ByteVecVT);
	MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
	Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
	DAG.getBitcast(ByteVecVT, Low), Zeros);
	High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
	DAG.getBitcast(ByteVecVT, High), Zeros);

	// Merge them together.
	MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
	V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
	DAG.getBitcast(ShortVecVT, Low),
	DAG.getBitcast(ShortVecVT, High));

	return DAG.getBitcast(VT, V);
	}

	// The only element type left is i16.
	assert(EltVT == MVT::i16 && "Unknown how to handle type");

	// To obtain pop count for each i16 element starting from the pop count for
	// i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
	// right by 8. It is important to shift as i16s as i8 vector shift isn't
	// directly supported.
	SDValue ShifterV = DAG.getConstant(8, DL, VT);
	SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
	V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
	DAG.getBitcast(ByteVecVT, V));
	return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
	}

	static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	int NumElts = VT.getVectorNumElements();
	(void)EltVT;
	assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");

	// Implement a lookup table in register by using an algorithm based on:
	// http://wm.ite.pl/articles/sse-popcount.html
	//
	// The general idea is that every lower byte nibble in the input vector is an
	// index into a in-register pre-computed pop count table. We then split up the
	// input vector in two new ones: (1) a vector with only the shifted-right
	// higher nibbles for each byte and (2) a vector with the lower nibbles (and
	// masked out higher ones) for each byte. PSHUFB is used separately with both
	// to index the in-register table. Next, both are added and the result is a
	// i8 vector where each element contains the pop count for input byte.
	const int LUT[16] = {/* 0 / 0, / 1 / 1, / 2 / 1, / 3 */ 2,
	/* 4 / 1, / 5 / 2, / 6 / 2, / 7 */ 3,
	/* 8 / 1, / 9 / 2, / a / 2, / b */ 3,
	/* c / 2, / d / 3, / e / 3, / f */ 4};

	SmallVector<SDValue, 64> LUTVec;
	for (int i = 0; i < NumElts; ++i)
	LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
	SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
	SDValue M0F = DAG.getConstant(0x0F, DL, VT);

	// High nibbles
	SDValue FourV = DAG.getConstant(4, DL, VT);
	SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);

	// Low nibbles
	SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);

	// The input vector is used as the shuffle mask that index elements into the
	// LUT. After counting low and high nibbles, add the vector to obtain the
	// final pop count per i8 element.
	SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
	SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
	return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
	}

	// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
	// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
	static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	assert((VT.is512BitVector() \|\| VT.is256BitVector() \|\| VT.is128BitVector()) &&
	"Unknown CTPOP type to handle");
	SDLoc DL(Op.getNode());
	SDValue Op0 = Op.getOperand(0);

	// TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
	if (Subtarget.hasVPOPCNTDQ()) {
	unsigned NumElems = VT.getVectorNumElements();
	assert((VT.getVectorElementType() == MVT::i8 \|\|
	VT.getVectorElementType() == MVT::i16) && "Unexpected type");
	if (NumElems < 16 \|\| (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
	MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
	Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
	Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
	return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
	}
	}

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return splitVectorIntUnary(Op, DAG);

	// Decompose 512-bit ops into smaller 256-bit ops.
	if (VT.is512BitVector() && !Subtarget.hasBWI())
	return splitVectorIntUnary(Op, DAG);

	// For element types greater than i8, do vXi8 pop counts and a bytesum.
	if (VT.getScalarType() != MVT::i8) {
	MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
	SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
	SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
	return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
	}

	// We can't use the fast LUT approach, so fall back on LegalizeDAG.
	if (!Subtarget.hasSSSE3())
	return SDValue();

	return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
	}

	static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().isVector() &&
	"We only do custom lowering for vector population count.");
	return LowerVectorCTPOP(Op, Subtarget, DAG);
	}

	static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	SDLoc DL(Op);

	// For scalars, its still beneficial to transfer to/from the SIMD unit to
	// perform the BITREVERSE.
	if (!VT.isVector()) {
	MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
	SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
	Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	}

	int NumElts = VT.getVectorNumElements();
	int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector())
	return splitVectorIntUnary(Op, DAG);

	assert(VT.is128BitVector() &&
	"Only 128-bit vector bitreverse lowering supported.");

	// VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
	// perform the BSWAP in the shuffle.
	// Its best to shuffle using the second operand as this will implicitly allow
	// memory folding for multiple vectors.
	SmallVector<SDValue, 16> MaskElts;
	for (int i = 0; i != NumElts; ++i) {
	for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
	int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
	int PermuteByte = SourceByte \| (2 << 5);
	MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
	}
	}

	SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
	SDValue Res = DAG.getBitcast(MVT::v16i8, In);
	Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
	Res, Mask);
	return DAG.getBitcast(VT, Res);
	}

	static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	if (Subtarget.hasXOP() && !VT.is512BitVector())
	return LowerBITREVERSE_XOP(Op, DAG);

	assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");

	SDValue In = Op.getOperand(0);
	SDLoc DL(Op);

	// Split v64i8 without BWI so that we can still use the PSHUFB lowering.
	if (VT == MVT::v64i8 && !Subtarget.hasBWI())
	return splitVectorIntUnary(Op, DAG);

	unsigned NumElts = VT.getVectorNumElements();
	assert(VT.getScalarType() == MVT::i8 &&
	"Only byte vector BITREVERSE supported");

	// Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return splitVectorIntUnary(Op, DAG);

	// Perform BITREVERSE using PSHUFB lookups. Each byte is split into
	// two nibbles and a PSHUFB lookup to find the bitreverse of each
	// 0-15 value (moved to the other nibble).
	SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
	SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
	SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));

	const int LoLUT[16] = {
	/* 0 / 0x00, / 1 / 0x80, / 2 / 0x40, / 3 */ 0xC0,
	/* 4 / 0x20, / 5 / 0xA0, / 6 / 0x60, / 7 */ 0xE0,
	/* 8 / 0x10, / 9 / 0x90, / a / 0x50, / b */ 0xD0,
	/* c / 0x30, / d / 0xB0, / e / 0x70, / f */ 0xF0};
	const int HiLUT[16] = {
	/* 0 / 0x00, / 1 / 0x08, / 2 / 0x04, / 3 */ 0x0C,
	/* 4 / 0x02, / 5 / 0x0A, / 6 / 0x06, / 7 */ 0x0E,
	/* 8 / 0x01, / 9 / 0x09, / a / 0x05, / b */ 0x0D,
	/* c / 0x03, / d / 0x0B, / e / 0x07, / f */ 0x0F};

	SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
	for (unsigned i = 0; i < NumElts; ++i) {
	LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
	HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
	}

	SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
	SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
	Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
	Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
	return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
	}

	static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned NewOpc = 0;
	switch (N->getOpcode()) {
	case ISD::ATOMIC_LOAD_ADD:
	NewOpc = X86ISD::LADD;
	break;
	case ISD::ATOMIC_LOAD_SUB:
	NewOpc = X86ISD::LSUB;
	break;
	case ISD::ATOMIC_LOAD_OR:
	NewOpc = X86ISD::LOR;
	break;
	case ISD::ATOMIC_LOAD_XOR:
	NewOpc = X86ISD::LXOR;
	break;
	case ISD::ATOMIC_LOAD_AND:
	NewOpc = X86ISD::LAND;
	break;
	default:
	llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
	}

	MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();

	return DAG.getMemIntrinsicNode(
	NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
	{N->getOperand(0), N->getOperand(1), N->getOperand(2)},
	/MemVT=/N->getSimpleValueType(0), MMO);
	}

	/// Lower atomic_load_ops into LOCK-prefixed operations.
	static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
	SDValue Chain = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	unsigned Opc = N->getOpcode();
	MVT VT = N->getSimpleValueType(0);
	SDLoc DL(N);

	// We can lower atomic_load_add into LXADD. However, any other atomicrmw op
	// can only be lowered when the result is unused. They should have already
	// been transformed into a cmpxchg loop in AtomicExpand.
	if (N->hasAnyUseOfValue(0)) {
	// Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
	// select LXADD if LOCK_SUB can't be selected.
	if (Opc == ISD::ATOMIC_LOAD_SUB) {
	RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
	return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
	RHS, AN->getMemOperand());
	}
	assert(Opc == ISD::ATOMIC_LOAD_ADD &&
	"Used AtomicRMW ops other than Add should have been expanded!");
	return N;
	}

	// Specialized lowering for the canonical form of an idemptotent atomicrmw.
	// The core idea here is that since the memory location isn't actually
	// changing, all we need is a lowering for the ordering impacts of the
	// atomicrmw. As such, we can chose a different operation and memory
	// location to minimize impact on other code.
	if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
	// On X86, the only ordering which actually requires an instruction is
	// seq_cst which isn't SingleThread, everything just needs to be preserved
	// during codegen and then dropped. Note that we expect (but don't assume),
	// that orderings other than seq_cst and acq_rel have been canonicalized to
	// a store or load.
	if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent &&
	AN->getSyncScopeID() == SyncScope::System) {
	// Prefer a locked operation against a stack location to minimize cache
	// traffic. This assumes that stack locations are very likely to be
	// accessed only by the owning thread.
	SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
	assert(!N->hasAnyUseOfValue(0));
	// NOTE: The getUNDEF is needed to give something for the unused result 0.
	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
	DAG.getUNDEF(VT), NewChain);
	}
	// MEMBARRIER is a compiler barrier; it codegens to a no-op.
	SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
	assert(!N->hasAnyUseOfValue(0));
	// NOTE: The getUNDEF is needed to give something for the unused result 0.
	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
	DAG.getUNDEF(VT), NewChain);
	}

	SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
	// RAUW the chain, but don't worry about the result, as it's unused.
	assert(!N->hasAnyUseOfValue(0));
	// NOTE: The getUNDEF is needed to give something for the unused result 0.
	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
	DAG.getUNDEF(VT), LockOp.getValue(1));
	}

	static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	auto *Node = cast<AtomicSDNode>(Op.getNode());
	SDLoc dl(Node);
	EVT VT = Node->getMemoryVT();

	bool IsSeqCst = Node->getOrdering() == AtomicOrdering::SequentiallyConsistent;
	bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);

	// If this store is not sequentially consistent and the type is legal
	// we can just keep it.
	if (!IsSeqCst && IsTypeLegal)
	return Op;

	if (VT == MVT::i64 && !IsTypeLegal) {
	// For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
	// is enabled.
	bool NoImplicitFloatOps =
	DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat);
	if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
	SDValue Chain;
	if (Subtarget.hasSSE1()) {
	SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
	Node->getOperand(2));
	MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
	SclToVec = DAG.getBitcast(StVT, SclToVec);
	SDVTList Tys = DAG.getVTList(MVT::Other);
	SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
	Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
	MVT::i64, Node->getMemOperand());
	} else if (Subtarget.hasX87()) {
	// First load this into an 80-bit X87 register using a stack temporary.
	// This will put the whole integer into the significand.
	SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
	int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
	MachinePointerInfo MPI =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
	Chain =
	DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
	MPI, /Align/ 0, MachineMemOperand::MOStore);
	SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
	SDValue LdOps[] = {Chain, StackPtr};
	SDValue Value =
	DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
	/Align/ None, MachineMemOperand::MOLoad);
	Chain = Value.getValue(1);

	// Now use an FIST to do the atomic store.
	SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
	Chain =
	DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
	StoreOps, MVT::i64, Node->getMemOperand());
	}

	if (Chain) {
	// If this is a sequentially consistent store, also emit an appropriate
	// barrier.
	if (IsSeqCst)
	Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);

	return Chain;
	}
	}
	}

	// Convert seq_cst store -> xchg
	// Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
	// FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
	SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
	Node->getMemoryVT(),
	Node->getOperand(0),
	Node->getOperand(1), Node->getOperand(2),
	Node->getMemOperand());
	return Swap.getValue(1);
	}

	static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
	SDNode *N = Op.getNode();
	MVT VT = N->getSimpleValueType(0);

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	SDLoc DL(N);

	// Set the carry flag.
	SDValue Carry = Op.getOperand(2);
	EVT CarryVT = Carry.getValueType();
	Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
	Carry, DAG.getAllOnesConstant(DL, CarryVT));

	unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
	SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
	Op.getOperand(1), Carry.getValue(1));

	SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
	if (N->getValueType(1) == MVT::i1)
	SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
	}

	static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());

	// For MacOSX, we want to call an alternative entry point: __sincos_stret,
	// which returns the values as { float, float } (in XMM0) or
	// { double, double } (which is returned in XMM0, XMM1).
	SDLoc dl(Op);
	SDValue Arg = Op.getOperand(0);
	EVT ArgVT = Arg.getValueType();
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;

	Entry.Node = Arg;
	Entry.Ty = ArgTy;
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);

	bool isF64 = ArgVT == MVT::f64;
	// Only optimize x86_64 for now. i386 is a bit messy. For f32,
	// the small struct {f32, f32} is returned in (eax, edx). For f64,
	// the results are returned via SRet in memory.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
	const char *LibcallName = TLI.getLibcallName(LC);
	SDValue Callee =
	DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));

	Type RetTy = isF64 ? (Type )StructType::get(ArgTy, ArgTy)
	: (Type *)FixedVectorType::get(ArgTy, 4);

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(DAG.getEntryNode())
	.setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));

	std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);

	if (isF64)
	// Returned in xmm0 and xmm1.
	return CallResult.first;

	// Returned in bits 0:31 and 32:64 xmm0.
	SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
	CallResult.first, DAG.getIntPtrConstant(0, dl));
	SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
	CallResult.first, DAG.getIntPtrConstant(1, dl));
	SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
	}

	/// Widen a vector input to a vector of NVT. The
	/// input vector must have the same element type as NVT.
	static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
	bool FillWithZeroes = false) {
	// Check if InOp already has the right width.
	MVT InVT = InOp.getSimpleValueType();
	if (InVT == NVT)
	return InOp;

	if (InOp.isUndef())
	return DAG.getUNDEF(NVT);

	assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
	"input and widen element type must match");

	unsigned InNumElts = InVT.getVectorNumElements();
	unsigned WidenNumElts = NVT.getVectorNumElements();
	assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
	"Unexpected request for vector widening");

	SDLoc dl(InOp);
	if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
	InOp.getNumOperands() == 2) {
	SDValue N1 = InOp.getOperand(1);
	if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) \|\|
	N1.isUndef()) {
	InOp = InOp.getOperand(0);
	InVT = InOp.getSimpleValueType();
	InNumElts = InVT.getVectorNumElements();
	}
	}
	if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) \|\|
	ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
	SmallVector<SDValue, 16> Ops;
	for (unsigned i = 0; i < InNumElts; ++i)
	Ops.push_back(InOp.getOperand(i));

	EVT EltVT = InOp.getOperand(0).getValueType();

	SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
	DAG.getUNDEF(EltVT);
	for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
	Ops.push_back(FillVal);
	return DAG.getBuildVector(NVT, dl, Ops);
	}
	SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
	DAG.getUNDEF(NVT);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
	InOp, DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"MGATHER/MSCATTER are supported on AVX-512 arch only");

	MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
	SDValue Src = N->getValue();
	MVT VT = Src.getSimpleValueType();
	assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
	SDLoc dl(Op);

	SDValue Scale = N->getScale();
	SDValue Index = N->getIndex();
	SDValue Mask = N->getMask();
	SDValue Chain = N->getChain();
	SDValue BasePtr = N->getBasePtr();

	if (VT == MVT::v2f32 \|\| VT == MVT::v2i32) {
	assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
	// If the index is v2i64 and we have VLX we can use xmm for data and index.
	if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
	SDVTList VTs = DAG.getVTList(MVT::Other);
	SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
	return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
	N->getMemoryVT(), N->getMemOperand());
	}
	return SDValue();
	}

	MVT IndexVT = Index.getSimpleValueType();

	// If the index is v2i32, we're being called by type legalization and we
	// should just let the default handling take care of it.
	if (IndexVT == MVT::v2i32)
	return SDValue();

	// If we don't have VLX and neither the passthru or index is 512-bits, we
	// need to widen until one is.
	if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
	!Index.getSimpleValueType().is512BitVector()) {
	// Determine how much we need to widen by to get a 512-bit type.
	unsigned Factor = std::min(512/VT.getSizeInBits(),
	512/IndexVT.getSizeInBits());
	unsigned NumElts = VT.getVectorNumElements() * Factor;

	VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
	IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
	MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

	Src = ExtendToType(Src, VT, DAG);
	Index = ExtendToType(Index, IndexVT, DAG);
	Mask = ExtendToType(Mask, MaskVT, DAG, true);
	}

	SDVTList VTs = DAG.getVTList(MVT::Other);
	SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
	return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
	N->getMemoryVT(), N->getMemOperand());
	}

	static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {

	MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
	MVT VT = Op.getSimpleValueType();
	MVT ScalarVT = VT.getScalarType();
	SDValue Mask = N->getMask();
	MVT MaskVT = Mask.getSimpleValueType();
	SDValue PassThru = N->getPassThru();
	SDLoc dl(Op);

	// Handle AVX masked loads which don't support passthru other than 0.
	if (MaskVT.getVectorElementType() != MVT::i1) {
	// We also allow undef in the isel pattern.
	if (PassThru.isUndef() \|\| ISD::isBuildVectorAllZeros(PassThru.getNode()))
	return Op;

	SDValue NewLoad = DAG.getMaskedLoad(
	VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
	getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
	N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
	N->isExpandingLoad());
	// Emit a blend.
	SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
	return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
	}

	assert((!N->isExpandingLoad() \|\| Subtarget.hasAVX512()) &&
	"Expanding masked load is supported on AVX-512 target only!");

	assert((!N->isExpandingLoad() \|\| ScalarVT.getSizeInBits() >= 32) &&
	"Expanding masked load is supported for 32 and 64-bit types only!");

	assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
	"Cannot lower masked load op.");

	assert((ScalarVT.getSizeInBits() >= 32 \|\|
	(Subtarget.hasBWI() &&
	(ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) &&
	"Unsupported masked load op.");

	// This operation is legal for targets with VLX, but without
	// VLX the vector should be widened to 512 bit
	unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
	MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
	PassThru = ExtendToType(PassThru, WideDataVT, DAG);

	// Mask element has to be i1.
	assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
	"Unexpected mask type");

	MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);

	Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
	SDValue NewLoad = DAG.getMaskedLoad(
	WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
	PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
	N->getExtensionType(), N->isExpandingLoad());

	SDValue Extract =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
	DAG.getIntPtrConstant(0, dl));
	SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
	return DAG.getMergeValues(RetOps, dl);
	}

	static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
	SDValue DataToStore = N->getValue();
	MVT VT = DataToStore.getSimpleValueType();
	MVT ScalarVT = VT.getScalarType();
	SDValue Mask = N->getMask();
	SDLoc dl(Op);

	assert((!N->isCompressingStore() \|\| Subtarget.hasAVX512()) &&
	"Expanding masked load is supported on AVX-512 target only!");

	assert((!N->isCompressingStore() \|\| ScalarVT.getSizeInBits() >= 32) &&
	"Expanding masked load is supported for 32 and 64-bit types only!");

	assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
	"Cannot lower masked store op.");

	assert((ScalarVT.getSizeInBits() >= 32 \|\|
	(Subtarget.hasBWI() &&
	(ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) &&
	"Unsupported masked store op.");

	// This operation is legal for targets with VLX, but without
	// VLX the vector should be widened to 512 bit
	unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
	MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);

	// Mask element has to be i1.
	assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
	"Unexpected mask type");

	MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);

	DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
	Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
	return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
	N->getOffset(), Mask, N->getMemoryVT(),
	N->getMemOperand(), N->getAddressingMode(),
	N->isTruncatingStore(), N->isCompressingStore());
	}

	static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX2() &&
	"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");

	MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue Index = N->getIndex();
	SDValue Mask = N->getMask();
	SDValue PassThru = N->getPassThru();
	MVT IndexVT = Index.getSimpleValueType();

	assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");

	// If the index is v2i32, we're being called by type legalization.
	if (IndexVT == MVT::v2i32)
	return SDValue();

	// If we don't have VLX and neither the passthru or index is 512-bits, we
	// need to widen until one is.
	MVT OrigVT = VT;
	if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
	!IndexVT.is512BitVector()) {
	// Determine how much we need to widen by to get a 512-bit type.
	unsigned Factor = std::min(512/VT.getSizeInBits(),
	512/IndexVT.getSizeInBits());

	unsigned NumElts = VT.getVectorNumElements() * Factor;

	VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
	IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
	MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

	PassThru = ExtendToType(PassThru, VT, DAG);
	Index = ExtendToType(Index, IndexVT, DAG);
	Mask = ExtendToType(Mask, MaskVT, DAG, true);
	}

	SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
	N->getScale() };
	SDValue NewGather = DAG.getMemIntrinsicNode(
	X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
	N->getMemOperand());
	SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
	NewGather, DAG.getIntPtrConstant(0, dl));
	return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
	}

	static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
	SDLoc dl(Op);
	SDValue Src = Op.getOperand(0);
	MVT DstVT = Op.getSimpleValueType();

	AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
	unsigned SrcAS = N->getSrcAddressSpace();

	assert(SrcAS != N->getDestAddressSpace() &&
	"addrspacecast must be between different address spaces");

	if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
	Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
	} else if (DstVT == MVT::i64) {
	Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
	} else if (DstVT == MVT::i32) {
	Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
	} else {
	report_fatal_error("Bad address space in addrspacecast");
	}
	return Op;
	}

	SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
	SelectionDAG &DAG) const {
	// TODO: Eventually, the lowering of these nodes should be informed by or
	// deferred to the GC strategy for the function in which they appear. For
	// now, however, they must be lowered to something. Since they are logically
	// no-ops in the case of a null GC strategy (or a GC strategy which does not
	// require special handling for these nodes), lower them as literal NOOPs for
	// the time being.
	SmallVector<SDValue, 2> Ops;

	Ops.push_back(Op.getOperand(0));
	if (Op->getGluedNode())
	Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));

	SDLoc OpDL(Op);
	SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);

	return NOOP;
	}

	SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
	RTLIB::Libcall Call) const {

	bool IsStrict = Op->isStrictFPOpcode();
	unsigned Offset = IsStrict ? 1 : 0;
	SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end());

	SDLoc dl(Op);
	SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
	MakeLibCallOptions CallOptions;
	std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, Call, MVT::f128, Ops,
	CallOptions, dl, Chain);

	if (IsStrict)
	return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);

	return Tmp.first;
	}

	// Custom split CVTPS2PH with wide types.
	static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
	SDLoc dl(Op);
	EVT VT = Op.getValueType();
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
	SDValue RC = Op.getOperand(1);
	Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
	Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
	}

	/// Provide custom lowering hooks for some operations.
	SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
	switch (Op.getOpcode()) {
	default: llvm_unreachable("Should not custom lower this!");
	case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
	return LowerCMP_SWAP(Op, Subtarget, DAG);
	case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
	case ISD::ATOMIC_LOAD_ADD:
	case ISD::ATOMIC_LOAD_SUB:
	case ISD::ATOMIC_LOAD_OR:
	case ISD::ATOMIC_LOAD_XOR:
	case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
	case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
	case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
	case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
	case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
	case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
	case ISD::VSELECT: return LowerVSELECT(Op, DAG);
	case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
	case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
	case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
	case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
	case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
	case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
	case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
	case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
	case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
	case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
	case ISD::SHL_PARTS:
	case ISD::SRA_PARTS:
	case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
	case ISD::FSHL:
	case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
	case ISD::STRICT_SINT_TO_FP:
	case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
	case ISD::STRICT_UINT_TO_FP:
	case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
	case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
	case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
	case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
	case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
	case ISD::FP_TO_SINT:
	case ISD::STRICT_FP_TO_SINT:
	case ISD::FP_TO_UINT:
	case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
	case ISD::FP_EXTEND:
	case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
	case ISD::FP_ROUND:
	case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
	case ISD::FP16_TO_FP:
	case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
	case ISD::FP_TO_FP16:
	case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
	case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
	case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
	case ISD::FADD:
	case ISD::FSUB: return lowerFaddFsub(Op, DAG);
	case ISD::FROUND: return LowerFROUND(Op, DAG);
	case ISD::FABS:
	case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
	case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
	case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
	case ISD::LRINT:
	case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
	case ISD::SETCC:
	case ISD::STRICT_FSETCC:
	case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
	case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
	case ISD::SELECT: return LowerSELECT(Op, DAG);
	case ISD::BRCOND: return LowerBRCOND(Op, DAG);
	case ISD::JumpTable: return LowerJumpTable(Op, DAG);
	case ISD::VASTART: return LowerVASTART(Op, DAG);
	case ISD::VAARG: return LowerVAARG(Op, DAG);
	case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
	case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
	case ISD::INTRINSIC_VOID:
	case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
	case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
	case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
	case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
	case ISD::FRAME_TO_ARGS_OFFSET:
	return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
	case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
	case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
	case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
	case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
	case ISD::EH_SJLJ_SETUP_DISPATCH:
	return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
	case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
	case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
	case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
	case ISD::CTLZ:
	case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
	case ISD::CTTZ:
	case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
	case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
	case ISD::MULHS:
	case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
	case ISD::ROTL:
	case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
	case ISD::SRA:
	case ISD::SRL:
	case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
	case ISD::SADDO:
	case ISD::UADDO:
	case ISD::SSUBO:
	case ISD::USUBO:
	case ISD::SMULO:
	case ISD::UMULO: return LowerXALUO(Op, DAG);
	case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
	case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
	case ISD::ADDCARRY:
	case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
	case ISD::ADD:
	case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
	case ISD::UADDSAT:
	case ISD::SADDSAT:
	case ISD::USUBSAT:
	case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
	case ISD::SMAX:
	case ISD::SMIN:
	case ISD::UMAX:
	case ISD::UMIN: return LowerMINMAX(Op, DAG);
	case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
	case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
	case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
	case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
	case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
	case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
	case ISD::GC_TRANSITION_START:
	case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
	case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
	case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
	}
	}

	/// Places new result values for the node in Results (their number
	/// and types must exactly match those of the original return values of
	/// the node), or leaves Results empty, which indicates that the node is not
	/// to be custom lowered after all.
	void X86TargetLowering::LowerOperationWrapper(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const {
	SDValue Res = LowerOperation(SDValue(N, 0), DAG);

	if (!Res.getNode())
	return;

	// If the original node has one result, take the return value from
	// LowerOperation as is. It might not be result number 0.
	if (N->getNumValues() == 1) {
	Results.push_back(Res);
	return;
	}

	// If the original node has multiple results, then the return node should
	// have the same number of results.
	assert((N->getNumValues() == Res->getNumValues()) &&
	"Lowering returned the wrong number of results!");

	// Places new result values base on N result number.
	for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
	Results.push_back(Res.getValue(I));
	}

	/// Replace a node with an illegal result type with a new node built out of
	/// custom code.
	void X86TargetLowering::ReplaceNodeResults(SDNode *N,
	SmallVectorImpl<SDValue>&Results,
	SelectionDAG &DAG) const {
	SDLoc dl(N);
	switch (N->getOpcode()) {
	default:
	#ifndef NDEBUG
	dbgs() << "ReplaceNodeResults: ";
	N->dump(&DAG);
	#endif
	llvm_unreachable("Do not know how to custom type legalize this operation!");
	case X86ISD::CVTPH2PS: {
	EVT VT = N->getValueType(0);
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
	Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
	Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
	Results.push_back(Res);
	return;
	}
	case X86ISD::STRICT_CVTPH2PS: {
	EVT VT = N->getValueType(0);
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
	Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
	{N->getOperand(0), Lo});
	Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
	{N->getOperand(0), Hi});
	SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	Lo.getValue(1), Hi.getValue(1));
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
	Results.push_back(Res);
	Results.push_back(Chain);
	return;
	}
	case ISD::CTPOP: {
	assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
	// Use a v2i64 if possible.
	bool NoImplicitFloatOps =
	DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat);
	if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
	SDValue Wide =
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
	Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
	// Bit count should fit in 32-bits, extract it as that and then zero
	// extend to i64. Otherwise we end up extracting bits 63:32 separately.
	Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
	Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
	DAG.getIntPtrConstant(0, dl));
	Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
	Results.push_back(Wide);
	}
	return;
	}
	case ISD::MUL: {
	EVT VT = N->getValueType(0);
	assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
	VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
	// Pre-promote these to vXi16 to avoid op legalization thinking all 16
	// elements are needed.
	MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
	SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
	SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
	SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
	Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	unsigned NumConcats = 16 / VT.getVectorNumElements();
	SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
	ConcatOps[0] = Res;
	Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
	Results.push_back(Res);
	return;
	}
	case X86ISD::VPMADDWD:
	case X86ISD::AVG: {
	// Legalize types for X86ISD::AVG/VPMADDWD by widening.
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");

	EVT VT = N->getValueType(0);
	EVT InVT = N->getOperand(0).getValueType();
	assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
	"Expected a VT that divides into 128 bits.");
	assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
	"Unexpected type action!");
	unsigned NumConcat = 128 / InVT.getSizeInBits();

	EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
	InVT.getVectorElementType(),
	NumConcat * InVT.getVectorNumElements());
	EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
	VT.getVectorElementType(),
	NumConcat * VT.getVectorNumElements());

	SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
	Ops[0] = N->getOperand(0);
	SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
	Ops[0] = N->getOperand(1);
	SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);

	SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
	Results.push_back(Res);
	return;
	}
	case ISD::ABS: {
	assert(N->getValueType(0) == MVT::i64 &&
	"Unexpected type (!= i64) on ABS.");
	MVT HalfT = MVT::i32;
	SDValue Lo, Hi, Tmp;
	SDVTList VTList = DAG.getVTList(HalfT, MVT::i1);

	Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
	DAG.getConstant(0, dl, HalfT));
	Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
	DAG.getConstant(1, dl, HalfT));
	Tmp = DAG.getNode(
	ISD::SRA, dl, HalfT, Hi,
	DAG.getShiftAmountConstant(HalfT.getSizeInBits() - 1, HalfT, dl));
	Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
	Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
	SDValue(Lo.getNode(), 1));
	Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
	Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
	Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi));
	return;
	}
	// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
	case X86ISD::FMINC:
	case X86ISD::FMIN:
	case X86ISD::FMAXC:
	case X86ISD::FMAX: {
	EVT VT = N->getValueType(0);
	assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
	SDValue UNDEF = DAG.getUNDEF(VT);
	SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
	N->getOperand(0), UNDEF);
	SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
	N->getOperand(1), UNDEF);
	Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
	return;
	}
	case ISD::SDIV:
	case ISD::UDIV:
	case ISD::SREM:
	case ISD::UREM: {
	EVT VT = N->getValueType(0);
	if (VT.isVector()) {
	assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
	"Unexpected type action!");
	// If this RHS is a constant splat vector we can widen this and let
	// division/remainder by constant optimize it.
	// TODO: Can we do something for non-splat?
	APInt SplatVal;
	if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
	unsigned NumConcats = 128 / VT.getSizeInBits();
	SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
	Ops0[0] = N->getOperand(0);
	EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
	SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
	SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
	SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
	Results.push_back(Res);
	}
	return;
	}

	LLVM_FALLTHROUGH;
	}
	case ISD::SDIVREM:
	case ISD::UDIVREM: {
	SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
	Results.push_back(V);
	return;
	}
	case ISD::TRUNCATE: {
	MVT VT = N->getSimpleValueType(0);
	if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
	return;

	// The generic legalizer will try to widen the input type to the same
	// number of elements as the widened result type. But this isn't always
	// the best thing so do some custom legalization to avoid some cases.
	MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
	SDValue In = N->getOperand(0);
	EVT InVT = In.getValueType();

	unsigned InBits = InVT.getSizeInBits();
	if (128 % InBits == 0) {
	// 128 bit and smaller inputs should avoid truncate all together and
	// just use a build_vector that will become a shuffle.
	// TODO: Widen and use a shuffle directly?
	MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
	EVT EltVT = VT.getVectorElementType();
	unsigned WidenNumElts = WidenVT.getVectorNumElements();
	SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
	// Use the original element count so we don't do more scalar opts than
	// necessary.
	unsigned MinElts = VT.getVectorNumElements();
	for (unsigned i=0; i < MinElts; ++i) {
	SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
	DAG.getIntPtrConstant(i, dl));
	Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
	}
	Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
	return;
	}
	// With AVX512 there are some cases that can use a target specific
	// truncate node to go from 256/512 to less than 128 with zeros in the
	// upper elements of the 128 bit result.
	if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
	// We can use VTRUNC directly if for 256 bits with VLX or for any 512.
	if ((InBits == 256 && Subtarget.hasVLX()) \|\| InBits == 512) {
	Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
	return;
	}
	// There's one case we can widen to 512 bits and use VTRUNC.
	if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
	In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
	DAG.getUNDEF(MVT::v4i64));
	Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
	return;
	}
	}
	if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
	getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
	isTypeLegal(MVT::v4i64)) {
	// Input needs to be split and output needs to widened. Let's use two
	// VTRUNCs, and shuffle their results together into the wider type.
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(In, dl);

	Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
	Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
	SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
	{ 0, 1, 2, 3, 16, 17, 18, 19,
	-1, -1, -1, -1, -1, -1, -1, -1 });
	Results.push_back(Res);
	return;
	}

	return;
	}
	case ISD::ANY_EXTEND:
	// Right now, only MVT::v8i8 has Custom action for an illegal type.
	// It's intended to custom handle the input type.
	assert(N->getValueType(0) == MVT::v8i8 &&
	"Do not know how to legalize this Node");
	return;
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND: {
	EVT VT = N->getValueType(0);
	SDValue In = N->getOperand(0);
	EVT InVT = In.getValueType();
	if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
	(InVT == MVT::v4i16 \|\| InVT == MVT::v4i8)){
	assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
	"Unexpected type action!");
	assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
	// Custom split this so we can extend i8/i16->i32 invec. This is better
	// since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
	// sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
	// we allow the sra from the extend to i32 to be shared by the split.
	In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);

	// Fill a vector with sign bits for each element.
	SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
	SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);

	// Create an unpackl and unpackh to interleave the sign bits then bitcast
	// to v2i64.
	SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
	{0, 4, 1, 5});
	Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
	SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
	{2, 6, 3, 7});
	Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);

	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
	Results.push_back(Res);
	return;
	}

	if (VT == MVT::v16i32 \|\| VT == MVT::v8i64) {
	if (!InVT.is128BitVector()) {
	// Not a 128 bit vector, but maybe type legalization will promote
	// it to 128 bits.
	if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
	return;
	InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
	if (!InVT.is128BitVector())
	return;

	// Promote the input to 128 bits. Type legalization will turn this into
	// zext_inreg/sext_inreg.
	In = DAG.getNode(N->getOpcode(), dl, InVT, In);
	}

	// Perform custom splitting instead of the two stage extend we would get
	// by default.
	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
	assert(isTypeLegal(LoVT) && "Split VT not legal?");

	SDValue Lo = getExtendInVec(N->getOpcode(), dl, LoVT, In, DAG);

	// We need to shift the input over by half the number of elements.
	unsigned NumElts = InVT.getVectorNumElements();
	unsigned HalfNumElts = NumElts / 2;
	SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
	for (unsigned i = 0; i != HalfNumElts; ++i)
	ShufMask[i] = i + HalfNumElts;

	SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
	Hi = getExtendInVec(N->getOpcode(), dl, HiVT, Hi, DAG);

	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
	Results.push_back(Res);
	}
	return;
	}
	case ISD::FP_TO_SINT:
	case ISD::STRICT_FP_TO_SINT:
	case ISD::FP_TO_UINT:
	case ISD::STRICT_FP_TO_UINT: {
	bool IsStrict = N->isStrictFPOpcode();
	bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT \|\|
	N->getOpcode() == ISD::STRICT_FP_TO_SINT;
	EVT VT = N->getValueType(0);
	SDValue Src = N->getOperand(IsStrict ? 1 : 0);
	EVT SrcVT = Src.getValueType();

	if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
	assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
	"Unexpected type action!");

	// Try to create a 128 bit vector, but don't exceed a 32 bit element.
	unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
	MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
	VT.getVectorNumElements());
	SDValue Res;
	SDValue Chain;
	if (IsStrict) {
	Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
	{N->getOperand(0), Src});
	Chain = Res.getValue(1);
	} else
	Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);

	// Preserve what we know about the size of the original result. Except
	// when the result is v2i32 since we can't widen the assert.
	if (PromoteVT != MVT::v2i32)
	Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext,
	dl, PromoteVT, Res,
	DAG.getValueType(VT.getVectorElementType()));

	// Truncate back to the original width.
	Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

	// Now widen to 128 bits.
	unsigned NumConcats = 128 / VT.getSizeInBits();
	MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
	VT.getVectorNumElements() * NumConcats);
	SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
	ConcatOps[0] = Res;
	Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
	Results.push_back(Res);
	if (IsStrict)
	Results.push_back(Chain);
	return;
	}


	if (VT == MVT::v2i32) {
	assert((IsSigned \|\| Subtarget.hasAVX512()) &&
	"Can only handle signed conversion without AVX512");
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
	"Unexpected type action!");
	if (Src.getValueType() == MVT::v2f64) {
	unsigned Opc;
	if (IsStrict)
	Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
	else
	Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

	// If we have VLX we can emit a target specific FP_TO_UINT node,.
	if (!IsSigned && !Subtarget.hasVLX()) {
	// Otherwise we can defer to the generic legalizer which will widen
	// the input as well. This will be further widened during op
	// legalization to v8i32<-v8f64.
	// For strict nodes we'll need to widen ourselves.
	// FIXME: Fix the type legalizer to safely widen strict nodes?
	if (!IsStrict)
	return;
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
	DAG.getConstantFP(0.0, dl, MVT::v2f64));
	Opc = N->getOpcode();
	}
	SDValue Res;
	SDValue Chain;
	if (IsStrict) {
	Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
	{N->getOperand(0), Src});
	Chain = Res.getValue(1);
	} else {
	Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
	}
	Results.push_back(Res);
	if (IsStrict)
	Results.push_back(Chain);
	return;
	}

	// Custom widen strict v2f32->v2i32 by padding with zeros.
	// FIXME: Should generic type legalizer do this?
	if (Src.getValueType() == MVT::v2f32 && IsStrict) {
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
	DAG.getConstantFP(0.0, dl, MVT::v2f32));
	SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
	{N->getOperand(0), Src});
	Results.push_back(Res);
	Results.push_back(Res.getValue(1));
	return;
	}

	// The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
	// so early out here.
	return;
	}

	assert(!VT.isVector() && "Vectors should have been handled above!");

	if (Subtarget.hasDQI() && VT == MVT::i64 &&
	(SrcVT == MVT::f32 \|\| SrcVT == MVT::f64)) {
	assert(!Subtarget.is64Bit() && "i64 should be legal");
	unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
	// If we use a 128-bit result we might need to use a target specific node.
	unsigned SrcElts =
	std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
	MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
	MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
	unsigned Opc = N->getOpcode();
	if (NumElts != SrcElts) {
	if (IsStrict)
	Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
	else
	Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
	}

	SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
	SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
	DAG.getConstantFP(0.0, dl, VecInVT), Src,
	ZeroIdx);
	SDValue Chain;
	if (IsStrict) {
	SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
	Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
	Chain = Res.getValue(1);
	} else
	Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
	Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
	Results.push_back(Res);
	if (IsStrict)
	Results.push_back(Chain);
	return;
	}

	SDValue Chain;
	if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
	Results.push_back(V);
	if (IsStrict)
	Results.push_back(Chain);
	}
	return;
	}
	case ISD::LRINT:
	case ISD::LLRINT: {
	if (SDValue V = LRINT_LLRINTHelper(N, DAG))
	Results.push_back(V);
	return;
	}

	case ISD::SINT_TO_FP:
	case ISD::STRICT_SINT_TO_FP:
	case ISD::UINT_TO_FP:
	case ISD::STRICT_UINT_TO_FP: {
	bool IsStrict = N->isStrictFPOpcode();
	bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP \|\|
	N->getOpcode() == ISD::STRICT_SINT_TO_FP;
	EVT VT = N->getValueType(0);
	if (VT != MVT::v2f32)
	return;
	SDValue Src = N->getOperand(IsStrict ? 1 : 0);
	EVT SrcVT = Src.getValueType();
	if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
	if (IsStrict) {
	unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
	: X86ISD::STRICT_CVTUI2P;
	SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
	{N->getOperand(0), Src});
	Results.push_back(Res);
	Results.push_back(Res.getValue(1));
	} else {
	unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
	Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
	}
	return;
	}
	if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
	Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
	SDValue Zero = DAG.getConstant(0, dl, SrcVT);
	SDValue One = DAG.getConstant(1, dl, SrcVT);
	SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
	DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
	DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
	SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
	SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
	SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
	for (int i = 0; i != 2; ++i) {
	SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
	SignSrc, DAG.getIntPtrConstant(i, dl));
	if (IsStrict)
	SignCvts[i] =
	DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
	{N->getOperand(0), Elt});
	else
	SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
	};
	SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
	SDValue Slow, Chain;
	if (IsStrict) {
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	SignCvts[0].getValue(1), SignCvts[1].getValue(1));
	Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
	{Chain, SignCvt, SignCvt});
	Chain = Slow.getValue(1);
	} else {
	Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
	}
	IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
	IsNeg =
	DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
	SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
	Results.push_back(Cvt);
	if (IsStrict)
	Results.push_back(Chain);
	return;
	}

	if (SrcVT != MVT::v2i32)
	return;

	if (IsSigned \|\| Subtarget.hasAVX512()) {
	if (!IsStrict)
	return;

	// Custom widen strict v2i32->v2f32 to avoid scalarization.
	// FIXME: Should generic type legalizer do this?
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
	DAG.getConstant(0, dl, MVT::v2i32));
	SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
	{N->getOperand(0), Src});
	Results.push_back(Res);
	Results.push_back(Res.getValue(1));
	return;
	}

	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
	SDValue VBias =
	DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
	SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
	DAG.getBitcast(MVT::v2i64, VBias));
	Or = DAG.getBitcast(MVT::v2f64, Or);
	if (IsStrict) {
	SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
	{N->getOperand(0), Or, VBias});
	SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
	{MVT::v4f32, MVT::Other},
	{Sub.getValue(1), Sub});
	Results.push_back(Res);
	Results.push_back(Res.getValue(1));
	} else {
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
	Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
	}
	return;
	}
	case ISD::STRICT_FP_ROUND:
	case ISD::FP_ROUND: {
	bool IsStrict = N->isStrictFPOpcode();
	SDValue Src = N->getOperand(IsStrict ? 1 : 0);
	if (!isTypeLegal(Src.getValueType()))
	return;
	SDValue V;
	if (IsStrict)
	V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {MVT::v4f32, MVT::Other},
	{N->getOperand(0), N->getOperand(1)});
	else
	V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
	Results.push_back(V);
	if (IsStrict)
	Results.push_back(V.getValue(1));
	return;
	}
	case ISD::FP_EXTEND:
	case ISD::STRICT_FP_EXTEND: {
	// Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
	// No other ValueType for FP_EXTEND should reach this point.
	assert(N->getValueType(0) == MVT::v2f32 &&
	"Do not know how to legalize this Node");
	return;
	}
	case ISD::INTRINSIC_W_CHAIN: {
	unsigned IntNo = N->getConstantOperandVal(1);
	switch (IntNo) {
	default : llvm_unreachable("Do not know how to custom type "
	"legalize this intrinsic operation!");
	case Intrinsic::x86_rdtsc:
	return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
	Results);
	case Intrinsic::x86_rdtscp:
	return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
	Results);
	case Intrinsic::x86_rdpmc:
	expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
	Results);
	return;
	case Intrinsic::x86_xgetbv:
	expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
	Results);
	return;
	}
	}
	case ISD::READCYCLECOUNTER: {
	return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
	}
	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
	EVT T = N->getValueType(0);
	assert((T == MVT::i64 \|\| T == MVT::i128) && "can only expand cmpxchg pair");
	bool Regs64bit = T == MVT::i128;
	assert((!Regs64bit \|\| Subtarget.hasCmpxchg16b()) &&
	"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
	MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
	SDValue cpInL, cpInH;
	cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
	DAG.getConstant(0, dl, HalfT));
	cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
	DAG.getConstant(1, dl, HalfT));
	cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
	Regs64bit ? X86::RAX : X86::EAX,
	cpInL, SDValue());
	cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
	Regs64bit ? X86::RDX : X86::EDX,
	cpInH, cpInL.getValue(1));
	SDValue swapInL, swapInH;
	swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
	DAG.getConstant(0, dl, HalfT));
	swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
	DAG.getConstant(1, dl, HalfT));
	swapInH =
	DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
	swapInH, cpInH.getValue(1));
	// If the current function needs the base pointer, RBX,
	// we shouldn't use cmpxchg directly.
	// Indeed the lowering of that instruction will clobber
	// that register and since RBX will be a reserved register
	// the register allocator will not make sure its value will
	// be properly saved and restored around this live-range.
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	SDValue Result;
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	Register BasePtr = TRI->getBaseRegister();
	MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
	if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
	(BasePtr == X86::RBX \|\| BasePtr == X86::EBX)) {
	// ISel prefers the LCMPXCHG64 variant.
	// If that assert breaks, that means it is not the case anymore,
	// and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
	// not just EBX. This is a matter of accepting i64 input for that
	// pseudo, and restoring into the register of the right wide
	// in expand pseudo. Everything else should just work.
	assert(((Regs64bit == (BasePtr == X86::RBX)) \|\| BasePtr == X86::EBX) &&
	"Saving only half of the RBX");
	unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
	: X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
	SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
	Regs64bit ? X86::RBX : X86::EBX,
	HalfT, swapInH.getValue(1));
	SDValue Ops[] = {/Chain/ RBXSave.getValue(1), N->getOperand(1), swapInL,
	RBXSave,
	/Glue/ RBXSave.getValue(2)};
	Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
	} else {
	unsigned Opcode =
	Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
	swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
	Regs64bit ? X86::RBX : X86::EBX, swapInL,
	swapInH.getValue(1));
	SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
	swapInL.getValue(1)};
	Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
	}
	SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
	Regs64bit ? X86::RAX : X86::EAX,
	HalfT, Result.getValue(1));
	SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
	Regs64bit ? X86::RDX : X86::EDX,
	HalfT, cpOutL.getValue(2));
	SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};

	SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
	MVT::i32, cpOutH.getValue(2));
	SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
	Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));

	Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
	Results.push_back(Success);
	Results.push_back(EFLAGS.getValue(1));
	return;
	}
	case ISD::ATOMIC_LOAD: {
	assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
	bool NoImplicitFloatOps =
	DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat);
	if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
	auto *Node = cast<AtomicSDNode>(N);
	if (Subtarget.hasSSE1()) {
	// Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
	// Then extract the lower 64-bits.
	MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
	SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
	SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
	SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
	MVT::i64, Node->getMemOperand());
	if (Subtarget.hasSSE2()) {
	SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Res);
	Results.push_back(Ld.getValue(1));
	return;
	}
	// We use an alternative sequence for SSE1 that extracts as v2f32 and
	// then casts to i64. This avoids a 128-bit stack temporary being
	// created by type legalization if we were to cast v4f32->v2i64.
	SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
	DAG.getIntPtrConstant(0, dl));
	Res = DAG.getBitcast(MVT::i64, Res);
	Results.push_back(Res);
	Results.push_back(Ld.getValue(1));
	return;
	}
	if (Subtarget.hasX87()) {
	// First load this into an 80-bit X87 register. This will put the whole
	// integer into the significand.
	SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
	SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
	SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
	dl, Tys, Ops, MVT::i64,
	Node->getMemOperand());
	SDValue Chain = Result.getValue(1);

	// Now store the X87 register to a stack temporary and convert to i64.
	// This store is not atomic and doesn't need to be.
	// FIXME: We don't need a stack temporary if the result of the load
	// is already being stored. We could just directly store there.
	SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
	int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
	MachinePointerInfo MPI =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
	SDValue StoreOps[] = { Chain, Result, StackPtr };
	Chain = DAG.getMemIntrinsicNode(
	X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
	MPI, None /Align/, MachineMemOperand::MOStore);

	// Finally load the value back from the stack temporary and return it.
	// This load is not atomic and doesn't need to be.
	// This load will be further type legalized.
	Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
	Results.push_back(Result);
	Results.push_back(Result.getValue(1));
	return;
	}
	}
	// TODO: Use MOVLPS when SSE1 is available?
	// Delegate to generic TypeLegalization. Situations we can really handle
	// should have already been dealt with by AtomicExpandPass.cpp.
	break;
	}
	case ISD::ATOMIC_SWAP:
	case ISD::ATOMIC_LOAD_ADD:
	case ISD::ATOMIC_LOAD_SUB:
	case ISD::ATOMIC_LOAD_AND:
	case ISD::ATOMIC_LOAD_OR:
	case ISD::ATOMIC_LOAD_XOR:
	case ISD::ATOMIC_LOAD_NAND:
	case ISD::ATOMIC_LOAD_MIN:
	case ISD::ATOMIC_LOAD_MAX:
	case ISD::ATOMIC_LOAD_UMIN:
	case ISD::ATOMIC_LOAD_UMAX:
	// Delegate to generic TypeLegalization. Situations we can really handle
	// should have already been dealt with by AtomicExpandPass.cpp.
	break;

	case ISD::BITCAST: {
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	EVT DstVT = N->getValueType(0);
	EVT SrcVT = N->getOperand(0).getValueType();

	// If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
	// we can split using the k-register rather than memory.
	if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
	assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
	Lo = DAG.getBitcast(MVT::i32, Lo);
	Hi = DAG.getBitcast(MVT::i32, Hi);
	SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
	Results.push_back(Res);
	return;
	}

	if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
	// FIXME: Use v4f32 for SSE1?
	assert(Subtarget.hasSSE2() && "Requires SSE2");
	assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
	"Unexpected type action!");
	EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
	SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
	N->getOperand(0));
	Res = DAG.getBitcast(WideVT, Res);
	Results.push_back(Res);
	return;
	}

	return;
	}
	case ISD::MGATHER: {
	EVT VT = N->getValueType(0);
	if ((VT == MVT::v2f32 \|\| VT == MVT::v2i32) &&
	(Subtarget.hasVLX() \|\| !Subtarget.hasAVX512())) {
	auto *Gather = cast<MaskedGatherSDNode>(N);
	SDValue Index = Gather->getIndex();
	if (Index.getValueType() != MVT::v2i64)
	return;
	assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
	"Unexpected type action!");
	EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
	SDValue Mask = Gather->getMask();
	assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
	SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
	Gather->getPassThru(),
	DAG.getUNDEF(VT));
	if (!Subtarget.hasVLX()) {
	// We need to widen the mask, but the instruction will only use 2
	// of its elements. So we can use undef.
	Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
	DAG.getUNDEF(MVT::v2i1));
	Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
	}
	SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
	Gather->getBasePtr(), Index, Gather->getScale() };
	SDValue Res = DAG.getMemIntrinsicNode(
	X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
	Gather->getMemoryVT(), Gather->getMemOperand());
	Results.push_back(Res);
	Results.push_back(Res.getValue(1));
	return;
	}
	return;
	}
	case ISD::LOAD: {
	// Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
	// avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
	// cast since type legalization will try to use an i64 load.
	MVT VT = N->getSimpleValueType(0);
	assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
	assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
	"Unexpected type action!");
	if (!ISD::isNON_EXTLoad(N))
	return;
	auto *Ld = cast<LoadSDNode>(N);
	if (Subtarget.hasSSE2()) {
	MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
	SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), Ld->getOriginalAlign(),
	Ld->getMemOperand()->getFlags());
	SDValue Chain = Res.getValue(1);
	MVT VecVT = MVT::getVectorVT(LdVT, 2);
	Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
	EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
	Res = DAG.getBitcast(WideVT, Res);
	Results.push_back(Res);
	Results.push_back(Chain);
	return;
	}
	assert(Subtarget.hasSSE1() && "Expected SSE");
	SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
	SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
	SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
	MVT::i64, Ld->getMemOperand());
	Results.push_back(Res);
	Results.push_back(Res.getValue(1));
	return;
	}
	case ISD::ADDRSPACECAST: {
	SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
	Results.push_back(V);
	return;
	}
	}
	}

	const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
	switch ((X86ISD::NodeType)Opcode) {
	case X86ISD::FIRST_NUMBER: break;
	#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
	NODE_NAME_CASE(BSF)
	NODE_NAME_CASE(BSR)
	NODE_NAME_CASE(FSHL)
	NODE_NAME_CASE(FSHR)
	NODE_NAME_CASE(FAND)
	NODE_NAME_CASE(FANDN)
	NODE_NAME_CASE(FOR)
	NODE_NAME_CASE(FXOR)
	NODE_NAME_CASE(FILD)
	NODE_NAME_CASE(FIST)
	NODE_NAME_CASE(FP_TO_INT_IN_MEM)
	NODE_NAME_CASE(FLD)
	NODE_NAME_CASE(FST)
	NODE_NAME_CASE(CALL)
	NODE_NAME_CASE(BT)
	NODE_NAME_CASE(CMP)
	NODE_NAME_CASE(FCMP)
	NODE_NAME_CASE(STRICT_FCMP)
	NODE_NAME_CASE(STRICT_FCMPS)
	NODE_NAME_CASE(COMI)
	NODE_NAME_CASE(UCOMI)
	NODE_NAME_CASE(CMPM)
	NODE_NAME_CASE(STRICT_CMPM)
	NODE_NAME_CASE(CMPM_SAE)
	NODE_NAME_CASE(SETCC)
	NODE_NAME_CASE(SETCC_CARRY)
	NODE_NAME_CASE(FSETCC)
	NODE_NAME_CASE(FSETCCM)
	NODE_NAME_CASE(FSETCCM_SAE)
	NODE_NAME_CASE(CMOV)
	NODE_NAME_CASE(BRCOND)
	NODE_NAME_CASE(RET_FLAG)
	NODE_NAME_CASE(IRET)
	NODE_NAME_CASE(REP_STOS)
	NODE_NAME_CASE(REP_MOVS)
	NODE_NAME_CASE(GlobalBaseReg)
	NODE_NAME_CASE(Wrapper)
	NODE_NAME_CASE(WrapperRIP)
	NODE_NAME_CASE(MOVQ2DQ)
	NODE_NAME_CASE(MOVDQ2Q)
	NODE_NAME_CASE(MMX_MOVD2W)
	NODE_NAME_CASE(MMX_MOVW2D)
	NODE_NAME_CASE(PEXTRB)
	NODE_NAME_CASE(PEXTRW)
	NODE_NAME_CASE(INSERTPS)
	NODE_NAME_CASE(PINSRB)
	NODE_NAME_CASE(PINSRW)
	NODE_NAME_CASE(PSHUFB)
	NODE_NAME_CASE(ANDNP)
	NODE_NAME_CASE(BLENDI)
	NODE_NAME_CASE(BLENDV)
	NODE_NAME_CASE(HADD)
	NODE_NAME_CASE(HSUB)
	NODE_NAME_CASE(FHADD)
	NODE_NAME_CASE(FHSUB)
	NODE_NAME_CASE(CONFLICT)
	NODE_NAME_CASE(FMAX)
	NODE_NAME_CASE(FMAXS)
	NODE_NAME_CASE(FMAX_SAE)
	NODE_NAME_CASE(FMAXS_SAE)
	NODE_NAME_CASE(FMIN)
	NODE_NAME_CASE(FMINS)
	NODE_NAME_CASE(FMIN_SAE)
	NODE_NAME_CASE(FMINS_SAE)
	NODE_NAME_CASE(FMAXC)
	NODE_NAME_CASE(FMINC)
	NODE_NAME_CASE(FRSQRT)
	NODE_NAME_CASE(FRCP)
	NODE_NAME_CASE(EXTRQI)
	NODE_NAME_CASE(INSERTQI)
	NODE_NAME_CASE(TLSADDR)
	NODE_NAME_CASE(TLSBASEADDR)
	NODE_NAME_CASE(TLSCALL)
	NODE_NAME_CASE(EH_SJLJ_SETJMP)
	NODE_NAME_CASE(EH_SJLJ_LONGJMP)
	NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
	NODE_NAME_CASE(EH_RETURN)
	NODE_NAME_CASE(TC_RETURN)
	NODE_NAME_CASE(FNSTCW16m)
	NODE_NAME_CASE(LCMPXCHG_DAG)
	NODE_NAME_CASE(LCMPXCHG8_DAG)
	NODE_NAME_CASE(LCMPXCHG16_DAG)
	NODE_NAME_CASE(LCMPXCHG8_SAVE_EBX_DAG)
	NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
	NODE_NAME_CASE(LADD)
	NODE_NAME_CASE(LSUB)
	NODE_NAME_CASE(LOR)
	NODE_NAME_CASE(LXOR)
	NODE_NAME_CASE(LAND)
	NODE_NAME_CASE(VZEXT_MOVL)
	NODE_NAME_CASE(VZEXT_LOAD)
	NODE_NAME_CASE(VEXTRACT_STORE)
	NODE_NAME_CASE(VTRUNC)
	NODE_NAME_CASE(VTRUNCS)
	NODE_NAME_CASE(VTRUNCUS)
	NODE_NAME_CASE(VMTRUNC)
	NODE_NAME_CASE(VMTRUNCS)
	NODE_NAME_CASE(VMTRUNCUS)
	NODE_NAME_CASE(VTRUNCSTORES)
	NODE_NAME_CASE(VTRUNCSTOREUS)
	NODE_NAME_CASE(VMTRUNCSTORES)
	NODE_NAME_CASE(VMTRUNCSTOREUS)
	NODE_NAME_CASE(VFPEXT)
	NODE_NAME_CASE(STRICT_VFPEXT)
	NODE_NAME_CASE(VFPEXT_SAE)
	NODE_NAME_CASE(VFPEXTS)
	NODE_NAME_CASE(VFPEXTS_SAE)
	NODE_NAME_CASE(VFPROUND)
	NODE_NAME_CASE(STRICT_VFPROUND)
	NODE_NAME_CASE(VMFPROUND)
	NODE_NAME_CASE(VFPROUND_RND)
	NODE_NAME_CASE(VFPROUNDS)
	NODE_NAME_CASE(VFPROUNDS_RND)
	NODE_NAME_CASE(VSHLDQ)
	NODE_NAME_CASE(VSRLDQ)
	NODE_NAME_CASE(VSHL)
	NODE_NAME_CASE(VSRL)
	NODE_NAME_CASE(VSRA)
	NODE_NAME_CASE(VSHLI)
	NODE_NAME_CASE(VSRLI)
	NODE_NAME_CASE(VSRAI)
	NODE_NAME_CASE(VSHLV)
	NODE_NAME_CASE(VSRLV)
	NODE_NAME_CASE(VSRAV)
	NODE_NAME_CASE(VROTLI)
	NODE_NAME_CASE(VROTRI)
	NODE_NAME_CASE(VPPERM)
	NODE_NAME_CASE(CMPP)
	NODE_NAME_CASE(STRICT_CMPP)
	NODE_NAME_CASE(PCMPEQ)
	NODE_NAME_CASE(PCMPGT)
	NODE_NAME_CASE(PHMINPOS)
	NODE_NAME_CASE(ADD)
	NODE_NAME_CASE(SUB)
	NODE_NAME_CASE(ADC)
	NODE_NAME_CASE(SBB)
	NODE_NAME_CASE(SMUL)
	NODE_NAME_CASE(UMUL)
	NODE_NAME_CASE(OR)
	NODE_NAME_CASE(XOR)
	NODE_NAME_CASE(AND)
	NODE_NAME_CASE(BEXTR)
	NODE_NAME_CASE(BZHI)
	NODE_NAME_CASE(PDEP)
	NODE_NAME_CASE(PEXT)
	NODE_NAME_CASE(MUL_IMM)
	NODE_NAME_CASE(MOVMSK)
	NODE_NAME_CASE(PTEST)
	NODE_NAME_CASE(TESTP)
	NODE_NAME_CASE(KORTEST)
	NODE_NAME_CASE(KTEST)
	NODE_NAME_CASE(KADD)
	NODE_NAME_CASE(KSHIFTL)
	NODE_NAME_CASE(KSHIFTR)
	NODE_NAME_CASE(PACKSS)
	NODE_NAME_CASE(PACKUS)
	NODE_NAME_CASE(PALIGNR)
	NODE_NAME_CASE(VALIGN)
	NODE_NAME_CASE(VSHLD)
	NODE_NAME_CASE(VSHRD)
	NODE_NAME_CASE(VSHLDV)
	NODE_NAME_CASE(VSHRDV)
	NODE_NAME_CASE(PSHUFD)
	NODE_NAME_CASE(PSHUFHW)
	NODE_NAME_CASE(PSHUFLW)
	NODE_NAME_CASE(SHUFP)
	NODE_NAME_CASE(SHUF128)
	NODE_NAME_CASE(MOVLHPS)
	NODE_NAME_CASE(MOVHLPS)
	NODE_NAME_CASE(MOVDDUP)
	NODE_NAME_CASE(MOVSHDUP)
	NODE_NAME_CASE(MOVSLDUP)
	NODE_NAME_CASE(MOVSD)
	NODE_NAME_CASE(MOVSS)
	NODE_NAME_CASE(UNPCKL)
	NODE_NAME_CASE(UNPCKH)
	NODE_NAME_CASE(VBROADCAST)
	NODE_NAME_CASE(VBROADCAST_LOAD)
	NODE_NAME_CASE(VBROADCASTM)
	NODE_NAME_CASE(SUBV_BROADCAST)
	NODE_NAME_CASE(VPERMILPV)
	NODE_NAME_CASE(VPERMILPI)
	NODE_NAME_CASE(VPERM2X128)
	NODE_NAME_CASE(VPERMV)
	NODE_NAME_CASE(VPERMV3)
	NODE_NAME_CASE(VPERMI)
	NODE_NAME_CASE(VPTERNLOG)
	NODE_NAME_CASE(VFIXUPIMM)
	NODE_NAME_CASE(VFIXUPIMM_SAE)
	NODE_NAME_CASE(VFIXUPIMMS)
	NODE_NAME_CASE(VFIXUPIMMS_SAE)
	NODE_NAME_CASE(VRANGE)
	NODE_NAME_CASE(VRANGE_SAE)
	NODE_NAME_CASE(VRANGES)
	NODE_NAME_CASE(VRANGES_SAE)
	NODE_NAME_CASE(PMULUDQ)
	NODE_NAME_CASE(PMULDQ)
	NODE_NAME_CASE(PSADBW)
	NODE_NAME_CASE(DBPSADBW)
	NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
	NODE_NAME_CASE(VAARG_64)
	NODE_NAME_CASE(WIN_ALLOCA)
	NODE_NAME_CASE(MEMBARRIER)
	NODE_NAME_CASE(MFENCE)
	NODE_NAME_CASE(SEG_ALLOCA)
	NODE_NAME_CASE(PROBED_ALLOCA)
	NODE_NAME_CASE(RDRAND)
	NODE_NAME_CASE(RDSEED)
	NODE_NAME_CASE(RDPKRU)
	NODE_NAME_CASE(WRPKRU)
	NODE_NAME_CASE(VPMADDUBSW)
	NODE_NAME_CASE(VPMADDWD)
	NODE_NAME_CASE(VPSHA)
	NODE_NAME_CASE(VPSHL)
	NODE_NAME_CASE(VPCOM)
	NODE_NAME_CASE(VPCOMU)
	NODE_NAME_CASE(VPERMIL2)
	NODE_NAME_CASE(FMSUB)
	NODE_NAME_CASE(STRICT_FMSUB)
	NODE_NAME_CASE(FNMADD)
	NODE_NAME_CASE(STRICT_FNMADD)
	NODE_NAME_CASE(FNMSUB)
	NODE_NAME_CASE(STRICT_FNMSUB)
	NODE_NAME_CASE(FMADDSUB)
	NODE_NAME_CASE(FMSUBADD)
	NODE_NAME_CASE(FMADD_RND)
	NODE_NAME_CASE(FNMADD_RND)
	NODE_NAME_CASE(FMSUB_RND)
	NODE_NAME_CASE(FNMSUB_RND)
	NODE_NAME_CASE(FMADDSUB_RND)
	NODE_NAME_CASE(FMSUBADD_RND)
	NODE_NAME_CASE(VPMADD52H)
	NODE_NAME_CASE(VPMADD52L)
	NODE_NAME_CASE(VRNDSCALE)
	NODE_NAME_CASE(STRICT_VRNDSCALE)
	NODE_NAME_CASE(VRNDSCALE_SAE)
	NODE_NAME_CASE(VRNDSCALES)
	NODE_NAME_CASE(VRNDSCALES_SAE)
	NODE_NAME_CASE(VREDUCE)
	NODE_NAME_CASE(VREDUCE_SAE)
	NODE_NAME_CASE(VREDUCES)
	NODE_NAME_CASE(VREDUCES_SAE)
	NODE_NAME_CASE(VGETMANT)
	NODE_NAME_CASE(VGETMANT_SAE)
	NODE_NAME_CASE(VGETMANTS)
	NODE_NAME_CASE(VGETMANTS_SAE)
	NODE_NAME_CASE(PCMPESTR)
	NODE_NAME_CASE(PCMPISTR)
	NODE_NAME_CASE(XTEST)
	NODE_NAME_CASE(COMPRESS)
	NODE_NAME_CASE(EXPAND)
	NODE_NAME_CASE(SELECTS)
	NODE_NAME_CASE(ADDSUB)
	NODE_NAME_CASE(RCP14)
	NODE_NAME_CASE(RCP14S)
	NODE_NAME_CASE(RCP28)
	NODE_NAME_CASE(RCP28_SAE)
	NODE_NAME_CASE(RCP28S)
	NODE_NAME_CASE(RCP28S_SAE)
	NODE_NAME_CASE(EXP2)
	NODE_NAME_CASE(EXP2_SAE)
	NODE_NAME_CASE(RSQRT14)
	NODE_NAME_CASE(RSQRT14S)
	NODE_NAME_CASE(RSQRT28)
	NODE_NAME_CASE(RSQRT28_SAE)
	NODE_NAME_CASE(RSQRT28S)
	NODE_NAME_CASE(RSQRT28S_SAE)
	NODE_NAME_CASE(FADD_RND)
	NODE_NAME_CASE(FADDS)
	NODE_NAME_CASE(FADDS_RND)
	NODE_NAME_CASE(FSUB_RND)
	NODE_NAME_CASE(FSUBS)
	NODE_NAME_CASE(FSUBS_RND)
	NODE_NAME_CASE(FMUL_RND)
	NODE_NAME_CASE(FMULS)
	NODE_NAME_CASE(FMULS_RND)
	NODE_NAME_CASE(FDIV_RND)
	NODE_NAME_CASE(FDIVS)
	NODE_NAME_CASE(FDIVS_RND)
	NODE_NAME_CASE(FSQRT_RND)
	NODE_NAME_CASE(FSQRTS)
	NODE_NAME_CASE(FSQRTS_RND)
	NODE_NAME_CASE(FGETEXP)
	NODE_NAME_CASE(FGETEXP_SAE)
	NODE_NAME_CASE(FGETEXPS)
	NODE_NAME_CASE(FGETEXPS_SAE)
	NODE_NAME_CASE(SCALEF)
	NODE_NAME_CASE(SCALEF_RND)
	NODE_NAME_CASE(SCALEFS)
	NODE_NAME_CASE(SCALEFS_RND)
	NODE_NAME_CASE(AVG)
	NODE_NAME_CASE(MULHRS)
	NODE_NAME_CASE(SINT_TO_FP_RND)
	NODE_NAME_CASE(UINT_TO_FP_RND)
	NODE_NAME_CASE(CVTTP2SI)
	NODE_NAME_CASE(CVTTP2UI)
	NODE_NAME_CASE(STRICT_CVTTP2SI)
	NODE_NAME_CASE(STRICT_CVTTP2UI)
	NODE_NAME_CASE(MCVTTP2SI)
	NODE_NAME_CASE(MCVTTP2UI)
	NODE_NAME_CASE(CVTTP2SI_SAE)
	NODE_NAME_CASE(CVTTP2UI_SAE)
	NODE_NAME_CASE(CVTTS2SI)
	NODE_NAME_CASE(CVTTS2UI)
	NODE_NAME_CASE(CVTTS2SI_SAE)
	NODE_NAME_CASE(CVTTS2UI_SAE)
	NODE_NAME_CASE(CVTSI2P)
	NODE_NAME_CASE(CVTUI2P)
	NODE_NAME_CASE(STRICT_CVTSI2P)
	NODE_NAME_CASE(STRICT_CVTUI2P)
	NODE_NAME_CASE(MCVTSI2P)
	NODE_NAME_CASE(MCVTUI2P)
	NODE_NAME_CASE(VFPCLASS)
	NODE_NAME_CASE(VFPCLASSS)
	NODE_NAME_CASE(MULTISHIFT)
	NODE_NAME_CASE(SCALAR_SINT_TO_FP)
	NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
	NODE_NAME_CASE(SCALAR_UINT_TO_FP)
	NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
	NODE_NAME_CASE(CVTPS2PH)
	NODE_NAME_CASE(STRICT_CVTPS2PH)
	NODE_NAME_CASE(MCVTPS2PH)
	NODE_NAME_CASE(CVTPH2PS)
	NODE_NAME_CASE(STRICT_CVTPH2PS)
	NODE_NAME_CASE(CVTPH2PS_SAE)
	NODE_NAME_CASE(CVTP2SI)
	NODE_NAME_CASE(CVTP2UI)
	NODE_NAME_CASE(MCVTP2SI)
	NODE_NAME_CASE(MCVTP2UI)
	NODE_NAME_CASE(CVTP2SI_RND)
	NODE_NAME_CASE(CVTP2UI_RND)
	NODE_NAME_CASE(CVTS2SI)
	NODE_NAME_CASE(CVTS2UI)
	NODE_NAME_CASE(CVTS2SI_RND)
	NODE_NAME_CASE(CVTS2UI_RND)
	NODE_NAME_CASE(CVTNE2PS2BF16)
	NODE_NAME_CASE(CVTNEPS2BF16)
	NODE_NAME_CASE(MCVTNEPS2BF16)
	NODE_NAME_CASE(DPBF16PS)
	NODE_NAME_CASE(LWPINS)
	NODE_NAME_CASE(MGATHER)
	NODE_NAME_CASE(MSCATTER)
	NODE_NAME_CASE(VPDPBUSD)
	NODE_NAME_CASE(VPDPBUSDS)
	NODE_NAME_CASE(VPDPWSSD)
	NODE_NAME_CASE(VPDPWSSDS)
	NODE_NAME_CASE(VPSHUFBITQMB)
	NODE_NAME_CASE(GF2P8MULB)
	NODE_NAME_CASE(GF2P8AFFINEQB)
	NODE_NAME_CASE(GF2P8AFFINEINVQB)
	NODE_NAME_CASE(NT_CALL)
	NODE_NAME_CASE(NT_BRIND)
	NODE_NAME_CASE(UMWAIT)
	NODE_NAME_CASE(TPAUSE)
	NODE_NAME_CASE(ENQCMD)
	NODE_NAME_CASE(ENQCMDS)
	NODE_NAME_CASE(VP2INTERSECT)
	}
	return nullptr;
	#undef NODE_NAME_CASE
	}

	/// Return true if the addressing mode represented by AM is legal for this
	/// target, for a load/store of the specified type.
	bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS,
	Instruction *I) const {
	// X86 supports extremely general addressing modes.
	CodeModel::Model M = getTargetMachine().getCodeModel();

	// X86 allows a sign-extended 32-bit immediate field as a displacement.
	if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
	return false;

	if (AM.BaseGV) {
	unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);

	// If a reference to this global requires an extra load, we can't fold it.
	if (isGlobalStubReference(GVFlags))
	return false;

	// If BaseGV requires a register for the PIC base, we cannot also have a
	// BaseReg specified.
	if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
	return false;

	// If lower 4G is not available, then we must use rip-relative addressing.
	if ((M != CodeModel::Small \|\| isPositionIndependent()) &&
	Subtarget.is64Bit() && (AM.BaseOffs \|\| AM.Scale > 1))
	return false;
	}

	switch (AM.Scale) {
	case 0:
	case 1:
	case 2:
	case 4:
	case 8:
	// These scales always work.
	break;
	case 3:
	case 5:
	case 9:
	// These scales are formed with basereg+scalereg. Only accept if there is
	// no basereg yet.
	if (AM.HasBaseReg)
	return false;
	break;
	default: // Other stuff never works.
	return false;
	}

	return true;
	}

	bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
	unsigned Bits = Ty->getScalarSizeInBits();

	// 8-bit shifts are always expensive, but versions with a scalar amount aren't
	// particularly cheaper than those without.
	if (Bits == 8)
	return false;

	// XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
	// Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
	if (Subtarget.hasXOP() &&
	(Bits == 8 \|\| Bits == 16 \|\| Bits == 32 \|\| Bits == 64))
	return false;

	// AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
	// shifts just as cheap as scalar ones.
	if (Subtarget.hasAVX2() && (Bits == 32 \|\| Bits == 64))
	return false;

	// AVX512BW has shifts such as vpsllvw.
	if (Subtarget.hasBWI() && Bits == 16)
	return false;

	// Otherwise, it's significantly cheaper to shift by a scalar amount than by a
	// fully general vector.
	return true;
	}

	bool X86TargetLowering::isBinOp(unsigned Opcode) const {
	switch (Opcode) {
	// These are non-commutative binops.
	// TODO: Add more X86ISD opcodes once we have test coverage.
	case X86ISD::ANDNP:
	case X86ISD::PCMPGT:
	case X86ISD::FMAX:
	case X86ISD::FMIN:
	case X86ISD::FANDN:
	return true;
	}

	return TargetLoweringBase::isBinOp(Opcode);
	}

	bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
	switch (Opcode) {
	// TODO: Add more X86ISD opcodes once we have test coverage.
	case X86ISD::PCMPEQ:
	case X86ISD::PMULDQ:
	case X86ISD::PMULUDQ:
	case X86ISD::FMAXC:
	case X86ISD::FMINC:
	case X86ISD::FAND:
	case X86ISD::FOR:
	case X86ISD::FXOR:
	return true;
	}

	return TargetLoweringBase::isCommutativeBinOp(Opcode);
	}

	bool X86TargetLowering::isTruncateFree(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;
	unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
	unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
	return NumBits1 > NumBits2;
	}

	bool X86TargetLowering::allowTruncateForTailCall(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;

	if (!isTypeLegal(EVT::getEVT(Ty1)))
	return false;

	assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");

	// Assuming the caller doesn't have a zeroext or signext return parameter,
	// truncation all the way down to i1 is valid.
	return true;
	}

	bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
	return isInt<32>(Imm);
	}

	bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
	// Can also use sub to handle negated immediates.
	return isInt<32>(Imm);
	}

	bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
	return isInt<32>(Imm);
	}

	bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
	if (!VT1.isScalarInteger() \|\| !VT2.isScalarInteger())
	return false;
	unsigned NumBits1 = VT1.getSizeInBits();
	unsigned NumBits2 = VT2.getSizeInBits();
	return NumBits1 > NumBits2;
	}

	bool X86TargetLowering::isZExtFree(Type Ty1, Type Ty2) const {
	// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
	return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
	}

	bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
	// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
	return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
	}

	bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
	EVT VT1 = Val.getValueType();
	if (isZExtFree(VT1, VT2))
	return true;

	if (Val.getOpcode() != ISD::LOAD)
	return false;

	if (!VT1.isSimple() \|\| !VT1.isInteger() \|\|
	!VT2.isSimple() \|\| !VT2.isInteger())
	return false;

	switch (VT1.getSimpleVT().SimpleTy) {
	default: break;
	case MVT::i8:
	case MVT::i16:
	case MVT::i32:
	// X86 has 8, 16, and 32-bit zero-extending loads.
	return true;
	}

	return false;
	}

	bool X86TargetLowering::shouldSinkOperands(Instruction *I,
	SmallVectorImpl<Use *> &Ops) const {
	// A uniform shift amount in a vector shift or funnel shift may be much
	// cheaper than a generic variable vector shift, so make that pattern visible
	// to SDAG by sinking the shuffle instruction next to the shift.
	int ShiftAmountOpNum = -1;
	if (I->isShift())
	ShiftAmountOpNum = 1;
	else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
	if (II->getIntrinsicID() == Intrinsic::fshl \|\|
	II->getIntrinsicID() == Intrinsic::fshr)
	ShiftAmountOpNum = 2;
	}

	if (ShiftAmountOpNum == -1)
	return false;

	auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
	if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
	isVectorShiftByScalarCheap(I->getType())) {
	Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
	return true;
	}

	return false;
	}

	bool X86TargetLowering::shouldConvertPhiType(Type From, Type To) const {
	if (!Subtarget.is64Bit())
	return false;
	return TargetLowering::shouldConvertPhiType(From, To);
	}

	bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
	if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
	return false;

	EVT SrcVT = ExtVal.getOperand(0).getValueType();

	// There is no extending load for vXi1.
	if (SrcVT.getScalarType() == MVT::i1)
	return false;

	return true;
	}

	bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
	EVT VT) const {
	if (!Subtarget.hasAnyFMA())
	return false;

	VT = VT.getScalarType();

	if (!VT.isSimple())
	return false;

	switch (VT.getSimpleVT().SimpleTy) {
	case MVT::f32:
	case MVT::f64:
	return true;
	default:
	break;
	}

	return false;
	}

	bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
	// i16 instructions are longer (0x66 prefix) and potentially slower.
	return !(VT1 == MVT::i32 && VT2 == MVT::i16);
	}

	/// Targets can use this to indicate that they only support some
	/// VECTOR_SHUFFLE operations, those with specific masks.
	/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
	/// are assumed to be legal.
	bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
	if (!VT.isSimple())
	return false;

	// Not for i1 vectors
	if (VT.getSimpleVT().getScalarType() == MVT::i1)
	return false;

	// Very little shuffling can be done for 64-bit vectors right now.
	if (VT.getSimpleVT().getSizeInBits() == 64)
	return false;

	// We only care that the types being shuffled are legal. The lowering can
	// handle any possible shuffle mask that results.
	return isTypeLegal(VT.getSimpleVT());
	}

	bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
	EVT VT) const {
	// Don't convert an 'and' into a shuffle that we don't directly support.
	// vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
	if (!Subtarget.hasAVX2())
	if (VT == MVT::v32i8 \|\| VT == MVT::v16i16)
	return false;

	// Just delegate to the generic legality, clear masks aren't special.
	return isShuffleMaskLegal(Mask, VT);
	}

	bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
	// If the subtarget is using thunks, we need to not generate jump tables.
	if (Subtarget.useIndirectThunkBranches())
	return false;

	// Otherwise, fallback on the generic logic.
	return TargetLowering::areJTsAllowed(Fn);
	}

	//===----------------------------------------------------------------------===//
	// X86 Scheduler Hooks
	//===----------------------------------------------------------------------===//

	+// Returns true if EFLAG is consumed after this iterator in the rest of the
	+// basic block or any successors of the basic block.
	+static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
	+ MachineBasicBlock *BB) {
	+ // Scan forward through BB for a use/def of EFLAGS.
	+ for (MachineBasicBlock::iterator miI = std::next(Itr), miE = BB->end();
	+ miI != miE; ++miI) {
	+ const MachineInstr& mi = *miI;
	+ if (mi.readsRegister(X86::EFLAGS))
	+ return true;
	+ // If we found a def, we can stop searching.
	+ if (mi.definesRegister(X86::EFLAGS))
	+ return false;
	+ }
	+
	+ // If we hit the end of the block, check whether EFLAGS is live into a
	+ // successor.
	+ for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
	+ sEnd = BB->succ_end();
	+ sItr != sEnd; ++sItr) {
	+ MachineBasicBlock* succ = *sItr;
	+ if (succ->isLiveIn(X86::EFLAGS))
	+ return true;
	+ }
	+
	+ return false;
	+}
	+
	/// Utility function to emit xbegin specifying the start of an RTM region.
	static MachineBasicBlock emitXBegin(MachineInstr &MI, MachineBasicBlock MBB,
	const TargetInstrInfo *TII) {
	DebugLoc DL = MI.getDebugLoc();

	const BasicBlock *BB = MBB->getBasicBlock();
	MachineFunction::iterator I = ++MBB->getIterator();

	// For the v = xbegin(), we generate
	//
	// thisMBB:
	// xbegin sinkMBB
	//
	// mainMBB:
	// s0 = -1
	//
	// fallBB:
	// eax = # XABORT_DEF
	// s1 = eax
	//
	// sinkMBB:
	// v = phi(s0/mainBB, s1/fallBB)

	MachineBasicBlock *thisMBB = MBB;
	MachineFunction *MF = MBB->getParent();
	MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(I, mainMBB);
	MF->insert(I, fallMBB);
	MF->insert(I, sinkMBB);

	+ if (isEFLAGSLiveAfter(MI, MBB)) {
	+ mainMBB->addLiveIn(X86::EFLAGS);
	+ fallMBB->addLiveIn(X86::EFLAGS);
	+ sinkMBB->addLiveIn(X86::EFLAGS);
	+ }
	+
	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

	MachineRegisterInfo &MRI = MF->getRegInfo();
	Register DstReg = MI.getOperand(0).getReg();
	const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
	Register mainDstReg = MRI.createVirtualRegister(RC);
	Register fallDstReg = MRI.createVirtualRegister(RC);

	// thisMBB:
	// xbegin fallMBB
	// # fallthrough to mainMBB
	// # abortion to fallMBB
	BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
	thisMBB->addSuccessor(mainMBB);
	thisMBB->addSuccessor(fallMBB);

	// mainMBB:
	// mainDstReg := -1
	BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
	BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
	mainMBB->addSuccessor(sinkMBB);

	// fallMBB:
	// ; pseudo instruction to model hardware's definition from XABORT
	// EAX := XABORT_DEF
	// fallDstReg := EAX
	BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
	BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
	.addReg(X86::EAX);
	fallMBB->addSuccessor(sinkMBB);

	// sinkMBB:
	// DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
	BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
	.addReg(mainDstReg).addMBB(mainMBB)
	.addReg(fallDstReg).addMBB(fallMBB);

	MI.eraseFromParent();
	return sinkMBB;
	}



	MachineBasicBlock *
	X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	// Emit va_arg instruction on X86-64.

	// Operands to this pseudo-instruction:
	// 0 ) Output : destination address (reg)
	// 1-5) Input : va_list address (addr, i64mem)
	// 6 ) ArgSize : Size (in bytes) of vararg type
	// 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
	// 8 ) Align : Alignment of type
	// 9 ) EFLAGS (implicit-def)

	assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
	static_assert(X86::AddrNumOperands == 5,
	"VAARG_64 assumes 5 address operands");

	Register DestReg = MI.getOperand(0).getReg();
	MachineOperand &Base = MI.getOperand(1);
	MachineOperand &Scale = MI.getOperand(2);
	MachineOperand &Index = MI.getOperand(3);
	MachineOperand &Disp = MI.getOperand(4);
	MachineOperand &Segment = MI.getOperand(5);
	unsigned ArgSize = MI.getOperand(6).getImm();
	unsigned ArgMode = MI.getOperand(7).getImm();
	Align Alignment = Align(MI.getOperand(8).getImm());

	MachineFunction *MF = MBB->getParent();

	// Memory Reference
	assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");

	MachineMemOperand *OldMMO = MI.memoperands().front();

	// Clone the MMO into two separate MMOs for loading and storing
	MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
	OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
	MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
	OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);

	// Machine Information
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
	const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
	const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
	DebugLoc DL = MI.getDebugLoc();

	// struct va_list {
	// i32 gp_offset
	// i32 fp_offset
	// i64 overflow_area (address)
	// i64 reg_save_area (address)
	// }
	// sizeof(va_list) = 24
	// alignment(va_list) = 8

	unsigned TotalNumIntRegs = 6;
	unsigned TotalNumXMMRegs = 8;
	bool UseGPOffset = (ArgMode == 1);
	bool UseFPOffset = (ArgMode == 2);
	unsigned MaxOffset = TotalNumIntRegs * 8 +
	(UseFPOffset ? TotalNumXMMRegs * 16 : 0);

	/* Align ArgSize to a multiple of 8 */
	unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
	bool NeedsAlign = (Alignment > 8);

	MachineBasicBlock *thisMBB = MBB;
	MachineBasicBlock *overflowMBB;
	MachineBasicBlock *offsetMBB;
	MachineBasicBlock *endMBB;

	unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
	unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
	unsigned OffsetReg = 0;

	if (!UseGPOffset && !UseFPOffset) {
	// If we only pull from the overflow region, we don't create a branch.
	// We don't need to alter control flow.
	OffsetDestReg = 0; // unused
	OverflowDestReg = DestReg;

	offsetMBB = nullptr;
	overflowMBB = thisMBB;
	endMBB = thisMBB;
	} else {
	// First emit code to check if gp_offset (or fp_offset) is below the bound.
	// If so, pull the argument from reg_save_area. (branch to offsetMBB)
	// If not, pull from overflow_area. (branch to overflowMBB)
	//
	// thisMBB
	// \| .
	// \| .
	// offsetMBB overflowMBB
	// \| .
	// \| .
	// endMBB

	// Registers for the PHI in endMBB
	OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
	OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);

	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
	overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	endMBB = MF->CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator MBBIter = ++MBB->getIterator();

	// Insert the new basic blocks
	MF->insert(MBBIter, offsetMBB);
	MF->insert(MBBIter, overflowMBB);
	MF->insert(MBBIter, endMBB);

	// Transfer the remainder of MBB and its successor edges to endMBB.
	endMBB->splice(endMBB->begin(), thisMBB,
	std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
	endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);

	// Make offsetMBB and overflowMBB successors of thisMBB
	thisMBB->addSuccessor(offsetMBB);
	thisMBB->addSuccessor(overflowMBB);

	// endMBB is a successor of both offsetMBB and overflowMBB
	offsetMBB->addSuccessor(endMBB);
	overflowMBB->addSuccessor(endMBB);

	// Load the offset value into a register
	OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
	BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, UseFPOffset ? 4 : 0)
	.add(Segment)
	.setMemRefs(LoadOnlyMMO);

	// Check if there is enough room left to pull this argument.
	BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
	.addReg(OffsetReg)
	.addImm(MaxOffset + 8 - ArgSizeA8);

	// Branch to "overflowMBB" if offset >= max
	// Fall through to "offsetMBB" otherwise
	BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
	.addMBB(overflowMBB).addImm(X86::COND_AE);
	}

	// In offsetMBB, emit code to use the reg_save_area.
	if (offsetMBB) {
	assert(OffsetReg != 0);

	// Read the reg_save_area address.
	Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 16)
	.add(Segment)
	.setMemRefs(LoadOnlyMMO);

	// Zero-extend the offset
	Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
	.addImm(0)
	.addReg(OffsetReg)
	.addImm(X86::sub_32bit);

	// Add the offset to the reg_save_area to get the final address.
	BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
	.addReg(OffsetReg64)
	.addReg(RegSaveReg);

	// Compute the offset for the next argument
	Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
	.addReg(OffsetReg)
	.addImm(UseFPOffset ? 16 : 8);

	// Store it back into the va_list.
	BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, UseFPOffset ? 4 : 0)
	.add(Segment)
	.addReg(NextOffsetReg)
	.setMemRefs(StoreOnlyMMO);

	// Jump to endMBB
	BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
	.addMBB(endMBB);
	}

	//
	// Emit code to use overflow area
	//

	// Load the overflow_area address into a register.
	Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 8)
	.add(Segment)
	.setMemRefs(LoadOnlyMMO);

	// If we need to align it, do so. Otherwise, just copy the address
	// to OverflowDestReg.
	if (NeedsAlign) {
	// Align the overflow address
	Register TmpReg = MRI.createVirtualRegister(AddrRegClass);

	// aligned_addr = (addr + (align-1)) & ~(align-1)
	BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
	.addReg(OverflowAddrReg)
	.addImm(Alignment.value() - 1);

	BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
	.addReg(TmpReg)
	.addImm(~(uint64_t)(Alignment.value() - 1));
	} else {
	BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
	.addReg(OverflowAddrReg);
	}

	// Compute the next overflow address after this argument.
	// (the overflow address should be kept 8-byte aligned)
	Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
	.addReg(OverflowDestReg)
	.addImm(ArgSizeA8);

	// Store the new overflow address.
	BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 8)
	.add(Segment)
	.addReg(NextAddrReg)
	.setMemRefs(StoreOnlyMMO);

	// If we branched, emit the PHI to the front of endMBB.
	if (offsetMBB) {
	BuildMI(*endMBB, endMBB->begin(), DL,
	TII->get(X86::PHI), DestReg)
	.addReg(OffsetDestReg).addMBB(offsetMBB)
	.addReg(OverflowDestReg).addMBB(overflowMBB);
	}

	// Erase the pseudo instruction
	MI.eraseFromParent();

	return endMBB;
	}

	MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
	MachineInstr &MI, MachineBasicBlock *MBB) const {
	// Emit code to save XMM registers to the stack. The ABI says that the
	// number of registers to save is given in %al, so it's theoretically
	// possible to do an indirect jump trick to avoid saving all of them,
	// however this code takes a simpler approach and just executes all
	// of the stores if %al is non-zero. It's less code, and it's probably
	// easier on the hardware branch predictor, and stores aren't all that
	// expensive anyway.

	// Create the new basic blocks. One block contains all the XMM stores,
	// and one block is the final destination regardless of whether any
	// stores were performed.
	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
	MachineFunction *F = MBB->getParent();
	MachineFunction::iterator MBBIter = ++MBB->getIterator();
	MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
	F->insert(MBBIter, XMMSaveMBB);
	F->insert(MBBIter, EndMBB);

	// Transfer the remainder of MBB and its successor edges to EndMBB.
	EndMBB->splice(EndMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	EndMBB->transferSuccessorsAndUpdatePHIs(MBB);

	// The original block will now fall through to the XMM save block.
	MBB->addSuccessor(XMMSaveMBB);
	// The XMMSaveMBB will fall through to the end block.
	XMMSaveMBB->addSuccessor(EndMBB);

	// Now add the instructions.
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	Register CountReg = MI.getOperand(0).getReg();
	int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
	int64_t VarArgsFPOffset = MI.getOperand(2).getImm();

	if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
	// If %al is 0, branch around the XMM save block.
	BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
	BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(EndMBB).addImm(X86::COND_E);
	MBB->addSuccessor(EndMBB);
	}

	// Make sure the last operand is EFLAGS, which gets clobbered by the branch
	// that was just emitted, but clearly shouldn't be "saved".
	assert((MI.getNumOperands() <= 3 \|\|
	!MI.getOperand(MI.getNumOperands() - 1).isReg() \|\|
	MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
	"Expected last argument to be EFLAGS");
	unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
	// In the XMM save block, save all the XMM argument registers.
	for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
	int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
	MachineMemOperand *MMO = F->getMachineMemOperand(
	MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
	MachineMemOperand::MOStore,
	/Size=/16, Align(16));
	BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
	.addFrameIndex(RegSaveFrameIndex)
	.addImm(/Scale=/1)
	.addReg(/IndexReg=/0)
	.addImm(/Disp=/Offset)
	.addReg(/Segment=/0)
	.addReg(MI.getOperand(i).getReg())
	.addMemOperand(MMO);
	}

	MI.eraseFromParent(); // The pseudo instruction is gone now.

	return EndMBB;
	}

	// The EFLAGS operand of SelectItr might be missing a kill marker
	// because there were multiple uses of EFLAGS, and ISel didn't know
	// which to mark. Figure out whether SelectItr should have had a
	// kill marker, and set it if it should. Returns the correct kill
	// marker value.
	static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
	MachineBasicBlock* BB,
	const TargetRegisterInfo* TRI) {
	- // Scan forward through BB for a use/def of EFLAGS.
	- MachineBasicBlock::iterator miI(std::next(SelectItr));
	- for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
	- const MachineInstr& mi = *miI;
	- if (mi.readsRegister(X86::EFLAGS))
	- return false;
	- if (mi.definesRegister(X86::EFLAGS))
	- break; // Should have kill-flag - update below.
	- }
	-
	- // If we hit the end of the block, check whether EFLAGS is live into a
	- // successor.
	- if (miI == BB->end()) {
	- for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
	- sEnd = BB->succ_end();
	- sItr != sEnd; ++sItr) {
	- MachineBasicBlock* succ = *sItr;
	- if (succ->isLiveIn(X86::EFLAGS))
	- return false;
	- }
	- }
	+ if (isEFLAGSLiveAfter(SelectItr, BB))
	+ return false;

	// We found a def, or hit the end of the basic block and EFLAGS wasn't live
	// out. SelectMI should have a kill flag on EFLAGS.
	SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
	return true;
	}

	// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
	// together with other CMOV pseudo-opcodes into a single basic-block with
	// conditional jump around it.
	static bool isCMOVPseudo(MachineInstr &MI) {
	switch (MI.getOpcode()) {
	case X86::CMOV_FR32:
	case X86::CMOV_FR32X:
	case X86::CMOV_FR64:
	case X86::CMOV_FR64X:
	case X86::CMOV_GR8:
	case X86::CMOV_GR16:
	case X86::CMOV_GR32:
	case X86::CMOV_RFP32:
	case X86::CMOV_RFP64:
	case X86::CMOV_RFP80:
	case X86::CMOV_VR64:
	case X86::CMOV_VR128:
	case X86::CMOV_VR128X:
	case X86::CMOV_VR256:
	case X86::CMOV_VR256X:
	case X86::CMOV_VR512:
	case X86::CMOV_VK1:
	case X86::CMOV_VK2:
	case X86::CMOV_VK4:
	case X86::CMOV_VK8:
	case X86::CMOV_VK16:
	case X86::CMOV_VK32:
	case X86::CMOV_VK64:
	return true;

	default:
	return false;
	}
	}

	// Helper function, which inserts PHI functions into SinkMBB:
	// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
	// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
	// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
	// the last PHI function inserted.
	static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
	MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
	MachineBasicBlock TrueMBB, MachineBasicBlock FalseMBB,
	MachineBasicBlock *SinkMBB) {
	MachineFunction *MF = TrueMBB->getParent();
	const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
	DebugLoc DL = MIItBegin->getDebugLoc();

	X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
	X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);

	MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();

	// As we are creating the PHIs, we have to be careful if there is more than
	// one. Later CMOVs may reference the results of earlier CMOVs, but later
	// PHIs have to reference the individual true/false inputs from earlier PHIs.
	// That also means that PHI construction must work forward from earlier to
	// later, and that the code must maintain a mapping from earlier PHI's
	// destination registers, and the registers that went into the PHI.
	DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
	MachineInstrBuilder MIB;

	for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
	Register DestReg = MIIt->getOperand(0).getReg();
	Register Op1Reg = MIIt->getOperand(1).getReg();
	Register Op2Reg = MIIt->getOperand(2).getReg();

	// If this CMOV we are generating is the opposite condition from
	// the jump we generated, then we have to swap the operands for the
	// PHI that is going to be generated.
	if (MIIt->getOperand(3).getImm() == OppCC)
	std::swap(Op1Reg, Op2Reg);

	if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
	Op1Reg = RegRewriteTable[Op1Reg].first;

	if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
	Op2Reg = RegRewriteTable[Op2Reg].second;

	MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
	.addReg(Op1Reg)
	.addMBB(FalseMBB)
	.addReg(Op2Reg)
	.addMBB(TrueMBB);

	// Add this PHI to the rewrite table.
	RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
	}

	return MIB;
	}

	// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
	MachineBasicBlock *
	X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
	MachineInstr &SecondCascadedCMOV,
	MachineBasicBlock *ThisMBB) const {
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = FirstCMOV.getDebugLoc();

	// We lower cascaded CMOVs such as
	//
	// (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
	//
	// to two successive branches.
	//
	// Without this, we would add a PHI between the two jumps, which ends up
	// creating a few copies all around. For instance, for
	//
	// (sitofp (zext (fcmp une)))
	//
	// we would generate:
	//
	// ucomiss %xmm1, %xmm0
	// movss <1.0f>, %xmm0
	// movaps %xmm0, %xmm1
	// jne .LBB5_2
	// xorps %xmm1, %xmm1
	// .LBB5_2:
	// jp .LBB5_4
	// movaps %xmm1, %xmm0
	// .LBB5_4:
	// retq
	//
	// because this custom-inserter would have generated:
	//
	// A
	// \| \
	// \| B
	// \| /
	// C
	// \| \
	// \| D
	// \| /
	// E
	//
	// A: X = ...; Y = ...
	// B: empty
	// C: Z = PHI [X, A], [Y, B]
	// D: empty
	// E: PHI [X, C], [Z, D]
	//
	// If we lower both CMOVs in a single step, we can instead generate:
	//
	// A
	// \| \
	// \| C
	// \| /\|
	// \|/ \|
	// \| \|
	// \| D
	// \| /
	// E
	//
	// A: X = ...; Y = ...
	// D: empty
	// E: PHI [X, A], [X, C], [Y, D]
	//
	// Which, in our sitofp/fcmp example, gives us something like:
	//
	// ucomiss %xmm1, %xmm0
	// movss <1.0f>, %xmm0
	// jne .LBB5_4
	// jp .LBB5_4
	// xorps %xmm0, %xmm0
	// .LBB5_4:
	// retq
	//

	// We lower cascaded CMOV into two successive branches to the same block.
	// EFLAGS is used by both, so mark it as live in the second.
	const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
	MachineFunction *F = ThisMBB->getParent();
	MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator It = ++ThisMBB->getIterator();
	F->insert(It, FirstInsertedMBB);
	F->insert(It, SecondInsertedMBB);
	F->insert(It, SinkMBB);

	// For a cascaded CMOV, we lower it to two successive branches to
	// the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
	// the FirstInsertedMBB.
	FirstInsertedMBB->addLiveIn(X86::EFLAGS);

	// If the EFLAGS register isn't dead in the terminator, then claim that it's
	// live into the sink and copy blocks.
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
	!checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
	SecondInsertedMBB->addLiveIn(X86::EFLAGS);
	SinkMBB->addLiveIn(X86::EFLAGS);
	}

	// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
	SinkMBB->splice(SinkMBB->begin(), ThisMBB,
	std::next(MachineBasicBlock::iterator(FirstCMOV)),
	ThisMBB->end());
	SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

	// Fallthrough block for ThisMBB.
	ThisMBB->addSuccessor(FirstInsertedMBB);
	// The true block target of the first branch is always SinkMBB.
	ThisMBB->addSuccessor(SinkMBB);
	// Fallthrough block for FirstInsertedMBB.
	FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
	// The true block for the branch of FirstInsertedMBB.
	FirstInsertedMBB->addSuccessor(SinkMBB);
	// This is fallthrough.
	SecondInsertedMBB->addSuccessor(SinkMBB);

	// Create the conditional branch instructions.
	X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
	BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);

	X86::CondCode SecondCC =
	X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
	BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);

	// SinkMBB:
	// %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
	Register DestReg = FirstCMOV.getOperand(0).getReg();
	Register Op1Reg = FirstCMOV.getOperand(1).getReg();
	Register Op2Reg = FirstCMOV.getOperand(2).getReg();
	MachineInstrBuilder MIB =
	BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
	.addReg(Op1Reg)
	.addMBB(SecondInsertedMBB)
	.addReg(Op2Reg)
	.addMBB(ThisMBB);

	// The second SecondInsertedMBB provides the same incoming value as the
	// FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
	MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
	// Copy the PHI result to the register defined by the second CMOV.
	BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
	TII->get(TargetOpcode::COPY),
	SecondCascadedCMOV.getOperand(0).getReg())
	.addReg(FirstCMOV.getOperand(0).getReg());

	// Now remove the CMOVs.
	FirstCMOV.eraseFromParent();
	SecondCascadedCMOV.eraseFromParent();

	return SinkMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
	MachineBasicBlock *ThisMBB) const {
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	// To "insert" a SELECT_CC instruction, we actually have to insert the
	// diamond control-flow pattern. The incoming instruction knows the
	// destination vreg to set, the condition code register to branch on, the
	// true/false values to select between and a branch opcode to use.

	// ThisMBB:
	// ...
	// TrueVal = ...
	// cmpTY ccX, r1, r2
	// bCC copy1MBB
	// fallthrough --> FalseMBB

	// This code lowers all pseudo-CMOV instructions. Generally it lowers these
	// as described above, by inserting a BB, and then making a PHI at the join
	// point to select the true and false operands of the CMOV in the PHI.
	//
	// The code also handles two different cases of multiple CMOV opcodes
	// in a row.
	//
	// Case 1:
	// In this case, there are multiple CMOVs in a row, all which are based on
	// the same condition setting (or the exact opposite condition setting).
	// In this case we can lower all the CMOVs using a single inserted BB, and
	// then make a number of PHIs at the join point to model the CMOVs. The only
	// trickiness here, is that in a case like:
	//
	// t2 = CMOV cond1 t1, f1
	// t3 = CMOV cond1 t2, f2
	//
	// when rewriting this into PHIs, we have to perform some renaming on the
	// temps since you cannot have a PHI operand refer to a PHI result earlier
	// in the same block. The "simple" but wrong lowering would be:
	//
	// t2 = PHI t1(BB1), f1(BB2)
	// t3 = PHI t2(BB1), f2(BB2)
	//
	// but clearly t2 is not defined in BB1, so that is incorrect. The proper
	// renaming is to note that on the path through BB1, t2 is really just a
	// copy of t1, and do that renaming, properly generating:
	//
	// t2 = PHI t1(BB1), f1(BB2)
	// t3 = PHI t1(BB1), f2(BB2)
	//
	// Case 2:
	// CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
	// function - EmitLoweredCascadedSelect.

	X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
	X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
	MachineInstr *LastCMOV = &MI;
	MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);

	// Check for case 1, where there are multiple CMOVs with the same condition
	// first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
	// number of jumps the most.

	if (isCMOVPseudo(MI)) {
	// See if we have a string of CMOVS with the same condition. Skip over
	// intervening debug insts.
	while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
	(NextMIIt->getOperand(3).getImm() == CC \|\|
	NextMIIt->getOperand(3).getImm() == OppCC)) {
	LastCMOV = &*NextMIIt;
	NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
	}
	}

	// This checks for case 2, but only do this if we didn't already find
	// case 1, as indicated by LastCMOV == MI.
	if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
	NextMIIt->getOpcode() == MI.getOpcode() &&
	NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
	NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
	NextMIIt->getOperand(1).isKill()) {
	return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
	}

	const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
	MachineFunction *F = ThisMBB->getParent();
	MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator It = ++ThisMBB->getIterator();
	F->insert(It, FalseMBB);
	F->insert(It, SinkMBB);

	// If the EFLAGS register isn't dead in the terminator, then claim that it's
	// live into the sink and copy blocks.
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	if (!LastCMOV->killsRegister(X86::EFLAGS) &&
	!checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
	FalseMBB->addLiveIn(X86::EFLAGS);
	SinkMBB->addLiveIn(X86::EFLAGS);
	}

	// Transfer any debug instructions inside the CMOV sequence to the sunk block.
	auto DbgEnd = MachineBasicBlock::iterator(LastCMOV);
	auto DbgIt = MachineBasicBlock::iterator(MI);
	while (DbgIt != DbgEnd) {
	auto Next = std::next(DbgIt);
	if (DbgIt->isDebugInstr())
	SinkMBB->push_back(DbgIt->removeFromParent());
	DbgIt = Next;
	}

	// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
	SinkMBB->splice(SinkMBB->end(), ThisMBB,
	std::next(MachineBasicBlock::iterator(LastCMOV)),
	ThisMBB->end());
	SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

	// Fallthrough block for ThisMBB.
	ThisMBB->addSuccessor(FalseMBB);
	// The true block target of the first (or only) branch is always a SinkMBB.
	ThisMBB->addSuccessor(SinkMBB);
	// Fallthrough block for FalseMBB.
	FalseMBB->addSuccessor(SinkMBB);

	// Create the conditional branch instruction.
	BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);

	// SinkMBB:
	// %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
	// ...
	MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
	MachineBasicBlock::iterator MIItEnd =
	std::next(MachineBasicBlock::iterator(LastCMOV));
	createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);

	// Now remove the CMOV(s).
	ThisMBB->erase(MIItBegin, MIItEnd);

	return SinkMBB;
	}

	static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
	if (IsLP64) {
	if (isInt<8>(Imm))
	return X86::SUB64ri8;
	return X86::SUB64ri32;
	} else {
	if (isInt<8>(Imm))
	return X86::SUB32ri8;
	return X86::SUB32ri;
	}
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
	DebugLoc DL = MI.getDebugLoc();
	const BasicBlock *LLVM_BB = MBB->getBasicBlock();

	const unsigned ProbeSize = getStackProbeSize(*MF);

	MachineRegisterInfo &MRI = MF->getRegInfo();
	MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator MBBIter = ++MBB->getIterator();
	MF->insert(MBBIter, testMBB);
	MF->insert(MBBIter, blockMBB);
	MF->insert(MBBIter, tailMBB);

	Register sizeVReg = MI.getOperand(1).getReg();

	Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;

	Register TmpStackPtr = MRI.createVirtualRegister(
	TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
	Register FinalStackPtr = MRI.createVirtualRegister(
	TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);

	BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)
	.addReg(physSPReg);
	{
	const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
	BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)
	.addReg(TmpStackPtr)
	.addReg(sizeVReg);
	}

	// test rsp size

	BuildMI(testMBB, DL,
	TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
	.addReg(FinalStackPtr)
	.addReg(physSPReg);

	BuildMI(testMBB, DL, TII->get(X86::JCC_1))
	.addMBB(tailMBB)
	.addImm(X86::COND_L);
	testMBB->addSuccessor(blockMBB);
	testMBB->addSuccessor(tailMBB);

	// Touch the block then extend it. This is done on the opposite side of
	// static probe where we allocate then touch, to avoid the need of probing the
	// tail of the static alloca. Possible scenarios are:
	//
	// + ---- <- ------------ <- ------------- <- ------------ +
	// \| \|
	// [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
	// \| \|
	// + <- ----------- <- ------------ <- ----------- <- ------------ +
	//
	// The property we want to enforce is to never have more than [page alloc] between two probes.

	const unsigned MovMIOpc =
	TFI.Uses64BitFramePtr ? X86::MOV64mi32 : X86::MOV32mi;
	addRegOffset(BuildMI(blockMBB, DL, TII->get(MovMIOpc)), physSPReg, false, 0)
	.addImm(0);

	BuildMI(blockMBB, DL,
	TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)
	.addReg(physSPReg)
	.addImm(ProbeSize);


	BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
	blockMBB->addSuccessor(testMBB);

	// Replace original instruction by the expected stack ptr
	BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
	.addReg(FinalStackPtr);

	tailMBB->splice(tailMBB->end(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
	MBB->addSuccessor(testMBB);

	// Delete the original pseudo instruction.
	MI.eraseFromParent();

	// And we're done.
	return tailMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	const BasicBlock *LLVM_BB = BB->getBasicBlock();

	assert(MF->shouldSplitStack());

	const bool Is64Bit = Subtarget.is64Bit();
	const bool IsLP64 = Subtarget.isTarget64BitLP64();

	const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
	const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;

	// BB:
	// ... [Till the alloca]
	// If stacklet is not large enough, jump to mallocMBB
	//
	// bumpMBB:
	// Allocate by subtracting from RSP
	// Jump to continueMBB
	//
	// mallocMBB:
	// Allocate by call to runtime
	//
	// continueMBB:
	// ...
	// [rest of original BB]
	//

	MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);

	MachineRegisterInfo &MRI = MF->getRegInfo();
	const TargetRegisterClass *AddrRegClass =
	getRegClassFor(getPointerTy(MF->getDataLayout()));

	unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
	bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
	tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
	SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
	sizeVReg = MI.getOperand(1).getReg(),
	physSPReg =
	IsLP64 \|\| Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;

	MachineFunction::iterator MBBIter = ++BB->getIterator();

	MF->insert(MBBIter, bumpMBB);
	MF->insert(MBBIter, mallocMBB);
	MF->insert(MBBIter, continueMBB);

	continueMBB->splice(continueMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	continueMBB->transferSuccessorsAndUpdatePHIs(BB);

	// Add code to the main basic block to check if the stack limit has been hit,
	// and if so, jump to mallocMBB otherwise to bumpMBB.
	BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
	BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
	.addReg(tmpSPVReg).addReg(sizeVReg);
	BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
	.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
	.addReg(SPLimitVReg);
	BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);

	// bumpMBB simply decreases the stack pointer, since we know the current
	// stacklet has enough space.
	BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
	.addReg(SPLimitVReg);
	BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
	.addReg(SPLimitVReg);
	BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

	// Calls into a routine in libgcc to allocate more space from the heap.
	const uint32_t *RegMask =
	Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
	if (IsLP64) {
	BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
	.addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::RDI, RegState::Implicit)
	.addReg(X86::RAX, RegState::ImplicitDefine);
	} else if (Is64Bit) {
	BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
	.addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::EDI, RegState::Implicit)
	.addReg(X86::EAX, RegState::ImplicitDefine);
	} else {
	BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
	.addImm(12);
	BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::EAX, RegState::ImplicitDefine);
	}

	if (!Is64Bit)
	BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
	.addImm(16);

	BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
	.addReg(IsLP64 ? X86::RAX : X86::EAX);
	BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

	// Set up the CFG correctly.
	BB->addSuccessor(bumpMBB);
	BB->addSuccessor(mallocMBB);
	mallocMBB->addSuccessor(continueMBB);
	bumpMBB->addSuccessor(continueMBB);

	// Take care of the PHI nodes.
	BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
	MI.getOperand(0).getReg())
	.addReg(mallocPtrVReg)
	.addMBB(mallocMBB)
	.addReg(bumpSPPtrVReg)
	.addMBB(bumpMBB);

	// Delete the original pseudo instruction.
	MI.eraseFromParent();

	// And we're done.
	return continueMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
	DebugLoc DL = MI.getDebugLoc();

	assert(!isAsynchronousEHPersonality(
	classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
	"SEH does not use catchret!");

	// Only 32-bit EH needs to worry about manually restoring stack pointers.
	if (!Subtarget.is32Bit())
	return BB;

	// C++ EH creates a new target block to hold the restore code, and wires up
	// the new block to the return destination with a normal JMP_4.
	MachineBasicBlock *RestoreMBB =
	MF->CreateMachineBasicBlock(BB->getBasicBlock());
	assert(BB->succ_size() == 1);
	MF->insert(std::next(BB->getIterator()), RestoreMBB);
	RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
	BB->addSuccessor(RestoreMBB);
	MI.getOperand(0).setMBB(RestoreMBB);

	// Marking this as an EH pad but not a funclet entry block causes PEI to
	// restore stack pointers in the block.
	RestoreMBB->setIsEHPad(true);

	auto RestoreMBBI = RestoreMBB->begin();
	BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// So, here we replace TLSADDR with the sequence:
	// adjust_stackdown -> TLSADDR -> adjust_stackup.
	// We need this because TLSADDR is lowered into calls
	// inside MC, therefore without the two markers shrink-wrapping
	// may push the prologue/epilogue pass them.
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction &MF = *BB->getParent();

	// Emit CALLSEQ_START right before the instruction.
	unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
	MachineInstrBuilder CallseqStart =
	BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
	BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);

	// Emit CALLSEQ_END right after the instruction.
	// We don't call erase from parent because we want to keep the
	// original instruction around.
	unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
	MachineInstrBuilder CallseqEnd =
	BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
	BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);

	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// This is pretty easy. We're taking the value that we received from
	// our load from the relocation, sticking it in either RDI (x86-64)
	// or EAX and doing an indirect call. The return value will then
	// be in the normal return register.
	MachineFunction *F = BB->getParent();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
	assert(MI.getOperand(3).isGlobal() && "This should be a global");

	// Get a register mask for the lowered call.
	// FIXME: The 32-bit calls have non-standard calling conventions. Use a
	// proper register mask.
	const uint32_t *RegMask =
	Subtarget.is64Bit() ?
	Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
	Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
	if (Subtarget.is64Bit()) {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
	.addReg(X86::RIP)
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
	addDirectMem(MIB, X86::RDI);
	MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
	} else if (!isPositionIndependent()) {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
	.addReg(0)
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
	addDirectMem(MIB, X86::EAX);
	MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
	} else {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
	.addReg(TII->getGlobalBaseReg(F))
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
	addDirectMem(MIB, X86::EAX);
	MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
	}

	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
	switch (RPOpc) {
	case X86::INDIRECT_THUNK_CALL32:
	return X86::CALLpcrel32;
	case X86::INDIRECT_THUNK_CALL64:
	return X86::CALL64pcrel32;
	case X86::INDIRECT_THUNK_TCRETURN32:
	return X86::TCRETURNdi;
	case X86::INDIRECT_THUNK_TCRETURN64:
	return X86::TCRETURNdi64;
	}
	llvm_unreachable("not indirect thunk opcode");
	}

	static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
	unsigned Reg) {
	if (Subtarget.useRetpolineExternalThunk()) {
	// When using an external thunk for retpolines, we pick names that match the
	// names GCC happens to use as well. This helps simplify the implementation
	// of the thunks for kernels where they have no easy ability to create
	// aliases and are doing non-trivial configuration of the thunk's body. For
	// example, the Linux kernel will do boot-time hot patching of the thunk
	// bodies and cannot easily export aliases of these to loaded modules.
	//
	// Note that at any point in the future, we may need to change the semantics
	// of how we implement retpolines and at that time will likely change the
	// name of the called thunk. Essentially, there is no hard guarantee that
	// LLVM will generate calls to specific thunks, we merely make a best-effort
	// attempt to help out kernels and other systems where duplicating the
	// thunks is costly.
	switch (Reg) {
	case X86::EAX:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__x86_indirect_thunk_eax";
	case X86::ECX:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__x86_indirect_thunk_ecx";
	case X86::EDX:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__x86_indirect_thunk_edx";
	case X86::EDI:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__x86_indirect_thunk_edi";
	case X86::R11:
	assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
	return "__x86_indirect_thunk_r11";
	}
	llvm_unreachable("unexpected reg for external indirect thunk");
	}

	if (Subtarget.useRetpolineIndirectCalls() \|\|
	Subtarget.useRetpolineIndirectBranches()) {
	// When targeting an internal COMDAT thunk use an LLVM-specific name.
	switch (Reg) {
	case X86::EAX:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__llvm_retpoline_eax";
	case X86::ECX:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__llvm_retpoline_ecx";
	case X86::EDX:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__llvm_retpoline_edx";
	case X86::EDI:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__llvm_retpoline_edi";
	case X86::R11:
	assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
	return "__llvm_retpoline_r11";
	}
	llvm_unreachable("unexpected reg for retpoline");
	}

	if (Subtarget.useLVIControlFlowIntegrity()) {
	assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
	return "__llvm_lvi_thunk_r11";
	}
	llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// Copy the virtual register into the R11 physical register and
	// call the retpoline thunk.
	DebugLoc DL = MI.getDebugLoc();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	Register CalleeVReg = MI.getOperand(0).getReg();
	unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());

	// Find an available scratch register to hold the callee. On 64-bit, we can
	// just use R11, but we scan for uses anyway to ensure we don't generate
	// incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
	// already a register use operand to the call to hold the callee. If none
	// are available, use EDI instead. EDI is chosen because EBX is the PIC base
	// register and ESI is the base pointer to realigned stack frames with VLAs.
	SmallVector<unsigned, 3> AvailableRegs;
	if (Subtarget.is64Bit())
	AvailableRegs.push_back(X86::R11);
	else
	AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});

	// Zero out any registers that are already used.
	for (const auto &MO : MI.operands()) {
	if (MO.isReg() && MO.isUse())
	for (unsigned &Reg : AvailableRegs)
	if (Reg == MO.getReg())
	Reg = 0;
	}

	// Choose the first remaining non-zero available register.
	unsigned AvailableReg = 0;
	for (unsigned MaybeReg : AvailableRegs) {
	if (MaybeReg) {
	AvailableReg = MaybeReg;
	break;
	}
	}
	if (!AvailableReg)
	report_fatal_error("calling convention incompatible with retpoline, no "
	"available registers");

	const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);

	BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
	.addReg(CalleeVReg);
	MI.getOperand(0).ChangeToES(Symbol);
	MI.setDesc(TII->get(Opc));
	MachineInstrBuilder(*BB->getParent(), &MI)
	.addReg(AvailableReg, RegState::Implicit \| RegState::Kill);
	return BB;
	}

	/// SetJmp implies future control flow change upon calling the corresponding
	/// LongJmp.
	/// Instead of using the 'return' instruction, the long jump fixes the stack and
	/// performs an indirect branch. To do so it uses the registers that were stored
	/// in the jump buffer (when calling SetJmp).
	/// In case the shadow stack is enabled we need to fix it as well, because some
	/// return addresses will be skipped.
	/// The function will save the SSP for future fixing in the function
	/// emitLongJmpShadowStackFix.
	/// \sa emitLongJmpShadowStackFix
	/// \param [in] MI The temporary Machine Instruction for the builtin.
	/// \param [in] MBB The Machine Basic Block that will be modified.
	void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();
	MachineInstrBuilder MIB;

	// Memory Reference.
	SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
	MI.memoperands_end());

	// Initialize a register with zero.
	MVT PVT = getPointerTy(MF->getDataLayout());
	const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
	Register ZReg = MRI.createVirtualRegister(PtrRC);
	unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
	BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
	.addDef(ZReg)
	.addReg(ZReg, RegState::Undef)
	.addReg(ZReg, RegState::Undef);

	// Read the current SSP Register value to the zeroed register.
	Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
	unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
	BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);

	// Write the SSP register value to offset 3 in input memory buffer.
	unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
	MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
	const int64_t SSPOffset = 3 * PVT.getStoreSize();
	const unsigned MemOpndSlot = 1;
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
	else
	MIB.add(MI.getOperand(MemOpndSlot + i));
	}
	MIB.addReg(SSPCopyReg);
	MIB.setMemRefs(MMOs);
	}

	MachineBasicBlock *
	X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	const BasicBlock *BB = MBB->getBasicBlock();
	MachineFunction::iterator I = ++MBB->getIterator();

	// Memory Reference
	SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
	MI.memoperands_end());

	unsigned DstReg;
	unsigned MemOpndSlot = 0;

	unsigned CurOp = 0;

	DstReg = MI.getOperand(CurOp++).getReg();
	const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
	assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
	(void)TRI;
	Register mainDstReg = MRI.createVirtualRegister(RC);
	Register restoreDstReg = MRI.createVirtualRegister(RC);

	MemOpndSlot = CurOp;

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) &&
	"Invalid Pointer Size!");

	// For v = setjmp(buf), we generate
	//
	// thisMBB:
	// buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
	// SjLjSetup restoreMBB
	//
	// mainMBB:
	// v_main = 0
	//
	// sinkMBB:
	// v = phi(main, restore)
	//
	// restoreMBB:
	// if base pointer being used, load it from frame
	// v_restore = 1

	MachineBasicBlock *thisMBB = MBB;
	MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(I, mainMBB);
	MF->insert(I, sinkMBB);
	MF->push_back(restoreMBB);
	restoreMBB->setHasAddressTaken();

	MachineInstrBuilder MIB;

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

	// thisMBB:
	unsigned PtrStoreOpc = 0;
	unsigned LabelReg = 0;
	const int64_t LabelOffset = 1 * PVT.getStoreSize();
	bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
	!isPositionIndependent();

	// Prepare IP either in reg or imm.
	if (!UseImmLabel) {
	PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
	const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
	LabelReg = MRI.createVirtualRegister(PtrRC);
	if (Subtarget.is64Bit()) {
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
	.addReg(X86::RIP)
	.addImm(0)
	.addReg(0)
	.addMBB(restoreMBB)
	.addReg(0);
	} else {
	const X86InstrInfo XII = static_cast<const X86InstrInfo>(TII);
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
	.addReg(XII->getGlobalBaseReg(MF))
	.addImm(0)
	.addReg(0)
	.addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
	.addReg(0);
	}
	} else
	PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
	// Store IP
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
	else
	MIB.add(MI.getOperand(MemOpndSlot + i));
	}
	if (!UseImmLabel)
	MIB.addReg(LabelReg);
	else
	MIB.addMBB(restoreMBB);
	MIB.setMemRefs(MMOs);

	if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
	emitSetJmpShadowStackFix(MI, thisMBB);
	}

	// Setup
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
	.addMBB(restoreMBB);

	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	MIB.addRegMask(RegInfo->getNoPreservedMask());
	thisMBB->addSuccessor(mainMBB);
	thisMBB->addSuccessor(restoreMBB);

	// mainMBB:
	// EAX = 0
	BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
	mainMBB->addSuccessor(sinkMBB);

	// sinkMBB:
	BuildMI(*sinkMBB, sinkMBB->begin(), DL,
	TII->get(X86::PHI), DstReg)
	.addReg(mainDstReg).addMBB(mainMBB)
	.addReg(restoreDstReg).addMBB(restoreMBB);

	// restoreMBB:
	if (RegInfo->hasBasePointer(*MF)) {
	const bool Uses64BitFramePtr =
	Subtarget.isTarget64BitLP64() \|\| Subtarget.isTargetNaCl64();
	X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
	X86FI->setRestoreBasePointer(MF);
	Register FramePtr = RegInfo->getFrameRegister(*MF);
	Register BasePtr = RegInfo->getBaseRegister();
	unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
	addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
	FramePtr, true, X86FI->getRestoreBasePointerOffset())
	.setMIFlag(MachineInstr::FrameSetup);
	}
	BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
	BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
	restoreMBB->addSuccessor(sinkMBB);

	MI.eraseFromParent();
	return sinkMBB;
	}

	/// Fix the shadow stack using the previously saved SSP pointer.
	/// \sa emitSetJmpShadowStackFix
	/// \param [in] MI The temporary Machine Instruction for the builtin.
	/// \param [in] MBB The Machine Basic Block that will be modified.
	/// \return The sink MBB that will perform the future indirect branch.
	MachineBasicBlock *
	X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	// Memory Reference
	SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
	MI.memoperands_end());

	MVT PVT = getPointerTy(MF->getDataLayout());
	const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

	// checkSspMBB:
	// xor vreg1, vreg1
	// rdssp vreg1
	// test vreg1, vreg1
	// je sinkMBB # Jump if Shadow Stack is not supported
	// fallMBB:
	// mov buf+24/12(%rip), vreg2
	// sub vreg1, vreg2
	// jbe sinkMBB # No need to fix the Shadow Stack
	// fixShadowMBB:
	// shr 3/2, vreg2
	// incssp vreg2 # fix the SSP according to the lower 8 bits
	// shr 8, vreg2
	// je sinkMBB
	// fixShadowLoopPrepareMBB:
	// shl vreg2
	// mov 128, vreg3
	// fixShadowLoopMBB:
	// incssp vreg3
	// dec vreg2
	// jne fixShadowLoopMBB # Iterate until you finish fixing
	// # the Shadow Stack
	// sinkMBB:

	MachineFunction::iterator I = ++MBB->getIterator();
	const BasicBlock *BB = MBB->getBasicBlock();

	MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(I, checkSspMBB);
	MF->insert(I, fallMBB);
	MF->insert(I, fixShadowMBB);
	MF->insert(I, fixShadowLoopPrepareMBB);
	MF->insert(I, fixShadowLoopMBB);
	MF->insert(I, sinkMBB);

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
	MBB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

	MBB->addSuccessor(checkSspMBB);

	// Initialize a register with zero.
	Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
	BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);

	if (PVT == MVT::i64) {
	Register TmpZReg = MRI.createVirtualRegister(PtrRC);
	BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)
	.addImm(0)
	.addReg(ZReg)
	.addImm(X86::sub_32bit);
	ZReg = TmpZReg;
	}

	// Read the current SSP Register value to the zeroed register.
	Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
	unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
	BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);

	// Check whether the result of the SSP register is zero and jump directly
	// to the sink.
	unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
	BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
	.addReg(SSPCopyReg)
	.addReg(SSPCopyReg);
	BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
	checkSspMBB->addSuccessor(sinkMBB);
	checkSspMBB->addSuccessor(fallMBB);

	// Reload the previously saved SSP register value.
	Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
	unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
	const int64_t SPPOffset = 3 * PVT.getStoreSize();
	MachineInstrBuilder MIB =
	BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	const MachineOperand &MO = MI.getOperand(i);
	if (i == X86::AddrDisp)
	MIB.addDisp(MO, SPPOffset);
	else if (MO.isReg()) // Don't add the whole operand, we don't want to
	// preserve kill flags.
	MIB.addReg(MO.getReg());
	else
	MIB.add(MO);
	}
	MIB.setMemRefs(MMOs);

	// Subtract the current SSP from the previous SSP.
	Register SspSubReg = MRI.createVirtualRegister(PtrRC);
	unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
	BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
	.addReg(PrevSSPReg)
	.addReg(SSPCopyReg);

	// Jump to sink in case PrevSSPReg <= SSPCopyReg.
	BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
	fallMBB->addSuccessor(sinkMBB);
	fallMBB->addSuccessor(fixShadowMBB);

	// Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
	unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
	unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
	Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
	BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
	.addReg(SspSubReg)
	.addImm(Offset);

	// Increase SSP when looking only on the lower 8 bits of the delta.
	unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
	BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);

	// Reset the lower 8 bits.
	Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
	BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
	.addReg(SspFirstShrReg)
	.addImm(8);

	// Jump if the result of the shift is zero.
	BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
	fixShadowMBB->addSuccessor(sinkMBB);
	fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);

	// Do a single shift left.
	unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
	Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
	BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
	.addReg(SspSecondShrReg);

	// Save the value 128 to a register (will be used next with incssp).
	Register Value128InReg = MRI.createVirtualRegister(PtrRC);
	unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
	BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
	.addImm(128);
	fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);

	// Since incssp only looks at the lower 8 bits, we might need to do several
	// iterations of incssp until we finish fixing the shadow stack.
	Register DecReg = MRI.createVirtualRegister(PtrRC);
	Register CounterReg = MRI.createVirtualRegister(PtrRC);
	BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
	.addReg(SspAfterShlReg)
	.addMBB(fixShadowLoopPrepareMBB)
	.addReg(DecReg)
	.addMBB(fixShadowLoopMBB);

	// Every iteration we increase the SSP by 128.
	BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);

	// Every iteration we decrement the counter by 1.
	unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
	BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);

	// Jump if the counter is not zero yet.
	BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
	fixShadowLoopMBB->addSuccessor(sinkMBB);
	fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);

	return sinkMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	// Memory Reference
	SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
	MI.memoperands_end());

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) &&
	"Invalid Pointer Size!");

	const TargetRegisterClass *RC =
	(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
	Register Tmp = MRI.createVirtualRegister(RC);
	// Since FP is only updated here but NOT referenced, it's treated as GPR.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
	Register SP = RegInfo->getStackRegister();

	MachineInstrBuilder MIB;

	const int64_t LabelOffset = 1 * PVT.getStoreSize();
	const int64_t SPOffset = 2 * PVT.getStoreSize();

	unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
	unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;

	MachineBasicBlock *thisMBB = MBB;

	// When CET and shadow stack is enabled, we need to fix the Shadow Stack.
	if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
	thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
	}

	// Reload FP
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	const MachineOperand &MO = MI.getOperand(i);
	if (MO.isReg()) // Don't add the whole operand, we don't want to
	// preserve kill flags.
	MIB.addReg(MO.getReg());
	else
	MIB.add(MO);
	}
	MIB.setMemRefs(MMOs);

	// Reload IP
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	const MachineOperand &MO = MI.getOperand(i);
	if (i == X86::AddrDisp)
	MIB.addDisp(MO, LabelOffset);
	else if (MO.isReg()) // Don't add the whole operand, we don't want to
	// preserve kill flags.
	MIB.addReg(MO.getReg());
	else
	MIB.add(MO);
	}
	MIB.setMemRefs(MMOs);

	// Reload SP
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(i), SPOffset);
	else
	MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
	// the last instruction of the expansion.
	}
	MIB.setMemRefs(MMOs);

	// Jump
	BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);

	MI.eraseFromParent();
	return thisMBB;
	}

	void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
	MachineBasicBlock *MBB,
	MachineBasicBlock *DispatchBB,
	int FI) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	MachineRegisterInfo *MRI = &MF->getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) && "Invalid Pointer Size!");

	unsigned Op = 0;
	unsigned VR = 0;

	bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
	!isPositionIndependent();

	if (UseImmLabel) {
	Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
	} else {
	const TargetRegisterClass *TRC =
	(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
	VR = MRI->createVirtualRegister(TRC);
	Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

	if (Subtarget.is64Bit())
	BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
	.addReg(X86::RIP)
	.addImm(1)
	.addReg(0)
	.addMBB(DispatchBB)
	.addReg(0);
	else
	BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
	.addReg(0) /* TII->getGlobalBaseReg(MF) */
	.addImm(1)
	.addReg(0)
	.addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
	.addReg(0);
	}

	MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
	addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
	if (UseImmLabel)
	MIB.addMBB(DispatchBB);
	else
	MIB.addReg(VR);
	}

	MachineBasicBlock *
	X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = BB->getParent();
	MachineRegisterInfo *MRI = &MF->getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	int FI = MF->getFrameInfo().getFunctionContextIndex();

	// Get a mapping of the call site numbers to all of the landing pads they're
	// associated with.
	DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
	unsigned MaxCSNum = 0;
	for (auto &MBB : *MF) {
	if (!MBB.isEHPad())
	continue;

	MCSymbol *Sym = nullptr;
	for (const auto &MI : MBB) {
	if (MI.isDebugInstr())
	continue;

	assert(MI.isEHLabel() && "expected EH_LABEL");
	Sym = MI.getOperand(0).getMCSymbol();
	break;
	}

	if (!MF->hasCallSiteLandingPad(Sym))
	continue;

	for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
	CallSiteNumToLPad[CSI].push_back(&MBB);
	MaxCSNum = std::max(MaxCSNum, CSI);
	}
	}

	// Get an ordered list of the machine basic blocks for the jump table.
	std::vector<MachineBasicBlock *> LPadList;
	SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
	LPadList.reserve(CallSiteNumToLPad.size());

	for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
	for (auto &LP : CallSiteNumToLPad[CSI]) {
	LPadList.push_back(LP);
	InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
	}
	}

	assert(!LPadList.empty() &&
	"No landing pad destinations for the dispatch jump table!");

	// Create the MBBs for the dispatch code.

	// Shove the dispatch's address into the return slot in the function context.
	MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
	DispatchBB->setIsEHPad(true);

	MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
	BuildMI(TrapBB, DL, TII->get(X86::TRAP));
	DispatchBB->addSuccessor(TrapBB);

	MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
	DispatchBB->addSuccessor(DispContBB);

	// Insert MBBs.
	MF->push_back(DispatchBB);
	MF->push_back(DispContBB);
	MF->push_back(TrapBB);

	// Insert code into the entry block that creates and registers the function
	// context.
	SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);

	// Create the jump table and associated information
	unsigned JTE = getJumpTableEncoding();
	MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
	unsigned MJTI = JTI->createJumpTableIndex(LPadList);

	const X86RegisterInfo &RI = TII->getRegisterInfo();
	// Add a register mask with no preserved registers. This results in all
	// registers being marked as clobbered.
	if (RI.hasBasePointer(*MF)) {
	const bool FPIs64Bit =
	Subtarget.isTarget64BitLP64() \|\| Subtarget.isTargetNaCl64();
	X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
	MFI->setRestoreBasePointer(MF);

	Register FP = RI.getFrameRegister(*MF);
	Register BP = RI.getBaseRegister();
	unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
	addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
	MFI->getRestoreBasePointerOffset())
	.addRegMask(RI.getNoPreservedMask());
	} else {
	BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
	.addRegMask(RI.getNoPreservedMask());
	}

	// IReg is used as an index in a memory operand and therefore can't be SP
	Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
	addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
	Subtarget.is64Bit() ? 8 : 4);
	BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
	.addReg(IReg)
	.addImm(LPadList.size());
	BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);

	if (Subtarget.is64Bit()) {
	Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
	Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);

	// leaq .LJTI0_0(%rip), BReg
	BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
	.addReg(X86::RIP)
	.addImm(1)
	.addReg(0)
	.addJumpTableIndex(MJTI)
	.addReg(0);
	// movzx IReg64, IReg
	BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
	.addImm(0)
	.addReg(IReg)
	.addImm(X86::sub_32bit);

	switch (JTE) {
	case MachineJumpTableInfo::EK_BlockAddress:
	// jmpq *(BReg,IReg64,8)
	BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
	.addReg(BReg)
	.addImm(8)
	.addReg(IReg64)
	.addImm(0)
	.addReg(0);
	break;
	case MachineJumpTableInfo::EK_LabelDifference32: {
	Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
	Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
	Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);

	// movl (BReg,IReg64,4), OReg
	BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
	.addReg(BReg)
	.addImm(4)
	.addReg(IReg64)
	.addImm(0)
	.addReg(0);
	// movsx OReg64, OReg
	BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
	// addq BReg, OReg64, TReg
	BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
	.addReg(OReg64)
	.addReg(BReg);
	// jmpq *TReg
	BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
	break;
	}
	default:
	llvm_unreachable("Unexpected jump table encoding");
	}
	} else {
	// jmpl *.LJTI0_0(,IReg,4)
	BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
	.addReg(0)
	.addImm(4)
	.addReg(IReg)
	.addJumpTableIndex(MJTI)
	.addReg(0);
	}

	// Add the jump table entries as successors to the MBB.
	SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
	for (auto &LP : LPadList)
	if (SeenMBBs.insert(LP).second)
	DispContBB->addSuccessor(LP);

	// N.B. the order the invoke BBs are processed in doesn't matter here.
	SmallVector<MachineBasicBlock *, 64> MBBLPads;
	const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
	for (MachineBasicBlock *MBB : InvokeBBs) {
	// Remove the landing pad successor from the invoke block and replace it
	// with the new dispatch block.
	// Keep a copy of Successors since it's modified inside the loop.
	SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
	MBB->succ_rend());
	// FIXME: Avoid quadratic complexity.
	for (auto MBBS : Successors) {
	if (MBBS->isEHPad()) {
	MBB->removeSuccessor(MBBS);
	MBBLPads.push_back(MBBS);
	}
	}

	MBB->addSuccessor(DispatchBB);

	// Find the invoke call and mark all of the callee-saved registers as
	// 'implicit defined' so that they're spilled. This prevents code from
	// moving instructions to before the EH block, where they will never be
	// executed.
	for (auto &II : reverse(*MBB)) {
	if (!II.isCall())
	continue;

	DenseMap<unsigned, bool> DefRegs;
	for (auto &MOp : II.operands())
	if (MOp.isReg())
	DefRegs[MOp.getReg()] = true;

	MachineInstrBuilder MIB(*MF, &II);
	for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
	unsigned Reg = SavedRegs[RegIdx];
	if (!DefRegs[Reg])
	MIB.addReg(Reg, RegState::ImplicitDefine \| RegState::Dead);
	}

	break;
	}
	}

	// Mark all former landing pads as non-landing pads. The dispatch is the only
	// landing pad now.
	for (auto &LP : MBBLPads)
	LP->setIsEHPad(false);

	// The instruction is gone now.
	MI.eraseFromParent();
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	auto TMMImmToTMMReg = [](unsigned Imm) {
	assert (Imm < 8 && "Illegal tmm index");
	return X86::TMM0 + Imm;
	};
	switch (MI.getOpcode()) {
	default: llvm_unreachable("Unexpected instr type to insert");
	case X86::TLS_addr32:
	case X86::TLS_addr64:
	case X86::TLS_base_addr32:
	case X86::TLS_base_addr64:
	return EmitLoweredTLSAddr(MI, BB);
	case X86::INDIRECT_THUNK_CALL32:
	case X86::INDIRECT_THUNK_CALL64:
	case X86::INDIRECT_THUNK_TCRETURN32:
	case X86::INDIRECT_THUNK_TCRETURN64:
	return EmitLoweredIndirectThunk(MI, BB);
	case X86::CATCHRET:
	return EmitLoweredCatchRet(MI, BB);
	case X86::SEG_ALLOCA_32:
	case X86::SEG_ALLOCA_64:
	return EmitLoweredSegAlloca(MI, BB);
	case X86::PROBED_ALLOCA_32:
	case X86::PROBED_ALLOCA_64:
	return EmitLoweredProbedAlloca(MI, BB);
	case X86::TLSCall_32:
	case X86::TLSCall_64:
	return EmitLoweredTLSCall(MI, BB);
	case X86::CMOV_FR32:
	case X86::CMOV_FR32X:
	case X86::CMOV_FR64:
	case X86::CMOV_FR64X:
	case X86::CMOV_GR8:
	case X86::CMOV_GR16:
	case X86::CMOV_GR32:
	case X86::CMOV_RFP32:
	case X86::CMOV_RFP64:
	case X86::CMOV_RFP80:
	case X86::CMOV_VR64:
	case X86::CMOV_VR128:
	case X86::CMOV_VR128X:
	case X86::CMOV_VR256:
	case X86::CMOV_VR256X:
	case X86::CMOV_VR512:
	case X86::CMOV_VK1:
	case X86::CMOV_VK2:
	case X86::CMOV_VK4:
	case X86::CMOV_VK8:
	case X86::CMOV_VK16:
	case X86::CMOV_VK32:
	case X86::CMOV_VK64:
	return EmitLoweredSelect(MI, BB);

	case X86::RDFLAGS32:
	case X86::RDFLAGS64: {
	unsigned PushF =
	MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
	unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
	MachineInstr Push = BuildMI(BB, MI, DL, TII->get(PushF));
	// Permit reads of the EFLAGS and DF registers without them being defined.
	// This intrinsic exists to read external processor state in flags, such as
	// the trap flag, interrupt flag, and direction flag, none of which are
	// modeled by the backend.
	assert(Push->getOperand(2).getReg() == X86::EFLAGS &&
	"Unexpected register in operand!");
	Push->getOperand(2).setIsUndef();
	assert(Push->getOperand(3).getReg() == X86::DF &&
	"Unexpected register in operand!");
	Push->getOperand(3).setIsUndef();
	BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	case X86::WRFLAGS32:
	case X86::WRFLAGS64: {
	unsigned Push =
	MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
	unsigned PopF =
	MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
	BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
	BuildMI(*BB, MI, DL, TII->get(PopF));

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	case X86::FP32_TO_INT16_IN_MEM:
	case X86::FP32_TO_INT32_IN_MEM:
	case X86::FP32_TO_INT64_IN_MEM:
	case X86::FP64_TO_INT16_IN_MEM:
	case X86::FP64_TO_INT32_IN_MEM:
	case X86::FP64_TO_INT64_IN_MEM:
	case X86::FP80_TO_INT16_IN_MEM:
	case X86::FP80_TO_INT32_IN_MEM:
	case X86::FP80_TO_INT64_IN_MEM: {
	// Change the floating point control register to use "round towards zero"
	// mode when truncating to an integer value.
	int OrigCWFrameIdx =
	MF->getFrameInfo().CreateStackObject(2, Align(2), false);
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);

	// Load the old value of the control word...
	Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
	addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
	OrigCWFrameIdx);

	// OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
	Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
	BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
	.addReg(OldCW, RegState::Kill).addImm(0xC00);

	// Extract to 16 bits.
	Register NewCW16 =
	MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
	BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
	.addReg(NewCW, RegState::Kill, X86::sub_16bit);

	// Prepare memory for FLDCW.
	int NewCWFrameIdx =
	MF->getFrameInfo().CreateStackObject(2, Align(2), false);
	addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
	NewCWFrameIdx)
	.addReg(NewCW16, RegState::Kill);

	// Reload the modified control word now...
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FLDCW16m)), NewCWFrameIdx);

	// Get the X86 opcode to use.
	unsigned Opc;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("illegal opcode!");
	case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
	case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
	case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
	case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
	case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
	case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
	case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
	case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
	case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
	}

	X86AddressMode AM = getAddressFromInstr(&MI, 0);
	addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
	.addReg(MI.getOperand(X86::AddrNumOperands).getReg());

	// Reload the original control word now.
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FLDCW16m)), OrigCWFrameIdx);

	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	// xbegin
	case X86::XBEGIN:
	return emitXBegin(MI, BB, Subtarget.getInstrInfo());

	case X86::VASTART_SAVE_XMM_REGS:
	return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);

	case X86::VAARG_64:
	return EmitVAARG64WithCustomInserter(MI, BB);

	case X86::EH_SjLj_SetJmp32:
	case X86::EH_SjLj_SetJmp64:
	return emitEHSjLjSetJmp(MI, BB);

	case X86::EH_SjLj_LongJmp32:
	case X86::EH_SjLj_LongJmp64:
	return emitEHSjLjLongJmp(MI, BB);

	case X86::Int_eh_sjlj_setup_dispatch:
	return EmitSjLjDispatchBlock(MI, BB);

	case TargetOpcode::STATEPOINT:
	// As an implementation detail, STATEPOINT shares the STACKMAP format at
	// this point in the process. We diverge later.
	return emitPatchPoint(MI, BB);

	case TargetOpcode::STACKMAP:
	case TargetOpcode::PATCHPOINT:
	return emitPatchPoint(MI, BB);

	case TargetOpcode::PATCHABLE_EVENT_CALL:
	return emitXRayCustomEvent(MI, BB);

	case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
	return emitXRayTypedEvent(MI, BB);

	case X86::LCMPXCHG8B: {
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	// In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
	// requires a memory operand. If it happens that current architecture is
	// i686 and for current function we need a base pointer
	// - which is ESI for i686 - register allocator would not be able to
	// allocate registers for an address in form of X(%reg, %reg, Y)
	// - there never would be enough unreserved registers during regalloc
	// (without the need for base ptr the only option would be X(%edi, %esi, Y).
	// We are giving a hand to register allocator by precomputing the address in
	// a new vreg using LEA.

	// If it is not i686 or there is no base pointer - nothing to do here.
	if (!Subtarget.is32Bit() \|\| !TRI->hasBasePointer(*MF))
	return BB;

	// Even though this code does not necessarily needs the base pointer to
	// be ESI, we check for that. The reason: if this assert fails, there are
	// some changes happened in the compiler base pointer handling, which most
	// probably have to be addressed somehow here.
	assert(TRI->getBaseRegister() == X86::ESI &&
	"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
	"base pointer in mind");

	MachineRegisterInfo &MRI = MF->getRegInfo();
	MVT SPTy = getPointerTy(MF->getDataLayout());
	const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
	Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);

	X86AddressMode AM = getAddressFromInstr(&MI, 0);
	// Regalloc does not need any help when the memory operand of CMPXCHG8B
	// does not use index register.
	if (AM.IndexReg == X86::NoRegister)
	return BB;

	// After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
	// four operand definitions that are E[ABCD] registers. We skip them and
	// then insert the LEA.
	MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
	while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) \|\|
	RMBBI->definesRegister(X86::EBX) \|\|
	RMBBI->definesRegister(X86::ECX) \|\|
	RMBBI->definesRegister(X86::EDX))) {
	++RMBBI;
	}
	MachineBasicBlock::iterator MBBI(RMBBI);
	addFullAddress(
	BuildMI(BB, MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);

	setDirectAddressInInstr(&MI, 0, computedAddrVReg);

	return BB;
	}
	case X86::LCMPXCHG16B:
	return BB;
	case X86::LCMPXCHG8B_SAVE_EBX:
	case X86::LCMPXCHG16B_SAVE_RBX: {
	unsigned BasePtr =
	MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
	if (!BB->isLiveIn(BasePtr))
	BB->addLiveIn(BasePtr);
	return BB;
	}
	case TargetOpcode::PREALLOCATED_SETUP: {
	assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
	auto MFI = MF->getInfo<X86MachineFunctionInfo>();
	MFI->setHasPreallocatedCall(true);
	int64_t PreallocatedId = MI.getOperand(0).getImm();
	size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
	assert(StackAdjustment != 0 && "0 stack adjustment");
	LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
	<< StackAdjustment << "\n");
	BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)
	.addReg(X86::ESP)
	.addImm(StackAdjustment);
	MI.eraseFromParent();
	return BB;
	}
	case TargetOpcode::PREALLOCATED_ARG: {
	assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
	int64_t PreallocatedId = MI.getOperand(1).getImm();
	int64_t ArgIdx = MI.getOperand(2).getImm();
	auto MFI = MF->getInfo<X86MachineFunctionInfo>();
	size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
	LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
	<< ", arg offset " << ArgOffset << "\n");
	// stack pointer + offset
	addRegOffset(
	BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),
	X86::ESP, false, ArgOffset);
	MI.eraseFromParent();
	return BB;
	}
	case X86::PTDPBSSD:
	case X86::PTDPBSUD:
	case X86::PTDPBUSD:
	case X86::PTDPBUUD:
	case X86::PTDPBF16PS: {
	const DebugLoc &DL = MI.getDebugLoc();
	unsigned Opc;
	switch (MI.getOpcode()) {
	case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
	case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
	case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
	case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
	case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
	}

	MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
	MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
	MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
	MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
	MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}
	case X86::PTILEZERO: {
	const DebugLoc &DL = MI.getDebugLoc();
	unsigned Imm = MI.getOperand(0).getImm();
	BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}
	case X86::PTILELOADD:
	case X86::PTILELOADDT1:
	case X86::PTILESTORED: {
	const DebugLoc &DL = MI.getDebugLoc();
	unsigned Opc;
	switch (MI.getOpcode()) {
	case X86::PTILELOADD: Opc = X86::TILELOADD; break;
	case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
	case X86::PTILESTORED: Opc = X86::TILESTORED; break;
	}

	MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
	unsigned CurOp = 0;
	if (Opc != X86::TILESTORED)
	MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
	RegState::Define);

	MIB.add(MI.getOperand(CurOp++)); // base
	MIB.add(MI.getOperand(CurOp++)); // scale
	MIB.add(MI.getOperand(CurOp++)); // index -- stride
	MIB.add(MI.getOperand(CurOp++)); // displacement
	MIB.add(MI.getOperand(CurOp++)); // segment

	if (Opc == X86::TILESTORED)
	MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
	RegState::Undef);

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}
	}
	}

	//===----------------------------------------------------------------------===//
	// X86 Optimization Hooks
	//===----------------------------------------------------------------------===//

	bool
	X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
	const APInt &DemandedBits,
	const APInt &DemandedElts,
	TargetLoweringOpt &TLO) const {
	EVT VT = Op.getValueType();
	unsigned Opcode = Op.getOpcode();
	unsigned EltSize = VT.getScalarSizeInBits();

	if (VT.isVector()) {
	// If the constant is only all signbits in the active bits, then we should
	// extend it to the entire constant to allow it act as a boolean constant
	// vector.
	auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
	if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
	return false;
	for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
	if (!DemandedElts[i] \|\| V.getOperand(i).isUndef())
	continue;
	const APInt &Val = V.getConstantOperandAPInt(i);
	if (Val.getBitWidth() > Val.getNumSignBits() &&
	Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
	return true;
	}
	return false;
	};
	// For vectors - if we have a constant, then try to sign extend.
	// TODO: Handle AND/ANDN cases.
	unsigned ActiveBits = DemandedBits.getActiveBits();
	if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
	(Opcode == ISD::OR \|\| Opcode == ISD::XOR) &&
	NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
	EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
	EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
	VT.getVectorNumElements());
	SDValue NewC =
	TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,
	Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
	SDValue NewOp =
	TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
	return TLO.CombineTo(Op, NewOp);
	}
	return false;
	}

	// Only optimize Ands to prevent shrinking a constant that could be
	// matched by movzx.
	if (Opcode != ISD::AND)
	return false;

	// Make sure the RHS really is a constant.
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	if (!C)
	return false;

	const APInt &Mask = C->getAPIntValue();

	// Clear all non-demanded bits initially.
	APInt ShrunkMask = Mask & DemandedBits;

	// Find the width of the shrunk mask.
	unsigned Width = ShrunkMask.getActiveBits();

	// If the mask is all 0s there's nothing to do here.
	if (Width == 0)
	return false;

	// Find the next power of 2 width, rounding up to a byte.
	Width = PowerOf2Ceil(std::max(Width, 8U));
	// Truncate the width to size to handle illegal types.
	Width = std::min(Width, EltSize);

	// Calculate a possible zero extend mask for this constant.
	APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);

	// If we aren't changing the mask, just return true to keep it and prevent
	// the caller from optimizing.
	if (ZeroExtendMask == Mask)
	return true;

	// Make sure the new mask can be represented by a combination of mask bits
	// and non-demanded bits.
	if (!ZeroExtendMask.isSubsetOf(Mask \| ~DemandedBits))
	return false;

	// Replace the constant with the zero extend mask.
	SDLoc DL(Op);
	SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
	SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
	return TLO.CombineTo(Op, NewOp);
	}

	void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
	KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth) const {
	unsigned BitWidth = Known.getBitWidth();
	unsigned NumElts = DemandedElts.getBitWidth();
	unsigned Opc = Op.getOpcode();
	EVT VT = Op.getValueType();
	assert((Opc >= ISD::BUILTIN_OP_END \|\|
	Opc == ISD::INTRINSIC_WO_CHAIN \|\|
	Opc == ISD::INTRINSIC_W_CHAIN \|\|
	Opc == ISD::INTRINSIC_VOID) &&
	"Should use MaskedValueIsZero if you don't know whether Op"
	" is a target node!");

	Known.resetAll();
	switch (Opc) {
	default: break;
	case X86ISD::SETCC:
	Known.Zero.setBitsFrom(1);
	break;
	case X86ISD::MOVMSK: {
	unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
	Known.Zero.setBitsFrom(NumLoBits);
	break;
	}
	case X86ISD::PEXTRB:
	case X86ISD::PEXTRW: {
	SDValue Src = Op.getOperand(0);
	EVT SrcVT = Src.getValueType();
	APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
	Op.getConstantOperandVal(1));
	Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
	Known = Known.anyextOrTrunc(BitWidth);
	Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
	break;
	}
	case X86ISD::VSRAI:
	case X86ISD::VSHLI:
	case X86ISD::VSRLI: {
	unsigned ShAmt = Op.getConstantOperandVal(1);
	if (ShAmt >= VT.getScalarSizeInBits()) {
	Known.setAllZero();
	break;
	}

	Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
	if (Opc == X86ISD::VSHLI) {
	Known.Zero <<= ShAmt;
	Known.One <<= ShAmt;
	// Low bits are known zero.
	Known.Zero.setLowBits(ShAmt);
	} else if (Opc == X86ISD::VSRLI) {
	Known.Zero.lshrInPlace(ShAmt);
	Known.One.lshrInPlace(ShAmt);
	// High bits are known zero.
	Known.Zero.setHighBits(ShAmt);
	} else {
	Known.Zero.ashrInPlace(ShAmt);
	Known.One.ashrInPlace(ShAmt);
	}
	break;
	}
	case X86ISD::PACKUS: {
	// PACKUS is just a truncation if the upper half is zero.
	APInt DemandedLHS, DemandedRHS;
	getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

	Known.One = APInt::getAllOnesValue(BitWidth * 2);
	Known.Zero = APInt::getAllOnesValue(BitWidth * 2);

	KnownBits Known2;
	if (!!DemandedLHS) {
	Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}
	if (!!DemandedRHS) {
	Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}

	if (Known.countMinLeadingZeros() < BitWidth)
	Known.resetAll();
	Known = Known.trunc(BitWidth);
	break;
	}
	case X86ISD::ANDNP: {
	KnownBits Known2;
	Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
	Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

	// ANDNP = (~X & Y);
	Known.One &= Known2.Zero;
	Known.Zero \|= Known2.One;
	break;
	}
	case X86ISD::FOR: {
	KnownBits Known2;
	Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
	Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

	Known \|= Known2;
	break;
	}
	case X86ISD::PSADBW: {
	assert(VT.getScalarType() == MVT::i64 &&
	Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
	"Unexpected PSADBW types");

	// PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
	Known.Zero.setBitsFrom(16);
	break;
	}
	case X86ISD::CMOV: {
	Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
	// If we don't know any bits, early out.
	if (Known.isUnknown())
	break;
	KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);

	// Only known if known in both the LHS and RHS.
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	break;
	}
	case X86ISD::BEXTR: {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
	unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
	unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);

	// If the length is 0, the result is 0.
	if (Length == 0) {
	Known.setAllZero();
	break;
	}

	if ((Shift + Length) <= BitWidth) {
	Known = DAG.computeKnownBits(Op0, Depth + 1);
	Known = Known.extractBits(Length, Shift);
	Known = Known.zextOrTrunc(BitWidth);
	}
	}
	break;
	}
	case X86ISD::CVTSI2P:
	case X86ISD::CVTUI2P:
	case X86ISD::CVTP2SI:
	case X86ISD::CVTP2UI:
	case X86ISD::MCVTP2SI:
	case X86ISD::MCVTP2UI:
	case X86ISD::CVTTP2SI:
	case X86ISD::CVTTP2UI:
	case X86ISD::MCVTTP2SI:
	case X86ISD::MCVTTP2UI:
	case X86ISD::MCVTSI2P:
	case X86ISD::MCVTUI2P:
	case X86ISD::VFPROUND:
	case X86ISD::VMFPROUND:
	case X86ISD::CVTPS2PH:
	case X86ISD::MCVTPS2PH: {
	// Conversions - upper elements are known zero.
	EVT SrcVT = Op.getOperand(0).getValueType();
	if (SrcVT.isVector()) {
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	if (NumElts > NumSrcElts &&
	DemandedElts.countTrailingZeros() >= NumSrcElts)
	Known.setAllZero();
	}
	break;
	}
	case X86ISD::STRICT_CVTTP2SI:
	case X86ISD::STRICT_CVTTP2UI:
	case X86ISD::STRICT_CVTSI2P:
	case X86ISD::STRICT_CVTUI2P:
	case X86ISD::STRICT_VFPROUND:
	case X86ISD::STRICT_CVTPS2PH: {
	// Strict Conversions - upper elements are known zero.
	EVT SrcVT = Op.getOperand(1).getValueType();
	if (SrcVT.isVector()) {
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	if (NumElts > NumSrcElts &&
	DemandedElts.countTrailingZeros() >= NumSrcElts)
	Known.setAllZero();
	}
	break;
	}
	case X86ISD::MOVQ2DQ: {
	// Move from MMX to XMM. Upper half of XMM should be 0.
	if (DemandedElts.countTrailingZeros() >= (NumElts / 2))
	Known.setAllZero();
	break;
	}
	}

	// Handle target shuffles.
	// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
	if (isTargetShuffle(Opc)) {
	bool IsUnary;
	SmallVector<int, 64> Mask;
	SmallVector<SDValue, 2> Ops;
	if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
	IsUnary)) {
	unsigned NumOps = Ops.size();
	unsigned NumElts = VT.getVectorNumElements();
	if (Mask.size() == NumElts) {
	SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
	Known.Zero.setAllBits(); Known.One.setAllBits();
	for (unsigned i = 0; i != NumElts; ++i) {
	if (!DemandedElts[i])
	continue;
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	// For UNDEF elements, we don't know anything about the common state
	// of the shuffle result.
	Known.resetAll();
	break;
	} else if (M == SM_SentinelZero) {
	Known.One.clearAllBits();
	continue;
	}
	assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
	"Shuffle index out of range");

	unsigned OpIdx = (unsigned)M / NumElts;
	unsigned EltIdx = (unsigned)M % NumElts;
	if (Ops[OpIdx].getValueType() != VT) {
	// TODO - handle target shuffle ops with different value types.
	Known.resetAll();
	break;
	}
	DemandedOps[OpIdx].setBit(EltIdx);
	}
	// Known bits are the values that are shared by every demanded element.
	for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
	if (!DemandedOps[i])
	continue;
	KnownBits Known2 =
	DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}
	}
	}
	}
	}

	unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
	SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
	unsigned Depth) const {
	EVT VT = Op.getValueType();
	unsigned VTBits = VT.getScalarSizeInBits();
	unsigned Opcode = Op.getOpcode();
	switch (Opcode) {
	case X86ISD::SETCC_CARRY:
	// SETCC_CARRY sets the dest to ~0 for true or 0 for false.
	return VTBits;

	case X86ISD::VTRUNC: {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
	assert(VTBits < NumSrcBits && "Illegal truncation input type");
	APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
	unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
	if (Tmp > (NumSrcBits - VTBits))
	return Tmp - (NumSrcBits - VTBits);
	return 1;
	}

	case X86ISD::PACKSS: {
	// PACKSS is just a truncation if the sign bits extend to the packed size.
	APInt DemandedLHS, DemandedRHS;
	getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
	DemandedRHS);

	unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
	unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
	if (!!DemandedLHS)
	Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
	if (!!DemandedRHS)
	Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
	unsigned Tmp = std::min(Tmp0, Tmp1);
	if (Tmp > (SrcBits - VTBits))
	return Tmp - (SrcBits - VTBits);
	return 1;
	}

	case X86ISD::VSHLI: {
	SDValue Src = Op.getOperand(0);
	const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
	if (ShiftVal.uge(VTBits))
	return VTBits; // Shifted all bits out --> zero.
	unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
	if (ShiftVal.uge(Tmp))
	return 1; // Shifted all sign bits out --> unknown.
	return Tmp - ShiftVal.getZExtValue();
	}

	case X86ISD::VSRAI: {
	SDValue Src = Op.getOperand(0);
	APInt ShiftVal = Op.getConstantOperandAPInt(1);
	if (ShiftVal.uge(VTBits - 1))
	return VTBits; // Sign splat.
	unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
	ShiftVal += Tmp;
	return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
	}

	case X86ISD::PCMPGT:
	case X86ISD::PCMPEQ:
	case X86ISD::CMPP:
	case X86ISD::VPCOM:
	case X86ISD::VPCOMU:
	// Vector compares return zero/all-bits result values.
	return VTBits;

	case X86ISD::ANDNP: {
	unsigned Tmp0 =
	DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
	if (Tmp0 == 1) return 1; // Early out.
	unsigned Tmp1 =
	DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
	return std::min(Tmp0, Tmp1);
	}

	case X86ISD::CMOV: {
	unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
	if (Tmp0 == 1) return 1; // Early out.
	unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
	return std::min(Tmp0, Tmp1);
	}
	}

	// Handle target shuffles.
	// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
	if (isTargetShuffle(Opcode)) {
	bool IsUnary;
	SmallVector<int, 64> Mask;
	SmallVector<SDValue, 2> Ops;
	if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
	IsUnary)) {
	unsigned NumOps = Ops.size();
	unsigned NumElts = VT.getVectorNumElements();
	if (Mask.size() == NumElts) {
	SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
	for (unsigned i = 0; i != NumElts; ++i) {
	if (!DemandedElts[i])
	continue;
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	// For UNDEF elements, we don't know anything about the common state
	// of the shuffle result.
	return 1;
	} else if (M == SM_SentinelZero) {
	// Zero = all sign bits.
	continue;
	}
	assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
	"Shuffle index out of range");

	unsigned OpIdx = (unsigned)M / NumElts;
	unsigned EltIdx = (unsigned)M % NumElts;
	if (Ops[OpIdx].getValueType() != VT) {
	// TODO - handle target shuffle ops with different value types.
	return 1;
	}
	DemandedOps[OpIdx].setBit(EltIdx);
	}
	unsigned Tmp0 = VTBits;
	for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
	if (!DemandedOps[i])
	continue;
	unsigned Tmp1 =
	DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
	Tmp0 = std::min(Tmp0, Tmp1);
	}
	return Tmp0;
	}
	}
	}

	// Fallback case.
	return 1;
	}

	SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
	if (N->getOpcode() == X86ISD::Wrapper \|\| N->getOpcode() == X86ISD::WrapperRIP)
	return N->getOperand(0);
	return N;
	}

	// Helper to look for a normal load that can be narrowed into a vzload with the
	// specified VT and memory VT. Returns SDValue() on failure.
	static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
	SelectionDAG &DAG) {
	// Can't if the load is volatile or atomic.
	if (!LN->isSimple())
	return SDValue();

	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
	return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
	LN->getPointerInfo(), LN->getOriginalAlign(),
	LN->getMemOperand()->getFlags());
	}

	// Attempt to match a combined shuffle mask against supported unary shuffle
	// instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
	bool AllowFloatDomain, bool AllowIntDomain,
	SDValue &V1, const SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget, unsigned &Shuffle,
	MVT &SrcVT, MVT &DstVT) {
	unsigned NumMaskElts = Mask.size();
	unsigned MaskEltSize = MaskVT.getScalarSizeInBits();

	// Match against a VZEXT_MOVL vXi32 zero-extending instruction.
	if (MaskEltSize == 32 && isUndefOrEqual(Mask[0], 0) &&
	isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
	Shuffle = X86ISD::VZEXT_MOVL;
	SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
	return true;
	}

	// Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
	// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
	unsigned MaxScale = 64 / MaskEltSize;
	for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
	bool MatchAny = true;
	bool MatchZero = true;
	unsigned NumDstElts = NumMaskElts / Scale;
	for (unsigned i = 0; i != NumDstElts && (MatchAny \|\| MatchZero); ++i) {
	if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
	MatchAny = MatchZero = false;
	break;
	}
	MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
	MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
	}
	if (MatchAny \|\| MatchZero) {
	assert(MatchZero && "Failed to match zext but matched aext?");
	unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
	MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
	MVT::getIntegerVT(MaskEltSize);
	SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);

	if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits())
	V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);

	Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
	if (SrcVT.getVectorNumElements() != NumDstElts)
	Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);

	DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
	DstVT = MVT::getVectorVT(DstVT, NumDstElts);
	return true;
	}
	}
	}

	// Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
	if (((MaskEltSize == 32) \|\| (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
	isUndefOrEqual(Mask[0], 0) &&
	isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
	Shuffle = X86ISD::VZEXT_MOVL;
	SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
	return true;
	}

	// Check if we have SSE3 which will let us use MOVDDUP etc. The
	// instructions are no slower than UNPCKLPD but has the option to
	// fold the input operand into even an unaligned memory load.
	if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
	if (isTargetShuffleEquivalent(Mask, {0, 0})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v2f64;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	}

	if (MaskVT.is256BitVector() && AllowFloatDomain) {
	assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v4f64;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v8f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v8f32;
	return true;
	}
	}

	if (MaskVT.is512BitVector() && AllowFloatDomain) {
	assert(Subtarget.hasAVX512() &&
	"AVX512 required for 512-bit vector shuffles");
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v8f64;
	return true;
	}
	if (isTargetShuffleEquivalent(
	Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v16f32;
	return true;
	}
	if (isTargetShuffleEquivalent(
	Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v16f32;
	return true;
	}
	}

	return false;
	}

	// Attempt to match a combined shuffle mask against supported unary immediate
	// permute instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
	const APInt &Zeroable,
	bool AllowFloatDomain, bool AllowIntDomain,
	const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &ShuffleVT,
	unsigned &PermuteImm) {
	unsigned NumMaskElts = Mask.size();
	unsigned InputSizeInBits = MaskVT.getSizeInBits();
	unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
	MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
	bool ContainsZeros = isAnyZero(Mask);

	// Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
	if (!ContainsZeros && MaskScalarSizeInBits == 64) {
	// Check for lane crossing permutes.
	if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
	// PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
	if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
	Shuffle = X86ISD::VPERMI;
	ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
	PermuteImm = getV4X86ShuffleImm(Mask);
	return true;
	}
	if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
	SmallVector<int, 4> RepeatedMask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
	Shuffle = X86ISD::VPERMI;
	ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
	PermuteImm = getV4X86ShuffleImm(RepeatedMask);
	return true;
	}
	}
	} else if (AllowFloatDomain && Subtarget.hasAVX()) {
	// VPERMILPD can permute with a non-repeating shuffle.
	Shuffle = X86ISD::VPERMILPI;
	ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
	PermuteImm = 0;
	for (int i = 0, e = Mask.size(); i != e; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
	PermuteImm \|= (M & 1) << i;
	}
	return true;
	}
	}

	// Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
	// AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
	// had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
	if ((MaskScalarSizeInBits == 64 \|\| MaskScalarSizeInBits == 32) &&
	!ContainsZeros && (AllowIntDomain \|\| Subtarget.hasAVX())) {
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
	// Narrow the repeated mask to create 32-bit element permutes.
	SmallVector<int, 4> WordMask = RepeatedMask;
	if (MaskScalarSizeInBits == 64)
	narrowShuffleMaskElts(2, RepeatedMask, WordMask);

	Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
	ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
	ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
	PermuteImm = getV4X86ShuffleImm(WordMask);
	return true;
	}
	}

	// Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
	if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
	ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
	ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);

	// PSHUFLW: permute lower 4 elements only.
	if (isUndefOrInRange(LoMask, 0, 4) &&
	isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
	Shuffle = X86ISD::PSHUFLW;
	ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
	PermuteImm = getV4X86ShuffleImm(LoMask);
	return true;
	}

	// PSHUFHW: permute upper 4 elements only.
	if (isUndefOrInRange(HiMask, 4, 8) &&
	isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
	// Offset the HiMask so that we can create the shuffle immediate.
	int OffsetHiMask[4];
	for (int i = 0; i != 4; ++i)
	OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);

	Shuffle = X86ISD::PSHUFHW;
	ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
	PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
	return true;
	}
	}
	}

	// Attempt to match against byte/bit shifts.
	if (AllowIntDomain &&
	((MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
	int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
	Mask, 0, Zeroable, Subtarget);
	if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() \|\| Subtarget.hasBWI() \|\|
	32 <= ShuffleVT.getScalarSizeInBits())) {
	PermuteImm = (unsigned)ShiftAmt;
	return true;
	}
	}

	// Attempt to match against bit rotates.
	if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
	((MaskVT.is128BitVector() && Subtarget.hasXOP()) \|\|
	Subtarget.hasAVX512())) {
	int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
	Subtarget, Mask);
	if (0 < RotateAmt) {
	Shuffle = X86ISD::VROTLI;
	PermuteImm = (unsigned)RotateAmt;
	return true;
	}
	}

	return false;
	}

	// Attempt to match a combined unary shuffle mask against supported binary
	// shuffle instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
	bool AllowFloatDomain, bool AllowIntDomain,
	SDValue &V1, SDValue &V2, const SDLoc &DL,
	SelectionDAG &DAG, const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
	bool IsUnary) {
	unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

	if (MaskVT.is128BitVector()) {
	if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
	V2 = V1;
	V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
	Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
	SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
	V2 = V1;
	Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
	SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
	(AllowFloatDomain \|\| !Subtarget.hasSSE41())) {
	std::swap(V1, V2);
	Shuffle = X86ISD::MOVSD;
	SrcVT = DstVT = MVT::v2f64;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
	(AllowFloatDomain \|\| !Subtarget.hasSSE41())) {
	Shuffle = X86ISD::MOVSS;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	}

	// Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
	if (((MaskVT == MVT::v8i16 \|\| MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) \|\|
	((MaskVT == MVT::v16i16 \|\| MaskVT == MVT::v32i8) && Subtarget.hasInt256()) \|\|
	((MaskVT == MVT::v32i16 \|\| MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
	if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
	Subtarget)) {
	DstVT = MaskVT;
	return true;
	}
	}

	// Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
	if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) \|\|
	(MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
	if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
	Subtarget)) {
	SrcVT = DstVT = MaskVT;
	if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
	SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
	return true;
	}
	}

	return false;
	}

	static bool matchBinaryPermuteShuffle(
	MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
	bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
	const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
	unsigned NumMaskElts = Mask.size();
	unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

	// Attempt to match against VALIGND/VALIGNQ rotate.
	if (AllowIntDomain && (EltSizeInBits == 64 \|\| EltSizeInBits == 32) &&
	((MaskVT.is128BitVector() && Subtarget.hasVLX()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasVLX()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
	if (!isAnyZero(Mask)) {
	int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
	if (0 < Rotation) {
	Shuffle = X86ISD::VALIGN;
	if (EltSizeInBits == 64)
	ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
	else
	ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
	PermuteImm = Rotation;
	return true;
	}
	}
	}

	// Attempt to match against PALIGNR byte rotate.
	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
	int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
	if (0 < ByteRotation) {
	Shuffle = X86ISD::PALIGNR;
	ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
	PermuteImm = ByteRotation;
	return true;
	}
	}

	// Attempt to combine to X86ISD::BLENDI.
	if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) \|\|
	(Subtarget.hasAVX() && MaskVT.is256BitVector()))) \|\|
	(MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
	uint64_t BlendMask = 0;
	bool ForceV1Zero = false, ForceV2Zero = false;
	SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
	if (matchShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,
	ForceV2Zero, BlendMask)) {
	if (MaskVT == MVT::v16i16) {
	// We can only use v16i16 PBLENDW if the lanes are repeated.
	SmallVector<int, 8> RepeatedMask;
	if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
	RepeatedMask)) {
	assert(RepeatedMask.size() == 8 &&
	"Repeated mask size doesn't match!");
	PermuteImm = 0;
	for (int i = 0; i < 8; ++i)
	if (RepeatedMask[i] >= 8)
	PermuteImm \|= 1 << i;
	V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
	V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
	Shuffle = X86ISD::BLENDI;
	ShuffleVT = MaskVT;
	return true;
	}
	} else {
	V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
	V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
	PermuteImm = (unsigned)BlendMask;
	Shuffle = X86ISD::BLENDI;
	ShuffleVT = MaskVT;
	return true;
	}
	}
	}

	// Attempt to combine to INSERTPS, but only if it has elements that need to
	// be set to zero.
	if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
	MaskVT.is128BitVector() && isAnyZero(Mask) &&
	matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
	Shuffle = X86ISD::INSERTPS;
	ShuffleVT = MVT::v4f32;
	return true;
	}

	// Attempt to combine to SHUFPD.
	if (AllowFloatDomain && EltSizeInBits == 64 &&
	((MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
	bool ForceV1Zero = false, ForceV2Zero = false;
	if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
	PermuteImm, Mask, Zeroable)) {
	V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
	V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
	Shuffle = X86ISD::SHUFP;
	ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
	return true;
	}
	}

	// Attempt to combine to SHUFPS.
	if (AllowFloatDomain && EltSizeInBits == 32 &&
	((MaskVT.is128BitVector() && Subtarget.hasSSE1()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
	SmallVector<int, 4> RepeatedMask;
	if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
	// Match each half of the repeated mask, to determine if its just
	// referencing one of the vectors, is zeroable or entirely undef.
	auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
	int M0 = RepeatedMask[Offset];
	int M1 = RepeatedMask[Offset + 1];

	if (isUndefInRange(RepeatedMask, Offset, 2)) {
	return DAG.getUNDEF(MaskVT);
	} else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : 0);
	S1 = (SM_SentinelUndef == M1 ? -1 : 1);
	return getZeroVector(MaskVT, Subtarget, DAG, DL);
	} else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
	S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
	return V1;
	} else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
	S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
	return V2;
	}

	return SDValue();
	};

	int ShufMask[4] = {-1, -1, -1, -1};
	SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
	SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);

	if (Lo && Hi) {
	V1 = Lo;
	V2 = Hi;
	Shuffle = X86ISD::SHUFP;
	ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
	PermuteImm = getV4X86ShuffleImm(ShufMask);
	return true;
	}
	}
	}

	// Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
	if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
	MaskVT.is128BitVector() &&
	matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
	Shuffle = X86ISD::INSERTPS;
	ShuffleVT = MVT::v4f32;
	return true;
	}

	return false;
	}

	static SDValue combineX86ShuffleChainWithExtract(
	ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
	bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
	const X86Subtarget &Subtarget);

	/// Combine an arbitrary chain of shuffles into a single instruction if
	/// possible.
	///
	/// This is the leaf of the recursive combine below. When we have found some
	/// chain of single-use x86 shuffle instructions and accumulated the combined
	/// shuffle mask represented by them, this will try to pattern match that mask
	/// into either a single instruction if there is a special purpose instruction
	/// for this operation, or into a PSHUFB instruction which is a fully general
	/// instruction but should only be used to replace chains over a certain depth.
	static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
	ArrayRef<int> BaseMask, int Depth,
	bool HasVariableMask,
	bool AllowVariableMask, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
	assert((Inputs.size() == 1 \|\| Inputs.size() == 2) &&
	"Unexpected number of shuffle inputs!");

	// Find the inputs that enter the chain. Note that multiple uses are OK
	// here, we're not going to remove the operands we find.
	bool UnaryShuffle = (Inputs.size() == 1);
	SDValue V1 = peekThroughBitcasts(Inputs[0]);
	SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
	: peekThroughBitcasts(Inputs[1]));

	MVT VT1 = V1.getSimpleValueType();
	MVT VT2 = V2.getSimpleValueType();
	MVT RootVT = Root.getSimpleValueType();
	assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
	VT2.getSizeInBits() == RootVT.getSizeInBits() &&
	"Vector size mismatch");

	SDLoc DL(Root);
	SDValue Res;

	unsigned NumBaseMaskElts = BaseMask.size();
	if (NumBaseMaskElts == 1) {
	assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
	return DAG.getBitcast(RootVT, V1);
	}

	bool OptForSize = DAG.shouldOptForSize();
	unsigned RootSizeInBits = RootVT.getSizeInBits();
	unsigned NumRootElts = RootVT.getVectorNumElements();
	unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
	bool FloatDomain = VT1.isFloatingPoint() \|\| VT2.isFloatingPoint() \|\|
	(RootVT.isFloatingPoint() && Depth >= 1) \|\|
	(RootVT.is256BitVector() && !Subtarget.hasAVX2());

	// Don't combine if we are a AVX512/EVEX target and the mask element size
	// is different from the root element size - this would prevent writemasks
	// from being reused.
	bool IsMaskedShuffle = false;
	if (RootSizeInBits == 512 \|\| (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
	if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
	Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
	IsMaskedShuffle = true;
	}
	}

	// If we are shuffling a broadcast (and not introducing zeros) then
	// we can just use the broadcast directly. This works for smaller broadcast
	// elements as well as they already repeat across each mask element
	if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
	(BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0) {
	return DAG.getBitcast(RootVT, V1);
	}

	// Attempt to match a subvector broadcast.
	// shuffle(insert_subvector(undef, sub, 0), undef, 0, 0, 0, 0)
	if (UnaryShuffle &&
	(BaseMaskEltSizeInBits == 128 \|\| BaseMaskEltSizeInBits == 256)) {
	SmallVector<int, 64> BroadcastMask(NumBaseMaskElts, 0);
	if (isTargetShuffleEquivalent(BaseMask, BroadcastMask)) {
	SDValue Src = Inputs[0];
	if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
	Src.getOperand(0).isUndef() &&
	Src.getOperand(1).getValueSizeInBits() == BaseMaskEltSizeInBits &&
	MayFoldLoad(Src.getOperand(1)) && isNullConstant(Src.getOperand(2))) {
	return DAG.getBitcast(RootVT, DAG.getNode(X86ISD::SUBV_BROADCAST, DL,
	Src.getValueType(),
	Src.getOperand(1)));
	}
	}
	}

	// Handle 128/256-bit lane shuffles of 512-bit vectors.
	if (RootVT.is512BitVector() &&
	(NumBaseMaskElts == 2 \|\| NumBaseMaskElts == 4)) {
	MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);

	// If the upper subvectors are zeroable, then an extract+insert is more
	// optimal than using X86ISD::SHUF128. The insertion is free, even if it has
	// to zero the upper subvectors.
	if (isUndefOrZeroInRange(BaseMask, 1, NumBaseMaskElts - 1)) {
	if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
	return SDValue(); // Nothing to do!
	assert(isInRange(BaseMask[0], 0, NumBaseMaskElts) &&
	"Unexpected lane shuffle");
	Res = DAG.getBitcast(ShuffleVT, V1);
	unsigned SubIdx = BaseMask[0] * (8 / NumBaseMaskElts);
	bool UseZero = isAnyZero(BaseMask);
	Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
	Res = widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
	return DAG.getBitcast(RootVT, Res);
	}

	// Narrow shuffle mask to v4x128.
	SmallVector<int, 4> Mask;
	assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
	narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, BaseMask, Mask);

	// Try to lower to vshuf64x2/vshuf32x4.
	auto MatchSHUF128 = [](MVT ShuffleVT, const SDLoc &DL, ArrayRef<int> Mask,
	SDValue V1, SDValue V2, SelectionDAG &DAG) {
	unsigned PermMask = 0;
	// Insure elements came from the same Op.
	SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
	for (int i = 0; i < 4; ++i) {
	assert(Mask[i] >= -1 && "Illegal shuffle sentinel value");
	if (Mask[i] < 0)
	continue;

	SDValue Op = Mask[i] >= 4 ? V2 : V1;
	unsigned OpIndex = i / 2;
	if (Ops[OpIndex].isUndef())
	Ops[OpIndex] = Op;
	else if (Ops[OpIndex] != Op)
	return SDValue();

	// Convert the 128-bit shuffle mask selection values into 128-bit
	// selection bits defined by a vshuf64x2 instruction's immediate control
	// byte.
	PermMask \|= (Mask[i] % 4) << (i * 2);
	}

	return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
	DAG.getBitcast(ShuffleVT, Ops[0]),
	DAG.getBitcast(ShuffleVT, Ops[1]),
	DAG.getTargetConstant(PermMask, DL, MVT::i8));
	};

	// FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
	// doesn't work because our mask is for 128 bits and we don't have an MVT
	// to match that.
	bool PreferPERMQ =
	UnaryShuffle && isUndefOrInRange(Mask[0], 0, 2) &&
	isUndefOrInRange(Mask[1], 0, 2) && isUndefOrInRange(Mask[2], 2, 4) &&
	isUndefOrInRange(Mask[3], 2, 4) &&
	(Mask[0] < 0 \|\| Mask[2] < 0 \|\| Mask[0] == (Mask[2] % 2)) &&
	(Mask[1] < 0 \|\| Mask[3] < 0 \|\| Mask[1] == (Mask[3] % 2));

	if (!isAnyZero(Mask) && !PreferPERMQ) {
	if (SDValue V = MatchSHUF128(ShuffleVT, DL, Mask, V1, V2, DAG))
	return DAG.getBitcast(RootVT, V);
	}
	}

	// Handle 128-bit lane shuffles of 256-bit vectors.
	if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
	MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);

	// If the upper half is zeroable, then an extract+insert is more optimal
	// than using X86ISD::VPERM2X128. The insertion is free, even if it has to
	// zero the upper half.
	if (isUndefOrZero(BaseMask[1])) {
	if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
	return SDValue(); // Nothing to do!
	assert(isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle");
	Res = DAG.getBitcast(ShuffleVT, V1);
	Res = extract128BitVector(Res, BaseMask[0] * 2, DAG, DL);
	Res = widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG,
	DL, 256);
	return DAG.getBitcast(RootVT, Res);
	}

	if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
	return SDValue(); // Nothing to do!

	// If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
	// we need to use the zeroing feature.
	// Prefer blends for sequential shuffles unless we are optimizing for size.
	if (UnaryShuffle &&
	!(Subtarget.hasAVX2() && isUndefOrInRange(BaseMask, 0, 2)) &&
	(OptForSize \|\| !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0))) {
	unsigned PermMask = 0;
	PermMask \|= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
	PermMask \|= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);

	Res = DAG.getBitcast(ShuffleVT, V1);
	Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
	DAG.getUNDEF(ShuffleVT),
	DAG.getTargetConstant(PermMask, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}

	if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
	return SDValue(); // Nothing to do!

	// TODO - handle AVX512VL cases with X86ISD::SHUF128.
	if (!UnaryShuffle && !IsMaskedShuffle) {
	assert(llvm::all_of(BaseMask, [](int M) { return 0 <= M && M < 4; }) &&
	"Unexpected shuffle sentinel value");
	// Prefer blends to X86ISD::VPERM2X128.
	if (!((BaseMask[0] == 0 && BaseMask[1] == 3) \|\|
	(BaseMask[0] == 2 && BaseMask[1] == 1))) {
	unsigned PermMask = 0;
	PermMask \|= ((BaseMask[0] & 3) << 0);
	PermMask \|= ((BaseMask[1] & 3) << 4);

	Res = DAG.getNode(
	X86ISD::VPERM2X128, DL, ShuffleVT,
	DAG.getBitcast(ShuffleVT, isInRange(BaseMask[0], 0, 2) ? V1 : V2),
	DAG.getBitcast(ShuffleVT, isInRange(BaseMask[1], 0, 2) ? V1 : V2),
	DAG.getTargetConstant(PermMask, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}
	}
	}

	// For masks that have been widened to 128-bit elements or more,
	// narrow back down to 64-bit elements.
	SmallVector<int, 64> Mask;
	if (BaseMaskEltSizeInBits > 64) {
	assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
	int MaskScale = BaseMaskEltSizeInBits / 64;
	narrowShuffleMaskElts(MaskScale, BaseMask, Mask);
	} else {
	Mask.assign(BaseMask.begin(), BaseMask.end());
	}

	// For masked shuffles, we're trying to match the root width for better
	// writemask folding, attempt to scale the mask.
	// TODO - variable shuffles might need this to be widened again.
	if (IsMaskedShuffle && NumRootElts > Mask.size()) {
	assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
	int MaskScale = NumRootElts / Mask.size();
	SmallVector<int, 64> ScaledMask;
	narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
	Mask = std::move(ScaledMask);
	}

	unsigned NumMaskElts = Mask.size();
	unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;

	// Determine the effective mask value type.
	FloatDomain &= (32 <= MaskEltSizeInBits);
	MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
	: MVT::getIntegerVT(MaskEltSizeInBits);
	MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);

	// Only allow legal mask types.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
	return SDValue();

	// Attempt to match the mask against known shuffle patterns.
	MVT ShuffleSrcVT, ShuffleVT;
	unsigned Shuffle, PermuteImm;

	// Which shuffle domains are permitted?
	// Permit domain crossing at higher combine depths.
	// TODO: Should we indicate which domain is preferred if both are allowed?
	bool AllowFloatDomain = FloatDomain \|\| (Depth >= 3);
	bool AllowIntDomain = (!FloatDomain \|\| (Depth >= 3)) && Subtarget.hasSSE2() &&
	(!MaskVT.is256BitVector() \|\| Subtarget.hasAVX2());

	// Determine zeroable mask elements.
	APInt KnownUndef, KnownZero;
	resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
	APInt Zeroable = KnownUndef \| KnownZero;

	if (UnaryShuffle) {
	// Attempt to match against broadcast-from-vector.
	// Limit AVX1 to cases where we're loading+broadcasting a scalar element.
	if ((Subtarget.hasAVX2() \|\|
	(Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
	(!IsMaskedShuffle \|\| NumRootElts == NumMaskElts)) {
	SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
	if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
	if (V1.getValueType() == MaskVT &&
	V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	MayFoldLoad(V1.getOperand(0))) {
	if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
	return SDValue(); // Nothing to do!
	Res = V1.getOperand(0);
	Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
	return DAG.getBitcast(RootVT, Res);
	}
	if (Subtarget.hasAVX2()) {
	if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
	return SDValue(); // Nothing to do!
	Res = DAG.getBitcast(MaskVT, V1);
	Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
	return DAG.getBitcast(RootVT, Res);
	}
	}
	}

	SDValue NewV1 = V1; // Save operand in case early exit happens.
	if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
	DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
	ShuffleVT) &&
	(!IsMaskedShuffle \|\|
	(NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 0 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
	return DAG.getBitcast(RootVT, Res);
	}

	if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
	AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
	PermuteImm) &&
	(!IsMaskedShuffle \|\|
	(NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 0 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	Res = DAG.getBitcast(ShuffleVT, V1);
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
	DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}
	}

	// Attempt to combine to INSERTPS, but only if the inserted element has come
	// from a scalar.
	// TODO: Handle other insertions here as well?
	if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
	MaskEltSizeInBits == 32 && Subtarget.hasSSE41() &&
	!isTargetShuffleEquivalent(Mask, {4, 1, 2, 3})) {
	SDValue SrcV1 = V1, SrcV2 = V2;
	if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask, DAG) &&
	SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
	if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
	return SDValue(); // Nothing to do!
	Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
	DAG.getBitcast(MVT::v4f32, SrcV1),
	DAG.getBitcast(MVT::v4f32, SrcV2),
	DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}
	}

	SDValue NewV1 = V1; // Save operands in case early exit happens.
	SDValue NewV2 = V2;
	if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
	NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
	ShuffleVT, UnaryShuffle) &&
	(!IsMaskedShuffle \|\| (NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 0 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
	NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
	return DAG.getBitcast(RootVT, Res);
	}

	NewV1 = V1; // Save operands in case early exit happens.
	NewV2 = V2;
	if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
	AllowIntDomain, NewV1, NewV2, DL, DAG,
	Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
	(!IsMaskedShuffle \|\| (NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 0 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
	NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
	DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}

	// Typically from here on, we need an integer version of MaskVT.
	MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
	IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);

	// Annoyingly, SSE4A instructions don't map into the above match helpers.
	if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
	uint64_t BitLen, BitIdx;
	if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
	Zeroable)) {
	if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
	return SDValue(); // Nothing to do!
	V1 = DAG.getBitcast(IntMaskVT, V1);
	Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
	DAG.getTargetConstant(BitLen, DL, MVT::i8),
	DAG.getTargetConstant(BitIdx, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}

	if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
	if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
	return SDValue(); // Nothing to do!
	V1 = DAG.getBitcast(IntMaskVT, V1);
	V2 = DAG.getBitcast(IntMaskVT, V2);
	Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
	DAG.getTargetConstant(BitLen, DL, MVT::i8),
	DAG.getTargetConstant(BitIdx, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}
	}

	// Match shuffle against TRUNCATE patterns.
	if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
	// Match against a VTRUNC instruction, accounting for src/dst sizes.
	if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
	Subtarget)) {
	bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
	ShuffleSrcVT.getVectorNumElements();
	unsigned Opc =
	IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
	if (Depth == 0 && Root.getOpcode() == Opc)
	return SDValue(); // Nothing to do!
	V1 = DAG.getBitcast(ShuffleSrcVT, V1);
	Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
	if (ShuffleVT.getSizeInBits() < RootSizeInBits)
	Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
	return DAG.getBitcast(RootVT, Res);
	}

	// Do we need a more general binary truncation pattern?
	if (RootSizeInBits < 512 &&
	((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) \|\|
	(RootVT.is128BitVector() && Subtarget.hasVLX())) &&
	(MaskEltSizeInBits > 8 \|\| Subtarget.hasBWI()) &&
	isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
	if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE)
	return SDValue(); // Nothing to do!
	ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
	ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
	V1 = DAG.getBitcast(ShuffleSrcVT, V1);
	V2 = DAG.getBitcast(ShuffleSrcVT, V2);
	ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
	ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
	Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
	Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
	return DAG.getBitcast(RootVT, Res);
	}
	}

	// Don't try to re-form single instruction chains under any circumstances now
	// that we've done encoding canonicalization for them.
	if (Depth < 1)
	return SDValue();

	// Depth threshold above which we can efficiently use variable mask shuffles.
	int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2;
	AllowVariableMask &= (Depth >= VariableShuffleDepth) \|\| HasVariableMask;

	bool MaskContainsZeros = isAnyZero(Mask);

	if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
	// If we have a single input lane-crossing shuffle then lower to VPERMV.
	if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
	((Subtarget.hasAVX2() &&
	(MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	Res = DAG.getBitcast(MaskVT, V1);
	Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
	return DAG.getBitcast(RootVT, Res);
	}

	// Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
	// vector as the second source.
	if (UnaryShuffle && AllowVariableMask &&
	((Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasVLX() &&
	(MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4i64 \|\|
	MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	// Adjust shuffle mask - replace SM_SentinelZero with second source index.
	for (unsigned i = 0; i != NumMaskElts; ++i)
	if (Mask[i] == SM_SentinelZero)
	Mask[i] = NumMaskElts + i;

	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	Res = DAG.getBitcast(MaskVT, V1);
	SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
	Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
	return DAG.getBitcast(RootVT, Res);
	}

	// If that failed and either input is extracted then try to combine as a
	// shuffle with the larger type.
	if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
	Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
	DAG, Subtarget))
	return WideShuffle;

	// If we have a dual input lane-crossing shuffle then lower to VPERMV3.
	if (AllowVariableMask && !MaskContainsZeros &&
	((Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasVLX() &&
	(MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4i64 \|\|
	MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	V1 = DAG.getBitcast(MaskVT, V1);
	V2 = DAG.getBitcast(MaskVT, V2);
	Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
	return DAG.getBitcast(RootVT, Res);
	}
	return SDValue();
	}

	// See if we can combine a single input shuffle with zeros to a bit-mask,
	// which is much simpler than any shuffle.
	if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
	isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
	DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
	APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
	APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
	APInt UndefElts(NumMaskElts, 0);
	SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
	for (unsigned i = 0; i != NumMaskElts; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	UndefElts.setBit(i);
	continue;
	}
	if (M == SM_SentinelZero)
	continue;
	EltBits[i] = AllOnes;
	}
	SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
	Res = DAG.getBitcast(MaskVT, V1);
	unsigned AndOpcode =
	MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
	Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
	return DAG.getBitcast(RootVT, Res);
	}

	// If we have a single input shuffle with different shuffle patterns in the
	// the 128-bit lanes use the variable mask to VPERMILPS.
	// TODO Combine other mask types at higher depths.
	if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
	((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) \|\|
	(MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
	SmallVector<SDValue, 16> VPermIdx;
	for (int M : Mask) {
	SDValue Idx =
	M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
	VPermIdx.push_back(Idx);
	}
	SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
	Res = DAG.getBitcast(MaskVT, V1);
	Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
	return DAG.getBitcast(RootVT, Res);
	}

	// With XOP, binary shuffles of 128/256-bit floating point vectors can combine
	// to VPERMIL2PD/VPERMIL2PS.
	if (AllowVariableMask && Subtarget.hasXOP() &&
	(MaskVT == MVT::v2f64 \|\| MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4f32 \|\|
	MaskVT == MVT::v8f32)) {
	// VPERMIL2 Operation.
	// Bits[3] - Match Bit.
	// Bits[2:1] - (Per Lane) PD Shuffle Mask.
	// Bits[2:0] - (Per Lane) PS Shuffle Mask.
	unsigned NumLanes = MaskVT.getSizeInBits() / 128;
	unsigned NumEltsPerLane = NumMaskElts / NumLanes;
	SmallVector<int, 8> VPerm2Idx;
	unsigned M2ZImm = 0;
	for (int M : Mask) {
	if (M == SM_SentinelUndef) {
	VPerm2Idx.push_back(-1);
	continue;
	}
	if (M == SM_SentinelZero) {
	M2ZImm = 2;
	VPerm2Idx.push_back(8);
	continue;
	}
	int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
	Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
	VPerm2Idx.push_back(Index);
	}
	V1 = DAG.getBitcast(MaskVT, V1);
	V2 = DAG.getBitcast(MaskVT, V2);
	SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
	Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
	DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}

	// If we have 3 or more shuffle instructions or a chain involving a variable
	// mask, we can replace them with a single PSHUFB instruction profitably.
	// Intel's manuals suggest only using PSHUFB if doing so replacing 5
	// instructions, but in practice PSHUFB tends to be very fast so we're
	// more aggressive.
	if (UnaryShuffle && AllowVariableMask &&
	((RootVT.is128BitVector() && Subtarget.hasSSSE3()) \|\|
	(RootVT.is256BitVector() && Subtarget.hasAVX2()) \|\|
	(RootVT.is512BitVector() && Subtarget.hasBWI()))) {
	SmallVector<SDValue, 16> PSHUFBMask;
	int NumBytes = RootVT.getSizeInBits() / 8;
	int Ratio = NumBytes / NumMaskElts;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / Ratio];
	if (M == SM_SentinelUndef) {
	PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
	continue;
	}
	if (M == SM_SentinelZero) {
	PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
	continue;
	}
	M = Ratio * M + i % Ratio;
	assert((M / 16) == (i / 16) && "Lane crossing detected");
	PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
	}
	MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
	Res = DAG.getBitcast(ByteVT, V1);
	SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
	Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
	return DAG.getBitcast(RootVT, Res);
	}

	// With XOP, if we have a 128-bit binary input shuffle we can always combine
	// to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
	// slower than PSHUFB on targets that support both.
	if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
	// VPPERM Mask Operation
	// Bits[4:0] - Byte Index (0 - 31)
	// Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
	SmallVector<SDValue, 16> VPPERMMask;
	int NumBytes = 16;
	int Ratio = NumBytes / NumMaskElts;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / Ratio];
	if (M == SM_SentinelUndef) {
	VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
	continue;
	}
	if (M == SM_SentinelZero) {
	VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
	continue;
	}
	M = Ratio * M + i % Ratio;
	VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
	}
	MVT ByteVT = MVT::v16i8;
	V1 = DAG.getBitcast(ByteVT, V1);
	V2 = DAG.getBitcast(ByteVT, V2);
	SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
	Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
	return DAG.getBitcast(RootVT, Res);
	}

	// If that failed and either input is extracted then try to combine as a
	// shuffle with the larger type.
	if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
	Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
	DAG, Subtarget))
	return WideShuffle;

	// If we have a dual input shuffle then lower to VPERMV3.
	if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
	((Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasVLX() &&
	(MaskVT == MVT::v2f64 \|\| MaskVT == MVT::v2i64 \|\| MaskVT == MVT::v4f64 \|\|
	MaskVT == MVT::v4i64 \|\| MaskVT == MVT::v4f32 \|\| MaskVT == MVT::v4i32 \|\|
	MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() &&
	(MaskVT == MVT::v8i16 \|\| MaskVT == MVT::v16i16)) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() &&
	(MaskVT == MVT::v16i8 \|\| MaskVT == MVT::v32i8)))) {
	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	V1 = DAG.getBitcast(MaskVT, V1);
	V2 = DAG.getBitcast(MaskVT, V2);
	Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
	return DAG.getBitcast(RootVT, Res);
	}

	// Failed to find any combines.
	return SDValue();
	}

	// Combine an arbitrary chain of shuffles + extract_subvectors into a single
	// instruction if possible.
	//
	// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
	// type size to attempt to combine:
	// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
	// -->
	// extract_subvector(shuffle(x,y,m2),0)
	static SDValue combineX86ShuffleChainWithExtract(
	ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
	bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned NumMaskElts = BaseMask.size();
	unsigned NumInputs = Inputs.size();
	if (NumInputs == 0)
	return SDValue();

	SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
	SmallVector<unsigned, 4> Offsets(NumInputs, 0);

	// Peek through subvectors.
	// TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
	unsigned WideSizeInBits = WideInputs[0].getValueSizeInBits();
	for (unsigned i = 0; i != NumInputs; ++i) {
	SDValue &Src = WideInputs[i];
	unsigned &Offset = Offsets[i];
	Src = peekThroughBitcasts(Src);
	EVT BaseVT = Src.getValueType();
	while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
	Offset += Src.getConstantOperandVal(1);
	Src = Src.getOperand(0);
	}
	WideSizeInBits = std::max(WideSizeInBits,
	(unsigned)Src.getValueSizeInBits());
	assert((Offset % BaseVT.getVectorNumElements()) == 0 &&
	"Unexpected subvector extraction");
	Offset /= BaseVT.getVectorNumElements();
	Offset *= NumMaskElts;
	}

	// Bail if we're always extracting from the lowest subvectors,
	// combineX86ShuffleChain should match this for the current width.
	if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
	return SDValue();

	EVT RootVT = Root.getValueType();
	unsigned RootSizeInBits = RootVT.getSizeInBits();
	unsigned Scale = WideSizeInBits / RootSizeInBits;
	assert((WideSizeInBits % RootSizeInBits) == 0 &&
	"Unexpected subvector extraction");

	// If the src vector types aren't the same, see if we can extend
	// them to match each other.
	// TODO: Support different scalar types?
	EVT WideSVT = WideInputs[0].getValueType().getScalarType();
	if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {
	return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) \|\|
	Op.getValueType().getScalarType() != WideSVT;
	}))
	return SDValue();

	for (SDValue &NewInput : WideInputs) {
	assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&
	"Shuffle vector size mismatch");
	if (WideSizeInBits > NewInput.getValueSizeInBits())
	NewInput = widenSubVector(NewInput, false, Subtarget, DAG,
	SDLoc(NewInput), WideSizeInBits);
	assert(WideSizeInBits == NewInput.getValueSizeInBits() &&
	"Unexpected subvector extraction");
	}

	// Create new mask for larger type.
	for (unsigned i = 1; i != NumInputs; ++i)
	Offsets[i] += i * Scale * NumMaskElts;

	SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end());
	for (int &M : WideMask) {
	if (M < 0)
	continue;
	M = (M % NumMaskElts) + Offsets[M / NumMaskElts];
	}
	WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);

	// Remove unused/repeated shuffle source ops.
	resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
	assert(!WideInputs.empty() && "Shuffle with no inputs detected");

	if (WideInputs.size() > 2)
	return SDValue();

	// Increase depth for every upper subvector we've peeked through.
	Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });

	// Attempt to combine wider chain.
	// TODO: Can we use a better Root?
	SDValue WideRoot = WideInputs[0];
	if (SDValue WideShuffle = combineX86ShuffleChain(
	WideInputs, WideRoot, WideMask, Depth, HasVariableMask,
	AllowVariableMask, DAG, Subtarget)) {
	WideShuffle =
	extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
	return DAG.getBitcast(RootVT, WideShuffle);
	}
	return SDValue();
	}

	// Attempt to constant fold all of the constant source ops.
	// Returns true if the entire shuffle is folded to a constant.
	// TODO: Extend this to merge multiple constant Ops and update the mask.
	static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
	ArrayRef<int> Mask, SDValue Root,
	bool HasVariableMask,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Root.getSimpleValueType();

	unsigned SizeInBits = VT.getSizeInBits();
	unsigned NumMaskElts = Mask.size();
	unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
	unsigned NumOps = Ops.size();

	// Extract constant bits from each source op.
	bool OneUseConstantOp = false;
	SmallVector<APInt, 16> UndefEltsOps(NumOps);
	SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
	for (unsigned i = 0; i != NumOps; ++i) {
	SDValue SrcOp = Ops[i];
	OneUseConstantOp \|= SrcOp.hasOneUse();
	if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
	RawBitsOps[i]))
	return SDValue();
	}

	// Only fold if at least one of the constants is only used once or
	// the combined shuffle has included a variable mask shuffle, this
	// is to avoid constant pool bloat.
	if (!OneUseConstantOp && !HasVariableMask)
	return SDValue();

	// Shuffle the constant bits according to the mask.
	SDLoc DL(Root);
	APInt UndefElts(NumMaskElts, 0);
	APInt ZeroElts(NumMaskElts, 0);
	APInt ConstantElts(NumMaskElts, 0);
	SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
	APInt::getNullValue(MaskSizeInBits));
	for (unsigned i = 0; i != NumMaskElts; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	UndefElts.setBit(i);
	continue;
	} else if (M == SM_SentinelZero) {
	ZeroElts.setBit(i);
	continue;
	}
	assert(0 <= M && M < (int)(NumMaskElts * NumOps));

	unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
	unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;

	auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
	if (SrcUndefElts[SrcMaskIdx]) {
	UndefElts.setBit(i);
	continue;
	}

	auto &SrcEltBits = RawBitsOps[SrcOpIdx];
	APInt &Bits = SrcEltBits[SrcMaskIdx];
	if (!Bits) {
	ZeroElts.setBit(i);
	continue;
	}

	ConstantElts.setBit(i);
	ConstantBitData[i] = Bits;
	}
	assert((UndefElts \| ZeroElts \| ConstantElts).isAllOnesValue());

	// Attempt to create a zero vector.
	if ((UndefElts \| ZeroElts).isAllOnesValue())
	return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);

	// Create the constant data.
	MVT MaskSVT;
	if (VT.isFloatingPoint() && (MaskSizeInBits == 32 \|\| MaskSizeInBits == 64))
	MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
	else
	MaskSVT = MVT::getIntegerVT(MaskSizeInBits);

	MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
	if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
	return SDValue();

	SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
	return DAG.getBitcast(VT, CstOp);
	}

	/// Fully generic combining of x86 shuffle instructions.
	///
	/// This should be the last combine run over the x86 shuffle instructions. Once
	/// they have been fully optimized, this will recursively consider all chains
	/// of single-use shuffle instructions, build a generic model of the cumulative
	/// shuffle operation, and check for simpler instructions which implement this
	/// operation. We use this primarily for two purposes:
	///
	/// 1) Collapse generic shuffles to specialized single instructions when
	/// equivalent. In most cases, this is just an encoding size win, but
	/// sometimes we will collapse multiple generic shuffles into a single
	/// special-purpose shuffle.
	/// 2) Look for sequences of shuffle instructions with 3 or more total
	/// instructions, and replace them with the slightly more expensive SSSE3
	/// PSHUFB instruction if available. We do this as the last combining step
	/// to ensure we avoid using PSHUFB if we can implement the shuffle with
	/// a suitable short sequence of other instructions. The PSHUFB will either
	/// use a register or have to read from memory and so is slightly (but only
	/// slightly) more expensive than the other shuffle instructions.
	///
	/// Because this is inherently a quadratic operation (for each shuffle in
	/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
	/// This should never be an issue in practice as the shuffle lowering doesn't
	/// produce sequences of more than 8 instructions.
	///
	/// FIXME: We will currently miss some cases where the redundant shuffling
	/// would simplify under the threshold for PSHUFB formation because of
	/// combine-ordering. To fix this, we should do the redundant instruction
	/// combining in this recursive walk.
	static SDValue combineX86ShufflesRecursively(
	ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
	ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
	bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(RootMask.size() > 0 &&
	(RootMask.size() > 1 \|\| (RootMask[0] == 0 && SrcOpIndex == 0)) &&
	"Illegal shuffle root mask");

	// Bound the depth of our recursive combine because this is ultimately
	// quadratic in nature.
	const unsigned MaxRecursionDepth = 8;
	if (Depth >= MaxRecursionDepth)
	return SDValue();

	// Directly rip through bitcasts to find the underlying operand.
	SDValue Op = SrcOps[SrcOpIndex];
	Op = peekThroughOneUseBitcasts(Op);

	MVT VT = Op.getSimpleValueType();
	if (!VT.isVector())
	return SDValue(); // Bail if we hit a non-vector.

	assert(Root.getSimpleValueType().isVector() &&
	"Shuffles operate on vector types!");
	unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits();
	assert(VT.getSizeInBits() == RootSizeInBits &&
	"Can only combine shuffles of the same vector register size.");

	// Extract target shuffle mask and resolve sentinels and inputs.
	// TODO - determine Op's demanded elts from RootMask.
	SmallVector<int, 64> OpMask;
	SmallVector<SDValue, 2> OpInputs;
	APInt OpUndef, OpZero;
	APInt OpDemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
	bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
	if (!getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
	OpZero, DAG, Depth, false))
	return SDValue();

	// Shuffle inputs must be the same size as the result, bail on any larger
	// inputs and widen any smaller inputs.
	if (llvm::any_of(OpInputs, [RootSizeInBits](SDValue Op) {
	return Op.getValueSizeInBits() > RootSizeInBits;
	}))
	return SDValue();

	for (SDValue &Op : OpInputs)
	if (Op.getValueSizeInBits() < RootSizeInBits)
	Op = widenSubVector(peekThroughOneUseBitcasts(Op), false, Subtarget, DAG,
	SDLoc(Op), RootSizeInBits);

	SmallVector<int, 64> Mask;
	SmallVector<SDValue, 16> Ops;

	// We don't need to merge masks if the root is empty.
	bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
	if (EmptyRoot) {
	// Only resolve zeros if it will remove an input, otherwise we might end
	// up in an infinite loop.
	bool ResolveKnownZeros = true;
	if (!OpZero.isNullValue()) {
	APInt UsedInputs = APInt::getNullValue(OpInputs.size());
	for (int i = 0, e = OpMask.size(); i != e; ++i) {
	int M = OpMask[i];
	if (OpUndef[i] \|\| OpZero[i] \|\| isUndefOrZero(M))
	continue;
	UsedInputs.setBit(M / OpMask.size());
	if (UsedInputs.isAllOnesValue()) {
	ResolveKnownZeros = false;
	break;
	}
	}
	}
	resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
	ResolveKnownZeros);

	Mask = OpMask;
	Ops.append(OpInputs.begin(), OpInputs.end());
	} else {
	resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);

	// Add the inputs to the Ops list, avoiding duplicates.
	Ops.append(SrcOps.begin(), SrcOps.end());

	auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
	// Attempt to find an existing match.
	SDValue InputBC = peekThroughBitcasts(Input);
	for (int i = 0, e = Ops.size(); i < e; ++i)
	if (InputBC == peekThroughBitcasts(Ops[i]))
	return i;
	// Match failed - should we replace an existing Op?
	if (InsertionPoint >= 0) {
	Ops[InsertionPoint] = Input;
	return InsertionPoint;
	}
	// Add to the end of the Ops list.
	Ops.push_back(Input);
	return Ops.size() - 1;
	};

	SmallVector<int, 2> OpInputIdx;
	for (SDValue OpInput : OpInputs)
	OpInputIdx.push_back(
	AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));

	assert(((RootMask.size() > OpMask.size() &&
	RootMask.size() % OpMask.size() == 0) \|\|
	(OpMask.size() > RootMask.size() &&
	OpMask.size() % RootMask.size() == 0) \|\|
	OpMask.size() == RootMask.size()) &&
	"The smaller number of elements must divide the larger.");

	// This function can be performance-critical, so we rely on the power-of-2
	// knowledge that we have about the mask sizes to replace div/rem ops with
	// bit-masks and shifts.
	assert(isPowerOf2_32(RootMask.size()) &&
	"Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
	unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
	unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());

	unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
	unsigned RootRatio =
	std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
	unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
	assert((RootRatio == 1 \|\| OpRatio == 1) &&
	"Must not have a ratio for both incoming and op masks!");

	assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
	unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
	unsigned OpRatioLog2 = countTrailingZeros(OpRatio);

	Mask.resize(MaskWidth, SM_SentinelUndef);

	// Merge this shuffle operation's mask into our accumulated mask. Note that
	// this shuffle's mask will be the first applied to the input, followed by
	// the root mask to get us all the way to the root value arrangement. The
	// reason for this order is that we are recursing up the operation chain.
	for (unsigned i = 0; i < MaskWidth; ++i) {
	unsigned RootIdx = i >> RootRatioLog2;
	if (RootMask[RootIdx] < 0) {
	// This is a zero or undef lane, we're done.
	Mask[i] = RootMask[RootIdx];
	continue;
	}

	unsigned RootMaskedIdx =
	RootRatio == 1
	? RootMask[RootIdx]
	: (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));

	// Just insert the scaled root mask value if it references an input other
	// than the SrcOp we're currently inserting.
	if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) \|\|
	(((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
	Mask[i] = RootMaskedIdx;
	continue;
	}

	RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
	unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
	if (OpMask[OpIdx] < 0) {
	// The incoming lanes are zero or undef, it doesn't matter which ones we
	// are using.
	Mask[i] = OpMask[OpIdx];
	continue;
	}

	// Ok, we have non-zero lanes, map them through to one of the Op's inputs.
	unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
	: (OpMask[OpIdx] << OpRatioLog2) +
	(RootMaskedIdx & (OpRatio - 1));

	OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
	int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
	assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
	OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;

	Mask[i] = OpMaskedIdx;
	}
	}

	// Remove unused/repeated shuffle source ops.
	resolveTargetShuffleInputsAndMask(Ops, Mask);

	// Handle the all undef/zero cases early.
	if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
	return DAG.getUNDEF(Root.getValueType());

	// TODO - should we handle the mixed zero/undef case as well? Just returning
	// a zero mask will lose information on undef elements possibly reducing
	// future combine possibilities.
	if (all_of(Mask, [](int Idx) { return Idx < 0; }))
	return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
	SDLoc(Root));

	assert(!Ops.empty() && "Shuffle with no inputs detected");
	HasVariableMask \|= IsOpVariableMask;

	// Update the list of shuffle nodes that have been combined so far.
	SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
	SrcNodes.end());
	CombinedNodes.push_back(Op.getNode());

	// See if we can recurse into each shuffle source op (if it's a target
	// shuffle). The source op should only be generally combined if it either has
	// a single use (i.e. current Op) or all its users have already been combined,
	// if not then we can still combine but should prevent generation of variable
	// shuffles to avoid constant pool bloat.
	// Don't recurse if we already have more source ops than we can combine in
	// the remaining recursion depth.
	if (Ops.size() < (MaxRecursionDepth - Depth)) {
	for (int i = 0, e = Ops.size(); i < e; ++i) {
	// For empty roots, we need to resolve zeroable elements before combining
	// them with other shuffles.
	SmallVector<int, 64> ResolvedMask = Mask;
	if (EmptyRoot)
	resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
	bool AllowVar = false;
	if (Ops[i].getNode()->hasOneUse() \|\|
	SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
	AllowVar = AllowVariableMask;
	if (SDValue Res = combineX86ShufflesRecursively(
	Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1,
	HasVariableMask, AllowVar, DAG, Subtarget))
	return Res;
	}
	}

	// Attempt to constant fold all of the constant source ops.
	if (SDValue Cst = combineX86ShufflesConstants(
	Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
	return Cst;

	// We can only combine unary and binary shuffle mask cases.
	if (Ops.size() <= 2) {
	// Minor canonicalization of the accumulated shuffle mask to make it easier
	// to match below. All this does is detect masks with sequential pairs of
	// elements, and shrink them to the half-width mask. It does this in a loop
	// so it will reduce the size of the mask to the minimal width mask which
	// performs an equivalent shuffle.
	SmallVector<int, 64> WidenedMask;
	while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
	Mask = std::move(WidenedMask);
	}

	// Canonicalization of binary shuffle masks to improve pattern matching by
	// commuting the inputs.
	if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
	ShuffleVectorSDNode::commuteMask(Mask);
	std::swap(Ops[0], Ops[1]);
	}

	// Finally, try to combine into a single shuffle instruction.
	return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
	AllowVariableMask, DAG, Subtarget);
	}

	// If that failed and any input is extracted then try to combine as a
	// shuffle with the larger type.
	return combineX86ShuffleChainWithExtract(Ops, Root, Mask, Depth,
	HasVariableMask, AllowVariableMask,
	DAG, Subtarget);
	}

	/// Helper entry wrapper to combineX86ShufflesRecursively.
	static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /Depth/ 0,
	/HasVarMask/ false,
	/AllowVarMask/ true, DAG, Subtarget);
	}

	/// Get the PSHUF-style mask from PSHUF node.
	///
	/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
	/// PSHUF-style masks that can be reused with such instructions.
	static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
	MVT VT = N.getSimpleValueType();
	SmallVector<int, 4> Mask;
	SmallVector<SDValue, 2> Ops;
	bool IsUnary;
	bool HaveMask =
	getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
	(void)HaveMask;
	assert(HaveMask);

	// If we have more than 128-bits, only the low 128-bits of shuffle mask
	// matter. Check that the upper masks are repeats and remove them.
	if (VT.getSizeInBits() > 128) {
	int LaneElts = 128 / VT.getScalarSizeInBits();
	#ifndef NDEBUG
	for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
	for (int j = 0; j < LaneElts; ++j)
	assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
	"Mask doesn't repeat in high 128-bit lanes!");
	#endif
	Mask.resize(LaneElts);
	}

	switch (N.getOpcode()) {
	case X86ISD::PSHUFD:
	return Mask;
	case X86ISD::PSHUFLW:
	Mask.resize(4);
	return Mask;
	case X86ISD::PSHUFHW:
	Mask.erase(Mask.begin(), Mask.begin() + 4);
	for (int &M : Mask)
	M -= 4;
	return Mask;
	default:
	llvm_unreachable("No valid shuffle instruction found!");
	}
	}

	/// Search for a combinable shuffle across a chain ending in pshufd.
	///
	/// We walk up the chain and look for a combinable shuffle, skipping over
	/// shuffles that we could hoist this shuffle's transformation past without
	/// altering anything.
	static SDValue
	combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(N.getOpcode() == X86ISD::PSHUFD &&
	"Called with something other than an x86 128-bit half shuffle!");
	SDLoc DL(N);

	// Walk up a single-use chain looking for a combinable shuffle. Keep a stack
	// of the shuffles in the chain so that we can form a fresh chain to replace
	// this one.
	SmallVector<SDValue, 8> Chain;
	SDValue V = N.getOperand(0);
	for (; V.hasOneUse(); V = V.getOperand(0)) {
	switch (V.getOpcode()) {
	default:
	return SDValue(); // Nothing combined!

	case ISD::BITCAST:
	// Skip bitcasts as we always know the type for the target specific
	// instructions.
	continue;

	case X86ISD::PSHUFD:
	// Found another dword shuffle.
	break;

	case X86ISD::PSHUFLW:
	// Check that the low words (being shuffled) are the identity in the
	// dword shuffle, and the high words are self-contained.
	if (Mask[0] != 0 \|\| Mask[1] != 1 \|\|
	!(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
	return SDValue();

	Chain.push_back(V);
	continue;

	case X86ISD::PSHUFHW:
	// Check that the high words (being shuffled) are the identity in the
	// dword shuffle, and the low words are self-contained.
	if (Mask[2] != 2 \|\| Mask[3] != 3 \|\|
	!(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
	return SDValue();

	Chain.push_back(V);
	continue;

	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	// For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
	// shuffle into a preceding word shuffle.
	if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
	V.getSimpleValueType().getVectorElementType() != MVT::i16)
	return SDValue();

	// Search for a half-shuffle which we can combine with.
	unsigned CombineOp =
	V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
	if (V.getOperand(0) != V.getOperand(1) \|\|
	!V->isOnlyUserOf(V.getOperand(0).getNode()))
	return SDValue();
	Chain.push_back(V);
	V = V.getOperand(0);
	do {
	switch (V.getOpcode()) {
	default:
	return SDValue(); // Nothing to combine.

	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	if (V.getOpcode() == CombineOp)
	break;

	Chain.push_back(V);

	LLVM_FALLTHROUGH;
	case ISD::BITCAST:
	V = V.getOperand(0);
	continue;
	}
	break;
	} while (V.hasOneUse());
	break;
	}
	// Break out of the loop if we break out of the switch.
	break;
	}

	if (!V.hasOneUse())
	// We fell out of the loop without finding a viable combining instruction.
	return SDValue();

	// Merge this node's mask and our incoming mask.
	SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
	for (int &M : Mask)
	M = VMask[M];
	V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

	// Rebuild the chain around this new shuffle.
	while (!Chain.empty()) {
	SDValue W = Chain.pop_back_val();

	if (V.getValueType() != W.getOperand(0).getValueType())
	V = DAG.getBitcast(W.getOperand(0).getValueType(), V);

	switch (W.getOpcode()) {
	default:
	llvm_unreachable("Only PSHUF and UNPCK instructions get here!");

	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
	break;

	case X86ISD::PSHUFD:
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
	break;
	}
	}
	if (V.getValueType() != N.getValueType())
	V = DAG.getBitcast(N.getValueType(), V);

	// Return the new chain to replace N.
	return V;
	}

	// Attempt to commute shufps LHS loads:
	// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
	static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
	SelectionDAG &DAG) {
	// TODO: Add vXf64 support.
	if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
	return SDValue();

	// SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
	auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
	if (V.getOpcode() != X86ISD::SHUFP \|\| !Parent->isOnlyUserOf(V.getNode()))
	return SDValue();
	SDValue N0 = V.getOperand(0);
	SDValue N1 = V.getOperand(1);
	unsigned Imm = V.getConstantOperandVal(2);
	if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) \|\|
	MayFoldLoad(peekThroughOneUseBitcasts(N1)))
	return SDValue();
	Imm = ((Imm & 0x0F) << 4) \| ((Imm & 0xF0) >> 4);
	return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
	DAG.getTargetConstant(Imm, DL, MVT::i8));
	};

	switch (N.getOpcode()) {
	case X86ISD::VPERMILPI:
	if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
	unsigned Imm = N.getConstantOperandVal(1);
	return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
	DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
	}
	break;
	case X86ISD::SHUFP: {
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);
	unsigned Imm = N.getConstantOperandVal(2);
	if (N0 == N1) {
	if (SDValue NewSHUFP = commuteSHUFP(N, N0))
	return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
	DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
	} else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
	return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
	DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
	} else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
	return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
	DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
	}
	break;
	}
	}

	return SDValue();
	}

	/// Try to combine x86 target specific shuffles.
	static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	MVT VT = N.getSimpleValueType();
	SmallVector<int, 4> Mask;
	unsigned Opcode = N.getOpcode();

	bool IsUnary;
	SmallVector<int, 64> TargetMask;
	SmallVector<SDValue, 2> TargetOps;
	if (isTargetShuffle(Opcode))
	getTargetShuffleMask(N.getNode(), VT, true, TargetOps, TargetMask, IsUnary);

	// Combine binary shuffle of 2 similar 'Horizontal' instructions into a
	// single instruction. Attempt to match a v2X64 repeating shuffle pattern that
	// represents the LHS/RHS inputs for the lower/upper halves.
	SmallVector<int, 16> TargetMask128;
	if (!TargetMask.empty() && 0 < TargetOps.size() && TargetOps.size() <= 2 &&
	isRepeatedTargetShuffleMask(128, VT, TargetMask, TargetMask128)) {
	SmallVector<int, 16> WidenedMask128 = TargetMask128;
	while (WidenedMask128.size() > 2) {
	SmallVector<int, 16> WidenedMask;
	if (!canWidenShuffleElements(WidenedMask128, WidenedMask))
	break;
	WidenedMask128 = std::move(WidenedMask);
	}
	if (WidenedMask128.size() == 2) {
	assert(isUndefOrZeroOrInRange(WidenedMask128, 0, 4) && "Illegal shuffle");
	SDValue BC0 = peekThroughBitcasts(TargetOps.front());
	SDValue BC1 = peekThroughBitcasts(TargetOps.back());
	EVT VT0 = BC0.getValueType();
	EVT VT1 = BC1.getValueType();
	unsigned Opcode0 = BC0.getOpcode();
	unsigned Opcode1 = BC1.getOpcode();
	bool isHoriz = (Opcode0 == X86ISD::FHADD \|\| Opcode0 == X86ISD::HADD \|\|
	Opcode0 == X86ISD::FHSUB \|\| Opcode0 == X86ISD::HSUB);
	if (Opcode0 == Opcode1 && VT0 == VT1 &&
	(isHoriz \|\| Opcode0 == X86ISD::PACKSS \|\| Opcode0 == X86ISD::PACKUS)) {
	bool SingleOp = (TargetOps.size() == 1);
	if (!isHoriz \|\| shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
	SDValue Lo = isInRange(WidenedMask128[0], 0, 2) ? BC0 : BC1;
	SDValue Hi = isInRange(WidenedMask128[1], 0, 2) ? BC0 : BC1;
	Lo = Lo.getOperand(WidenedMask128[0] & 1);
	Hi = Hi.getOperand(WidenedMask128[1] & 1);
	if (SingleOp) {
	MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
	SDValue Undef = DAG.getUNDEF(SrcVT);
	SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
	Lo = (WidenedMask128[0] == SM_SentinelZero ? Zero : Lo);
	Hi = (WidenedMask128[1] == SM_SentinelZero ? Zero : Hi);
	Lo = (WidenedMask128[0] == SM_SentinelUndef ? Undef : Lo);
	Hi = (WidenedMask128[1] == SM_SentinelUndef ? Undef : Hi);
	}
	SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
	return DAG.getBitcast(VT, Horiz);
	}
	}
	}
	}

	if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
	return R;

	// Canonicalize UNARYSHUFFLE(XOR(X,-1) -> XOR(UNARYSHUFFLE(X),-1) to
	// help expose the 'NOT' pattern further up the DAG.
	// TODO: This might be beneficial for any binop with a 'splattable' operand.
	switch (Opcode) {
	case X86ISD::MOVDDUP:
	case X86ISD::PSHUFD: {
	SDValue Src = N.getOperand(0);
	if (Src.hasOneUse() && Src.getValueType() == VT) {
	if (SDValue Not = IsNOT(Src, DAG, /OneUse/ true)) {
	Not = DAG.getBitcast(VT, Not);
	Not = Opcode == X86ISD::MOVDDUP
	? DAG.getNode(Opcode, DL, VT, Not)
	: DAG.getNode(Opcode, DL, VT, Not, N.getOperand(1));
	EVT IntVT = Not.getValueType().changeTypeToInteger();
	SDValue AllOnes = DAG.getConstant(-1, DL, IntVT);
	Not = DAG.getBitcast(IntVT, Not);
	Not = DAG.getNode(ISD::XOR, DL, IntVT, Not, AllOnes);
	return DAG.getBitcast(VT, Not);
	}
	}
	break;
	}
	}

	// Handle specific target shuffles.
	switch (Opcode) {
	case X86ISD::MOVDDUP: {
	SDValue Src = N.getOperand(0);
	// Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
	if (VT == MVT::v2f64 && Src.hasOneUse() &&
	ISD::isNormalLoad(Src.getNode())) {
	LoadSDNode *LN = cast<LoadSDNode>(Src);
	if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
	SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
	DCI.CombineTo(N.getNode(), Movddup);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
	DCI.recursivelyDeleteUnusedNodes(LN);
	return N; // Return N so it doesn't get rechecked!
	}
	}

	return SDValue();
	}
	case X86ISD::VBROADCAST: {
	SDValue Src = N.getOperand(0);
	SDValue BC = peekThroughBitcasts(Src);
	EVT SrcVT = Src.getValueType();
	EVT BCVT = BC.getValueType();

	// If broadcasting from another shuffle, attempt to simplify it.
	// TODO - we really need a general SimplifyDemandedVectorElts mechanism.
	if (isTargetShuffle(BC.getOpcode()) &&
	VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
	unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
	SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
	SM_SentinelUndef);
	for (unsigned i = 0; i != Scale; ++i)
	DemandedMask[i] = i;
	if (SDValue Res = combineX86ShufflesRecursively(
	{BC}, 0, BC, DemandedMask, {}, /Depth/ 0,
	/HasVarMask/ false, /AllowVarMask/ true, DAG, Subtarget))
	return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
	DAG.getBitcast(SrcVT, Res));
	}

	// broadcast(bitcast(src)) -> bitcast(broadcast(src))
	// 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
	if (Src.getOpcode() == ISD::BITCAST &&
	SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
	DAG.getTargetLoweringInfo().isTypeLegal(BCVT)) {
	EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
	VT.getVectorNumElements());
	return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
	}

	// Reduce broadcast source vector to lowest 128-bits.
	if (SrcVT.getSizeInBits() > 128)
	return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
	extract128BitVector(Src, 0, DAG, DL));

	// broadcast(scalar_to_vector(x)) -> broadcast(x).
	if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
	return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));

	// Share broadcast with the longest vector and extract low subvector (free).
	for (SDNode *User : Src->uses())
	if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
	User->getValueSizeInBits(0) > VT.getSizeInBits()) {
	return extractSubVector(SDValue(User, 0), 0, DAG, DL,
	VT.getSizeInBits());
	}

	// vbroadcast(scalarload X) -> vbroadcast_load X
	// For float loads, extract other uses of the scalar from the broadcast.
	if (!SrcVT.isVector() && (Src.hasOneUse() \|\| VT.isFloatingPoint()) &&
	ISD::isNormalLoad(Src.getNode())) {
	LoadSDNode *LN = cast<LoadSDNode>(Src);
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
	SDValue BcastLd =
	DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
	LN->getMemoryVT(), LN->getMemOperand());
	// If the load value is used only by N, replace it via CombineTo N.
	bool NoReplaceExtract = Src.hasOneUse();
	DCI.CombineTo(N.getNode(), BcastLd);
	if (NoReplaceExtract) {
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
	DCI.recursivelyDeleteUnusedNodes(LN);
	} else {
	SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
	DAG.getIntPtrConstant(0, DL));
	DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
	}
	return N; // Return N so it doesn't get rechecked!
	}

	// Due to isTypeDesirableForOp, we won't always shrink a load truncated to
	// i16. So shrink it ourselves if we can make a broadcast_load.
	if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
	Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
	assert(Subtarget.hasAVX2() && "Expected AVX2");
	SDValue TruncIn = Src.getOperand(0);

	// If this is a truncate of a non extending load we can just narrow it to
	// use a broadcast_load.
	if (ISD::isNormalLoad(TruncIn.getNode())) {
	LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
	// Unless its volatile or atomic.
	if (LN->isSimple()) {
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
	SDValue BcastLd = DAG.getMemIntrinsicNode(
	X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
	LN->getPointerInfo(), LN->getOriginalAlign(),
	LN->getMemOperand()->getFlags());
	DCI.CombineTo(N.getNode(), BcastLd);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
	DCI.recursivelyDeleteUnusedNodes(Src.getNode());
	return N; // Return N so it doesn't get rechecked!
	}
	}

	// If this is a truncate of an i16 extload, we can directly replace it.
	if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
	ISD::isEXTLoad(Src.getOperand(0).getNode())) {
	LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
	if (LN->getMemoryVT().getSizeInBits() == 16) {
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
	SDValue BcastLd =
	DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
	LN->getMemoryVT(), LN->getMemOperand());
	DCI.CombineTo(N.getNode(), BcastLd);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
	DCI.recursivelyDeleteUnusedNodes(Src.getNode());
	return N; // Return N so it doesn't get rechecked!
	}
	}

	// If this is a truncate of load that has been shifted right, we can
	// offset the pointer and use a narrower load.
	if (TruncIn.getOpcode() == ISD::SRL &&
	TruncIn.getOperand(0).hasOneUse() &&
	isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
	ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
	LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
	unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
	// Make sure the shift amount and the load size are divisible by 16.
	// Don't do this if the load is volatile or atomic.
	if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
	LN->isSimple()) {
	unsigned Offset = ShiftAmt / 8;
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(), Offset, DL);
	SDValue Ops[] = { LN->getChain(), Ptr };
	SDValue BcastLd = DAG.getMemIntrinsicNode(
	X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
	LN->getPointerInfo().getWithOffset(Offset),
	LN->getOriginalAlign(),
	LN->getMemOperand()->getFlags());
	DCI.CombineTo(N.getNode(), BcastLd);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
	DCI.recursivelyDeleteUnusedNodes(Src.getNode());
	return N; // Return N so it doesn't get rechecked!
	}
	}
	}

	// vbroadcast(vzload X) -> vbroadcast_load X
	if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
	MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
	if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
	SDValue BcastLd =
	DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
	LN->getMemoryVT(), LN->getMemOperand());
	DCI.CombineTo(N.getNode(), BcastLd);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
	DCI.recursivelyDeleteUnusedNodes(LN);
	return N; // Return N so it doesn't get rechecked!
	}
	}

	// vbroadcast(vector load X) -> vbroadcast_load
	if (SrcVT == MVT::v2f64 && Src.hasOneUse() &&
	ISD::isNormalLoad(Src.getNode())) {
	LoadSDNode *LN = cast<LoadSDNode>(Src);
	// Unless the load is volatile or atomic.
	if (LN->isSimple()) {
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
	SDValue BcastLd = DAG.getMemIntrinsicNode(
	X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
	LN->getPointerInfo(), LN->getOriginalAlign(),
	LN->getMemOperand()->getFlags());
	DCI.CombineTo(N.getNode(), BcastLd);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
	DCI.recursivelyDeleteUnusedNodes(LN);
	return N; // Return N so it doesn't get rechecked!
	}
	}

	return SDValue();
	}
	case X86ISD::VZEXT_MOVL: {
	SDValue N0 = N.getOperand(0);

	// If this a vzmovl of a full vector load, replace it with a vzload, unless
	// the load is volatile.
	if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
	auto *LN = cast<LoadSDNode>(N0);
	if (SDValue VZLoad =
	narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
	DCI.CombineTo(N.getNode(), VZLoad);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
	DCI.recursivelyDeleteUnusedNodes(LN);
	return N;
	}
	}

	// If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
	// and can just use a VZEXT_LOAD.
	// FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
	if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
	auto *LN = cast<MemSDNode>(N0);
	if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
	SDValue VZLoad =
	DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
	LN->getMemoryVT(), LN->getMemOperand());
	DCI.CombineTo(N.getNode(), VZLoad);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
	DCI.recursivelyDeleteUnusedNodes(LN);
	return N;
	}
	}

	// Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
	// (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
	// if the upper bits of the i64 are zero.
	if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	N0.getOperand(0).hasOneUse() &&
	N0.getOperand(0).getValueType() == MVT::i64) {
	SDValue In = N0.getOperand(0);
	APInt Mask = APInt::getHighBitsSet(64, 32);
	if (DAG.MaskedValueIsZero(In, Mask)) {
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
	MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
	SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
	SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
	return DAG.getBitcast(VT, Movl);
	}
	}

	// Load a scalar integer constant directly to XMM instead of transferring an
	// immediate value from GPR.
	// vzext_movl (scalar_to_vector C) --> load [C,0...]
	if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
	if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
	// Create a vector constant - scalar constant followed by zeros.
	EVT ScalarVT = N0.getOperand(0).getValueType();
	Type ScalarTy = ScalarVT.getTypeForEVT(DAG.getContext());
	unsigned NumElts = VT.getVectorNumElements();
	Constant *Zero = ConstantInt::getNullValue(ScalarTy);
	SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
	ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());

	// Load the vector constant from constant pool.
	MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
	SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
	MachinePointerInfo MPI =
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
	Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
	return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
	MachineMemOperand::MOLoad);
	}
	}

	return SDValue();
	}
	case X86ISD::BLENDI: {
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);

	// blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
	// TODO: Handle MVT::v16i16 repeated blend mask.
	if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
	N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
	MVT SrcVT = N0.getOperand(0).getSimpleValueType();
	if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
	SrcVT.getScalarSizeInBits() >= 32) {
	unsigned BlendMask = N.getConstantOperandVal(2);
	unsigned Size = VT.getVectorNumElements();
	unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
	BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
	N1.getOperand(0),
	DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
	}
	}
	return SDValue();
	}
	case X86ISD::VPERMI: {
	// vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
	// TODO: Remove when we have preferred domains in combineX86ShuffleChain.
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	if (N0.getOpcode() == ISD::BITCAST &&
	N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
	SDValue Src = N0.getOperand(0);
	EVT SrcVT = Src.getValueType();
	SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
	return DAG.getBitcast(VT, Res);
	}
	return SDValue();
	}
	case X86ISD::VPERM2X128: {
	// If both 128-bit values were inserted into high halves of 256-bit values,
	// the shuffle can be reduced to a concatenation of subvectors:
	// vperm2x128 (ins ?, X, C1), (ins ?, Y, C2), 0x31 --> concat X, Y
	// Note: We are only looking for the exact high/high shuffle mask because we
	// expect to fold other similar patterns before creating this opcode.
	SDValue Ins0 = peekThroughBitcasts(N.getOperand(0));
	SDValue Ins1 = peekThroughBitcasts(N.getOperand(1));
	unsigned Imm = N.getConstantOperandVal(2);
	if (!(Imm == 0x31 &&
	Ins0.getOpcode() == ISD::INSERT_SUBVECTOR &&
	Ins1.getOpcode() == ISD::INSERT_SUBVECTOR &&
	Ins0.getValueType() == Ins1.getValueType()))
	return SDValue();

	SDValue X = Ins0.getOperand(1);
	SDValue Y = Ins1.getOperand(1);
	unsigned C1 = Ins0.getConstantOperandVal(2);
	unsigned C2 = Ins1.getConstantOperandVal(2);
	MVT SrcVT = X.getSimpleValueType();
	unsigned SrcElts = SrcVT.getVectorNumElements();
	if (SrcVT != Y.getSimpleValueType() \|\| SrcVT.getSizeInBits() != 128 \|\|
	C1 != SrcElts \|\| C2 != SrcElts)
	return SDValue();

	return DAG.getBitcast(VT, DAG.getNode(ISD::CONCAT_VECTORS, DL,
	Ins1.getValueType(), X, Y));
	}
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	Mask = getPSHUFShuffleMask(N);
	assert(Mask.size() == 4);
	break;
	case X86ISD::MOVSD:
	case X86ISD::MOVSS: {
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);

	// Canonicalize scalar FPOps:
	// MOVS(N0, OP(N0, N1)) --> MOVS(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
	// If commutable, allow OP(N1[0], N0[0]).
	unsigned Opcode1 = N1.getOpcode();
	if (Opcode1 == ISD::FADD \|\| Opcode1 == ISD::FMUL \|\| Opcode1 == ISD::FSUB \|\|
	Opcode1 == ISD::FDIV) {
	SDValue N10 = N1.getOperand(0);
	SDValue N11 = N1.getOperand(1);
	if (N10 == N0 \|\|
	(N11 == N0 && (Opcode1 == ISD::FADD \|\| Opcode1 == ISD::FMUL))) {
	if (N10 != N0)
	std::swap(N10, N11);
	MVT SVT = VT.getVectorElementType();
	SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
	N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
	N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
	SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
	SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
	return DAG.getNode(Opcode, DL, VT, N0, SclVec);
	}
	}

	return SDValue();
	}
	case X86ISD::INSERTPS: {
	assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
	SDValue Op0 = N.getOperand(0);
	SDValue Op1 = N.getOperand(1);
	unsigned InsertPSMask = N.getConstantOperandVal(2);
	unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
	unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
	unsigned ZeroMask = InsertPSMask & 0xF;

	// If we zero out all elements from Op0 then we don't need to reference it.
	if (((ZeroMask \| (1u << DstIdx)) == 0xF) && !Op0.isUndef())
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
	DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

	// If we zero out the element from Op1 then we don't need to reference it.
	if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
	DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

	// Attempt to merge insertps Op1 with an inner target shuffle node.
	SmallVector<int, 8> TargetMask1;
	SmallVector<SDValue, 2> Ops1;
	APInt KnownUndef1, KnownZero1;
	if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
	KnownZero1)) {
	if (KnownUndef1[SrcIdx] \|\| KnownZero1[SrcIdx]) {
	// Zero/UNDEF insertion - zero out element and remove dependency.
	InsertPSMask \|= (1u << DstIdx);
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
	DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
	}
	// Update insertps mask srcidx and reference the source input directly.
	int M = TargetMask1[SrcIdx];
	assert(0 <= M && M < 8 && "Shuffle index out of range");
	InsertPSMask = (InsertPSMask & 0x3f) \| ((M & 0x3) << 6);
	Op1 = Ops1[M < 4 ? 0 : 1];
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
	DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
	}

	// Attempt to merge insertps Op0 with an inner target shuffle node.
	SmallVector<int, 8> TargetMask0;
	SmallVector<SDValue, 2> Ops0;
	APInt KnownUndef0, KnownZero0;
	if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
	KnownZero0)) {
	bool Updated = false;
	bool UseInput00 = false;
	bool UseInput01 = false;
	for (int i = 0; i != 4; ++i) {
	if ((InsertPSMask & (1u << i)) \|\| (i == (int)DstIdx)) {
	// No change if element is already zero or the inserted element.
	continue;
	} else if (KnownUndef0[i] \|\| KnownZero0[i]) {
	// If the target mask is undef/zero then we must zero the element.
	InsertPSMask \|= (1u << i);
	Updated = true;
	continue;
	}

	// The input vector element must be inline.
	int M = TargetMask0[i];
	if (M != i && M != (i + 4))
	return SDValue();

	// Determine which inputs of the target shuffle we're using.
	UseInput00 \|= (0 <= M && M < 4);
	UseInput01 \|= (4 <= M);
	}

	// If we're not using both inputs of the target shuffle then use the
	// referenced input directly.
	if (UseInput00 && !UseInput01) {
	Updated = true;
	Op0 = Ops0[0];
	} else if (!UseInput00 && UseInput01) {
	Updated = true;
	Op0 = Ops0[1];
	}

	if (Updated)
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
	DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
	}

	// If we're inserting an element from a vbroadcast load, fold the
	// load into the X86insertps instruction. We need to convert the scalar
	// load to a vector and clear the source lane of the INSERTPS control.
	if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
	auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
	if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
	SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
	MemIntr->getBasePtr(),
	MemIntr->getMemOperand());
	SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
	Load),
	DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
	DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
	return Insert;
	}
	}

	return SDValue();
	}
	default:
	return SDValue();
	}

	// Nuke no-op shuffles that show up after combining.
	if (isNoopShuffleMask(Mask))
	return N.getOperand(0);

	// Look for simplifications involving one or two shuffle instructions.
	SDValue V = N.getOperand(0);
	switch (N.getOpcode()) {
	default:
	break;
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");

	// See if this reduces to a PSHUFD which is no more expensive and can
	// combine with more operations. Note that it has to at least flip the
	// dwords as otherwise it would have been removed as a no-op.
	if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
	int DMask[] = {0, 1, 2, 3};
	int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
	DMask[DOffset + 0] = DOffset + 1;
	DMask[DOffset + 1] = DOffset + 0;
	MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
	V = DAG.getBitcast(DVT, V);
	V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
	getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
	return DAG.getBitcast(VT, V);
	}

	// Look for shuffle patterns which can be implemented as a single unpack.
	// FIXME: This doesn't handle the location of the PSHUFD generically, and
	// only works when we have a PSHUFD followed by two half-shuffles.
	if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
	(V.getOpcode() == X86ISD::PSHUFLW \|\|
	V.getOpcode() == X86ISD::PSHUFHW) &&
	V.getOpcode() != N.getOpcode() &&
	V.hasOneUse() && V.getOperand(0).hasOneUse()) {
	SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
	if (D.getOpcode() == X86ISD::PSHUFD) {
	SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
	SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
	int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
	int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
	int WordMask[8];
	for (int i = 0; i < 4; ++i) {
	WordMask[i + NOffset] = Mask[i] + NOffset;
	WordMask[i + VOffset] = VMask[i] + VOffset;
	}
	// Map the word mask through the DWord mask.
	int MappedMask[8];
	for (int i = 0; i < 8; ++i)
	MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
	if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) \|\|
	makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
	// We can replace all three shuffles with an unpack.
	V = DAG.getBitcast(VT, D.getOperand(0));
	return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
	: X86ISD::UNPCKH,
	DL, VT, V, V);
	}
	}
	}

	break;

	case X86ISD::PSHUFD:
	if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
	return NewN;

	break;
	}

	return SDValue();
	}

	/// Checks if the shuffle mask takes subsequent elements
	/// alternately from two vectors.
	/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
	static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {

	int ParitySrc[2] = {-1, -1};
	unsigned Size = Mask.size();
	for (unsigned i = 0; i != Size; ++i) {
	int M = Mask[i];
	if (M < 0)
	continue;

	// Make sure we are using the matching element from the input.
	if ((M % Size) != i)
	return false;

	// Make sure we use the same input for all elements of the same parity.
	int Src = M / Size;
	if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
	return false;
	ParitySrc[i % 2] = Src;
	}

	// Make sure each input is used.
	if (ParitySrc[0] < 0 \|\| ParitySrc[1] < 0 \|\| ParitySrc[0] == ParitySrc[1])
	return false;

	Op0Even = ParitySrc[0] == 0;
	return true;
	}

	/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
	/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
	/// are written to the parameters \p Opnd0 and \p Opnd1.
	///
	/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
	/// so it is easier to generically match. We also insert dummy vector shuffle
	/// nodes for the operands which explicitly discard the lanes which are unused
	/// by this operation to try to flow through the rest of the combiner
	/// the fact that they're unused.
	static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
	SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
	bool &IsSubAdd) {

	EVT VT = N->getValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!Subtarget.hasSSE3() \|\| !TLI.isTypeLegal(VT) \|\|
	!VT.getSimpleVT().isFloatingPoint())
	return false;

	// We only handle target-independent shuffles.
	// FIXME: It would be easy and harmless to use the target shuffle mask
	// extraction tool to support more.
	if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
	return false;

	SDValue V1 = N->getOperand(0);
	SDValue V2 = N->getOperand(1);

	// Make sure we have an FADD and an FSUB.
	if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) \|\|
	(V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) \|\|
	V1.getOpcode() == V2.getOpcode())
	return false;

	// If there are other uses of these operations we can't fold them.
	if (!V1->hasOneUse() \|\| !V2->hasOneUse())
	return false;

	// Ensure that both operations have the same operands. Note that we can
	// commute the FADD operands.
	SDValue LHS, RHS;
	if (V1.getOpcode() == ISD::FSUB) {
	LHS = V1->getOperand(0); RHS = V1->getOperand(1);
	if ((V2->getOperand(0) != LHS \|\| V2->getOperand(1) != RHS) &&
	(V2->getOperand(0) != RHS \|\| V2->getOperand(1) != LHS))
	return false;
	} else {
	assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
	LHS = V2->getOperand(0); RHS = V2->getOperand(1);
	if ((V1->getOperand(0) != LHS \|\| V1->getOperand(1) != RHS) &&
	(V1->getOperand(0) != RHS \|\| V1->getOperand(1) != LHS))
	return false;
	}

	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
	bool Op0Even;
	if (!isAddSubOrSubAddMask(Mask, Op0Even))
	return false;

	// It's a subadd if the vector in the even parity is an FADD.
	IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
	: V2->getOpcode() == ISD::FADD;

	Opnd0 = LHS;
	Opnd1 = RHS;
	return true;
	}

	/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
	static SDValue combineShuffleToFMAddSub(SDNode *N,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// We only handle target-independent shuffles.
	// FIXME: It would be easy and harmless to use the target shuffle mask
	// extraction tool to support more.
	if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
	return SDValue();

	MVT VT = N->getSimpleValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!Subtarget.hasAnyFMA() \|\| !TLI.isTypeLegal(VT))
	return SDValue();

	// We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	SDValue FMAdd = Op0, FMSub = Op1;
	if (FMSub.getOpcode() != X86ISD::FMSUB)
	std::swap(FMAdd, FMSub);

	if (FMAdd.getOpcode() != ISD::FMA \|\| FMSub.getOpcode() != X86ISD::FMSUB \|\|
	FMAdd.getOperand(0) != FMSub.getOperand(0) \|\| !FMAdd.hasOneUse() \|\|
	FMAdd.getOperand(1) != FMSub.getOperand(1) \|\| !FMSub.hasOneUse() \|\|
	FMAdd.getOperand(2) != FMSub.getOperand(2))
	return SDValue();

	// Check for correct shuffle mask.
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
	bool Op0Even;
	if (!isAddSubOrSubAddMask(Mask, Op0Even))
	return SDValue();

	// FMAddSub takes zeroth operand from FMSub node.
	SDLoc DL(N);
	bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
	unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
	return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
	FMAdd.getOperand(2));
	}

	/// Try to combine a shuffle into a target-specific add-sub or
	/// mul-add-sub node.
	static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
	return V;

	SDValue Opnd0, Opnd1;
	bool IsSubAdd;
	if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
	return SDValue();

	MVT VT = N->getSimpleValueType(0);
	SDLoc DL(N);

	// Try to generate X86ISD::FMADDSUB node here.
	SDValue Opnd2;
	if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
	unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
	return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
	}

	if (IsSubAdd)
	return SDValue();

	// Do not generate X86ISD::ADDSUB node for 512-bit types even though
	// the ADDSUB idiom has been successfully recognized. There are no known
	// X86 targets with 512-bit ADDSUB instructions!
	if (VT.is512BitVector())
	return SDValue();

	return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
	}

	// We are looking for a shuffle where both sources are concatenated with undef
	// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
	// if we can express this as a single-source shuffle, that's preferable.
	static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasAVX2() \|\| !isa<ShuffleVectorSDNode>(N))
	return SDValue();

	EVT VT = N->getValueType(0);

	// We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
	if (!VT.is128BitVector() && !VT.is256BitVector())
	return SDValue();

	if (VT.getVectorElementType() != MVT::i32 &&
	VT.getVectorElementType() != MVT::i64 &&
	VT.getVectorElementType() != MVT::f32 &&
	VT.getVectorElementType() != MVT::f64)
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Check that both sources are concats with undef.
	if (N0.getOpcode() != ISD::CONCAT_VECTORS \|\|
	N1.getOpcode() != ISD::CONCAT_VECTORS \|\| N0.getNumOperands() != 2 \|\|
	N1.getNumOperands() != 2 \|\| !N0.getOperand(1).isUndef() \|\|
	!N1.getOperand(1).isUndef())
	return SDValue();

	// Construct the new shuffle mask. Elements from the first source retain their
	// index, but elements from the second source no longer need to skip an undef.
	SmallVector<int, 8> Mask;
	int NumElts = VT.getVectorNumElements();

	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
	for (int Elt : SVOp->getMask())
	Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));

	SDLoc DL(N);
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
	N1.getOperand(0));
	return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
	}

	/// Eliminate a redundant shuffle of a horizontal math op.
	static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) {
	unsigned Opcode = N->getOpcode();
	if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST)
	if (Opcode != ISD::VECTOR_SHUFFLE \|\| !N->getOperand(1).isUndef())
	return SDValue();

	// For a broadcast, peek through an extract element of index 0 to find the
	// horizontal op: broadcast (ext_vec_elt HOp, 0)
	EVT VT = N->getValueType(0);
	if (Opcode == X86ISD::VBROADCAST) {
	SDValue SrcOp = N->getOperand(0);
	if (SrcOp.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	SrcOp.getValueType() == MVT::f64 &&
	SrcOp.getOperand(0).getValueType() == VT &&
	isNullConstant(SrcOp.getOperand(1)))
	N = SrcOp.getNode();
	}

	SDValue HOp = N->getOperand(0);
	if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
	HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
	return SDValue();

	// 128-bit horizontal math instructions are defined to operate on adjacent
	// lanes of each operand as:
	// v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
	// ...similarly for v2f64 and v8i16.
	if (!HOp.getOperand(0).isUndef() && !HOp.getOperand(1).isUndef() &&
	HOp.getOperand(0) != HOp.getOperand(1))
	return SDValue();

	// The shuffle that we are eliminating may have allowed the horizontal op to
	// have an undemanded (undefined) operand. Duplicate the other (defined)
	// operand to ensure that the results are defined across all lanes without the
	// shuffle.
	auto updateHOp = [](SDValue HorizOp, SelectionDAG &DAG) {
	SDValue X;
	if (HorizOp.getOperand(0).isUndef()) {
	assert(!HorizOp.getOperand(1).isUndef() && "Not expecting foldable h-op");
	X = HorizOp.getOperand(1);
	} else if (HorizOp.getOperand(1).isUndef()) {
	assert(!HorizOp.getOperand(0).isUndef() && "Not expecting foldable h-op");
	X = HorizOp.getOperand(0);
	} else {
	return HorizOp;
	}
	return DAG.getNode(HorizOp.getOpcode(), SDLoc(HorizOp),
	HorizOp.getValueType(), X, X);
	};

	// When the operands of a horizontal math op are identical, the low half of
	// the result is the same as the high half. If a target shuffle is also
	// replicating low and high halves (and without changing the type/length of
	// the vector), we don't need the shuffle.
	if (Opcode == X86ISD::MOVDDUP \|\| Opcode == X86ISD::VBROADCAST) {
	if (HOp.getScalarValueSizeInBits() == 64 && HOp.getValueType() == VT) {
	// movddup (hadd X, X) --> hadd X, X
	// broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X
	assert((HOp.getValueType() == MVT::v2f64 \|\|
	HOp.getValueType() == MVT::v4f64) && "Unexpected type for h-op");
	return updateHOp(HOp, DAG);
	}
	return SDValue();
	}

	// shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
	// TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
	// but this should be tied to whatever horizontal op matching and shuffle
	// canonicalization are producing.
	if (HOp.getValueSizeInBits() == 128 &&
	(isTargetShuffleEquivalent(Mask, {0, 0}) \|\|
	isTargetShuffleEquivalent(Mask, {0, 1, 0, 1}) \|\|
	isTargetShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
	return updateHOp(HOp, DAG);

	if (HOp.getValueSizeInBits() == 256 &&
	(isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}) \|\|
	isTargetShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) \|\|
	isTargetShuffleEquivalent(
	Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11})))
	return updateHOp(HOp, DAG);

	return SDValue();
	}

	/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
	/// low half of each source vector and does not set any high half elements in
	/// the destination vector, narrow the shuffle to half its original size.
	static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
	if (!Shuf->getValueType(0).isSimple())
	return SDValue();
	MVT VT = Shuf->getSimpleValueType(0);
	if (!VT.is256BitVector() && !VT.is512BitVector())
	return SDValue();

	// See if we can ignore all of the high elements of the shuffle.
	ArrayRef<int> Mask = Shuf->getMask();
	if (!isUndefUpperHalf(Mask))
	return SDValue();

	// Check if the shuffle mask accesses only the low half of each input vector
	// (half-index output is 0 or 2).
	int HalfIdx1, HalfIdx2;
	SmallVector<int, 8> HalfMask(Mask.size() / 2);
	if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) \|\|
	(HalfIdx1 % 2 == 1) \|\| (HalfIdx2 % 2 == 1))
	return SDValue();

	// Create a half-width shuffle to replace the unnecessarily wide shuffle.
	// The trick is knowing that all of the insert/extract are actually free
	// subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
	// of narrow inputs into a narrow output, and that is always cheaper than
	// the wide shuffle that we started with.
	return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
	Shuf->getOperand(1), HalfMask, HalfIdx1,
	HalfIdx2, false, DAG, /UseConcat/true);
	}

	static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
	if (SDValue V = narrowShuffle(Shuf, DAG))
	return V;

	// If we have legalized the vector types, look for blends of FADD and FSUB
	// nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.isTypeLegal(VT)) {
	if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
	return AddSub;

	if (SDValue HAddSub = foldShuffleOfHorizOp(N, DAG))
	return HAddSub;
	}

	// Attempt to combine into a vector load/broadcast.
	if (SDValue LD = combineToConsecutiveLoads(VT, SDValue(N, 0), dl, DAG,
	Subtarget, true))
	return LD;

	// For AVX2, we sometimes want to combine
	// (vector_shuffle <mask> (concat_vectors t1, undef)
	// (concat_vectors t2, undef))
	// Into:
	// (vector_shuffle <mask> (concat_vectors t1, t2), undef)
	// Since the latter can be efficiently lowered with VPERMD/VPERMQ
	if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
	return ShufConcat;

	if (isTargetShuffle(N->getOpcode())) {
	SDValue Op(N, 0);
	if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
	return Shuffle;

	// Try recursively combining arbitrary sequences of x86 shuffle
	// instructions into higher-order shuffles. We do this after combining
	// specific PSHUF instruction sequences into their minimal form so that we
	// can evaluate how many specialized shuffle instructions are involved in
	// a particular chain.
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;

	// Simplify source operands based on shuffle mask.
	// TODO - merge this into combineX86ShufflesRecursively.
	APInt KnownUndef, KnownZero;
	APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
	if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, DCI))
	return SDValue(N, 0);
	}

	// Pull subvector inserts into undef through VZEXT_MOVL by making it an
	// insert into a zero vector. This helps get VZEXT_MOVL closer to
	// scalar_to_vectors where 256/512 are canonicalized to an insert and a
	// 128-bit scalar_to_vector. This reduces the number of isel patterns.
	if (N->getOpcode() == X86ISD::VZEXT_MOVL && !DCI.isBeforeLegalizeOps() &&
	N->getOperand(0).hasOneUse()) {
	SDValue V = peekThroughOneUseBitcasts(N->getOperand(0));

	if (V.getOpcode() == ISD::INSERT_SUBVECTOR &&
	V.getOperand(0).isUndef() && isNullConstant(V.getOperand(2))) {
	SDValue In = V.getOperand(1);
	MVT SubVT =
	MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
	In.getValueSizeInBits() / VT.getScalarSizeInBits());
	In = DAG.getBitcast(SubVT, In);
	SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, SubVT, In);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT,
	getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl),
	Movl, V.getOperand(2));
	}
	}

	return SDValue();
	}

	// Simplify variable target shuffle masks based on the demanded elements.
	// TODO: Handle DemandedBits in mask indices as well?
	bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
	SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
	TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
	// If we're demanding all elements don't bother trying to simplify the mask.
	unsigned NumElts = DemandedElts.getBitWidth();
	if (DemandedElts.isAllOnesValue())
	return false;

	SDValue Mask = Op.getOperand(MaskIndex);
	if (!Mask.hasOneUse())
	return false;

	// Attempt to generically simplify the variable shuffle mask.
	APInt MaskUndef, MaskZero;
	if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
	Depth + 1))
	return true;

	// Attempt to extract+simplify a (constant pool load) shuffle mask.
	// TODO: Support other types from getTargetShuffleMaskIndices?
	SDValue BC = peekThroughOneUseBitcasts(Mask);
	EVT BCVT = BC.getValueType();
	auto *Load = dyn_cast<LoadSDNode>(BC);
	if (!Load)
	return false;

	const Constant *C = getTargetConstantFromNode(Load);
	if (!C)
	return false;

	Type *CTy = C->getType();
	if (!CTy->isVectorTy() \|\|
	CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
	return false;

	// Handle scaling for i64 elements on 32-bit targets.
	unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
	if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
	return false;
	unsigned Scale = NumCstElts / NumElts;

	// Simplify mask if we have an undemanded element that is not undef.
	bool Simplified = false;
	SmallVector<Constant *, 32> ConstVecOps;
	for (unsigned i = 0; i != NumCstElts; ++i) {
	Constant *Elt = C->getAggregateElement(i);
	if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
	ConstVecOps.push_back(UndefValue::get(Elt->getType()));
	Simplified = true;
	continue;
	}
	ConstVecOps.push_back(Elt);
	}
	if (!Simplified)
	return false;

	// Generate new constant pool entry + legalize immediately for the load.
	SDLoc DL(Op);
	SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
	SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
	SDValue NewMask = TLO.DAG.getLoad(
	BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
	MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),
	Load->getAlign());
	return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
	}

	bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
	SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
	TargetLoweringOpt &TLO, unsigned Depth) const {
	int NumElts = DemandedElts.getBitWidth();
	unsigned Opc = Op.getOpcode();
	EVT VT = Op.getValueType();

	// Handle special case opcodes.
	switch (Opc) {
	case X86ISD::PMULDQ:
	case X86ISD::PMULUDQ: {
	APInt LHSUndef, LHSZero;
	APInt RHSUndef, RHSZero;
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
	Depth + 1))
	return true;
	if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
	Depth + 1))
	return true;
	// Multiply by zero.
	KnownZero = LHSZero \| RHSZero;
	break;
	}
	case X86ISD::VSHL:
	case X86ISD::VSRL:
	case X86ISD::VSRA: {
	// We only need the bottom 64-bits of the (128-bit) shift amount.
	SDValue Amt = Op.getOperand(1);
	MVT AmtVT = Amt.getSimpleValueType();
	assert(AmtVT.is128BitVector() && "Unexpected value type");

	// If we reuse the shift amount just for sse shift amounts then we know that
	// only the bottom 64-bits are only ever used.
	bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
	unsigned UseOpc = Use->getOpcode();
	return (UseOpc == X86ISD::VSHL \|\| UseOpc == X86ISD::VSRL \|\|
	UseOpc == X86ISD::VSRA) &&
	Use->getOperand(0) != Amt;
	});

	APInt AmtUndef, AmtZero;
	unsigned NumAmtElts = AmtVT.getVectorNumElements();
	APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
	if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
	Depth + 1, AssumeSingleUse))
	return true;
	LLVM_FALLTHROUGH;
	}
	case X86ISD::VSHLI:
	case X86ISD::VSRLI:
	case X86ISD::VSRAI: {
	SDValue Src = Op.getOperand(0);
	APInt SrcUndef;
	if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
	Depth + 1))
	return true;
	// TODO convert SrcUndef to KnownUndef.
	break;
	}
	case X86ISD::KSHIFTL: {
	SDValue Src = Op.getOperand(0);
	auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
	assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
	unsigned ShiftAmt = Amt->getZExtValue();

	if (ShiftAmt == 0)
	return TLO.CombineTo(Op, Src);

	// If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
	// single shift. We can do this if the bottom bits (which are shifted
	// out) are never demanded.
	if (Src.getOpcode() == X86ISD::KSHIFTR) {
	if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
	unsigned C1 = Src.getConstantOperandVal(1);
	unsigned NewOpc = X86ISD::KSHIFTL;
	int Diff = ShiftAmt - C1;
	if (Diff < 0) {
	Diff = -Diff;
	NewOpc = X86ISD::KSHIFTR;
	}

	SDLoc dl(Op);
	SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
	return TLO.CombineTo(
	Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
	}
	}

	APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
	if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
	Depth + 1))
	return true;

	KnownUndef <<= ShiftAmt;
	KnownZero <<= ShiftAmt;
	KnownZero.setLowBits(ShiftAmt);
	break;
	}
	case X86ISD::KSHIFTR: {
	SDValue Src = Op.getOperand(0);
	auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
	assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
	unsigned ShiftAmt = Amt->getZExtValue();

	if (ShiftAmt == 0)
	return TLO.CombineTo(Op, Src);

	// If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
	// single shift. We can do this if the top bits (which are shifted
	// out) are never demanded.
	if (Src.getOpcode() == X86ISD::KSHIFTL) {
	if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
	unsigned C1 = Src.getConstantOperandVal(1);
	unsigned NewOpc = X86ISD::KSHIFTR;
	int Diff = ShiftAmt - C1;
	if (Diff < 0) {
	Diff = -Diff;
	NewOpc = X86ISD::KSHIFTL;
	}

	SDLoc dl(Op);
	SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
	return TLO.CombineTo(
	Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
	}
	}

	APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
	if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
	Depth + 1))
	return true;

	KnownUndef.lshrInPlace(ShiftAmt);
	KnownZero.lshrInPlace(ShiftAmt);
	KnownZero.setHighBits(ShiftAmt);
	break;
	}
	case X86ISD::CVTSI2P:
	case X86ISD::CVTUI2P: {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	APInt SrcUndef, SrcZero;
	APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
	if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
	Depth + 1))
	return true;
	break;
	}
	case X86ISD::PACKSS:
	case X86ISD::PACKUS: {
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);

	APInt DemandedLHS, DemandedRHS;
	getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

	APInt SrcUndef, SrcZero;
	if (SimplifyDemandedVectorElts(N0, DemandedLHS, SrcUndef, SrcZero, TLO,
	Depth + 1))
	return true;
	if (SimplifyDemandedVectorElts(N1, DemandedRHS, SrcUndef, SrcZero, TLO,
	Depth + 1))
	return true;

	// Aggressively peek through ops to get at the demanded elts.
	// TODO - we should do this for all target/faux shuffles ops.
	if (!DemandedElts.isAllOnesValue()) {
	SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
	TLO.DAG, Depth + 1);
	SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
	TLO.DAG, Depth + 1);
	if (NewN0 \|\| NewN1) {
	NewN0 = NewN0 ? NewN0 : N0;
	NewN1 = NewN1 ? NewN1 : N1;
	return TLO.CombineTo(Op,
	TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
	}
	}
	break;
	}
	case X86ISD::HADD:
	case X86ISD::HSUB:
	case X86ISD::FHADD:
	case X86ISD::FHSUB: {
	APInt DemandedLHS, DemandedRHS;
	getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

	APInt LHSUndef, LHSZero;
	if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, LHSUndef,
	LHSZero, TLO, Depth + 1))
	return true;
	APInt RHSUndef, RHSZero;
	if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, RHSUndef,
	RHSZero, TLO, Depth + 1))
	return true;
	break;
	}
	case X86ISD::VTRUNC:
	case X86ISD::VTRUNCS:
	case X86ISD::VTRUNCUS: {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
	APInt SrcUndef, SrcZero;
	if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
	Depth + 1))
	return true;
	KnownZero = SrcZero.zextOrTrunc(NumElts);
	KnownUndef = SrcUndef.zextOrTrunc(NumElts);
	break;
	}
	case X86ISD::BLENDV: {
	APInt SelUndef, SelZero;
	if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
	SelZero, TLO, Depth + 1))
	return true;

	// TODO: Use SelZero to adjust LHS/RHS DemandedElts.
	APInt LHSUndef, LHSZero;
	if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
	LHSZero, TLO, Depth + 1))
	return true;

	APInt RHSUndef, RHSZero;
	if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
	RHSZero, TLO, Depth + 1))
	return true;

	KnownZero = LHSZero & RHSZero;
	KnownUndef = LHSUndef & RHSUndef;
	break;
	}
	case X86ISD::VZEXT_MOVL: {
	// If upper demanded elements are already zero then we have nothing to do.
	SDValue Src = Op.getOperand(0);
	APInt DemandedUpperElts = DemandedElts;
	DemandedUpperElts.clearLowBits(1);
	if (TLO.DAG.computeKnownBits(Src, DemandedUpperElts, Depth + 1).isZero())
	return TLO.CombineTo(Op, Src);
	break;
	}
	case X86ISD::VBROADCAST: {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	if (!SrcVT.isVector())
	return false;
	// Don't bother broadcasting if we just need the 0'th element.
	if (DemandedElts == 1) {
	if (Src.getValueType() != VT)
	Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
	SDLoc(Op));
	return TLO.CombineTo(Op, Src);
	}
	APInt SrcUndef, SrcZero;
	APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
	if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
	Depth + 1))
	return true;
	// Aggressively peek through src to get at the demanded elt.
	// TODO - we should do this for all target/faux shuffles ops.
	if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
	Src, SrcElts, TLO.DAG, Depth + 1))
	return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
	break;
	}
	case X86ISD::VPERMV:
	if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
	Depth))
	return true;
	break;
	case X86ISD::PSHUFB:
	case X86ISD::VPERMV3:
	case X86ISD::VPERMILPV:
	if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
	Depth))
	return true;
	break;
	case X86ISD::VPPERM:
	case X86ISD::VPERMIL2:
	if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
	Depth))
	return true;
	break;
	}

	// For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
	// demand any of the high elements, then narrow the op to 128/256-bits: e.g.
	// (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
	if ((VT.is256BitVector() \|\| VT.is512BitVector()) &&
	DemandedElts.lshr(NumElts / 2) == 0) {
	unsigned SizeInBits = VT.getSizeInBits();
	unsigned ExtSizeInBits = SizeInBits / 2;

	// See if 512-bit ops only use the bottom 128-bits.
	if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
	ExtSizeInBits = SizeInBits / 4;

	switch (Opc) {
	// Subvector broadcast.
	case X86ISD::SUBV_BROADCAST: {
	SDLoc DL(Op);
	SDValue Src = Op.getOperand(0);
	if (Src.getValueSizeInBits() > ExtSizeInBits)
	Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
	else if (Src.getValueSizeInBits() < ExtSizeInBits) {
	MVT SrcSVT = Src.getSimpleValueType().getScalarType();
	MVT SrcVT =
	MVT::getVectorVT(SrcSVT, ExtSizeInBits / SrcSVT.getSizeInBits());
	Src = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, DL, SrcVT, Src);
	}
	return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Src, 0,
	TLO.DAG, DL, ExtSizeInBits));
	}
	// Byte shifts by immediate.
	case X86ISD::VSHLDQ:
	case X86ISD::VSRLDQ:
	// Shift by uniform.
	case X86ISD::VSHL:
	case X86ISD::VSRL:
	case X86ISD::VSRA:
	// Shift by immediate.
	case X86ISD::VSHLI:
	case X86ISD::VSRLI:
	case X86ISD::VSRAI: {
	SDLoc DL(Op);
	SDValue Ext0 =
	extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
	SDValue ExtOp =
	TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
	SDValue UndefVec = TLO.DAG.getUNDEF(VT);
	SDValue Insert =
	insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
	return TLO.CombineTo(Op, Insert);
	}
	case X86ISD::VPERMI: {
	// Simplify PERMPD/PERMQ to extract_subvector.
	// TODO: This should be done in shuffle combining.
	if (VT == MVT::v4f64 \|\| VT == MVT::v4i64) {
	SmallVector<int, 4> Mask;
	DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
	if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
	SDLoc DL(Op);
	SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
	SDValue UndefVec = TLO.DAG.getUNDEF(VT);
	SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
	return TLO.CombineTo(Op, Insert);
	}
	}
	break;
	}
	// Zero upper elements.
	case X86ISD::VZEXT_MOVL:
	// Target unary shuffles by immediate:
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	case X86ISD::VPERMILPI:
	// (Non-Lane Crossing) Target Shuffles.
	case X86ISD::VPERMILPV:
	case X86ISD::VPERMIL2:
	case X86ISD::PSHUFB:
	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	case X86ISD::BLENDI:
	// Saturated Packs.
	case X86ISD::PACKSS:
	case X86ISD::PACKUS:
	// Horizontal Ops.
	case X86ISD::HADD:
	case X86ISD::HSUB:
	case X86ISD::FHADD:
	case X86ISD::FHSUB: {
	SDLoc DL(Op);
	SmallVector<SDValue, 4> Ops;
	for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
	SDValue SrcOp = Op.getOperand(i);
	EVT SrcVT = SrcOp.getValueType();
	assert((!SrcVT.isVector() \|\| SrcVT.getSizeInBits() == SizeInBits) &&
	"Unsupported vector size");
	Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
	ExtSizeInBits)
	: SrcOp);
	}
	MVT ExtVT = VT.getSimpleVT();
	ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
	ExtSizeInBits / ExtVT.getScalarSizeInBits());
	SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
	SDValue UndefVec = TLO.DAG.getUNDEF(VT);
	SDValue Insert =
	insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
	return TLO.CombineTo(Op, Insert);
	}
	}
	}

	// Get target/faux shuffle mask.
	APInt OpUndef, OpZero;
	SmallVector<int, 64> OpMask;
	SmallVector<SDValue, 2> OpInputs;
	if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
	OpZero, TLO.DAG, Depth, false))
	return false;

	// Shuffle inputs must be the same size as the result.
	if (OpMask.size() != (unsigned)NumElts \|\|
	llvm::any_of(OpInputs, [VT](SDValue V) {
	return VT.getSizeInBits() != V.getValueSizeInBits() \|\|
	!V.getValueType().isVector();
	}))
	return false;

	KnownZero = OpZero;
	KnownUndef = OpUndef;

	// Check if shuffle mask can be simplified to undef/zero/identity.
	int NumSrcs = OpInputs.size();
	for (int i = 0; i != NumElts; ++i)
	if (!DemandedElts[i])
	OpMask[i] = SM_SentinelUndef;

	if (isUndefInRange(OpMask, 0, NumElts)) {
	KnownUndef.setAllBits();
	return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
	}
	if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
	KnownZero.setAllBits();
	return TLO.CombineTo(
	Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
	}
	for (int Src = 0; Src != NumSrcs; ++Src)
	if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
	return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));

	// Attempt to simplify inputs.
	for (int Src = 0; Src != NumSrcs; ++Src) {
	// TODO: Support inputs of different types.
	if (OpInputs[Src].getValueType() != VT)
	continue;

	int Lo = Src * NumElts;
	APInt SrcElts = APInt::getNullValue(NumElts);
	for (int i = 0; i != NumElts; ++i)
	if (DemandedElts[i]) {
	int M = OpMask[i] - Lo;
	if (0 <= M && M < NumElts)
	SrcElts.setBit(M);
	}

	// TODO - Propagate input undef/zero elts.
	APInt SrcUndef, SrcZero;
	if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
	TLO, Depth + 1))
	return true;
	}

	// If we don't demand all elements, then attempt to combine to a simpler
	// shuffle.
	// TODO: Handle other depths, but first we need to handle the fact that
	// it might combine to the same shuffle.
	if (!DemandedElts.isAllOnesValue() && Depth == 0) {
	SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
	for (int i = 0; i != NumElts; ++i)
	if (DemandedElts[i])
	DemandedMask[i] = i;

	SDValue NewShuffle = combineX86ShufflesRecursively(
	{Op}, 0, Op, DemandedMask, {}, Depth, /HasVarMask/ false,
	/AllowVarMask/ true, TLO.DAG, Subtarget);
	if (NewShuffle)
	return TLO.CombineTo(Op, NewShuffle);
	}

	return false;
	}

	bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
	SDValue Op, const APInt &OriginalDemandedBits,
	const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
	unsigned Depth) const {
	EVT VT = Op.getValueType();
	unsigned BitWidth = OriginalDemandedBits.getBitWidth();
	unsigned Opc = Op.getOpcode();
	switch(Opc) {
	case X86ISD::VTRUNC: {
	KnownBits KnownOp;
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();

	// Simplify the input, using demanded bit information.
	APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
	APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
	if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
	return true;
	break;
	}
	case X86ISD::PMULDQ:
	case X86ISD::PMULUDQ: {
	// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
	KnownBits KnownOp;
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	// FIXME: Can we bound this better?
	APInt DemandedMask = APInt::getLowBitsSet(64, 32);
	if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,
	TLO, Depth + 1))
	return true;
	if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
	TLO, Depth + 1))
	return true;

	// Aggressively peek through ops to get at the demanded low bits.
	SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
	LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
	SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
	RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
	if (DemandedLHS \|\| DemandedRHS) {
	DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
	DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
	return TLO.CombineTo(
	Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
	}
	break;
	}
	case X86ISD::VSHLI: {
	SDValue Op0 = Op.getOperand(0);

	unsigned ShAmt = Op.getConstantOperandVal(1);
	if (ShAmt >= BitWidth)
	break;

	APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);

	// If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
	// single shift. We can do this if the bottom bits (which are shifted
	// out) are never demanded.
	if (Op0.getOpcode() == X86ISD::VSRLI &&
	OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
	unsigned Shift2Amt = Op0.getConstantOperandVal(1);
	if (Shift2Amt < BitWidth) {
	int Diff = ShAmt - Shift2Amt;
	if (Diff == 0)
	return TLO.CombineTo(Op, Op0.getOperand(0));

	unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
	SDValue NewShift = TLO.DAG.getNode(
	NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
	TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
	return TLO.CombineTo(Op, NewShift);
	}
	}

	// If we are only demanding sign bits then we can use the shift source directly.
	unsigned NumSignBits =
	TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
	unsigned UpperDemandedBits =
	BitWidth - OriginalDemandedBits.countTrailingZeros();
	if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
	return TLO.CombineTo(Op, Op0);

	if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
	TLO, Depth + 1))
	return true;

	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	Known.Zero <<= ShAmt;
	Known.One <<= ShAmt;

	// Low bits known zero.
	Known.Zero.setLowBits(ShAmt);
	break;
	}
	case X86ISD::VSRLI: {
	unsigned ShAmt = Op.getConstantOperandVal(1);
	if (ShAmt >= BitWidth)
	break;

	APInt DemandedMask = OriginalDemandedBits << ShAmt;

	if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
	OriginalDemandedElts, Known, TLO, Depth + 1))
	return true;

	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	Known.Zero.lshrInPlace(ShAmt);
	Known.One.lshrInPlace(ShAmt);

	// High bits known zero.
	Known.Zero.setHighBits(ShAmt);
	break;
	}
	case X86ISD::VSRAI: {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
	if (ShAmt >= BitWidth)
	break;

	APInt DemandedMask = OriginalDemandedBits << ShAmt;

	// If we just want the sign bit then we don't need to shift it.
	if (OriginalDemandedBits.isSignMask())
	return TLO.CombineTo(Op, Op0);

	// fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
	if (Op0.getOpcode() == X86ISD::VSHLI &&
	Op.getOperand(1) == Op0.getOperand(1)) {
	SDValue Op00 = Op0.getOperand(0);
	unsigned NumSignBits =
	TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
	if (ShAmt < NumSignBits)
	return TLO.CombineTo(Op, Op00);
	}

	// If any of the demanded bits are produced by the sign extension, we also
	// demand the input sign bit.
	if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
	DemandedMask.setSignBit();

	if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
	TLO, Depth + 1))
	return true;

	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	Known.Zero.lshrInPlace(ShAmt);
	Known.One.lshrInPlace(ShAmt);

	// If the input sign bit is known to be zero, or if none of the top bits
	// are demanded, turn this into an unsigned shift right.
	if (Known.Zero[BitWidth - ShAmt - 1] \|\|
	OriginalDemandedBits.countLeadingZeros() >= ShAmt)
	return TLO.CombineTo(
	Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));

	// High bits are known one.
	if (Known.One[BitWidth - ShAmt - 1])
	Known.One.setHighBits(ShAmt);
	break;
	}
	case X86ISD::PEXTRB:
	case X86ISD::PEXTRW: {
	SDValue Vec = Op.getOperand(0);
	auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	MVT VecVT = Vec.getSimpleValueType();
	unsigned NumVecElts = VecVT.getVectorNumElements();

	if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
	unsigned Idx = CIdx->getZExtValue();
	unsigned VecBitWidth = VecVT.getScalarSizeInBits();

	// If we demand no bits from the vector then we must have demanded
	// bits from the implict zext - simplify to zero.
	APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
	if (DemandedVecBits == 0)
	return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

	APInt KnownUndef, KnownZero;
	APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
	if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
	KnownZero, TLO, Depth + 1))
	return true;

	KnownBits KnownVec;
	if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
	KnownVec, TLO, Depth + 1))
	return true;

	if (SDValue V = SimplifyMultipleUseDemandedBits(
	Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
	return TLO.CombineTo(
	Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));

	Known = KnownVec.zext(BitWidth);
	return false;
	}
	break;
	}
	case X86ISD::PINSRB:
	case X86ISD::PINSRW: {
	SDValue Vec = Op.getOperand(0);
	SDValue Scl = Op.getOperand(1);
	auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
	MVT VecVT = Vec.getSimpleValueType();

	if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
	unsigned Idx = CIdx->getZExtValue();
	if (!OriginalDemandedElts[Idx])
	return TLO.CombineTo(Op, Vec);

	KnownBits KnownVec;
	APInt DemandedVecElts(OriginalDemandedElts);
	DemandedVecElts.clearBit(Idx);
	if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
	KnownVec, TLO, Depth + 1))
	return true;

	KnownBits KnownScl;
	unsigned NumSclBits = Scl.getScalarValueSizeInBits();
	APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
	if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
	return true;

	KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
	Known.One = KnownVec.One & KnownScl.One;
	Known.Zero = KnownVec.Zero & KnownScl.Zero;
	return false;
	}
	break;
	}
	case X86ISD::PACKSS:
	// PACKSS saturates to MIN/MAX integer values. So if we just want the
	// sign bit then we can just ask for the source operands sign bit.
	// TODO - add known bits handling.
	if (OriginalDemandedBits.isSignMask()) {
	APInt DemandedLHS, DemandedRHS;
	getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);

	KnownBits KnownLHS, KnownRHS;
	APInt SignMask = APInt::getSignMask(BitWidth * 2);
	if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
	KnownLHS, TLO, Depth + 1))
	return true;
	if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
	KnownRHS, TLO, Depth + 1))
	return true;

	// Attempt to avoid multi-use ops if we don't need anything from them.
	SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
	Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
	SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
	Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
	if (DemandedOp0 \|\| DemandedOp1) {
	SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
	SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
	return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
	}
	}
	// TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
	break;
	case X86ISD::PCMPGT:
	// icmp sgt(0, R) == ashr(R, BitWidth-1).
	// iff we only need the sign bit then we can use R directly.
	if (OriginalDemandedBits.isSignMask() &&
	ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
	return TLO.CombineTo(Op, Op.getOperand(1));
	break;
	case X86ISD::MOVMSK: {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	unsigned SrcBits = SrcVT.getScalarSizeInBits();
	unsigned NumElts = SrcVT.getVectorNumElements();

	// If we don't need the sign bits at all just return zero.
	if (OriginalDemandedBits.countTrailingZeros() >= NumElts)
	return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

	// Only demand the vector elements of the sign bits we need.
	APInt KnownUndef, KnownZero;
	APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
	if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
	TLO, Depth + 1))
	return true;

	Known.Zero = KnownZero.zextOrSelf(BitWidth);
	Known.Zero.setHighBits(BitWidth - NumElts);

	// MOVMSK only uses the MSB from each vector element.
	KnownBits KnownSrc;
	APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
	if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
	Depth + 1))
	return true;

	if (KnownSrc.One[SrcBits - 1])
	Known.One.setLowBits(NumElts);
	else if (KnownSrc.Zero[SrcBits - 1])
	Known.Zero.setLowBits(NumElts);

	// Attempt to avoid multi-use os if we don't need anything from it.
	if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
	Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
	return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
	return false;
	}
	case X86ISD::BEXTR: {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	// Only bottom 16-bits of the control bits are required.
	if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
	// NOTE: SimplifyDemandedBits won't do this for constants.
	const APInt &Val1 = Cst1->getAPIntValue();
	APInt MaskedVal1 = Val1 & 0xFFFF;
	if (MaskedVal1 != Val1) {
	SDLoc DL(Op);
	return TLO.CombineTo(
	Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
	TLO.DAG.getConstant(MaskedVal1, DL, VT)));
	}
	}

	KnownBits Known1;
	APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
	if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
	return true;

	// If the length is 0, replace with 0.
	KnownBits LengthBits = Known1.extractBits(8, 8);
	if (LengthBits.isZero())
	return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

	break;
	}
	}

	return TargetLowering::SimplifyDemandedBitsForTargetNode(
	Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
	}

	SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
	SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
	SelectionDAG &DAG, unsigned Depth) const {
	int NumElts = DemandedElts.getBitWidth();
	unsigned Opc = Op.getOpcode();
	EVT VT = Op.getValueType();

	switch (Opc) {
	case X86ISD::PINSRB:
	case X86ISD::PINSRW: {
	// If we don't demand the inserted element, return the base vector.
	SDValue Vec = Op.getOperand(0);
	auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
	MVT VecVT = Vec.getSimpleValueType();
	if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
	!DemandedElts[CIdx->getZExtValue()])
	return Vec;
	break;
	}
	case X86ISD::VSHLI: {
	// If we are only demanding sign bits then we can use the shift source
	// directly.
	SDValue Op0 = Op.getOperand(0);
	unsigned ShAmt = Op.getConstantOperandVal(1);
	unsigned BitWidth = DemandedBits.getBitWidth();
	unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
	unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros();
	if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
	return Op0;
	break;
	}
	case X86ISD::VSRAI:
	// iff we only need the sign bit then we can use the source directly.
	// TODO: generalize where we only demand extended signbits.
	if (DemandedBits.isSignMask())
	return Op.getOperand(0);
	break;
	case X86ISD::PCMPGT:
	// icmp sgt(0, R) == ashr(R, BitWidth-1).
	// iff we only need the sign bit then we can use R directly.
	if (DemandedBits.isSignMask() &&
	ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
	return Op.getOperand(1);
	break;
	}

	APInt ShuffleUndef, ShuffleZero;
	SmallVector<int, 16> ShuffleMask;
	SmallVector<SDValue, 2> ShuffleOps;
	if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
	ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
	// If all the demanded elts are from one operand and are inline,
	// then we can use the operand directly.
	int NumOps = ShuffleOps.size();
	if (ShuffleMask.size() == (unsigned)NumElts &&
	llvm::all_of(ShuffleOps, [VT](SDValue V) {
	return VT.getSizeInBits() == V.getValueSizeInBits();
	})) {

	if (DemandedElts.isSubsetOf(ShuffleUndef))
	return DAG.getUNDEF(VT);
	if (DemandedElts.isSubsetOf(ShuffleUndef \| ShuffleZero))
	return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));

	// Bitmask that indicates which ops have only been accessed 'inline'.
	APInt IdentityOp = APInt::getAllOnesValue(NumOps);
	for (int i = 0; i != NumElts; ++i) {
	int M = ShuffleMask[i];
	if (!DemandedElts[i] \|\| ShuffleUndef[i])
	continue;
	int OpIdx = M / NumElts;
	int EltIdx = M % NumElts;
	if (M < 0 \|\| EltIdx != i) {
	IdentityOp.clearAllBits();
	break;
	}
	IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
	if (IdentityOp == 0)
	break;
	}
	assert((IdentityOp == 0 \|\| IdentityOp.countPopulation() == 1) &&
	"Multiple identity shuffles detected");

	if (IdentityOp != 0)
	return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]);
	}
	}

	return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
	Op, DemandedBits, DemandedElts, DAG, Depth);
	}

	// Helper to peek through bitops/setcc to determine size of source vector.
	// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
	static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
	switch (Src.getOpcode()) {
	case ISD::SETCC:
	return Src.getOperand(0).getValueSizeInBits() == Size;
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR:
	return checkBitcastSrcVectorSize(Src.getOperand(0), Size) &&
	checkBitcastSrcVectorSize(Src.getOperand(1), Size);
	}
	return false;
	}

	// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
	static unsigned getAltBitOpcode(unsigned Opcode) {
	switch(Opcode) {
	case ISD::AND: return X86ISD::FAND;
	case ISD::OR: return X86ISD::FOR;
	case ISD::XOR: return X86ISD::FXOR;
	case X86ISD::ANDNP: return X86ISD::FANDN;
	}
	llvm_unreachable("Unknown bitwise opcode");
	}

	// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
	static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,
	const SDLoc &DL) {
	EVT SrcVT = Src.getValueType();
	if (SrcVT != MVT::v4i1)
	return SDValue();

	switch (Src.getOpcode()) {
	case ISD::SETCC:
	if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
	ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
	cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
	SDValue Op0 = Src.getOperand(0);
	if (ISD::isNormalLoad(Op0.getNode()))
	return DAG.getBitcast(MVT::v4f32, Op0);
	if (Op0.getOpcode() == ISD::BITCAST &&
	Op0.getOperand(0).getValueType() == MVT::v4f32)
	return Op0.getOperand(0);
	}
	break;
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR: {
	SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
	SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
	if (Op0 && Op1)
	return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
	Op1);
	break;
	}
	}
	return SDValue();
	}

	// Helper to push sign extension of vXi1 SETCC result through bitops.
	static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
	SDValue Src, const SDLoc &DL) {
	switch (Src.getOpcode()) {
	case ISD::SETCC:
	return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR:
	return DAG.getNode(
	Src.getOpcode(), DL, SExtVT,
	signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
	signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
	}
	llvm_unreachable("Unexpected node type for vXi1 sign extension");
	}

	// Try to match patterns such as
	// (i16 bitcast (v16i1 x))
	// ->
	// (i16 movmsk (16i8 sext (v16i1 x)))
	// before the illegal vector is scalarized on subtargets that don't have legal
	// vxi1 types.
	static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
	const SDLoc &DL,
	const X86Subtarget &Subtarget) {
	EVT SrcVT = Src.getValueType();
	if (!SrcVT.isSimple() \|\| SrcVT.getScalarType() != MVT::i1)
	return SDValue();

	// Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
	// legalization destroys the v4i32 type.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
	if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
	V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
	DAG.getBitcast(MVT::v4f32, V));
	return DAG.getZExtOrTrunc(V, DL, VT);
	}
	}

	// If the input is a truncate from v16i8 or v32i8 go ahead and use a
	// movmskb even with avx512. This will be better than truncating to vXi1 and
	// using a kmov. This can especially help KNL if the input is a v16i8/v32i8
	// vpcmpeqb/vpcmpgtb.
	bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
	(Src.getOperand(0).getValueType() == MVT::v16i8 \|\|
	Src.getOperand(0).getValueType() == MVT::v32i8 \|\|
	Src.getOperand(0).getValueType() == MVT::v64i8);

	// Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
	// directly with vpmovmskb/vmovmskps/vmovmskpd.
	if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
	cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
	ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
	EVT CmpVT = Src.getOperand(0).getValueType();
	EVT EltVT = CmpVT.getVectorElementType();
	if (CmpVT.getSizeInBits() <= 256 &&
	(EltVT == MVT::i8 \|\| EltVT == MVT::i32 \|\| EltVT == MVT::i64))
	PreferMovMsk = true;
	}

	// With AVX512 vxi1 types are legal and we prefer using k-regs.
	// MOVMSK is supported in SSE2 or later.
	if (!Subtarget.hasSSE2() \|\| (Subtarget.hasAVX512() && !PreferMovMsk))
	return SDValue();

	// There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
	// v8f64. So all legal 128-bit and 256-bit vectors are covered except for
	// v8i16 and v16i16.
	// For these two cases, we can shuffle the upper element bytes to a
	// consecutive sequence at the start of the vector and treat the results as
	// v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
	// for v16i16 this is not the case, because the shuffle is expensive, so we
	// avoid sign-extending to this type entirely.
	// For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
	// (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
	MVT SExtVT;
	bool PropagateSExt = false;
	switch (SrcVT.getSimpleVT().SimpleTy) {
	default:
	return SDValue();
	case MVT::v2i1:
	SExtVT = MVT::v2i64;
	break;
	case MVT::v4i1:
	SExtVT = MVT::v4i32;
	// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
	// sign-extend to a 256-bit operation to avoid truncation.
	if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256)) {
	SExtVT = MVT::v4i64;
	PropagateSExt = true;
	}
	break;
	case MVT::v8i1:
	SExtVT = MVT::v8i16;
	// For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
	// sign-extend to a 256-bit operation to match the compare.
	// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
	// 256-bit because the shuffle is cheaper than sign extending the result of
	// the compare.
	if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256) \|\|
	checkBitcastSrcVectorSize(Src, 512))) {
	SExtVT = MVT::v8i32;
	PropagateSExt = true;
	}
	break;
	case MVT::v16i1:
	SExtVT = MVT::v16i8;
	// For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
	// it is not profitable to sign-extend to 256-bit because this will
	// require an extra cross-lane shuffle which is more expensive than
	// truncating the result of the compare to 128-bits.
	break;
	case MVT::v32i1:
	SExtVT = MVT::v32i8;
	break;
	case MVT::v64i1:
	// If we have AVX512F, but not AVX512BW and the input is truncated from
	// v64i8 checked earlier. Then split the input and make two pmovmskbs.
	if (Subtarget.hasAVX512()) {
	if (Subtarget.hasBWI())
	return SDValue();
	SExtVT = MVT::v64i8;
	break;
	}
	// Split if this is a <64 x i8> comparison result.
	if (checkBitcastSrcVectorSize(Src, 512)) {
	SExtVT = MVT::v64i8;
	break;
	}
	return SDValue();
	};

	SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
	: DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);

	if (SExtVT == MVT::v16i8 \|\| SExtVT == MVT::v32i8 \|\| SExtVT == MVT::v64i8) {
	V = getPMOVMSKB(DL, V, DAG, Subtarget);
	} else {
	if (SExtVT == MVT::v8i16)
	V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
	DAG.getUNDEF(MVT::v8i16));
	V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
	}

	EVT IntVT =
	EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
	V = DAG.getZExtOrTrunc(V, DL, IntVT);
	return DAG.getBitcast(VT, V);
	}

	// Convert a vXi1 constant build vector to the same width scalar integer.
	static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
	EVT SrcVT = Op.getValueType();
	assert(SrcVT.getVectorElementType() == MVT::i1 &&
	"Expected a vXi1 vector");
	assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
	"Expected a constant build vector");

	APInt Imm(SrcVT.getVectorNumElements(), 0);
	for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
	SDValue In = Op.getOperand(Idx);
	if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
	Imm.setBit(Idx);
	}
	EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
	return DAG.getConstant(Imm, SDLoc(Op), IntVT);
	}

	static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");

	if (!DCI.isBeforeLegalizeOps())
	return SDValue();

	// Only do this if we have k-registers.
	if (!Subtarget.hasAVX512())
	return SDValue();

	EVT DstVT = N->getValueType(0);
	SDValue Op = N->getOperand(0);
	EVT SrcVT = Op.getValueType();

	if (!Op.hasOneUse())
	return SDValue();

	// Look for logic ops.
	if (Op.getOpcode() != ISD::AND &&
	Op.getOpcode() != ISD::OR &&
	Op.getOpcode() != ISD::XOR)
	return SDValue();

	// Make sure we have a bitcast between mask registers and a scalar type.
	if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
	DstVT.isScalarInteger()) &&
	!(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
	SrcVT.isScalarInteger()))
	return SDValue();

	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);

	if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
	LHS.getOperand(0).getValueType() == DstVT)
	return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
	DAG.getBitcast(DstVT, RHS));

	if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
	RHS.getOperand(0).getValueType() == DstVT)
	return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
	DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));

	// If the RHS is a vXi1 build vector, this is a good reason to flip too.
	// Most of these have to move a constant from the scalar domain anyway.
	if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
	RHS = combinevXi1ConstantToInteger(RHS, DAG);
	return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
	DAG.getBitcast(DstVT, LHS), RHS);
	}

	return SDValue();
	}

	static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(BV);
	unsigned NumElts = BV->getNumOperands();
	SDValue Splat = BV->getSplatValue();

	// Build MMX element from integer GPR or SSE float values.
	auto CreateMMXElement = [&](SDValue V) {
	if (V.isUndef())
	return DAG.getUNDEF(MVT::x86mmx);
	if (V.getValueType().isFloatingPoint()) {
	if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
	V = DAG.getBitcast(MVT::v2i64, V);
	return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
	}
	V = DAG.getBitcast(MVT::i32, V);
	} else {
	V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
	}
	return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
	};

	// Convert build vector ops to MMX data in the bottom elements.
	SmallVector<SDValue, 8> Ops;

	// Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
	if (Splat) {
	if (Splat.isUndef())
	return DAG.getUNDEF(MVT::x86mmx);

	Splat = CreateMMXElement(Splat);

	if (Subtarget.hasSSE1()) {
	// Unpack v8i8 to splat i8 elements to lowest 16-bits.
	if (NumElts == 8)
	Splat = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
	DAG.getConstant(Intrinsic::x86_mmx_punpcklbw, DL, MVT::i32), Splat,
	Splat);

	// Use PSHUFW to repeat 16-bit elements.
	unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
	return DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
	DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32),
	Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
	}
	Ops.append(NumElts, Splat);
	} else {
	for (unsigned i = 0; i != NumElts; ++i)
	Ops.push_back(CreateMMXElement(BV->getOperand(i)));
	}

	// Use tree of PUNPCKLs to build up general MMX vector.
	while (Ops.size() > 1) {
	unsigned NumOps = Ops.size();
	unsigned IntrinOp =
	(NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
	: (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
	: Intrinsic::x86_mmx_punpcklbw));
	SDValue Intrin = DAG.getConstant(IntrinOp, DL, MVT::i32);
	for (unsigned i = 0; i != NumOps; i += 2)
	Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
	Ops[i], Ops[i + 1]);
	Ops.resize(NumOps / 2);
	}

	return Ops[0];
	}

	// Recursive function that attempts to find if a bool vector node was originally
	// a vector/float/double that got truncated/extended/bitcast to/from a scalar
	// integer. If so, replace the scalar ops with bool vector equivalents back down
	// the chain.
	static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, SDLoc DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned Opc = V.getOpcode();
	switch (Opc) {
	case ISD::BITCAST: {
	// Bitcast from a vector/float/double, we can cheaply bitcast to VT.
	SDValue Src = V.getOperand(0);
	EVT SrcVT = Src.getValueType();
	if (SrcVT.isVector() \|\| SrcVT.isFloatingPoint())
	return DAG.getBitcast(VT, Src);
	break;
	}
	case ISD::TRUNCATE: {
	// If we find a suitable source, a truncated scalar becomes a subvector.
	SDValue Src = V.getOperand(0);
	EVT NewSrcVT =
	EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
	if (TLI.isTypeLegal(NewSrcVT))
	if (SDValue N0 =
	combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
	DAG.getIntPtrConstant(0, DL));
	break;
	}
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND: {
	// If we find a suitable source, an extended scalar becomes a subvector.
	SDValue Src = V.getOperand(0);
	EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
	Src.getScalarValueSizeInBits());
	if (TLI.isTypeLegal(NewSrcVT))
	if (SDValue N0 =
	combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
	Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
	: DAG.getConstant(0, DL, VT),
	N0, DAG.getIntPtrConstant(0, DL));
	break;
	}
	case ISD::OR: {
	// If we find suitable sources, we can just move an OR to the vector domain.
	SDValue Src0 = V.getOperand(0);
	SDValue Src1 = V.getOperand(1);
	if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
	if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
	return DAG.getNode(Opc, DL, VT, N0, N1);
	break;
	}
	case ISD::SHL: {
	// If we find a suitable source, a SHL becomes a KSHIFTL.
	SDValue Src0 = V.getOperand(0);
	if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
	if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
	return DAG.getNode(
	X86ISD::KSHIFTL, DL, VT, N0,
	DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
	break;
	}
	}
	return SDValue();
	}

	static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT SrcVT = N0.getValueType();

	// Try to match patterns such as
	// (i16 bitcast (v16i1 x))
	// ->
	// (i16 movmsk (16i8 sext (v16i1 x)))
	// before the setcc result is scalarized on subtargets that don't have legal
	// vxi1 types.
	if (DCI.isBeforeLegalize()) {
	SDLoc dl(N);
	if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
	return V;

	// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
	// type, widen both sides to avoid a trip through memory.
	if ((VT == MVT::v4i1 \|\| VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
	Subtarget.hasAVX512()) {
	N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
	N0 = DAG.getBitcast(MVT::v8i1, N0);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
	DAG.getIntPtrConstant(0, dl));
	}

	// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
	// type, widen both sides to avoid a trip through memory.
	if ((SrcVT == MVT::v4i1 \|\| SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
	Subtarget.hasAVX512()) {
	// Use zeros for the widening if we already have some zeroes. This can
	// allow SimplifyDemandedBits to remove scalar ANDs that may be down
	// stream of this.
	// FIXME: It might make sense to detect a concat_vectors with a mix of
	// zeroes and undef and turn it into insert_subvector for i1 vectors as
	// a separate combine. What we can't do is canonicalize the operands of
	// such a concat or we'll get into a loop with SimplifyDemandedBits.
	if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
	SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
	if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
	SrcVT = LastOp.getValueType();
	unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
	SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
	Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
	N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
	N0 = DAG.getBitcast(MVT::i8, N0);
	return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
	}
	}

	unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
	SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
	Ops[0] = N0;
	N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
	N0 = DAG.getBitcast(MVT::i8, N0);
	return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
	}
	} else {
	// If we're bitcasting from iX to vXi1, see if the integer originally
	// began as a vXi1 and whether we can remove the bitcast entirely.
	if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
	SrcVT.isScalarInteger() &&
	DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
	if (SDValue V =
	combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
	return V;
	}
	}

	// Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
	// replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
	// due to insert_subvector legalization on KNL. By promoting the copy to i16
	// we can help with known bits propagation from the vXi1 domain to the
	// scalar domain.
	if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
	!Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N0.getOperand(0).getValueType() == MVT::v16i1 &&
	isNullConstant(N0.getOperand(1)))
	return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
	DAG.getBitcast(MVT::i16, N0.getOperand(0)));

	// Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
	// and the vbroadcast_load are both integer or both fp. In some cases this
	// will remove the bitcast entirely.
	if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
	VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
	auto *BCast = cast<MemIntrinsicSDNode>(N0);
	unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
	unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
	// Don't swap i8/i16 since don't have fp types that size.
	if (MemSize >= 32) {
	MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
	: MVT::getIntegerVT(MemSize);
	MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
	: MVT::getIntegerVT(SrcVTSize);
	LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());

	SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
	SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
	SDValue ResNode =
	DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
	MemVT, BCast->getMemOperand());
	DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
	return DAG.getBitcast(VT, ResNode);
	}
	}

	// Since MMX types are special and don't usually play with other vector types,
	// it's better to handle them early to be sure we emit efficient code by
	// avoiding store-load conversions.
	if (VT == MVT::x86mmx) {
	// Detect MMX constant vectors.
	APInt UndefElts;
	SmallVector<APInt, 1> EltBits;
	if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
	SDLoc DL(N0);
	// Handle zero-extension of i32 with MOVD.
	if (EltBits[0].countLeadingZeros() >= 32)
	return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
	DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
	// Else, bitcast to a double.
	// TODO - investigate supporting sext 32-bit immediates on x86_64.
	APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
	return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
	}

	// Detect bitcasts to x86mmx low word.
	if (N0.getOpcode() == ISD::BUILD_VECTOR &&
	(SrcVT == MVT::v2i32 \|\| SrcVT == MVT::v4i16 \|\| SrcVT == MVT::v8i8) &&
	N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
	bool LowUndef = true, AllUndefOrZero = true;
	for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
	SDValue Op = N0.getOperand(i);
	LowUndef &= Op.isUndef() \|\| (i >= e/2);
	AllUndefOrZero &= (Op.isUndef() \|\| isNullConstant(Op));
	}
	if (AllUndefOrZero) {
	SDValue N00 = N0.getOperand(0);
	SDLoc dl(N00);
	N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
	: DAG.getZExtOrTrunc(N00, dl, MVT::i32);
	return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
	}
	}

	// Detect bitcasts of 64-bit build vectors and convert to a
	// MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
	// lowest element.
	if (N0.getOpcode() == ISD::BUILD_VECTOR &&
	(SrcVT == MVT::v2f32 \|\| SrcVT == MVT::v2i32 \|\| SrcVT == MVT::v4i16 \|\|
	SrcVT == MVT::v8i8))
	return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);

	// Detect bitcasts between element or subvector extraction to x86mmx.
	if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT \|\|
	N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
	isNullConstant(N0.getOperand(1))) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getValueType().is128BitVector())
	return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
	DAG.getBitcast(MVT::v2i64, N00));
	}

	// Detect bitcasts from FP_TO_SINT to x86mmx.
	if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
	SDLoc DL(N0);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
	DAG.getUNDEF(MVT::v2i32));
	return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
	DAG.getBitcast(MVT::v2i64, Res));
	}
	}

	// Try to remove a bitcast of constant vXi1 vector. We have to legalize
	// most of these to scalar anyway.
	if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
	SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
	ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
	return combinevXi1ConstantToInteger(N0, DAG);
	}

	if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
	VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
	isa<ConstantSDNode>(N0)) {
	auto *C = cast<ConstantSDNode>(N0);
	if (C->isAllOnesValue())
	return DAG.getConstant(1, SDLoc(N0), VT);
	if (C->isNullValue())
	return DAG.getConstant(0, SDLoc(N0), VT);
	}

	// Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
	// Turn it into a sign bit compare that produces a k-register. This avoids
	// a trip through a GPR.
	if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
	VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
	isPowerOf2_32(VT.getVectorNumElements())) {
	unsigned NumElts = VT.getVectorNumElements();
	SDValue Src = N0;

	// Peek through truncate.
	if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
	Src = N0.getOperand(0);

	if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
	SDValue MovmskIn = Src.getOperand(0);
	MVT MovmskVT = MovmskIn.getSimpleValueType();
	unsigned MovMskElts = MovmskVT.getVectorNumElements();

	// We allow extra bits of the movmsk to be used since they are known zero.
	// We can't convert a VPMOVMSKB without avx512bw.
	if (MovMskElts <= NumElts &&
	(Subtarget.hasBWI() \|\| MovmskVT.getVectorElementType() != MVT::i8)) {
	EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
	MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
	SDLoc dl(N);
	MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
	SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
	DAG.getConstant(0, dl, IntVT), ISD::SETLT);
	if (EVT(CmpVT) == VT)
	return Cmp;

	// Pad with zeroes up to original VT to replace the zeroes that were
	// being used from the MOVMSK.
	unsigned NumConcats = NumElts / MovMskElts;
	SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
	Ops[0] = Cmp;
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
	}
	}
	}

	// Try to remove bitcasts from input and output of mask arithmetic to
	// remove GPR<->K-register crossings.
	if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
	return V;

	// Convert a bitcasted integer logic operation that has one bitcasted
	// floating-point operand into a floating-point logic operation. This may
	// create a load of a constant, but that is cheaper than materializing the
	// constant in an integer register and transferring it to an SSE register or
	// transferring the SSE operand to integer register and back.
	unsigned FPOpcode;
	switch (N0.getOpcode()) {
	case ISD::AND: FPOpcode = X86ISD::FAND; break;
	case ISD::OR: FPOpcode = X86ISD::FOR; break;
	case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
	default: return SDValue();
	}

	if (!((Subtarget.hasSSE1() && VT == MVT::f32) \|\|
	(Subtarget.hasSSE2() && VT == MVT::f64)))
	return SDValue();

	SDValue LogicOp0 = N0.getOperand(0);
	SDValue LogicOp1 = N0.getOperand(1);
	SDLoc DL0(N0);

	// bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
	if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
	LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
	!isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
	SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
	return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
	}
	// bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
	if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
	LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
	!isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
	SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
	return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
	}

	return SDValue();
	}

	// Given a ABS node, detect the following pattern:
	// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
	// This is useful as it is the input into a SAD pattern.
	static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
	SDValue AbsOp1 = Abs->getOperand(0);
	if (AbsOp1.getOpcode() != ISD::SUB)
	return false;

	Op0 = AbsOp1.getOperand(0);
	Op1 = AbsOp1.getOperand(1);

	// Check if the operands of the sub are zero-extended from vectors of i8.
	if (Op0.getOpcode() != ISD::ZERO_EXTEND \|\|
	Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 \|\|
	Op1.getOpcode() != ISD::ZERO_EXTEND \|\|
	Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
	return false;

	return true;
	}

	// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
	// to these zexts.
	static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
	const SDValue &Zext1, const SDLoc &DL,
	const X86Subtarget &Subtarget) {
	// Find the appropriate width for the PSADBW.
	EVT InVT = Zext0.getOperand(0).getValueType();
	unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());

	// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
	// fill in the missing vector elements with 0.
	unsigned NumConcat = RegSize / InVT.getSizeInBits();
	SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
	Ops[0] = Zext0.getOperand(0);
	MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
	SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
	Ops[0] = Zext1.getOperand(0);
	SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

	// Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
	auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
	return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
	};
	MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
	return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
	PSADBWBuilder);
	}

	// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
	// PHMINPOSUW.
	static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Bail without SSE41.
	if (!Subtarget.hasSSE41())
	return SDValue();

	EVT ExtractVT = Extract->getValueType(0);
	if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
	return SDValue();

	// Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
	ISD::NodeType BinOp;
	SDValue Src = DAG.matchBinOpReduction(
	Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
	if (!Src)
	return SDValue();

	EVT SrcVT = Src.getValueType();
	EVT SrcSVT = SrcVT.getScalarType();
	if (SrcSVT != ExtractVT \|\| (SrcVT.getSizeInBits() % 128) != 0)
	return SDValue();

	SDLoc DL(Extract);
	SDValue MinPos = Src;

	// First, reduce the source down to 128-bit, applying BinOp to lo/hi.
	while (SrcVT.getSizeInBits() > 128) {
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
	SrcVT = Lo.getValueType();
	MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
	}
	assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) \|\|
	(SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
	"Unexpected value type");

	// PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
	// to flip the value accordingly.
	SDValue Mask;
	unsigned MaskEltsBits = ExtractVT.getSizeInBits();
	if (BinOp == ISD::SMAX)
	Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
	else if (BinOp == ISD::SMIN)
	Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
	else if (BinOp == ISD::UMAX)
	Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);

	if (Mask)
	MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

	// For v16i8 cases we need to perform UMIN on pairs of byte elements,
	// shuffling each upper element down and insert zeros. This means that the
	// v16i8 UMIN will leave the upper element as zero, performing zero-extension
	// ready for the PHMINPOS.
	if (ExtractVT == MVT::i8) {
	SDValue Upper = DAG.getVectorShuffle(
	SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
	{1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
	MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
	}

	// Perform the PHMINPOS on a v8i16 vector,
	MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
	MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
	MinPos = DAG.getBitcast(SrcVT, MinPos);

	if (Mask)
	MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
	DAG.getIntPtrConstant(0, DL));
	}

	// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
	static SDValue combineHorizontalPredicateResult(SDNode *Extract,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Bail without SSE2.
	if (!Subtarget.hasSSE2())
	return SDValue();

	EVT ExtractVT = Extract->getValueType(0);
	unsigned BitWidth = ExtractVT.getSizeInBits();
	if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
	ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
	return SDValue();

	// Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
	ISD::NodeType BinOp;
	SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
	if (!Match && ExtractVT == MVT::i1)
	Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
	if (!Match)
	return SDValue();

	// EXTRACT_VECTOR_ELT can require implicit extension of the vector element
	// which we can't support here for now.
	if (Match.getScalarValueSizeInBits() != BitWidth)
	return SDValue();

	SDValue Movmsk;
	SDLoc DL(Extract);
	EVT MatchVT = Match.getValueType();
	unsigned NumElts = MatchVT.getVectorNumElements();
	unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (ExtractVT == MVT::i1) {
	// Special case for (pre-legalization) vXi1 reductions.
	if (NumElts > 64 \|\| !isPowerOf2_32(NumElts))
	return SDValue();
	if (TLI.isTypeLegal(MatchVT)) {
	// If this is a legal AVX512 predicate type then we can just bitcast.
	EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
	Movmsk = DAG.getBitcast(MovmskVT, Match);
	} else {
	// For all_of(setcc(vec,0,eq)) - avoid vXi64 comparisons if we don't have
	// PCMPEQQ (SSE41+), use PCMPEQD instead.
	if (BinOp == ISD::AND && !Subtarget.hasSSE41() &&
	Match.getOpcode() == ISD::SETCC &&
	ISD::isBuildVectorAllZeros(Match.getOperand(1).getNode()) &&
	cast<CondCodeSDNode>(Match.getOperand(2))->get() ==
	ISD::CondCode::SETEQ) {
	SDValue Vec = Match.getOperand(0);
	if (Vec.getValueType().getScalarType() == MVT::i64 &&
	(2 * NumElts) <= MaxElts) {
	NumElts *= 2;
	EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
	MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
	Match = DAG.getSetCC(
	DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)),
	DAG.getBitcast(CmpVT, Match.getOperand(1)), ISD::CondCode::SETEQ);
	}
	}

	// Use combineBitcastvxi1 to create the MOVMSK.
	while (NumElts > MaxElts) {
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
	Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
	NumElts /= 2;
	}
	EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
	Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
	}
	if (!Movmsk)
	return SDValue();
	Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
	} else {
	// FIXME: Better handling of k-registers or 512-bit vectors?
	unsigned MatchSizeInBits = Match.getValueSizeInBits();
	if (!(MatchSizeInBits == 128 \|\|
	(MatchSizeInBits == 256 && Subtarget.hasAVX())))
	return SDValue();

	// Make sure this isn't a vector of 1 element. The perf win from using
	// MOVMSK diminishes with less elements in the reduction, but it is
	// generally better to get the comparison over to the GPRs as soon as
	// possible to reduce the number of vector ops.
	if (Match.getValueType().getVectorNumElements() < 2)
	return SDValue();

	// Check that we are extracting a reduction of all sign bits.
	if (DAG.ComputeNumSignBits(Match) != BitWidth)
	return SDValue();

	if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
	Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
	MatchSizeInBits = Match.getValueSizeInBits();
	}

	// For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
	MVT MaskSrcVT;
	if (64 == BitWidth \|\| 32 == BitWidth)
	MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
	MatchSizeInBits / BitWidth);
	else
	MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);

	SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
	Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
	NumElts = MaskSrcVT.getVectorNumElements();
	}
	assert((NumElts <= 32 \|\| NumElts == 64) &&
	"Not expecting more than 64 elements");

	MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
	if (BinOp == ISD::XOR) {
	// parity -> (AND (CTPOP(MOVMSK X)), 1)
	SDValue Mask = DAG.getConstant(1, DL, CmpVT);
	SDValue Result = DAG.getNode(ISD::CTPOP, DL, CmpVT, Movmsk);
	Result = DAG.getNode(ISD::AND, DL, CmpVT, Result, Mask);
	return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
	}

	SDValue CmpC;
	ISD::CondCode CondCode;
	if (BinOp == ISD::OR) {
	// any_of -> MOVMSK != 0
	CmpC = DAG.getConstant(0, DL, CmpVT);
	CondCode = ISD::CondCode::SETNE;
	} else {
	// all_of -> MOVMSK == ((1 << NumElts) - 1)
	CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
	DL, CmpVT);
	CondCode = ISD::CondCode::SETEQ;
	}

	// The setcc produces an i8 of 0/1, so extend that to the result width and
	// negate to get the final 0/-1 mask value.
	EVT SetccVT =
	TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);
	SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
	SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
	SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
	return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
	}

	static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// PSADBW is only supported on SSE2 and up.
	if (!Subtarget.hasSSE2())
	return SDValue();

	EVT ExtractVT = Extract->getValueType(0);
	// Verify the type we're extracting is either i32 or i64.
	// FIXME: Could support other types, but this is what we have coverage for.
	if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
	return SDValue();

	EVT VT = Extract->getOperand(0).getValueType();
	if (!isPowerOf2_32(VT.getVectorNumElements()))
	return SDValue();

	// Match shuffle + add pyramid.
	ISD::NodeType BinOp;
	SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});

	// The operand is expected to be zero extended from i8
	// (verified in detectZextAbsDiff).
	// In order to convert to i64 and above, additional any/zero/sign
	// extend is expected.
	// The zero extend from 32 bit has no mathematical effect on the result.
	// Also the sign extend is basically zero extend
	// (extends the sign bit which is zero).
	// So it is correct to skip the sign/zero extend instruction.
	if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND \|\|
	Root.getOpcode() == ISD::ZERO_EXTEND \|\|
	Root.getOpcode() == ISD::ANY_EXTEND))
	Root = Root.getOperand(0);

	// If there was a match, we want Root to be a select that is the root of an
	// abs-diff pattern.
	if (!Root \|\| Root.getOpcode() != ISD::ABS)
	return SDValue();

	// Check whether we have an abs-diff pattern feeding into the select.
	SDValue Zext0, Zext1;
	if (!detectZextAbsDiff(Root, Zext0, Zext1))
	return SDValue();

	// Create the SAD instruction.
	SDLoc DL(Extract);
	SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);

	// If the original vector was wider than 8 elements, sum over the results
	// in the SAD vector.
	unsigned Stages = Log2_32(VT.getVectorNumElements());
	EVT SadVT = SAD.getValueType();
	if (Stages > 3) {
	unsigned SadElems = SadVT.getVectorNumElements();

	for(unsigned i = Stages - 3; i > 0; --i) {
	SmallVector<int, 16> Mask(SadElems, -1);
	for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
	Mask[j] = MaskEnd + j;

	SDValue Shuffle =
	DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
	SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
	}
	}

	unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
	// Return the lowest ExtractSizeInBits bits.
	EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
	SadVT.getSizeInBits() / ExtractSizeInBits);
	SAD = DAG.getBitcast(ResVT, SAD);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
	Extract->getOperand(1));
	}

	// Attempt to peek through a target shuffle and extract the scalar from the
	// source.
	static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SDLoc dl(N);
	SDValue Src = N->getOperand(0);
	SDValue Idx = N->getOperand(1);

	EVT VT = N->getValueType(0);
	EVT SrcVT = Src.getValueType();
	EVT SrcSVT = SrcVT.getVectorElementType();
	unsigned SrcEltBits = SrcSVT.getSizeInBits();
	unsigned NumSrcElts = SrcVT.getVectorNumElements();

	// Don't attempt this for boolean mask vectors or unknown extraction indices.
	if (SrcSVT == MVT::i1 \|\| !isa<ConstantSDNode>(Idx))
	return SDValue();

	const APInt &IdxC = N->getConstantOperandAPInt(1);
	if (IdxC.uge(NumSrcElts))
	return SDValue();

	SDValue SrcBC = peekThroughBitcasts(Src);

	// Handle extract(bitcast(broadcast(scalar_value))).
	if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
	SDValue SrcOp = SrcBC.getOperand(0);
	EVT SrcOpVT = SrcOp.getValueType();
	if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
	(SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
	unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
	unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
	// TODO support non-zero offsets.
	if (Offset == 0) {
	SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
	SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
	return SrcOp;
	}
	}
	}

	// If we're extracting a single element from a broadcast load and there are
	// no other users, just create a single load.
	if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
	auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
	unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
	if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
	VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
	SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
	MemIntr->getBasePtr(),
	MemIntr->getPointerInfo(),
	MemIntr->getOriginalAlign(),
	MemIntr->getMemOperand()->getFlags());
	DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
	return Load;
	}
	}

	// Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
	// TODO: Move to DAGCombine?
	if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
	SrcBC.getValueType().isInteger() &&
	(SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
	SrcBC.getScalarValueSizeInBits() ==
	SrcBC.getOperand(0).getValueSizeInBits()) {
	unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
	if (IdxC.ult(Scale)) {
	unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
	SDValue Scl = SrcBC.getOperand(0);
	EVT SclVT = Scl.getValueType();
	if (Offset) {
	Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
	DAG.getShiftAmountConstant(Offset, SclVT, dl));
	}
	Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
	Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
	return Scl;
	}
	}

	// Handle extract(truncate(x)) for 0'th index.
	// TODO: Treat this as a faux shuffle?
	// TODO: When can we use this for general indices?
	if (ISD::TRUNCATE == Src.getOpcode() && SrcVT.is128BitVector() && IdxC == 0) {
	Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
	Src = DAG.getBitcast(SrcVT, Src);
	return DAG.getNode(N->getOpcode(), dl, VT, Src, Idx);
	}

	// Resolve the target shuffle inputs and mask.
	SmallVector<int, 16> Mask;
	SmallVector<SDValue, 2> Ops;
	if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
	return SDValue();

	// Shuffle inputs must be the same size as the result.
	if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
	return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
	}))
	return SDValue();

	// Attempt to narrow/widen the shuffle mask to the correct size.
	if (Mask.size() != NumSrcElts) {
	if ((NumSrcElts % Mask.size()) == 0) {
	SmallVector<int, 16> ScaledMask;
	int Scale = NumSrcElts / Mask.size();
	narrowShuffleMaskElts(Scale, Mask, ScaledMask);
	Mask = std::move(ScaledMask);
	} else if ((Mask.size() % NumSrcElts) == 0) {
	// Simplify Mask based on demanded element.
	int ExtractIdx = (int)N->getConstantOperandVal(1);
	int Scale = Mask.size() / NumSrcElts;
	int Lo = Scale * ExtractIdx;
	int Hi = Scale * (ExtractIdx + 1);
	for (int i = 0, e = (int)Mask.size(); i != e; ++i)
	if (i < Lo \|\| Hi <= i)
	Mask[i] = SM_SentinelUndef;

	SmallVector<int, 16> WidenedMask;
	while (Mask.size() > NumSrcElts &&
	canWidenShuffleElements(Mask, WidenedMask))
	Mask = std::move(WidenedMask);
	// TODO - investigate support for wider shuffle masks with known upper
	// undef/zero elements for implicit zero-extension.
	}
	}

	// Check if narrowing/widening failed.
	if (Mask.size() != NumSrcElts)
	return SDValue();

	int SrcIdx = Mask[IdxC.getZExtValue()];

	// If the shuffle source element is undef/zero then we can just accept it.
	if (SrcIdx == SM_SentinelUndef)
	return DAG.getUNDEF(VT);

	if (SrcIdx == SM_SentinelZero)
	return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
	: DAG.getConstant(0, dl, VT);

	SDValue SrcOp = Ops[SrcIdx / Mask.size()];
	SrcIdx = SrcIdx % Mask.size();

	// We can only extract other elements from 128-bit vectors and in certain
	// circumstances, depending on SSE-level.
	// TODO: Investigate using extract_subvector for larger vectors.
	// TODO: Investigate float/double extraction if it will be just stored.
	if ((SrcVT == MVT::v4i32 \|\| SrcVT == MVT::v2i64) &&
	((SrcIdx == 0 && Subtarget.hasSSE2()) \|\| Subtarget.hasSSE41())) {
	assert(SrcSVT == VT && "Unexpected extraction type");
	SrcOp = DAG.getBitcast(SrcVT, SrcOp);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
	DAG.getIntPtrConstant(SrcIdx, dl));
	}

	if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) \|\|
	(SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
	assert(VT.getSizeInBits() >= SrcEltBits && "Unexpected extraction type");
	unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
	SrcOp = DAG.getBitcast(SrcVT, SrcOp);
	SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
	DAG.getIntPtrConstant(SrcIdx, dl));
	return DAG.getZExtOrTrunc(ExtOp, dl, VT);
	}

	return SDValue();
	}

	/// Extracting a scalar FP value from vector element 0 is free, so extract each
	/// operand first, then perform the math as a scalar op.
	static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
	assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
	SDValue Vec = ExtElt->getOperand(0);
	SDValue Index = ExtElt->getOperand(1);
	EVT VT = ExtElt->getValueType(0);
	EVT VecVT = Vec.getValueType();

	// TODO: If this is a unary/expensive/expand op, allow extraction from a
	// non-zero element because the shuffle+scalar op will be cheaper?
	if (!Vec.hasOneUse() \|\| !isNullConstant(Index) \|\| VecVT.getScalarType() != VT)
	return SDValue();

	// Vector FP compares don't fit the pattern of FP math ops (propagate, not
	// extract, the condition code), so deal with those as a special-case.
	if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
	EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
	if (OpVT != MVT::f32 && OpVT != MVT::f64)
	return SDValue();

	// extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
	SDLoc DL(ExtElt);
	SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
	Vec.getOperand(0), Index);
	SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
	Vec.getOperand(1), Index);
	return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
	}

	if (VT != MVT::f32 && VT != MVT::f64)
	return SDValue();

	// Vector FP selects don't fit the pattern of FP math ops (because the
	// condition has a different type and we have to change the opcode), so deal
	// with those here.
	// FIXME: This is restricted to pre type legalization by ensuring the setcc
	// has i1 elements. If we loosen this we need to convert vector bool to a
	// scalar bool.
	if (Vec.getOpcode() == ISD::VSELECT &&
	Vec.getOperand(0).getOpcode() == ISD::SETCC &&
	Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
	Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
	// ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
	SDLoc DL(ExtElt);
	SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
	Vec.getOperand(0).getValueType().getScalarType(),
	Vec.getOperand(0), Index);
	SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
	Vec.getOperand(1), Index);
	SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
	Vec.getOperand(2), Index);
	return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
	}

	// TODO: This switch could include FNEG and the x86-specific FP logic ops
	// (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
	// missed load folding and fma+fneg combining.
	switch (Vec.getOpcode()) {
	case ISD::FMA: // Begin 3 operands
	case ISD::FMAD:
	case ISD::FADD: // Begin 2 operands
	case ISD::FSUB:
	case ISD::FMUL:
	case ISD::FDIV:
	case ISD::FREM:
	case ISD::FCOPYSIGN:
	case ISD::FMINNUM:
	case ISD::FMAXNUM:
	case ISD::FMINNUM_IEEE:
	case ISD::FMAXNUM_IEEE:
	case ISD::FMAXIMUM:
	case ISD::FMINIMUM:
	case X86ISD::FMAX:
	case X86ISD::FMIN:
	case ISD::FABS: // Begin 1 operand
	case ISD::FSQRT:
	case ISD::FRINT:
	case ISD::FCEIL:
	case ISD::FTRUNC:
	case ISD::FNEARBYINT:
	case ISD::FROUND:
	case ISD::FFLOOR:
	case X86ISD::FRCP:
	case X86ISD::FRSQRT: {
	// extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
	SDLoc DL(ExtElt);
	SmallVector<SDValue, 4> ExtOps;
	for (SDValue Op : Vec->ops())
	ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
	return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
	}
	default:
	return SDValue();
	}
	llvm_unreachable("All opcodes should return within switch");
	}

	/// Try to convert a vector reduction sequence composed of binops and shuffles
	/// into horizontal ops.
	static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");

	// We need at least SSE2 to anything here.
	if (!Subtarget.hasSSE2())
	return SDValue();

	ISD::NodeType Opc;
	SDValue Rdx =
	DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD, ISD::FADD}, true);
	if (!Rdx)
	return SDValue();

	SDValue Index = ExtElt->getOperand(1);
	assert(isNullConstant(Index) &&
	"Reduction doesn't end in an extract from index 0");

	EVT VT = ExtElt->getValueType(0);
	EVT VecVT = Rdx.getValueType();
	if (VecVT.getScalarType() != VT)
	return SDValue();

	SDLoc DL(ExtElt);

	// vXi8 reduction - sub 128-bit vector.
	if (VecVT == MVT::v4i8 \|\| VecVT == MVT::v8i8) {
	if (VecVT == MVT::v4i8) {
	// Pad with zero.
	if (Subtarget.hasSSE41()) {
	Rdx = DAG.getBitcast(MVT::i32, Rdx);
	Rdx = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
	DAG.getConstant(0, DL, MVT::v4i32), Rdx,
	DAG.getIntPtrConstant(0, DL));
	Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
	} else {
	Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
	DAG.getConstant(0, DL, VecVT));
	}
	}
	if (Rdx.getValueType() == MVT::v8i8) {
	// Pad with undef.
	Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
	DAG.getUNDEF(MVT::v8i8));
	}
	Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
	DAG.getConstant(0, DL, MVT::v16i8));
	Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
	}

	// Must be a >=128-bit vector with pow2 elements.
	if ((VecVT.getSizeInBits() % 128) != 0 \|\|
	!isPowerOf2_32(VecVT.getVectorNumElements()))
	return SDValue();

	// vXi8 reduction - sum lo/hi halves then use PSADBW.
	if (VT == MVT::i8) {
	while (Rdx.getValueSizeInBits() > 128) {
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
	VecVT = Lo.getValueType();
	Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
	}
	assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");

	SDValue Hi = DAG.getVectorShuffle(
	MVT::v16i8, DL, Rdx, Rdx,
	{8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
	Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
	Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
	getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
	Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
	}

	// Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
	if (!shouldUseHorizontalOp(true, DAG, Subtarget))
	return SDValue();

	unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;

	// 256-bit horizontal instructions operate on 128-bit chunks rather than
	// across the whole vector, so we need an extract + hop preliminary stage.
	// This is the only step where the operands of the hop are not the same value.
	// TODO: We could extend this to handle 512-bit or even longer vectors.
	if (((VecVT == MVT::v16i16 \|\| VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) \|\|
	((VecVT == MVT::v8f32 \|\| VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
	unsigned NumElts = VecVT.getVectorNumElements();
	SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
	SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
	Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
	VecVT = Rdx.getValueType();
	}
	if (!((VecVT == MVT::v8i16 \|\| VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
	!((VecVT == MVT::v4f32 \|\| VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
	return SDValue();

	// extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
	unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
	for (unsigned i = 0; i != ReductionSteps; ++i)
	Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
	}

	/// Detect vector gather/scatter index generation and convert it from being a
	/// bunch of shuffles and extracts into a somewhat faster sequence.
	/// For i686, the best sequence is apparently storing the value and loading
	/// scalars back, while for x64 we should use 64-bit extracts and shifts.
	static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
	return NewOp;

	SDValue InputVector = N->getOperand(0);
	SDValue EltIdx = N->getOperand(1);
	auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);

	EVT SrcVT = InputVector.getValueType();
	EVT VT = N->getValueType(0);
	SDLoc dl(InputVector);
	bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
	unsigned NumSrcElts = SrcVT.getVectorNumElements();

	if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
	return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);

	// Integer Constant Folding.
	if (CIdx && VT.isInteger()) {
	APInt UndefVecElts;
	SmallVector<APInt, 16> EltBits;
	unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
	if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
	EltBits, true, false)) {
	uint64_t Idx = CIdx->getZExtValue();
	if (UndefVecElts[Idx])
	return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
	return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),
	dl, VT);
	}
	}

	if (IsPextr) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.SimplifyDemandedBits(
	SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI))
	return SDValue(N, 0);

	// PEXTR(PINSR(v, s, c), c) -> s (with implicit zext handling).
	if ((InputVector.getOpcode() == X86ISD::PINSRB \|\|
	InputVector.getOpcode() == X86ISD::PINSRW) &&
	InputVector.getOperand(2) == EltIdx) {
	assert(SrcVT == InputVector.getOperand(0).getValueType() &&
	"Vector type mismatch");
	SDValue Scl = InputVector.getOperand(1);
	Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
	return DAG.getZExtOrTrunc(Scl, dl, VT);
	}

	// TODO - Remove this once we can handle the implicit zero-extension of
	// X86ISD::PEXTRW/X86ISD::PEXTRB in combineHorizontalPredicateResult and
	// combineBasicSADPattern.
	return SDValue();
	}

	// Detect mmx extraction of all bits as a i64. It works better as a bitcast.
	if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
	VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
	SDValue MMXSrc = InputVector.getOperand(0);

	// The bitcast source is a direct mmx result.
	if (MMXSrc.getValueType() == MVT::x86mmx)
	return DAG.getBitcast(VT, InputVector);
	}

	// Detect mmx to i32 conversion through a v2i32 elt extract.
	if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
	VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
	SDValue MMXSrc = InputVector.getOperand(0);

	// The bitcast source is a direct mmx result.
	if (MMXSrc.getValueType() == MVT::x86mmx)
	return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
	}

	// Check whether this extract is the root of a sum of absolute differences
	// pattern. This has to be done here because we really want it to happen
	// pre-legalization,
	if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
	return SAD;

	// Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
	if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
	return Cmp;

	// Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
	if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
	return MinMax;

	if (SDValue V = combineReductionToHorizontal(N, DAG, Subtarget))
	return V;

	if (SDValue V = scalarizeExtEltFP(N, DAG))
	return V;

	// Attempt to extract a i1 element by using MOVMSK to extract the signbits
	// and then testing the relevant element.
	//
	// Note that we only combine extracts on the same result number, i.e.
	// t0 = merge_values a0, a1, a2, a3
	// i1 = extract_vector_elt t0, Constant:i64<2>
	// i1 = extract_vector_elt t0, Constant:i64<3>
	// but not
	// i1 = extract_vector_elt t0:1, Constant:i64<2>
	// since the latter would need its own MOVMSK.
	if (CIdx && SrcVT.getScalarType() == MVT::i1) {
	SmallVector<SDNode *, 16> BoolExtracts;
	unsigned ResNo = InputVector.getResNo();
	auto IsBoolExtract = [&BoolExtracts, &ResNo](SDNode *Use) {
	if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	isa<ConstantSDNode>(Use->getOperand(1)) &&
	Use->getOperand(0).getResNo() == ResNo &&
	Use->getValueType(0) == MVT::i1) {
	BoolExtracts.push_back(Use);
	return true;
	}
	return false;
	};
	if (all_of(InputVector->uses(), IsBoolExtract) &&
	BoolExtracts.size() > 1) {
	EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
	if (SDValue BC =
	combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
	for (SDNode *Use : BoolExtracts) {
	// extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
	unsigned MaskIdx = Use->getConstantOperandVal(1);
	APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);
	SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);
	SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
	Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
	DCI.CombineTo(Use, Res);
	}
	return SDValue(N, 0);
	}
	}
	}

	return SDValue();
	}

	/// If a vector select has an operand that is -1 or 0, try to simplify the
	/// select to a bitwise logic operation.
	/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
	static SDValue
	combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue Cond = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	EVT VT = LHS.getValueType();
	EVT CondVT = Cond.getValueType();
	SDLoc DL(N);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (N->getOpcode() != ISD::VSELECT)
	return SDValue();

	assert(CondVT.isVector() && "Vector select expects a vector selector!");

	// TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
	// TODO: Can we assert that both operands are not zeros (because that should
	// get simplified at node creation time)?
	bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
	bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());

	// If both inputs are 0/undef, create a complete zero vector.
	// FIXME: As noted above this should be handled by DAGCombiner/getNode.
	if (TValIsAllZeros && FValIsAllZeros) {
	if (VT.isFloatingPoint())
	return DAG.getConstantFP(0.0, DL, VT);
	return DAG.getConstant(0, DL, VT);
	}

	// To use the condition operand as a bitwise mask, it must have elements that
	// are the same size as the select elements. Ie, the condition operand must
	// have already been promoted from the IR select condition type <N x i1>.
	// Don't check if the types themselves are equal because that excludes
	// vector floating-point selects.
	if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
	return SDValue();

	// Try to invert the condition if true value is not all 1s and false value is
	// not all 0s. Only do this if the condition has one use.
	bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
	if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
	// Check if the selector will be produced by CMPP/PCMP.
	Cond.getOpcode() == ISD::SETCC &&
	// Check if SETCC has already been promoted.
	TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
	CondVT) {
	bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());

	if (TValIsAllZeros \|\| FValIsAllOnes) {
	SDValue CC = Cond.getOperand(2);
	ISD::CondCode NewCC = ISD::getSetCCInverse(
	cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
	Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
	NewCC);
	std::swap(LHS, RHS);
	TValIsAllOnes = FValIsAllOnes;
	FValIsAllZeros = TValIsAllZeros;
	}
	}

	// Cond value must be 'sign splat' to be converted to a logical op.
	if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
	return SDValue();

	// vselect Cond, 111..., 000... -> Cond
	if (TValIsAllOnes && FValIsAllZeros)
	return DAG.getBitcast(VT, Cond);

	if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
	return SDValue();

	// vselect Cond, 111..., X -> or Cond, X
	if (TValIsAllOnes) {
	SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
	SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
	return DAG.getBitcast(VT, Or);
	}

	// vselect Cond, X, 000... -> and Cond, X
	if (FValIsAllZeros) {
	SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
	SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
	return DAG.getBitcast(VT, And);
	}

	// vselect Cond, 000..., X -> andn Cond, X
	if (TValIsAllZeros) {
	MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
	SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
	SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
	SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
	return DAG.getBitcast(VT, AndN);
	}

	return SDValue();
	}

	/// If both arms of a vector select are concatenated vectors, split the select,
	/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
	/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
	/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
	static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
	return SDValue();

	// TODO: Split 512-bit vectors too?
	EVT VT = N->getValueType(0);
	if (!VT.is256BitVector())
	return SDValue();

	// TODO: Split as long as any 2 of the 3 operands are concatenated?
	SDValue Cond = N->getOperand(0);
	SDValue TVal = N->getOperand(1);
	SDValue FVal = N->getOperand(2);
	SmallVector<SDValue, 4> CatOpsT, CatOpsF;
	if (!TVal.hasOneUse() \|\| !FVal.hasOneUse() \|\|
	!collectConcatOps(TVal.getNode(), CatOpsT) \|\|
	!collectConcatOps(FVal.getNode(), CatOpsF))
	return SDValue();

	auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
	};
	return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
	makeBlend, /CheckBWI/ false);
	}

	static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
	SDValue Cond = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	SDLoc DL(N);

	auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
	auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
	if (!TrueC \|\| !FalseC)
	return SDValue();

	// Don't do this for crazy integer types.
	EVT VT = N->getValueType(0);
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	// We're going to use the condition bit in math or logic ops. We could allow
	// this with a wider condition value (post-legalization it becomes an i8),
	// but if nothing is creating selects that late, it doesn't matter.
	if (Cond.getValueType() != MVT::i1)
	return SDValue();

	// A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
	// 3, 5, or 9 with i32/i64, so those get transformed too.
	// TODO: For constants that overflow or do not differ by power-of-2 or small
	// multiplier, convert to 'and' + 'add'.
	const APInt &TrueVal = TrueC->getAPIntValue();
	const APInt &FalseVal = FalseC->getAPIntValue();
	bool OV;
	APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
	if (OV)
	return SDValue();

	APInt AbsDiff = Diff.abs();
	if (AbsDiff.isPowerOf2() \|\|
	((VT == MVT::i32 \|\| VT == MVT::i64) &&
	(AbsDiff == 3 \|\| AbsDiff == 5 \|\| AbsDiff == 9))) {

	// We need a positive multiplier constant for shift/LEA codegen. The 'not'
	// of the condition can usually be folded into a compare predicate, but even
	// without that, the sequence should be cheaper than a CMOV alternative.
	if (TrueVal.slt(FalseVal)) {
	Cond = DAG.getNOT(DL, Cond, MVT::i1);
	std::swap(TrueC, FalseC);
	}

	// select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
	SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);

	// Multiply condition by the difference if non-one.
	if (!AbsDiff.isOneValue())
	R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));

	// Add the base if non-zero.
	if (!FalseC->isNullValue())
	R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));

	return R;
	}

	return SDValue();
	}

	/// If this is a dynamic select (non-constant condition) and we can match
	/// this node with one of the variable blend instructions, restructure the
	/// condition so that blends can use the high (sign) bit of each element.
	/// This function will also call SimplifyDemandedBits on already created
	/// BLENDV to perform additional simplifications.
	static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue Cond = N->getOperand(0);
	if ((N->getOpcode() != ISD::VSELECT &&
	N->getOpcode() != X86ISD::BLENDV) \|\|
	ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
	return SDValue();

	// Don't optimize before the condition has been transformed to a legal type
	// and don't ever optimize vector selects that map to AVX512 mask-registers.
	unsigned BitWidth = Cond.getScalarValueSizeInBits();
	if (BitWidth < 8 \|\| BitWidth > 64)
	return SDValue();

	// We can only handle the cases where VSELECT is directly legal on the
	// subtarget. We custom lower VSELECT nodes with constant conditions and
	// this makes it hard to see whether a dynamic VSELECT will correctly
	// lower, so we both check the operation's status and explicitly handle the
	// cases where a dynamic blend will fail even though a constant-condition
	// blend could be custom lowered.
	// FIXME: We should find a better way to handle this class of problems.
	// Potentially, we should combine constant-condition vselect nodes
	// pre-legalization into shuffles and not mark as many types as custom
	// lowered.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT VT = N->getValueType(0);
	if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
	return SDValue();
	// FIXME: We don't support i16-element blends currently. We could and
	// should support them by making all the bits in the condition be set
	// rather than just the high bit and using an i8-element blend.
	if (VT.getVectorElementType() == MVT::i16)
	return SDValue();
	// Dynamic blending was only available from SSE4.1 onward.
	if (VT.is128BitVector() && !Subtarget.hasSSE41())
	return SDValue();
	// Byte blends are only available in AVX2
	if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
	return SDValue();
	// There are no 512-bit blend instructions that use sign bits.
	if (VT.is512BitVector())
	return SDValue();

	auto OnlyUsedAsSelectCond = [](SDValue Cond) {
	for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
	UI != UE; ++UI)
	if ((UI->getOpcode() != ISD::VSELECT &&
	UI->getOpcode() != X86ISD::BLENDV) \|\|
	UI.getOperandNo() != 0)
	return false;

	return true;
	};

	APInt DemandedBits(APInt::getSignMask(BitWidth));

	if (OnlyUsedAsSelectCond(Cond)) {
	KnownBits Known;
	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
	!DCI.isBeforeLegalizeOps());
	if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
	return SDValue();

	// If we changed the computation somewhere in the DAG, this change will
	// affect all users of Cond. Update all the nodes so that we do not use
	// the generic VSELECT anymore. Otherwise, we may perform wrong
	// optimizations as we messed with the actual expectation for the vector
	// boolean values.
	for (SDNode *U : Cond->uses()) {
	if (U->getOpcode() == X86ISD::BLENDV)
	continue;

	SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
	Cond, U->getOperand(1), U->getOperand(2));
	DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
	DCI.AddToWorklist(U);
	}
	DCI.CommitTargetLoweringOpt(TLO);
	return SDValue(N, 0);
	}

	// Otherwise we can still at least try to simplify multiple use bits.
	if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))
	return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
	N->getOperand(1), N->getOperand(2));

	return SDValue();
	}

	// Try to match:
	// (or (and (M, (sub 0, X)), (pandn M, X)))
	// which is a special case of:
	// (select M, (sub 0, X), X)
	// Per:
	// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
	// We know that, if fNegate is 0 or 1:
	// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
	//
	// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
	// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
	// ( M ? -X : X) == ((X ^ M ) + (M & 1))
	// This lets us transform our vselect to:
	// (add (xor X, M), (and M, 1))
	// And further to:
	// (sub (xor X, M), M)
	static SDValue combineLogicBlendIntoConditionalNegate(
	EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
	SelectionDAG &DAG, const X86Subtarget &Subtarget) {
	EVT MaskVT = Mask.getValueType();
	assert(MaskVT.isInteger() &&
	DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
	"Mask must be zero/all-bits");

	if (X.getValueType() != MaskVT \|\| Y.getValueType() != MaskVT)
	return SDValue();
	if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
	return SDValue();

	auto IsNegV = [](SDNode *N, SDValue V) {
	return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
	ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
	};

	SDValue V;
	if (IsNegV(Y.getNode(), X))
	V = X;
	else if (IsNegV(X.getNode(), Y))
	V = Y;
	else
	return SDValue();

	SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
	SDValue SubOp2 = Mask;

	// If the negate was on the false side of the select, then
	// the operands of the SUB need to be swapped. PR 27251.
	// This is because the pattern being matched above is
	// (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
	// but if the pattern matched was
	// (vselect M, X, (sub (0, X))), that is really negation of the pattern
	// above, -(vselect M, (sub 0, X), X), and therefore the replacement
	// pattern also needs to be a negation of the replacement pattern above.
	// And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
	// sub accomplishes the negation of the replacement pattern.
	if (V == Y)
	std::swap(SubOp1, SubOp2);

	SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
	return DAG.getBitcast(VT, Res);
	}

	/// Do target-specific dag combines on SELECT and VSELECT nodes.
	static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	SDValue Cond = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);

	// Try simplification again because we use this function to optimize
	// BLENDV nodes that are not handled by the generic combiner.
	if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
	return V;

	EVT VT = LHS.getValueType();
	EVT CondVT = Cond.getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());

	// Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
	// Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
	// can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
	if (CondVT.isVector() && CondVT.isInteger() &&
	CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
	(!CondConstantVector \|\| CondVT.getScalarType() == MVT::i8) &&
	DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())
	if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,
	DL, DAG, Subtarget))
	return V;

	// Convert vselects with constant condition into shuffles.
	if (CondConstantVector && DCI.isBeforeLegalizeOps()) {
	SmallVector<int, 64> Mask;
	if (createShuffleMaskFromVSELECT(Mask, Cond))
	return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
	}

	// If we have SSE[12] support, try to form min/max nodes. SSE min/max
	// instructions match the semantics of the common C idiom x<y?x:y but not
	// x<=y?x:y, because of how they handle negative zero (which can be
	// ignored in unsafe-math mode).
	// We also try to create v2f32 min/max nodes, which we later widen to v4f32.
	if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
	VT != MVT::f80 && VT != MVT::f128 &&
	(TLI.isTypeLegal(VT) \|\| VT == MVT::v2f32) &&
	(Subtarget.hasSSE2() \|\|
	(Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

	unsigned Opcode = 0;
	// Check for x CC y ? x : y.
	if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(1))) {
	switch (CC) {
	default: break;
	case ISD::SETULT:
	// Converting this to a min would handle NaNs incorrectly, and swapping
	// the operands would cause it to handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)) {
	if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
	!(DAG.isKnownNeverZeroFloat(LHS) \|\|
	DAG.isKnownNeverZeroFloat(RHS)))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETOLE:
	// Converting this to a min would handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
	!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
	break;
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETULE:
	// Converting this to a min would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOLT:
	case ISD::SETLT:
	case ISD::SETLE:
	Opcode = X86ISD::FMIN;
	break;

	case ISD::SETOGE:
	// Converting this to a max would handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
	!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
	break;
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETUGT:
	// Converting this to a max would handle NaNs incorrectly, and swapping
	// the operands would cause it to handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)) {
	if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
	!(DAG.isKnownNeverZeroFloat(LHS) \|\|
	DAG.isKnownNeverZeroFloat(RHS)))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETUGE:
	// Converting this to a max would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOGT:
	case ISD::SETGT:
	case ISD::SETGE:
	Opcode = X86ISD::FMAX;
	break;
	}
	// Check for x CC y ? y : x -- a min/max with reversed arms.
	} else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(0))) {
	switch (CC) {
	default: break;
	case ISD::SETOGE:
	// Converting this to a min would handle comparisons between positive
	// and negative zero incorrectly, and swapping the operands would
	// cause it to handle NaNs incorrectly.
	if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
	!(DAG.isKnownNeverZeroFloat(LHS) \|\|
	DAG.isKnownNeverZeroFloat(RHS))) {
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETUGT:
	// Converting this to a min would handle NaNs incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETUGE:
	// Converting this to a min would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOGT:
	case ISD::SETGT:
	case ISD::SETGE:
	Opcode = X86ISD::FMIN;
	break;

	case ISD::SETULT:
	// Converting this to a max would handle NaNs incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETOLE:
	// Converting this to a max would handle comparisons between positive
	// and negative zero incorrectly, and swapping the operands would
	// cause it to handle NaNs incorrectly.
	if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
	!DAG.isKnownNeverZeroFloat(LHS) &&
	!DAG.isKnownNeverZeroFloat(RHS)) {
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETULE:
	// Converting this to a max would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOLT:
	case ISD::SETLT:
	case ISD::SETLE:
	Opcode = X86ISD::FMAX;
	break;
	}
	}

	if (Opcode)
	return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
	}

	// Some mask scalar intrinsics rely on checking if only one bit is set
	// and implement it in C code like this:
	// A[0] = (U & 1) ? A[0] : W[0];
	// This creates some redundant instructions that break pattern matching.
	// fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
	if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
	Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 \|\| VT == MVT::f64)) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
	SDValue AndNode = Cond.getOperand(0);
	if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
	isNullConstant(Cond.getOperand(1)) &&
	isOneConstant(AndNode.getOperand(1))) {
	// LHS and RHS swapped due to
	// setcc outputting 1 when AND resulted in 0 and vice versa.
	AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
	return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
	}
	}

	// v16i8 (select v16i1, v16i8, v16i8) does not have a proper
	// lowering on KNL. In this case we convert it to
	// v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
	// The same situation all vectors of i8 and i16 without BWI.
	// Make sure we extend these even before type legalization gets a chance to
	// split wide vectors.
	// Since SKX these selects have a proper lowering.
	if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
	CondVT.getVectorElementType() == MVT::i1 &&
	(VT.getVectorElementType() == MVT::i8 \|\|
	VT.getVectorElementType() == MVT::i16)) {
	Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
	return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
	}

	// AVX512 - Extend select with zero to merge with target shuffle.
	// select(mask, extract_subvector(shuffle(x)), zero) -->
	// extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
	// TODO - support non target shuffles as well.
	if (Subtarget.hasAVX512() && CondVT.isVector() &&
	CondVT.getVectorElementType() == MVT::i1) {
	auto SelectableOp = [&TLI](SDValue Op) {
	return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	isTargetShuffle(Op.getOperand(0).getOpcode()) &&
	isNullConstant(Op.getOperand(1)) &&
	TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
	Op.hasOneUse() && Op.getOperand(0).hasOneUse();
	};

	bool SelectableLHS = SelectableOp(LHS);
	bool SelectableRHS = SelectableOp(RHS);
	bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
	bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());

	if ((SelectableLHS && ZeroRHS) \|\| (SelectableRHS && ZeroLHS)) {
	EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
	: RHS.getOperand(0).getValueType();
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	EVT SrcCondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumSrcElts);
	LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
	VT.getSizeInBits());
	RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
	VT.getSizeInBits());
	Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
	DAG.getUNDEF(SrcCondVT), Cond,
	DAG.getIntPtrConstant(0, DL));
	SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
	return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
	}
	}

	if (SDValue V = combineSelectOfTwoConstants(N, DAG))
	return V;

	// Canonicalize max and min:
	// (x > y) ? x : y -> (x >= y) ? x : y
	// (x < y) ? x : y -> (x <= y) ? x : y
	// This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
	// the need for an extra compare
	// against zero. e.g.
	// (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
	// subl %esi, %edi
	// testl %edi, %edi
	// movl $0, %eax
	// cmovgl %edi, %eax
	// =>
	// xorl %eax, %eax
	// subl %esi, $edi
	// cmovsl %eax, %edi
	if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
	Cond.hasOneUse() &&
	DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(1))) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
	switch (CC) {
	default: break;
	case ISD::SETLT:
	case ISD::SETGT: {
	ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
	Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
	Cond.getOperand(0), Cond.getOperand(1), NewCC);
	return DAG.getSelect(DL, VT, Cond, LHS, RHS);
	}
	}
	}

	// Match VSELECTs into subs with unsigned saturation.
	if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
	// psubus is available in SSE2 for i8 and i16 vectors.
	Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&
	isPowerOf2_32(VT.getVectorNumElements()) &&
	(VT.getVectorElementType() == MVT::i8 \|\|
	VT.getVectorElementType() == MVT::i16)) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

	// Check if one of the arms of the VSELECT is a zero vector. If it's on the
	// left side invert the predicate to simplify logic below.
	SDValue Other;
	if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
	Other = RHS;
	CC = ISD::getSetCCInverse(CC, VT.getVectorElementType());
	} else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
	Other = LHS;
	}

	if (Other.getNode() && Other->getNumOperands() == 2 &&
	Other->getOperand(0) == Cond.getOperand(0)) {
	SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
	SDValue CondRHS = Cond->getOperand(1);

	// Look for a general sub with unsigned saturation first.
	// x >= y ? x-y : 0 --> subus x, y
	// x > y ? x-y : 0 --> subus x, y
	if ((CC == ISD::SETUGE \|\| CC == ISD::SETUGT) &&
	Other->getOpcode() == ISD::SUB && OpRHS == CondRHS)
	return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);

	if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS)) {
	if (isa<BuildVectorSDNode>(CondRHS)) {
	// If the RHS is a constant we have to reverse the const
	// canonicalization.
	// x > C-1 ? x+-C : 0 --> subus x, C
	auto MatchUSUBSAT = [](ConstantSDNode Op, ConstantSDNode Cond) {
	return (!Op && !Cond) \|\|
	(Op && Cond &&
	Cond->getAPIntValue() == (-Op->getAPIntValue() - 1));
	};
	if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
	ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT,
	/AllowUndefs/ true)) {
	OpRHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	OpRHS);
	return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
	}

	// Another special case: If C was a sign bit, the sub has been
	// canonicalized into a xor.
	// FIXME: Would it be better to use computeKnownBits to determine
	// whether it's safe to decanonicalize the xor?
	// x s< 0 ? x^C : 0 --> subus x, C
	if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
	if (CC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&
	ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
	OpRHSConst->getAPIntValue().isSignMask()) {
	// Note that we have to rebuild the RHS constant here to ensure we
	// don't rely on particular values of undef lanes.
	OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT);
	return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
	}
	}
	}
	}
	}
	}

	// Match VSELECTs into add with unsigned saturation.
	if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
	// paddus is available in SSE2 for i8 and i16 vectors.
	Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&
	isPowerOf2_32(VT.getVectorNumElements()) &&
	(VT.getVectorElementType() == MVT::i8 \|\|
	VT.getVectorElementType() == MVT::i16)) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

	SDValue CondLHS = Cond->getOperand(0);
	SDValue CondRHS = Cond->getOperand(1);

	// Check if one of the arms of the VSELECT is vector with all bits set.
	// If it's on the left side invert the predicate to simplify logic below.
	SDValue Other;
	if (ISD::isBuildVectorAllOnes(LHS.getNode())) {
	Other = RHS;
	CC = ISD::getSetCCInverse(CC, VT.getVectorElementType());
	} else if (ISD::isBuildVectorAllOnes(RHS.getNode())) {
	Other = LHS;
	}

	if (Other.getNode() && Other.getOpcode() == ISD::ADD) {
	SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1);

	// Canonicalize condition operands.
	if (CC == ISD::SETUGE) {
	std::swap(CondLHS, CondRHS);
	CC = ISD::SETULE;
	}

	// We can test against either of the addition operands.
	// x <= x+y ? x+y : ~0 --> addus x, y
	// x+y >= x ? x+y : ~0 --> addus x, y
	if (CC == ISD::SETULE && Other == CondRHS &&
	(OpLHS == CondLHS \|\| OpRHS == CondLHS))
	return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);

	if (isa<BuildVectorSDNode>(OpRHS) && isa<BuildVectorSDNode>(CondRHS) &&
	CondLHS == OpLHS) {
	// If the RHS is a constant we have to reverse the const
	// canonicalization.
	// x > ~C ? x+C : ~0 --> addus x, C
	auto MatchUADDSAT = [](ConstantSDNode Op, ConstantSDNode Cond) {
	return Cond->getAPIntValue() == ~Op->getAPIntValue();
	};
	if (CC == ISD::SETULE &&
	ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT))
	return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
	}
	}
	}

	// Check if the first operand is all zeros and Cond type is vXi1.
	// If this an avx512 target we can improve the use of zero masking by
	// swapping the operands and inverting the condition.
	if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
	Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
	ISD::isBuildVectorAllZeros(LHS.getNode()) &&
	!ISD::isBuildVectorAllZeros(RHS.getNode())) {
	// Invert the cond to not(cond) : xor(op,allones)=not(op)
	SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
	// Vselect cond, op1, op2 = Vselect not(cond), op2, op1
	return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
	}

	// Early exit check
	if (!TLI.isTypeLegal(VT))
	return SDValue();

	if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
	return V;

	if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
	return V;

	if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
	return V;

	// select(~Cond, X, Y) -> select(Cond, Y, X)
	if (CondVT.getScalarType() != MVT::i1)
	if (SDValue CondNot = IsNOT(Cond, DAG))
	return DAG.getNode(N->getOpcode(), DL, VT,
	DAG.getBitcast(CondVT, CondNot), RHS, LHS);

	// Try to optimize vXi1 selects if both operands are either all constants or
	// bitcasts from scalar integer type. In that case we can convert the operands
	// to integer and use an integer select which will be converted to a CMOV.
	// We need to take a little bit of care to avoid creating an i64 type after
	// type legalization.
	if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
	VT.getVectorElementType() == MVT::i1 &&
	(DCI.isBeforeLegalize() \|\| (VT != MVT::v64i1 \|\| Subtarget.is64Bit()))) {
	EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
	bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
	bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());

	if ((LHSIsConst \|\|
	(LHS.getOpcode() == ISD::BITCAST &&
	LHS.getOperand(0).getValueType() == IntVT)) &&
	(RHSIsConst \|\|
	(RHS.getOpcode() == ISD::BITCAST &&
	RHS.getOperand(0).getValueType() == IntVT))) {
	if (LHSIsConst)
	LHS = combinevXi1ConstantToInteger(LHS, DAG);
	else
	LHS = LHS.getOperand(0);

	if (RHSIsConst)
	RHS = combinevXi1ConstantToInteger(RHS, DAG);
	else
	RHS = RHS.getOperand(0);

	SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
	return DAG.getBitcast(VT, Select);
	}
	}

	// If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
	// single bits, then invert the predicate and swap the select operands.
	// This can lower using a vector shift bit-hack rather than mask and compare.
	if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
	N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
	Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
	Cond.getOperand(0).getOpcode() == ISD::AND &&
	isNullOrNullSplat(Cond.getOperand(1)) &&
	cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
	Cond.getOperand(0).getValueType() == VT) {
	// The 'and' mask must be composed of power-of-2 constants.
	SDValue And = Cond.getOperand(0);
	auto *C = isConstOrConstSplat(And.getOperand(1));
	if (C && C->getAPIntValue().isPowerOf2()) {
	// vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
	SDValue NotCond =
	DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
	return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
	}

	// If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
	// and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
	// 16-bit lacks a proper blendv.
	unsigned EltBitWidth = VT.getScalarSizeInBits();
	bool CanShiftBlend =
	TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) \|\|
	(Subtarget.hasAVX2() && EltBitWidth == 64) \|\|
	(Subtarget.hasXOP()));
	if (CanShiftBlend &&
	ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
	return C->getAPIntValue().isPowerOf2();
	})) {
	// Create a left-shift constant to get the mask bits over to the sign-bit.
	SDValue Mask = And.getOperand(1);
	SmallVector<int, 32> ShlVals;
	for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
	auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
	ShlVals.push_back(EltBitWidth - 1 -
	MaskVal->getAPIntValue().exactLogBase2());
	}
	// vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
	SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
	SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
	SDValue NewCond =
	DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
	return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
	}
	}

	return SDValue();
	}

	/// Combine:
	/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
	/// to:
	/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
	/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
	/// Note that this is only legal for some op/cc combinations.
	static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// This combine only operates on CMP-like nodes.
	if (!(Cmp.getOpcode() == X86ISD::CMP \|\|
	(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
	return SDValue();

	// Can't replace the cmp if it has more uses than the one we're looking at.
	// FIXME: We would like to be able to handle this, but would need to make sure
	// all uses were updated.
	if (!Cmp.hasOneUse())
	return SDValue();

	// This only applies to variations of the common case:
	// (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
	// (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
	// (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
	// (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
	// Using the proper condcodes (see below), overflow is checked for.

	// FIXME: We can generalize both constraints:
	// - XOR/OR/AND (if they were made to survive AtomicExpand)
	// - LHS != 1
	// if the result is compared.

	SDValue CmpLHS = Cmp.getOperand(0);
	SDValue CmpRHS = Cmp.getOperand(1);

	if (!CmpLHS.hasOneUse())
	return SDValue();

	unsigned Opc = CmpLHS.getOpcode();
	if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
	return SDValue();

	SDValue OpRHS = CmpLHS.getOperand(2);
	auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
	if (!OpRHSC)
	return SDValue();

	APInt Addend = OpRHSC->getAPIntValue();
	if (Opc == ISD::ATOMIC_LOAD_SUB)
	Addend = -Addend;

	auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
	if (!CmpRHSC)
	return SDValue();

	APInt Comparison = CmpRHSC->getAPIntValue();

	// If the addend is the negation of the comparison value, then we can do
	// a full comparison by emitting the atomic arithmetic as a locked sub.
	if (Comparison == -Addend) {
	// The CC is fine, but we need to rewrite the LHS of the comparison as an
	// atomic sub.
	auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
	auto AtomicSub = DAG.getAtomic(
	ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
	/Chain/ CmpLHS.getOperand(0), /LHS/ CmpLHS.getOperand(1),
	/RHS/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
	AN->getMemOperand());
	auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
	DAG.getUNDEF(CmpLHS.getValueType()));
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
	return LockOp;
	}

	// We can handle comparisons with zero in a number of cases by manipulating
	// the CC used.
	if (!Comparison.isNullValue())
	return SDValue();

	if (CC == X86::COND_S && Addend == 1)
	CC = X86::COND_LE;
	else if (CC == X86::COND_NS && Addend == 1)
	CC = X86::COND_G;
	else if (CC == X86::COND_G && Addend == -1)
	CC = X86::COND_GE;
	else if (CC == X86::COND_LE && Addend == -1)
	CC = X86::COND_L;
	else
	return SDValue();

	SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
	DAG.getUNDEF(CmpLHS.getValueType()));
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
	return LockOp;
	}

	// Check whether a boolean test is testing a boolean value generated by
	// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
	// code.
	//
	// Simplify the following patterns:
	// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
	// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
	// to (Op EFLAGS Cond)
	//
	// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
	// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
	// to (Op EFLAGS !Cond)
	//
	// where Op could be BRCOND or CMOV.
	//
	static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
	// This combine only operates on CMP-like nodes.
	if (!(Cmp.getOpcode() == X86ISD::CMP \|\|
	(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
	return SDValue();

	// Quit if not used as a boolean value.
	if (CC != X86::COND_E && CC != X86::COND_NE)
	return SDValue();

	// Check CMP operands. One of them should be 0 or 1 and the other should be
	// an SetCC or extended from it.
	SDValue Op1 = Cmp.getOperand(0);
	SDValue Op2 = Cmp.getOperand(1);

	SDValue SetCC;
	const ConstantSDNode* C = nullptr;
	bool needOppositeCond = (CC == X86::COND_E);
	bool checkAgainstTrue = false; // Is it a comparison against 1?

	if ((C = dyn_cast<ConstantSDNode>(Op1)))
	SetCC = Op2;
	else if ((C = dyn_cast<ConstantSDNode>(Op2)))
	SetCC = Op1;
	else // Quit if all operands are not constants.
	return SDValue();

	if (C->getZExtValue() == 1) {
	needOppositeCond = !needOppositeCond;
	checkAgainstTrue = true;
	} else if (C->getZExtValue() != 0)
	// Quit if the constant is neither 0 or 1.
	return SDValue();

	bool truncatedToBoolWithAnd = false;
	// Skip (zext $x), (trunc $x), or (and $x, 1) node.
	while (SetCC.getOpcode() == ISD::ZERO_EXTEND \|\|
	SetCC.getOpcode() == ISD::TRUNCATE \|\|
	SetCC.getOpcode() == ISD::AND) {
	if (SetCC.getOpcode() == ISD::AND) {
	int OpIdx = -1;
	if (isOneConstant(SetCC.getOperand(0)))
	OpIdx = 1;
	if (isOneConstant(SetCC.getOperand(1)))
	OpIdx = 0;
	if (OpIdx < 0)
	break;
	SetCC = SetCC.getOperand(OpIdx);
	truncatedToBoolWithAnd = true;
	} else
	SetCC = SetCC.getOperand(0);
	}

	switch (SetCC.getOpcode()) {
	case X86ISD::SETCC_CARRY:
	// Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
	// simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
	// i.e. it's a comparison against true but the result of SETCC_CARRY is not
	// truncated to i1 using 'and'.
	if (checkAgainstTrue && !truncatedToBoolWithAnd)
	break;
	assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
	"Invalid use of SETCC_CARRY!");
	LLVM_FALLTHROUGH;
	case X86ISD::SETCC:
	// Set the condition code or opposite one if necessary.
	CC = X86::CondCode(SetCC.getConstantOperandVal(0));
	if (needOppositeCond)
	CC = X86::GetOppositeBranchCondition(CC);
	return SetCC.getOperand(1);
	case X86ISD::CMOV: {
	// Check whether false/true value has canonical one, i.e. 0 or 1.
	ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
	ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
	// Quit if true value is not a constant.
	if (!TVal)
	return SDValue();
	// Quit if false value is not a constant.
	if (!FVal) {
	SDValue Op = SetCC.getOperand(0);
	// Skip 'zext' or 'trunc' node.
	if (Op.getOpcode() == ISD::ZERO_EXTEND \|\|
	Op.getOpcode() == ISD::TRUNCATE)
	Op = Op.getOperand(0);
	// A special case for rdrand/rdseed, where 0 is set if false cond is
	// found.
	if ((Op.getOpcode() != X86ISD::RDRAND &&
	Op.getOpcode() != X86ISD::RDSEED) \|\| Op.getResNo() != 0)
	return SDValue();
	}
	// Quit if false value is not the constant 0 or 1.
	bool FValIsFalse = true;
	if (FVal && FVal->getZExtValue() != 0) {
	if (FVal->getZExtValue() != 1)
	return SDValue();
	// If FVal is 1, opposite cond is needed.
	needOppositeCond = !needOppositeCond;
	FValIsFalse = false;
	}
	// Quit if TVal is not the constant opposite of FVal.
	if (FValIsFalse && TVal->getZExtValue() != 1)
	return SDValue();
	if (!FValIsFalse && TVal->getZExtValue() != 0)
	return SDValue();
	CC = X86::CondCode(SetCC.getConstantOperandVal(2));
	if (needOppositeCond)
	CC = X86::GetOppositeBranchCondition(CC);
	return SetCC.getOperand(3);
	}
	}

	return SDValue();
	}

	/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
	/// Match:
	/// (X86or (X86setcc) (X86setcc))
	/// (X86cmp (and (X86setcc) (X86setcc)), 0)
	static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
	X86::CondCode &CC1, SDValue &Flags,
	bool &isAnd) {
	if (Cond->getOpcode() == X86ISD::CMP) {
	if (!isNullConstant(Cond->getOperand(1)))
	return false;

	Cond = Cond->getOperand(0);
	}

	isAnd = false;

	SDValue SetCC0, SetCC1;
	switch (Cond->getOpcode()) {
	default: return false;
	case ISD::AND:
	case X86ISD::AND:
	isAnd = true;
	LLVM_FALLTHROUGH;
	case ISD::OR:
	case X86ISD::OR:
	SetCC0 = Cond->getOperand(0);
	SetCC1 = Cond->getOperand(1);
	break;
	};

	// Make sure we have SETCC nodes, using the same flags value.
	if (SetCC0.getOpcode() != X86ISD::SETCC \|\|
	SetCC1.getOpcode() != X86ISD::SETCC \|\|
	SetCC0->getOperand(1) != SetCC1->getOperand(1))
	return false;

	CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
	CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
	Flags = SetCC0->getOperand(1);
	return true;
	}

	// When legalizing carry, we create carries via add X, -1
	// If that comes from an actual carry, via setcc, we use the
	// carry directly.
	static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
	if (EFLAGS.getOpcode() == X86ISD::ADD) {
	if (isAllOnesConstant(EFLAGS.getOperand(1))) {
	SDValue Carry = EFLAGS.getOperand(0);
	while (Carry.getOpcode() == ISD::TRUNCATE \|\|
	Carry.getOpcode() == ISD::ZERO_EXTEND \|\|
	Carry.getOpcode() == ISD::SIGN_EXTEND \|\|
	Carry.getOpcode() == ISD::ANY_EXTEND \|\|
	(Carry.getOpcode() == ISD::AND &&
	isOneConstant(Carry.getOperand(1))))
	Carry = Carry.getOperand(0);
	if (Carry.getOpcode() == X86ISD::SETCC \|\|
	Carry.getOpcode() == X86ISD::SETCC_CARRY) {
	// TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
	uint64_t CarryCC = Carry.getConstantOperandVal(0);
	SDValue CarryOp1 = Carry.getOperand(1);
	if (CarryCC == X86::COND_B)
	return CarryOp1;
	if (CarryCC == X86::COND_A) {
	// Try to convert COND_A into COND_B in an attempt to facilitate
	// materializing "setb reg".
	//
	// Do not flip "e > c", where "c" is a constant, because Cmp
	// instruction cannot take an immediate as its first operand.
	//
	if (CarryOp1.getOpcode() == X86ISD::SUB &&
	CarryOp1.getNode()->hasOneUse() &&
	CarryOp1.getValueType().isInteger() &&
	!isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
	SDValue SubCommute =
	DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
	CarryOp1.getOperand(1), CarryOp1.getOperand(0));
	return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
	}
	}
	// If this is a check of the z flag of an add with 1, switch to the
	// C flag.
	if (CarryCC == X86::COND_E &&
	CarryOp1.getOpcode() == X86ISD::ADD &&
	isOneConstant(CarryOp1.getOperand(1)))
	return CarryOp1;
	}
	}
	}

	return SDValue();
	}

	/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
	/// to avoid the inversion.
	static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
	if (EFLAGS.getOpcode() != X86ISD::PTEST &&
	EFLAGS.getOpcode() != X86ISD::TESTP)
	return SDValue();

	// PTEST/TESTP sets EFLAGS as:
	// TESTZ: ZF = (Op0 & Op1) == 0
	// TESTC: CF = (~Op0 & Op1) == 0
	// TESTNZC: ZF == 0 && CF == 0
	EVT VT = EFLAGS.getValueType();
	SDValue Op0 = EFLAGS.getOperand(0);
	SDValue Op1 = EFLAGS.getOperand(1);
	EVT OpVT = Op0.getValueType();

	// TEST(~X,Y) == TEST(X,Y)
	if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
	X86::CondCode InvCC;
	switch (CC) {
	case X86::COND_B:
	// testc -> testz.
	InvCC = X86::COND_E;
	break;
	case X86::COND_AE:
	// !testc -> !testz.
	InvCC = X86::COND_NE;
	break;
	case X86::COND_E:
	// testz -> testc.
	InvCC = X86::COND_B;
	break;
	case X86::COND_NE:
	// !testz -> !testc.
	InvCC = X86::COND_AE;
	break;
	case X86::COND_A:
	case X86::COND_BE:
	// testnzc -> testnzc (no change).
	InvCC = CC;
	break;
	default:
	InvCC = X86::COND_INVALID;
	break;
	}

	if (InvCC != X86::COND_INVALID) {
	CC = InvCC;
	return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
	DAG.getBitcast(OpVT, NotOp0), Op1);
	}
	}

	if (CC == X86::COND_E \|\| CC == X86::COND_NE) {
	// TESTZ(X,~Y) == TESTC(Y,X)
	if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
	CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
	return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
	DAG.getBitcast(OpVT, NotOp1), Op0);
	}

	if (Op0 == Op1) {
	SDValue BC = peekThroughBitcasts(Op0);
	EVT BCVT = BC.getValueType();
	assert(BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
	"Unexpected vector type");

	// TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
	if (BC.getOpcode() == ISD::AND \|\| BC.getOpcode() == X86ISD::FAND) {
	return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
	DAG.getBitcast(OpVT, BC.getOperand(0)),
	DAG.getBitcast(OpVT, BC.getOperand(1)));
	}

	// TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
	if (BC.getOpcode() == X86ISD::ANDNP \|\| BC.getOpcode() == X86ISD::FANDN) {
	CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
	return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
	DAG.getBitcast(OpVT, BC.getOperand(0)),
	DAG.getBitcast(OpVT, BC.getOperand(1)));
	}

	// If every element is an all-sign value, see if we can use MOVMSK to
	// more efficiently extract the sign bits and compare that.
	// TODO: Handle TESTC with comparison inversion.
	// TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
	// MOVMSK combines to make sure its never worse than PTEST?
	unsigned EltBits = BCVT.getScalarSizeInBits();
	if (DAG.ComputeNumSignBits(BC) == EltBits) {
	assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
	APInt SignMask = APInt::getSignMask(EltBits);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (SDValue Res =
	TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
	// For vXi16 cases we need to use pmovmksb and extract every other
	// sign bit.
	SDLoc DL(EFLAGS);
	if (EltBits == 16) {
	MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
	Res = DAG.getBitcast(MovmskVT, Res);
	Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
	Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
	DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
	} else {
	Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
	}
	return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
	DAG.getConstant(0, DL, MVT::i32));
	}
	}
	}

	// TESTZ(-1,X) == TESTZ(X,X)
	if (ISD::isBuildVectorAllOnes(Op0.getNode()))
	return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);

	// TESTZ(X,-1) == TESTZ(X,X)
	if (ISD::isBuildVectorAllOnes(Op1.getNode()))
	return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
	}

	return SDValue();
	}

	// Attempt to simplify the MOVMSK input based on the comparison type.
	static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Handle eq/ne against zero (any_of).
	// Handle eq/ne against -1 (all_of).
	if (!(CC == X86::COND_E \|\| CC == X86::COND_NE))
	return SDValue();
	if (EFLAGS.getValueType() != MVT::i32)
	return SDValue();
	unsigned CmpOpcode = EFLAGS.getOpcode();
	if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
	return SDValue();
	auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
	if (!CmpConstant)
	return SDValue();
	const APInt &CmpVal = CmpConstant->getAPIntValue();

	SDValue CmpOp = EFLAGS.getOperand(0);
	unsigned CmpBits = CmpOp.getValueSizeInBits();
	assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");

	// Peek through any truncate.
	if (CmpOp.getOpcode() == ISD::TRUNCATE)
	CmpOp = CmpOp.getOperand(0);

	// Bail if we don't find a MOVMSK.
	if (CmpOp.getOpcode() != X86ISD::MOVMSK)
	return SDValue();

	SDValue Vec = CmpOp.getOperand(0);
	MVT VecVT = Vec.getSimpleValueType();
	assert((VecVT.is128BitVector() \|\| VecVT.is256BitVector()) &&
	"Unexpected MOVMSK operand");
	unsigned NumElts = VecVT.getVectorNumElements();
	unsigned NumEltBits = VecVT.getScalarSizeInBits();

	bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isNullValue();
	bool IsAllOf = CmpOpcode == X86ISD::SUB && NumElts <= CmpBits &&
	CmpVal.isMask(NumElts);
	if (!IsAnyOf && !IsAllOf)
	return SDValue();

	// See if we can peek through to a vector with a wider element type, if the
	// signbits extend down to all the sub-elements as well.
	// Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
	// potential SimplifyDemandedBits/Elts cases.
	if (Vec.getOpcode() == ISD::BITCAST) {
	SDValue BC = peekThroughBitcasts(Vec);
	MVT BCVT = BC.getSimpleValueType();
	unsigned BCNumElts = BCVT.getVectorNumElements();
	unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
	if ((BCNumEltBits == 32 \|\| BCNumEltBits == 64) &&
	BCNumEltBits > NumEltBits &&
	DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
	SDLoc DL(EFLAGS);
	unsigned CmpMask = IsAnyOf ? 0 : ((1 << BCNumElts) - 1);
	return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
	DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
	DAG.getConstant(CmpMask, DL, MVT::i32));
	}
	}

	// MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
	// MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
	if (IsAllOf && Subtarget.hasSSE41()) {
	SDValue BC = peekThroughBitcasts(Vec);
	if (BC.getOpcode() == X86ISD::PCMPEQ &&
	ISD::isBuildVectorAllZeros(BC.getOperand(1).getNode())) {
	MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
	SDValue V = DAG.getBitcast(TestVT, BC.getOperand(0));
	return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
	}
	}

	// See if we can avoid a PACKSS by calling MOVMSK on the sources.
	// For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
	// sign bits prior to the comparison with zero unless we know that
	// the vXi16 splats the sign bit down to the lower i8 half.
	// TODO: Handle all_of patterns.
	if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
	SDValue VecOp0 = Vec.getOperand(0);
	SDValue VecOp1 = Vec.getOperand(1);
	bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
	bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
	// PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
	if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
	SDLoc DL(EFLAGS);
	SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
	Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
	Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
	if (!SignExt0) {
	Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
	DAG.getConstant(0xAAAA, DL, MVT::i16));
	}
	return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
	DAG.getConstant(0, DL, MVT::i16));
	}
	// PMOVMSKB(PACKSSBW(LO(X), HI(X)))
	// -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
	if (CmpBits == 16 && Subtarget.hasInt256() &&
	VecOp0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	VecOp1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	VecOp0.getOperand(0) == VecOp1.getOperand(0) &&
	VecOp0.getConstantOperandAPInt(1) == 0 &&
	VecOp1.getConstantOperandAPInt(1) == 8 &&
	(IsAnyOf \|\| (SignExt0 && SignExt1))) {
	SDLoc DL(EFLAGS);
	SDValue Result = DAG.getBitcast(MVT::v32i8, VecOp0.getOperand(0));
	Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
	unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
	if (!SignExt0 \|\| !SignExt1) {
	assert(IsAnyOf && "Only perform v16i16 signmasks for any_of patterns");
	Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
	DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
	}
	return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
	DAG.getConstant(CmpMask, DL, MVT::i32));
	}
	}

	// MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
	SmallVector<int, 32> ShuffleMask;
	SmallVector<SDValue, 2> ShuffleInputs;
	if (NumElts == CmpBits &&
	getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
	ShuffleMask, DAG) &&
	ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
	ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {
	unsigned NumShuffleElts = ShuffleMask.size();
	APInt DemandedElts = APInt::getNullValue(NumShuffleElts);
	for (int M : ShuffleMask) {
	assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index");
	DemandedElts.setBit(M);
	}
	if (DemandedElts.isAllOnesValue()) {
	SDLoc DL(EFLAGS);
	SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
	Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
	Result =
	DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
	return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
	EFLAGS.getOperand(1));
	}
	}

	return SDValue();
	}

	/// Optimize an EFLAGS definition used according to the condition code \p CC
	/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
	/// uses of chain values.
	static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (CC == X86::COND_B)
	if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
	return Flags;

	if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
	return R;

	if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
	return R;

	if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
	return R;

	return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
	}

	/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
	static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);

	SDValue FalseOp = N->getOperand(0);
	SDValue TrueOp = N->getOperand(1);
	X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
	SDValue Cond = N->getOperand(3);

	// cmov X, X, ?, ? --> X
	if (TrueOp == FalseOp)
	return TrueOp;

	// Try to simplify the EFLAGS and condition code operands.
	// We can't always do this as FCMOV only supports a subset of X86 cond.
	if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
	if (!(FalseOp.getValueType() == MVT::f80 \|\|
	(FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) \|\|
	(FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) \|\|
	!Subtarget.hasCMov() \|\| hasFPCMov(CC)) {
	SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
	Flags};
	return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
	}
	}

	// If this is a select between two integer constants, try to do some
	// optimizations. Note that the operands are ordered the opposite of SELECT
	// operands.
	if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
	if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
	// Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
	// larger than FalseC (the false value).
	if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
	CC = X86::GetOppositeBranchCondition(CC);
	std::swap(TrueC, FalseC);
	std::swap(TrueOp, FalseOp);
	}

	// Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
	// This is efficient for any integer data type (including i8/i16) and
	// shift amount.
	if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
	Cond = getSETCC(CC, Cond, DL, DAG);

	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);

	unsigned ShAmt = TrueC->getAPIntValue().logBase2();
	Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
	DAG.getConstant(ShAmt, DL, MVT::i8));
	return Cond;
	}

	// Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
	// for any integer data type, including i8/i16.
	if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
	Cond = getSETCC(CC, Cond, DL, DAG);

	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
	FalseC->getValueType(0), Cond);
	Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
	SDValue(FalseC, 0));
	return Cond;
	}

	// Optimize cases that will turn into an LEA instruction. This requires
	// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
	if (N->getValueType(0) == MVT::i32 \|\| N->getValueType(0) == MVT::i64) {
	APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
	assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&
	"Implicit constant truncation");

	bool isFastMultiplier = false;
	if (Diff.ult(10)) {
	switch (Diff.getZExtValue()) {
	default: break;
	case 1: // result = add base, cond
	case 2: // result = lea base( , cond*2)
	case 3: // result = lea base(cond, cond*2)
	case 4: // result = lea base( , cond*4)
	case 5: // result = lea base(cond, cond*4)
	case 8: // result = lea base( , cond*8)
	case 9: // result = lea base(cond, cond*8)
	isFastMultiplier = true;
	break;
	}
	}

	if (isFastMultiplier) {
	Cond = getSETCC(CC, Cond, DL ,DAG);
	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
	Cond);
	// Scale the condition by the difference.
	if (Diff != 1)
	Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
	DAG.getConstant(Diff, DL, Cond.getValueType()));

	// Add the base if non-zero.
	if (FalseC->getAPIntValue() != 0)
	Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
	SDValue(FalseC, 0));
	return Cond;
	}
	}
	}
	}

	// Handle these cases:
	// (select (x != c), e, c) -> select (x != c), e, x),
	// (select (x == c), c, e) -> select (x == c), x, e)
	// where the c is an integer constant, and the "select" is the combination
	// of CMOV and CMP.
	//
	// The rationale for this change is that the conditional-move from a constant
	// needs two instructions, however, conditional-move from a register needs
	// only one instruction.
	//
	// CAVEAT: By replacing a constant with a symbolic value, it may obscure
	// some instruction-combining opportunities. This opt needs to be
	// postponed as late as possible.
	//
	if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
	// the DCI.xxxx conditions are provided to postpone the optimization as
	// late as possible.

	ConstantSDNode *CmpAgainst = nullptr;
	if ((Cond.getOpcode() == X86ISD::CMP \|\| Cond.getOpcode() == X86ISD::SUB) &&
	(CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
	!isa<ConstantSDNode>(Cond.getOperand(0))) {

	if (CC == X86::COND_NE &&
	CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
	CC = X86::GetOppositeBranchCondition(CC);
	std::swap(TrueOp, FalseOp);
	}

	if (CC == X86::COND_E &&
	CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
	SDValue Ops[] = {FalseOp, Cond.getOperand(0),
	DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
	return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
	}
	}
	}

	// Fold and/or of setcc's to double CMOV:
	// (CMOV F, T, ((cc1 \| cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
	// (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
	//
	// This combine lets us generate:
	// cmovcc1 (jcc1 if we don't have CMOV)
	// cmovcc2 (same)
	// instead of:
	// setcc1
	// setcc2
	// and/or
	// cmovne (jne if we don't have CMOV)
	// When we can't use the CMOV instruction, it might increase branch
	// mispredicts.
	// When we can use CMOV, or when there is no mispredict, this improves
	// throughput and reduces register pressure.
	//
	if (CC == X86::COND_NE) {
	SDValue Flags;
	X86::CondCode CC0, CC1;
	bool isAndSetCC;
	if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
	if (isAndSetCC) {
	std::swap(FalseOp, TrueOp);
	CC0 = X86::GetOppositeBranchCondition(CC0);
	CC1 = X86::GetOppositeBranchCondition(CC1);
	}

	SDValue LOps[] = {FalseOp, TrueOp,
	DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
	SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
	SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
	Flags};
	SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
	return CMOV;
	}
	}

	// Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
	// (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
	// Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
	// (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
	if ((CC == X86::COND_NE \|\| CC == X86::COND_E) &&
	Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
	SDValue Add = TrueOp;
	SDValue Const = FalseOp;
	// Canonicalize the condition code for easier matching and output.
	if (CC == X86::COND_E)
	std::swap(Add, Const);

	// We might have replaced the constant in the cmov with the LHS of the
	// compare. If so change it to the RHS of the compare.
	if (Const == Cond.getOperand(0))
	Const = Cond.getOperand(1);

	// Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
	if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
	Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
	(Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF \|\|
	Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
	Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
	EVT VT = N->getValueType(0);
	// This should constant fold.
	SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
	SDValue CMov =
	DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
	DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
	return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
	}
	}

	return SDValue();
	}

	/// Different mul shrinking modes.
	enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };

	static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
	EVT VT = N->getOperand(0).getValueType();
	if (VT.getScalarSizeInBits() != 32)
	return false;

	assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
	unsigned SignBits[2] = {1, 1};
	bool IsPositive[2] = {false, false};
	for (unsigned i = 0; i < 2; i++) {
	SDValue Opd = N->getOperand(i);

	SignBits[i] = DAG.ComputeNumSignBits(Opd);
	IsPositive[i] = DAG.SignBitIsZero(Opd);
	}

	bool AllPositive = IsPositive[0] && IsPositive[1];
	unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
	// When ranges are from -128 ~ 127, use MULS8 mode.
	if (MinSignBits >= 25)
	Mode = ShrinkMode::MULS8;
	// When ranges are from 0 ~ 255, use MULU8 mode.
	else if (AllPositive && MinSignBits >= 24)
	Mode = ShrinkMode::MULU8;
	// When ranges are from -32768 ~ 32767, use MULS16 mode.
	else if (MinSignBits >= 17)
	Mode = ShrinkMode::MULS16;
	// When ranges are from 0 ~ 65535, use MULU16 mode.
	else if (AllPositive && MinSignBits >= 16)
	Mode = ShrinkMode::MULU16;
	else
	return false;
	return true;
	}

	/// When the operands of vector mul are extended from smaller size values,
	/// like i8 and i16, the type of mul may be shrinked to generate more
	/// efficient code. Two typical patterns are handled:
	/// Pattern1:
	/// %2 = sext/zext <N x i8> %1 to <N x i32>
	/// %4 = sext/zext <N x i8> %3 to <N x i32>
	// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
	/// %5 = mul <N x i32> %2, %4
	///
	/// Pattern2:
	/// %2 = zext/sext <N x i16> %1 to <N x i32>
	/// %4 = zext/sext <N x i16> %3 to <N x i32>
	/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
	/// %5 = mul <N x i32> %2, %4
	///
	/// There are four mul shrinking modes:
	/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
	/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
	/// generate pmullw+sext32 for it (MULS8 mode).
	/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
	/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
	/// generate pmullw+zext32 for it (MULU8 mode).
	/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
	/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
	/// generate pmullw+pmulhw for it (MULS16 mode).
	/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
	/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
	/// generate pmullw+pmulhuw for it (MULU16 mode).
	static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Check for legality
	// pmullw/pmulhw are not supported by SSE.
	if (!Subtarget.hasSSE2())
	return SDValue();

	// Check for profitability
	// pmulld is supported since SSE41. It is better to use pmulld
	// instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
	// the expansion.
	bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
	if (Subtarget.hasSSE41() && (OptForMinSize \|\| !Subtarget.isPMULLDSlow()))
	return SDValue();

	ShrinkMode Mode;
	if (!canReduceVMulWidth(N, DAG, Mode))
	return SDValue();

	SDLoc DL(N);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getOperand(0).getValueType();
	unsigned NumElts = VT.getVectorNumElements();
	if ((NumElts % 2) != 0)
	return SDValue();

	EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);

	// Shrink the operands of mul.
	SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
	SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);

	// Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
	// lower part is needed.
	SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
	if (Mode == ShrinkMode::MULU8 \|\| Mode == ShrinkMode::MULS8)
	return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
	: ISD::SIGN_EXTEND,
	DL, VT, MulLo);

	EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
	// Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
	// the higher part is also needed.
	SDValue MulHi =
	DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
	ReducedVT, NewN0, NewN1);

	// Repack the lower part and higher part result of mul into a wider
	// result.
	// Generate shuffle functioning as punpcklwd.
	SmallVector<int, 16> ShuffleMask(NumElts);
	for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
	ShuffleMask[2 * i] = i;
	ShuffleMask[2 * i + 1] = i + NumElts;
	}
	SDValue ResLo =
	DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
	ResLo = DAG.getBitcast(ResVT, ResLo);
	// Generate shuffle functioning as punpckhwd.
	for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
	ShuffleMask[2 * i] = i + NumElts / 2;
	ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
	}
	SDValue ResHi =
	DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
	ResHi = DAG.getBitcast(ResVT, ResHi);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
	}

	static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
	EVT VT, const SDLoc &DL) {

	auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
	SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(Mult, DL, VT));
	Result = DAG.getNode(ISD::SHL, DL, VT, Result,
	DAG.getConstant(Shift, DL, MVT::i8));
	Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
	N->getOperand(0));
	return Result;
	};

	auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
	SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(Mul1, DL, VT));
	Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
	DAG.getConstant(Mul2, DL, VT));
	Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
	N->getOperand(0));
	return Result;
	};

	switch (MulAmt) {
	default:
	break;
	case 11:
	// mul x, 11 => add ((shl (mul x, 5), 1), x)
	return combineMulShlAddOrSub(5, 1, /isAdd/ true);
	case 21:
	// mul x, 21 => add ((shl (mul x, 5), 2), x)
	return combineMulShlAddOrSub(5, 2, /isAdd/ true);
	case 41:
	// mul x, 41 => add ((shl (mul x, 5), 3), x)
	return combineMulShlAddOrSub(5, 3, /isAdd/ true);
	case 22:
	// mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
	return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
	combineMulShlAddOrSub(5, 2, /isAdd/ true));
	case 19:
	// mul x, 19 => add ((shl (mul x, 9), 1), x)
	return combineMulShlAddOrSub(9, 1, /isAdd/ true);
	case 37:
	// mul x, 37 => add ((shl (mul x, 9), 2), x)
	return combineMulShlAddOrSub(9, 2, /isAdd/ true);
	case 73:
	// mul x, 73 => add ((shl (mul x, 9), 3), x)
	return combineMulShlAddOrSub(9, 3, /isAdd/ true);
	case 13:
	// mul x, 13 => add ((shl (mul x, 3), 2), x)
	return combineMulShlAddOrSub(3, 2, /isAdd/ true);
	case 23:
	// mul x, 23 => sub ((shl (mul x, 3), 3), x)
	return combineMulShlAddOrSub(3, 3, /isAdd/ false);
	case 26:
	// mul x, 26 => add ((mul (mul x, 5), 5), x)
	return combineMulMulAddOrSub(5, 5, /isAdd/ true);
	case 28:
	// mul x, 28 => add ((mul (mul x, 9), 3), x)
	return combineMulMulAddOrSub(9, 3, /isAdd/ true);
	case 29:
	// mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
	return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
	combineMulMulAddOrSub(9, 3, /isAdd/ true));
	}

	// Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
	// by a single LEA.
	// First check if this a sum of two power of 2s because that's easy. Then
	// count how many zeros are up to the first bit.
	// TODO: We can do this even without LEA at a cost of two shifts and an add.
	if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
	unsigned ScaleShift = countTrailingZeros(MulAmt);
	if (ScaleShift >= 1 && ScaleShift < 4) {
	unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
	SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(ShiftAmt, DL, MVT::i8));
	SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(ScaleShift, DL, MVT::i8));
	return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
	}
	}

	return SDValue();
	}

	// If the upper 17 bits of each element are zero then we can use PMADDWD,
	// which is always at least as quick as PMULLD, except on KNL.
	static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	if (Subtarget.isPMADDWDSlow())
	return SDValue();

	EVT VT = N->getValueType(0);

	// Only support vXi32 vectors.
	if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::i32)
	return SDValue();

	// Make sure the type is legal or will be widened to a legal type.
	if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());

	// Without BWI, we would need to split v32i16.
	if (WVT == MVT::v32i16 && !Subtarget.hasBWI())
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// If we are zero extending two steps without SSE4.1, its better to reduce
	// the vmul width instead.
	if (!Subtarget.hasSSE41() &&
	(N0.getOpcode() == ISD::ZERO_EXTEND &&
	N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
	(N1.getOpcode() == ISD::ZERO_EXTEND &&
	N1.getOperand(0).getScalarValueSizeInBits() <= 8))
	return SDValue();

	APInt Mask17 = APInt::getHighBitsSet(32, 17);
	if (!DAG.MaskedValueIsZero(N1, Mask17) \|\|
	!DAG.MaskedValueIsZero(N0, Mask17))
	return SDValue();

	// Use SplitOpsAndApply to handle AVX splitting.
	auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
	return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
	};
	return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
	{ DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
	PMADDWDBuilder);
	}

	static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	EVT VT = N->getValueType(0);

	// Only support vXi64 vectors.
	if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::i64 \|\|
	VT.getVectorNumElements() < 2 \|\|
	!isPowerOf2_32(VT.getVectorNumElements()))
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// MULDQ returns the 64-bit result of the signed multiplication of the lower
	// 32-bits. We can lower with this if the sign bits stretch that far.
	if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
	DAG.ComputeNumSignBits(N1) > 32) {
	auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
	};
	return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
	PMULDQBuilder, /CheckBWI/false);
	}

	// If the upper bits are zero we can use a single pmuludq.
	APInt Mask = APInt::getHighBitsSet(64, 32);
	if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
	auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
	};
	return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
	PMULUDQBuilder, /CheckBWI/false);
	}

	return SDValue();
	}

	/// Optimize a single multiply with constant into two operations in order to
	/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
	static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);

	if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
	return V;

	if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
	return V;

	if (DCI.isBeforeLegalize() && VT.isVector())
	return reduceVMULWidth(N, DAG, Subtarget);

	if (!MulConstantOptimization)
	return SDValue();
	// An imul is usually smaller than the alternative sequence.
	if (DAG.getMachineFunction().getFunction().hasMinSize())
	return SDValue();

	if (DCI.isBeforeLegalize() \|\| DCI.isCalledByLegalizer())
	return SDValue();

	if (VT != MVT::i64 && VT != MVT::i32)
	return SDValue();

	ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!C)
	return SDValue();
	if (isPowerOf2_64(C->getZExtValue()))
	return SDValue();

	int64_t SignMulAmt = C->getSExtValue();
	assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
	uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;

	SDLoc DL(N);
	if (AbsMulAmt == 3 \|\| AbsMulAmt == 5 \|\| AbsMulAmt == 9) {
	SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(AbsMulAmt, DL, VT));
	if (SignMulAmt < 0)
	NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	NewMul);

	return NewMul;
	}

	uint64_t MulAmt1 = 0;
	uint64_t MulAmt2 = 0;
	if ((AbsMulAmt % 9) == 0) {
	MulAmt1 = 9;
	MulAmt2 = AbsMulAmt / 9;
	} else if ((AbsMulAmt % 5) == 0) {
	MulAmt1 = 5;
	MulAmt2 = AbsMulAmt / 5;
	} else if ((AbsMulAmt % 3) == 0) {
	MulAmt1 = 3;
	MulAmt2 = AbsMulAmt / 3;
	}

	SDValue NewMul;
	// For negative multiply amounts, only allow MulAmt2 to be a power of 2.
	if (MulAmt2 &&
	(isPowerOf2_64(MulAmt2) \|\|
	(SignMulAmt >= 0 && (MulAmt2 == 3 \|\| MulAmt2 == 5 \|\| MulAmt2 == 9)))) {

	if (isPowerOf2_64(MulAmt2) &&
	!(SignMulAmt >= 0 && N->hasOneUse() &&
	N->use_begin()->getOpcode() == ISD::ADD))
	// If second multiplifer is pow2, issue it first. We want the multiply by
	// 3, 5, or 9 to be folded into the addressing mode unless the lone use
	// is an add. Only do this for positive multiply amounts since the
	// negate would prevent it from being used as an address mode anyway.
	std::swap(MulAmt1, MulAmt2);

	if (isPowerOf2_64(MulAmt1))
	NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
	else
	NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(MulAmt1, DL, VT));

	if (isPowerOf2_64(MulAmt2))
	NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
	DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
	else
	NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
	DAG.getConstant(MulAmt2, DL, VT));

	// Negate the result.
	if (SignMulAmt < 0)
	NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	NewMul);
	} else if (!Subtarget.slowLEA())
	NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);

	if (!NewMul) {
	assert(C->getZExtValue() != 0 &&
	C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
	"Both cases that could cause potential overflows should have "
	"already been handled.");
	if (isPowerOf2_64(AbsMulAmt - 1)) {
	// (mul x, 2^N + 1) => (add (shl x, N), x)
	NewMul = DAG.getNode(
	ISD::ADD, DL, VT, N->getOperand(0),
	DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
	MVT::i8)));
	// To negate, subtract the number from zero
	if (SignMulAmt < 0)
	NewMul = DAG.getNode(ISD::SUB, DL, VT,
	DAG.getConstant(0, DL, VT), NewMul);
	} else if (isPowerOf2_64(AbsMulAmt + 1)) {
	// (mul x, 2^N - 1) => (sub (shl x, N), x)
	NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(AbsMulAmt + 1),
	DL, MVT::i8));
	// To negate, reverse the operands of the subtract.
	if (SignMulAmt < 0)
	NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
	else
	NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
	} else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
	// (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
	NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(AbsMulAmt - 2),
	DL, MVT::i8));
	NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
	NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
	} else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
	// (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
	NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(AbsMulAmt + 2),
	DL, MVT::i8));
	NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
	NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
	}
	}

	return NewMul;
	}

	// Try to form a MULHU or MULHS node by looking for
	// (srl (mul ext, ext), 16)
	// TODO: This is X86 specific because we want to be able to handle wide types
	// before type legalization. But we can only do it if the vector will be
	// legalized via widening/splitting. Type legalization can't handle promotion
	// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
	// combiner.
	static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert((N->getOpcode() == ISD::SRL \|\| N->getOpcode() == ISD::SRA) &&
	"SRL or SRA node is required here!");
	SDLoc DL(N);

	// Only do this with SSE4.1. On earlier targets reduceVMULWidth will expand
	// the multiply.
	if (!Subtarget.hasSSE41())
	return SDValue();

	// The operation feeding into the shift must be a multiply.
	SDValue ShiftOperand = N->getOperand(0);
	if (ShiftOperand.getOpcode() != ISD::MUL \|\| !ShiftOperand.hasOneUse())
	return SDValue();

	// Input type should be at least vXi32.
	EVT VT = N->getValueType(0);
	if (!VT.isVector() \|\| VT.getVectorElementType().getSizeInBits() < 32)
	return SDValue();

	// Need a shift by 16.
	APInt ShiftAmt;
	if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) \|\|
	ShiftAmt != 16)
	return SDValue();

	SDValue LHS = ShiftOperand.getOperand(0);
	SDValue RHS = ShiftOperand.getOperand(1);

	unsigned ExtOpc = LHS.getOpcode();
	if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) \|\|
	RHS.getOpcode() != ExtOpc)
	return SDValue();

	// Peek through the extends.
	LHS = LHS.getOperand(0);
	RHS = RHS.getOperand(0);

	// Ensure the input types match.
	EVT MulVT = LHS.getValueType();
	if (MulVT.getVectorElementType() != MVT::i16 \|\| RHS.getValueType() != MulVT)
	return SDValue();

	unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
	SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);

	ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	return DAG.getNode(ExtOpc, DL, VT, Mulh);
	}

	static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
	EVT VT = N0.getValueType();

	// fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
	// since the result of setcc_c is all zero's or all ones.
	if (VT.isInteger() && !VT.isVector() &&
	N1C && N0.getOpcode() == ISD::AND &&
	N0.getOperand(1).getOpcode() == ISD::Constant) {
	SDValue N00 = N0.getOperand(0);
	APInt Mask = N0.getConstantOperandAPInt(1);
	Mask <<= N1C->getAPIntValue();
	bool MaskOK = false;
	// We can handle cases concerning bit-widening nodes containing setcc_c if
	// we carefully interrogate the mask to make sure we are semantics
	// preserving.
	// The transform is not safe if the result of C1 << C2 exceeds the bitwidth
	// of the underlying setcc_c operation if the setcc_c was zero extended.
	// Consider the following example:
	// zext(setcc_c) -> i32 0x0000FFFF
	// c1 -> i32 0x0000FFFF
	// c2 -> i32 0x00000001
	// (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
	// (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
	if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = true;
	} else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
	N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = true;
	} else if ((N00.getOpcode() == ISD::ZERO_EXTEND \|\|
	N00.getOpcode() == ISD::ANY_EXTEND) &&
	N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
	}
	if (MaskOK && Mask != 0) {
	SDLoc DL(N);
	return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
	}
	}

	// Hardware support for vector shifts is sparse which makes us scalarize the
	// vector operations in many cases. Also, on sandybridge ADD is faster than
	// shl.
	// (shl V, 1) -> add V,V
	if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
	if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
	assert(N0.getValueType().isVector() && "Invalid vector shift type");
	// We shift all of the values by one. In many cases we do not have
	// hardware support for this operation. This is better expressed as an ADD
	// of two values.
	if (N1SplatC->isOne())
	return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
	}

	return SDValue();
	}

	static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	unsigned Size = VT.getSizeInBits();

	if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
	return V;

	// fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
	// into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
	// into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
	// depending on sign of (SarConst - [56,48,32,24,16])

	// sexts in X86 are MOVs. The MOVs have the same code size
	// as above SHIFTs (only SHIFT on 1 has lower code size).
	// However the MOVs have 2 advantages to a SHIFT:
	// 1. MOVs can write to a register that differs from source
	// 2. MOVs accept memory operands

	if (VT.isVector() \|\| N1.getOpcode() != ISD::Constant \|\|
	N0.getOpcode() != ISD::SHL \|\| !N0.hasOneUse() \|\|
	N0.getOperand(1).getOpcode() != ISD::Constant)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
	APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
	EVT CVT = N1.getValueType();

	if (SarConst.isNegative())
	return SDValue();

	for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
	unsigned ShiftSize = SVT.getSizeInBits();
	// skipping types without corresponding sext/zext and
	// ShlConst that is not one of [56,48,32,24,16]
	if (ShiftSize >= Size \|\| ShlConst != Size - ShiftSize)
	continue;
	SDLoc DL(N);
	SDValue NN =
	DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
	SarConst = SarConst - (Size - ShiftSize);
	if (SarConst == 0)
	return NN;
	else if (SarConst.isNegative())
	return DAG.getNode(ISD::SHL, DL, VT, NN,
	DAG.getConstant(-SarConst, DL, CVT));
	else
	return DAG.getNode(ISD::SRA, DL, VT, NN,
	DAG.getConstant(SarConst, DL, CVT));
	}
	return SDValue();
	}

	static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();

	if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
	return V;

	// Only do this on the last DAG combine as it can interfere with other
	// combines.
	if (!DCI.isAfterLegalizeDAG())
	return SDValue();

	// Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
	// TODO: This is a generic DAG combine that became an x86-only combine to
	// avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
	// and-not ('andn').
	if (N0.getOpcode() != ISD::AND \|\| !N0.hasOneUse())
	return SDValue();

	auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
	auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (!ShiftC \|\| !AndC)
	return SDValue();

	// If we can shrink the constant mask below 8-bits or 32-bits, then this
	// transform should reduce code size. It may also enable secondary transforms
	// from improved known-bits analysis or instruction selection.
	APInt MaskVal = AndC->getAPIntValue();

	// If this can be matched by a zero extend, don't optimize.
	if (MaskVal.isMask()) {
	unsigned TO = MaskVal.countTrailingOnes();
	if (TO >= 8 && isPowerOf2_32(TO))
	return SDValue();
	}

	APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
	unsigned OldMaskSize = MaskVal.getMinSignedBits();
	unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
	if ((OldMaskSize > 8 && NewMaskSize <= 8) \|\|
	(OldMaskSize > 32 && NewMaskSize <= 32)) {
	// srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
	SDLoc DL(N);
	SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
	SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
	return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
	}
	return SDValue();
	}

	static SDValue combineVectorPackWithShuffle(SDNode *N, SelectionDAG &DAG) {
	unsigned Opcode = N->getOpcode();
	assert((X86ISD::PACKSS == Opcode \|\| X86ISD::PACKUS == Opcode) &&
	"Unexpected pack opcode");

	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	unsigned NumDstElts = VT.getVectorNumElements();

	// Attempt to fold PACK(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
	// to SHUFFLE(PACK(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
	// truncation trees that help us avoid lane crossing shuffles.
	// TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
	if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N0.getConstantOperandAPInt(1) == 0 &&
	N1.getConstantOperandAPInt(1) == (NumDstElts / 2) &&
	N0.getOperand(0) == N1.getOperand(0) && VT.is128BitVector() &&
	N0.getOperand(0).getValueType().is256BitVector()) {
	// TODO - support target/faux shuffles.
	SDValue Vec = peekThroughBitcasts(N0.getOperand(0));
	if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Vec)) {
	// To keep the PACK LHS/RHS coherency, we must be able to scale the unary
	// shuffle to a vXi64 width - we can probably relax this in the future.
	SmallVector<int, 4> ShuffleMask;
	if (SVN->getOperand(1).isUndef() &&
	scaleShuffleElements(SVN->getMask(), 4, ShuffleMask)) {
	SDLoc DL(N);
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(SVN->getOperand(0), DL);
	Lo = DAG.getBitcast(N0.getValueType(), Lo);
	Hi = DAG.getBitcast(N1.getValueType(), Hi);
	SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
	Res = DAG.getBitcast(MVT::v4i32, Res);
	Res = DAG.getVectorShuffle(MVT::v4i32, DL, Res, Res, ShuffleMask);
	return DAG.getBitcast(VT, Res);
	}
	}
	}

	// Attempt to fold PACK(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(PACK(X,Y)).
	// TODO: Relax shuffle scaling to support sub-128-bit subvector shuffles.
	if (VT.is256BitVector()) {
	if (auto *SVN0 = dyn_cast<ShuffleVectorSDNode>(N0)) {
	if (auto *SVN1 = dyn_cast<ShuffleVectorSDNode>(N1)) {
	SmallVector<int, 2> ShuffleMask0, ShuffleMask1;
	if (scaleShuffleElements(SVN0->getMask(), 2, ShuffleMask0) &&
	scaleShuffleElements(SVN1->getMask(), 2, ShuffleMask1)) {
	SDValue Op00 = SVN0->getOperand(0);
	SDValue Op01 = SVN0->getOperand(1);
	SDValue Op10 = SVN1->getOperand(0);
	SDValue Op11 = SVN1->getOperand(1);
	if ((Op00 == Op11) && (Op01 == Op10)) {
	std::swap(Op10, Op11);
	ShuffleVectorSDNode::commuteMask(ShuffleMask1);
	}
	if ((Op00 == Op10) && (Op01 == Op11)) {
	SmallVector<int, 4> ShuffleMask;
	ShuffleMask.append(ShuffleMask0.begin(), ShuffleMask0.end());
	ShuffleMask.append(ShuffleMask1.begin(), ShuffleMask1.end());
	SDLoc DL(N);
	SDValue Res = DAG.getNode(Opcode, DL, VT, Op00, Op01);
	Res = DAG.getBitcast(MVT::v4i64, Res);
	Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, ShuffleMask);
	return DAG.getBitcast(VT, Res);
	}
	}
	}
	}
	}

	return SDValue();
	}

	static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	assert((X86ISD::PACKSS == Opcode \|\| X86ISD::PACKUS == Opcode) &&
	"Unexpected pack opcode");

	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	unsigned NumDstElts = VT.getVectorNumElements();
	unsigned DstBitsPerElt = VT.getScalarSizeInBits();
	unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
	assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
	N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
	"Unexpected PACKSS/PACKUS input type");

	bool IsSigned = (X86ISD::PACKSS == Opcode);

	// Constant Folding.
	APInt UndefElts0, UndefElts1;
	SmallVector<APInt, 32> EltBits0, EltBits1;
	if ((N0.isUndef() \|\| N->isOnlyUserOf(N0.getNode())) &&
	(N1.isUndef() \|\| N->isOnlyUserOf(N1.getNode())) &&
	getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
	getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
	unsigned NumLanes = VT.getSizeInBits() / 128;
	unsigned NumSrcElts = NumDstElts / 2;
	unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
	unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;

	APInt Undefs(NumDstElts, 0);
	SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
	for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
	for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
	unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
	auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
	auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);

	if (UndefElts[SrcIdx]) {
	Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
	continue;
	}

	APInt &Val = EltBits[SrcIdx];
	if (IsSigned) {
	// PACKSS: Truncate signed value with signed saturation.
	// Source values less than dst minint are saturated to minint.
	// Source values greater than dst maxint are saturated to maxint.
	if (Val.isSignedIntN(DstBitsPerElt))
	Val = Val.trunc(DstBitsPerElt);
	else if (Val.isNegative())
	Val = APInt::getSignedMinValue(DstBitsPerElt);
	else
	Val = APInt::getSignedMaxValue(DstBitsPerElt);
	} else {
	// PACKUS: Truncate signed value with unsigned saturation.
	// Source values less than zero are saturated to zero.
	// Source values greater than dst maxuint are saturated to maxuint.
	if (Val.isIntN(DstBitsPerElt))
	Val = Val.trunc(DstBitsPerElt);
	else if (Val.isNegative())
	Val = APInt::getNullValue(DstBitsPerElt);
	else
	Val = APInt::getAllOnesValue(DstBitsPerElt);
	}
	Bits[Lane * NumDstEltsPerLane + Elt] = Val;
	}
	}

	return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
	}

	// Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
	if (SDValue V = combineVectorPackWithShuffle(N, DAG))
	return V;

	// Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
	// truncate to create a larger truncate.
	if (Subtarget.hasAVX512() &&
	N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
	N0.getOperand(0).getValueType() == MVT::v8i32) {
	if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) \|\|
	(!IsSigned &&
	DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
	if (Subtarget.hasVLX())
	return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));

	// Widen input to v16i32 so we can truncate that.
	SDLoc dl(N);
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
	N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
	return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
	}
	}

	// Attempt to combine as shuffle.
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;

	return SDValue();
	}

	static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	assert((X86ISD::VSHL == N->getOpcode() \|\| X86ISD::VSRA == N->getOpcode() \|\|
	X86ISD::VSRL == N->getOpcode()) &&
	"Unexpected shift opcode");
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Shift zero -> zero.
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return DAG.getConstant(0, SDLoc(N), VT);

	// Detect constant shift amounts.
	APInt UndefElts;
	SmallVector<APInt, 32> EltBits;
	if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
	unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
	return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
	EltBits[0].getZExtValue(), DAG);
	}

	APInt KnownUndef, KnownZero;
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
	if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
	KnownZero, DCI))
	return SDValue(N, 0);

	return SDValue();
	}

	static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	assert((X86ISD::VSHLI == Opcode \|\| X86ISD::VSRAI == Opcode \|\|
	X86ISD::VSRLI == Opcode) &&
	"Unexpected shift opcode");
	bool LogicalShift = X86ISD::VSHLI == Opcode \|\| X86ISD::VSRLI == Opcode;
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	unsigned NumBitsPerElt = VT.getScalarSizeInBits();
	assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
	"Unexpected value type");
	assert(N->getOperand(1).getValueType() == MVT::i8 &&
	"Unexpected shift amount type");

	// Out of range logical bit shifts are guaranteed to be zero.
	// Out of range arithmetic bit shifts splat the sign bit.
	unsigned ShiftVal = N->getConstantOperandVal(1);
	if (ShiftVal >= NumBitsPerElt) {
	if (LogicalShift)
	return DAG.getConstant(0, SDLoc(N), VT);
	ShiftVal = NumBitsPerElt - 1;
	}

	// (shift X, 0) -> X
	if (!ShiftVal)
	return N0;

	// (shift 0, C) -> 0
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	// N0 is all zeros or undef. We guarantee that the bits shifted into the
	// result are all zeros, not undef.
	return DAG.getConstant(0, SDLoc(N), VT);

	// (VSRAI -1, C) -> -1
	if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
	// N0 is all ones or undef. We guarantee that the bits shifted into the
	// result are all ones, not undef.
	return DAG.getConstant(-1, SDLoc(N), VT);

	// (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
	if (Opcode == N0.getOpcode()) {
	unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
	unsigned NewShiftVal = ShiftVal + ShiftVal2;
	if (NewShiftVal >= NumBitsPerElt) {
	// Out of range logical bit shifts are guaranteed to be zero.
	// Out of range arithmetic bit shifts splat the sign bit.
	if (LogicalShift)
	return DAG.getConstant(0, SDLoc(N), VT);
	NewShiftVal = NumBitsPerElt - 1;
	}
	return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
	DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
	}

	// We can decode 'whole byte' logical bit shifts as shuffles.
	if (LogicalShift && (ShiftVal % 8) == 0) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;
	}

	// Constant Folding.
	APInt UndefElts;
	SmallVector<APInt, 32> EltBits;
	if (N->isOnlyUserOf(N0.getNode()) &&
	getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
	assert(EltBits.size() == VT.getVectorNumElements() &&
	"Unexpected shift value type");
	// Undef elements need to fold to 0. It's possible SimplifyDemandedBits
	// created an undef input due to no input bits being demanded, but user
	// still expects 0 in other bits.
	for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
	APInt &Elt = EltBits[i];
	if (UndefElts[i])
	Elt = 0;
	else if (X86ISD::VSHLI == Opcode)
	Elt <<= ShiftVal;
	else if (X86ISD::VSRAI == Opcode)
	Elt.ashrInPlace(ShiftVal);
	else
	Elt.lshrInPlace(ShiftVal);
	}
	// Reset undef elements since they were zeroed above.
	UndefElts = 0;
	return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
	}

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.SimplifyDemandedBits(SDValue(N, 0),
	APInt::getAllOnesValue(NumBitsPerElt), DCI))
	return SDValue(N, 0);

	return SDValue();
	}

	static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) \|\|
	(N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) \|\|
	N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&
	"Unexpected vector insertion");

	if (N->getOpcode() == X86ISD::PINSRB \|\| N->getOpcode() == X86ISD::PINSRW) {
	unsigned NumBitsPerElt = VT.getScalarSizeInBits();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.SimplifyDemandedBits(SDValue(N, 0),
	APInt::getAllOnesValue(NumBitsPerElt), DCI))
	return SDValue(N, 0);
	}

	// Attempt to combine insertion patterns to a shuffle.
	if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;
	}

	return SDValue();
	}

	/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
	/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
	/// OR -> CMPNEQSS.
	static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned opcode;

	// SSE1 supports CMP{eq\|ne}SS, and SSE2 added CMP{eq\|ne}SD, but
	// we're requiring SSE2 for both.
	if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue CMP0 = N0.getOperand(1);
	SDValue CMP1 = N1.getOperand(1);
	SDLoc DL(N);

	// The SETCCs should both refer to the same CMP.
	if (CMP0.getOpcode() != X86ISD::FCMP \|\| CMP0 != CMP1)
	return SDValue();

	SDValue CMP00 = CMP0->getOperand(0);
	SDValue CMP01 = CMP0->getOperand(1);
	EVT VT = CMP00.getValueType();

	if (VT == MVT::f32 \|\| VT == MVT::f64) {
	bool ExpectingFlags = false;
	// Check for any users that want flags:
	for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
	!ExpectingFlags && UI != UE; ++UI)
	switch (UI->getOpcode()) {
	default:
	case ISD::BR_CC:
	case ISD::BRCOND:
	case ISD::SELECT:
	ExpectingFlags = true;
	break;
	case ISD::CopyToReg:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	break;
	}

	if (!ExpectingFlags) {
	enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
	enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);

	if (cc1 == X86::COND_E \|\| cc1 == X86::COND_NE) {
	X86::CondCode tmp = cc0;
	cc0 = cc1;
	cc1 = tmp;
	}

	if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) \|\|
	(cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
	// FIXME: need symbolic constants for these magic numbers.
	// See X86ATTInstPrinter.cpp:printSSECC().
	unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
	if (Subtarget.hasAVX512()) {
	SDValue FSetCC =
	DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
	DAG.getTargetConstant(x86cc, DL, MVT::i8));
	// Need to fill with zeros to ensure the bitcast will produce zeroes
	// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
	SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
	DAG.getConstant(0, DL, MVT::v16i1),
	FSetCC, DAG.getIntPtrConstant(0, DL));
	return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
	N->getSimpleValueType(0));
	}
	SDValue OnesOrZeroesF =
	DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
	CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));

	bool is64BitFP = (CMP00.getValueType() == MVT::f64);
	MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;

	if (is64BitFP && !Subtarget.is64Bit()) {
	// On a 32-bit target, we cannot bitcast the 64-bit float to a
	// 64-bit integer, since that's not a legal type. Since
	// OnesOrZeroesF is all ones of all zeroes, we don't need all the
	// bits, but can do this little dance to extract the lowest 32 bits
	// and work with those going forward.
	SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
	OnesOrZeroesF);
	SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
	OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
	Vector32, DAG.getIntPtrConstant(0, DL));
	IntVT = MVT::i32;
	}

	SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
	SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
	DAG.getConstant(1, DL, IntVT));
	SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
	ANDed);
	return OneBitOfTruth;
	}
	}
	}
	}
	return SDValue();
	}

	/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
	static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
	assert(N->getOpcode() == ISD::AND);

	MVT VT = N->getSimpleValueType(0);
	if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
	return SDValue();

	SDValue X, Y;
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	auto GetNot = [&VT, &DAG](SDValue V) {
	// Basic X = NOT(Y) detection.
	if (SDValue Not = IsNOT(V, DAG))
	return Not;
	// Fold BROADCAST(NOT(Y)) -> BROADCAST(Y).
	if (V.getOpcode() == X86ISD::VBROADCAST) {
	SDValue Src = V.getOperand(0);
	EVT SrcVT = Src.getValueType();
	if (!SrcVT.isVector())
	return SDValue();
	if (SDValue Not = IsNOT(Src, DAG))
	return DAG.getNode(X86ISD::VBROADCAST, SDLoc(V), VT,
	DAG.getBitcast(SrcVT, Not));
	}
	return SDValue();
	};

	if (SDValue Not = GetNot(N0)) {
	X = Not;
	Y = N1;
	} else if (SDValue Not = GetNot(N1)) {
	X = Not;
	Y = N0;
	} else
	return SDValue();

	X = DAG.getBitcast(VT, X);
	Y = DAG.getBitcast(VT, Y);
	return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
	}

	// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
	// logical operations, like in the example below.
	// or (and (truncate x, truncate y)),
	// (xor (truncate z, build_vector (constants)))
	// Given a target type \p VT, we generate
	// or (and x, y), (xor z, zext(build_vector (constants)))
	// given x, y and z are of type \p VT. We can do so, if operands are either
	// truncates from VT types, the second operand is a vector of constants or can
	// be recursively promoted.
	static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
	unsigned Depth) {
	// Limit recursion to avoid excessive compile times.
	if (Depth >= SelectionDAG::MaxRecursionDepth)
	return SDValue();

	if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
	N->getOpcode() != ISD::OR)
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDLoc DL(N);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
	return SDValue();

	if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
	N0 = NN0;
	else {
	// The Left side has to be a trunc.
	if (N0.getOpcode() != ISD::TRUNCATE)
	return SDValue();

	// The type of the truncated inputs.
	if (N0.getOperand(0).getValueType() != VT)
	return SDValue();

	N0 = N0.getOperand(0);
	}

	if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
	N1 = NN1;
	else {
	// The right side has to be a 'trunc' or a constant vector.
	bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
	N1.getOperand(0).getValueType() == VT;
	if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
	return SDValue();

	if (RHSTrunc)
	N1 = N1.getOperand(0);
	else
	N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
	}

	return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
	}

	// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
	// register. In most cases we actually compare or select YMM-sized registers
	// and mixing the two types creates horrible code. This method optimizes
	// some of the transition sequences.
	// Even with AVX-512 this is still useful for removing casts around logical
	// operations on vXi1 mask types.
	static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	assert(VT.isVector() && "Expected vector type");

	SDLoc DL(N);
	assert((N->getOpcode() == ISD::ANY_EXTEND \|\|
	N->getOpcode() == ISD::ZERO_EXTEND \|\|
	N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");

	SDValue Narrow = N->getOperand(0);
	EVT NarrowVT = Narrow.getValueType();

	// Generate the wide operation.
	SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
	if (!Op)
	return SDValue();
	switch (N->getOpcode()) {
	default: llvm_unreachable("Unexpected opcode");
	case ISD::ANY_EXTEND:
	return Op;
	case ISD::ZERO_EXTEND:
	return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
	case ISD::SIGN_EXTEND:
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
	Op, DAG.getValueType(NarrowVT));
	}
	}

	static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
	unsigned FPOpcode;
	switch (Opcode) {
	default: llvm_unreachable("Unexpected input node for FP logic conversion");
	case ISD::AND: FPOpcode = X86ISD::FAND; break;
	case ISD::OR: FPOpcode = X86ISD::FOR; break;
	case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
	}
	return FPOpcode;
	}

	/// If both input operands of a logic op are being cast from floating point
	/// types, try to convert this into a floating point logic node to avoid
	/// unnecessary moves from SSE to integer registers.
	static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDLoc DL(N);

	if (N0.getOpcode() != ISD::BITCAST \|\| N1.getOpcode() != ISD::BITCAST)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	SDValue N10 = N1.getOperand(0);
	EVT N00Type = N00.getValueType();
	EVT N10Type = N10.getValueType();

	// Ensure that both types are the same and are legal scalar fp types.
	if (N00Type != N10Type \|\|
	!((Subtarget.hasSSE1() && N00Type == MVT::f32) \|\|
	(Subtarget.hasSSE2() && N00Type == MVT::f64)))
	return SDValue();

	unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
	SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
	return DAG.getBitcast(VT, FPLogic);
	}

	// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
	// to reduce XMM->GPR traffic.
	static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {
	unsigned Opc = N->getOpcode();
	assert((Opc == ISD::OR \|\| Opc == ISD::AND \|\| Opc == ISD::XOR) &&
	"Unexpected bit opcode");

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Both operands must be single use MOVMSK.
	if (N0.getOpcode() != X86ISD::MOVMSK \|\| !N0.hasOneUse() \|\|
	N1.getOpcode() != X86ISD::MOVMSK \|\| !N1.hasOneUse())
	return SDValue();

	SDValue Vec0 = N0.getOperand(0);
	SDValue Vec1 = N1.getOperand(0);
	EVT VecVT0 = Vec0.getValueType();
	EVT VecVT1 = Vec1.getValueType();

	// Both MOVMSK operands must be from vectors of the same size and same element
	// size, but its OK for a fp/int diff.
	if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() \|\|
	VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
	return SDValue();

	SDLoc DL(N);
	unsigned VecOpc =
	VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
	SDValue Result =
	DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
	return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
	}

	/// If this is a zero/all-bits result that is bitwise-anded with a low bits
	/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
	/// with a shift-right to eliminate loading the vector constant mask value.
	static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
	SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
	EVT VT0 = Op0.getValueType();
	EVT VT1 = Op1.getValueType();

	if (VT0 != VT1 \|\| !VT0.isSimple() \|\| !VT0.isInteger())
	return SDValue();

	APInt SplatVal;
	if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) \|\|
	!SplatVal.isMask())
	return SDValue();

	// Don't prevent creation of ANDN.
	if (isBitwiseNot(Op0))
	return SDValue();

	if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
	return SDValue();

	unsigned EltBitWidth = VT0.getScalarSizeInBits();
	if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
	return SDValue();

	SDLoc DL(N);
	unsigned ShiftVal = SplatVal.countTrailingOnes();
	SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
	SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
	return DAG.getBitcast(N->getValueType(0), Shift);
	}

	// Get the index node from the lowered DAG of a GEP IR instruction with one
	// indexing dimension.
	static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
	if (Ld->isIndexed())
	return SDValue();

	SDValue Base = Ld->getBasePtr();

	if (Base.getOpcode() != ISD::ADD)
	return SDValue();

	SDValue ShiftedIndex = Base.getOperand(0);

	if (ShiftedIndex.getOpcode() != ISD::SHL)
	return SDValue();

	return ShiftedIndex.getOperand(0);

	}

	static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
	if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
	switch (VT.getSizeInBits()) {
	default: return false;
	case 64: return Subtarget.is64Bit() ? true : false;
	case 32: return true;
	}
	}
	return false;
	}

	// This function recognizes cases where X86 bzhi instruction can replace and
	// 'and-load' sequence.
	// In case of loading integer value from an array of constants which is defined
	// as follows:
	//
	// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
	//
	// then applying a bitwise and on the result with another input.
	// It's equivalent to performing bzhi (zero high bits) on the input, with the
	// same index of the load.
	static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Node->getSimpleValueType(0);
	SDLoc dl(Node);

	// Check if subtarget has BZHI instruction for the node's type
	if (!hasBZHI(Subtarget, VT))
	return SDValue();

	// Try matching the pattern for both operands.
	for (unsigned i = 0; i < 2; i++) {
	SDValue N = Node->getOperand(i);
	LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());

	// continue if the operand is not a load instruction
	if (!Ld)
	return SDValue();

	const Value *MemOp = Ld->getMemOperand()->getValue();

	if (!MemOp)
	return SDValue();

	if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
	if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
	if (GV->isConstant() && GV->hasDefinitiveInitializer()) {

	Constant *Init = GV->getInitializer();
	Type *Ty = Init->getType();
	if (!isa<ConstantDataArray>(Init) \|\|
	!Ty->getArrayElementType()->isIntegerTy() \|\|
	Ty->getArrayElementType()->getScalarSizeInBits() !=
	VT.getSizeInBits() \|\|
	Ty->getArrayNumElements() >
	Ty->getArrayElementType()->getScalarSizeInBits())
	continue;

	// Check if the array's constant elements are suitable to our case.
	uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
	bool ConstantsMatch = true;
	for (uint64_t j = 0; j < ArrayElementCount; j++) {
	ConstantInt *Elem =
	dyn_cast<ConstantInt>(Init->getAggregateElement(j));
	if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
	ConstantsMatch = false;
	break;
	}
	}
	if (!ConstantsMatch)
	continue;

	// Do the transformation (For 32-bit type):
	// -> (and (load arr[idx]), inp)
	// <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
	// that will be replaced with one bzhi instruction.
	SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
	SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);

	// Get the Node which indexes into the array.
	SDValue Index = getIndexFromUnindexedLoad(Ld);
	if (!Index)
	return SDValue();
	Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);

	SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
	Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);

	SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
	SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);

	return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
	}
	}
	}
	}
	return SDValue();
	}

	// Look for (and (ctpop X), 1) which is the IR form of __builtin_parity.
	// Turn it into series of XORs and a setnp.
	static SDValue combineParity(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);

	// We only support 64-bit and 32-bit. 64-bit requires special handling
	// unless the 64-bit popcnt instruction is legal.
	if (VT != MVT::i32 && VT != MVT::i64)
	return SDValue();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.isTypeLegal(VT) && TLI.isOperationLegal(ISD::CTPOP, VT))
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// LHS needs to be a single use CTPOP.
	if (N0.getOpcode() != ISD::CTPOP \|\| !N0.hasOneUse())
	return SDValue();

	// RHS needs to be 1.
	if (!isOneConstant(N1))
	return SDValue();

	SDLoc DL(N);
	SDValue X = N0.getOperand(0);

	// If this is 64-bit, its always best to xor the two 32-bit pieces together
	// even if we have popcnt.
	if (VT == MVT::i64) {
	SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
	DAG.getNode(ISD::SRL, DL, VT, X,
	DAG.getConstant(32, DL, MVT::i8)));
	SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
	X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
	// Generate a 32-bit parity idiom. This will bring us back here if we need
	// to expand it too.
	SDValue Parity = DAG.getNode(ISD::AND, DL, MVT::i32,
	DAG.getNode(ISD::CTPOP, DL, MVT::i32, X),
	DAG.getConstant(1, DL, MVT::i32));
	return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Parity);
	}
	assert(VT == MVT::i32 && "Unexpected VT!");

	// Xor the high and low 16-bits together using a 32-bit operation.
	SDValue Hi16 = DAG.getNode(ISD::SRL, DL, VT, X,
	DAG.getConstant(16, DL, MVT::i8));
	X = DAG.getNode(ISD::XOR, DL, VT, X, Hi16);

	// Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
	// This should allow an h-reg to be used to save a shift.
	// FIXME: We only get an h-reg in 32-bit mode.
	SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
	DAG.getNode(ISD::SRL, DL, VT, X,
	DAG.getConstant(8, DL, MVT::i8)));
	SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
	SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
	SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);

	// Copy the inverse of the parity flag into a register with setcc.
	SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
	// Zero extend to original type.
	return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp);
	}


	// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
	// Where C is a mask containing the same number of bits as the setcc and
	// where the setcc will freely 0 upper bits of k-register. We can replace the
	// undef in the concat with 0s and remove the AND. This mainly helps with
	// v2i1/v4i1 setcc being casted to scalar.
	static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");

	EVT VT = N->getValueType(0);

	// Make sure this is an AND with constant. We will check the value of the
	// constant later.
	if (!isa<ConstantSDNode>(N->getOperand(1)))
	return SDValue();

	// This is implied by the ConstantSDNode.
	assert(!VT.isVector() && "Expected scalar VT!");

	if (N->getOperand(0).getOpcode() != ISD::BITCAST \|\|
	!N->getOperand(0).hasOneUse() \|\|
	!N->getOperand(0).getOperand(0).hasOneUse())
	return SDValue();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Src = N->getOperand(0).getOperand(0);
	EVT SrcVT = Src.getValueType();
	if (!SrcVT.isVector() \|\| SrcVT.getVectorElementType() != MVT::i1 \|\|
	!TLI.isTypeLegal(SrcVT))
	return SDValue();

	if (Src.getOpcode() != ISD::CONCAT_VECTORS)
	return SDValue();

	// We only care about the first subvector of the concat, we expect the
	// other subvectors to be ignored due to the AND if we make the change.
	SDValue SubVec = Src.getOperand(0);
	EVT SubVecVT = SubVec.getValueType();

	// First subvector should be a setcc with a legal result type. The RHS of the
	// AND should be a mask with this many bits.
	if (SubVec.getOpcode() != ISD::SETCC \|\| !TLI.isTypeLegal(SubVecVT) \|\|
	!N->getConstantOperandAPInt(1).isMask(SubVecVT.getVectorNumElements()))
	return SDValue();

	EVT SetccVT = SubVec.getOperand(0).getValueType();
	if (!TLI.isTypeLegal(SetccVT) \|\|
	!(Subtarget.hasVLX() \|\| SetccVT.is512BitVector()))
	return SDValue();

	if (!(Subtarget.hasBWI() \|\| SetccVT.getScalarSizeInBits() >= 32))
	return SDValue();

	// We passed all the checks. Rebuild the concat_vectors with zeroes
	// and cast it back to VT.
	SDLoc dl(N);
	SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
	DAG.getConstant(0, dl, SubVecVT));
	Ops[0] = SubVec;
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
	Ops);
	return DAG.getBitcast(VT, Concat);
	}

	static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);

	// If this is SSE1 only convert to FAND to avoid scalarization.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
	return DAG.getBitcast(
	MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
	DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
	DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
	}

	// Use a 32-bit and+zext if upper bits known zero.
	if (VT == MVT::i64 && Subtarget.is64Bit() &&
	!isa<ConstantSDNode>(N->getOperand(1))) {
	APInt HiMask = APInt::getHighBitsSet(64, 32);
	if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) \|\|
	DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
	SDLoc dl(N);
	SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
	SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
	DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
	}
	}

	// This must be done before legalization has expanded the ctpop.
	if (SDValue V = combineParity(N, DAG, Subtarget))
	return V;

	// Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
	// TODO: Support multiple SrcOps.
	if (VT == MVT::i1) {
	SmallVector<SDValue, 2> SrcOps;
	SmallVector<APInt, 2> SrcPartials;
	if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
	SrcOps.size() == 1) {
	SDLoc dl(N);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
	EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
	SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
	if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
	Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
	if (Mask) {
	assert(SrcPartials[0].getBitWidth() == NumElts &&
	"Unexpected partial reduction mask");
	SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
	Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
	return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
	}
	}
	}

	if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
	return V;

	if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
	return R;

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
	return R;

	if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
	return ShiftRight;

	if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
	return R;

	// Attempt to recursively combine a bitmask AND with shuffles.
	if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;
	}

	// Attempt to combine a scalar bitmask AND with an extracted shuffle.
	if ((VT.getScalarSizeInBits() % 8) == 0 &&
	N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
	SDValue BitMask = N->getOperand(1);
	SDValue SrcVec = N->getOperand(0).getOperand(0);
	EVT SrcVecVT = SrcVec.getValueType();

	// Check that the constant bitmask masks whole bytes.
	APInt UndefElts;
	SmallVector<APInt, 64> EltBits;
	if (VT == SrcVecVT.getScalarType() &&
	N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
	getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
	llvm::all_of(EltBits, [](APInt M) {
	return M.isNullValue() \|\| M.isAllOnesValue();
	})) {
	unsigned NumElts = SrcVecVT.getVectorNumElements();
	unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
	unsigned Idx = N->getOperand(0).getConstantOperandVal(1);

	// Create a root shuffle mask from the byte mask and the extracted index.
	SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
	for (unsigned i = 0; i != Scale; ++i) {
	if (UndefElts[i])
	continue;
	int VecIdx = Scale * Idx + i;
	ShuffleMask[VecIdx] =
	EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
	}

	if (SDValue Shuffle = combineX86ShufflesRecursively(
	{SrcVec}, 0, SrcVec, ShuffleMask, {}, /Depth/ 1,
	/HasVarMask/ false, /AllowVarMask/ true, DAG, Subtarget))
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
	N->getOperand(0).getOperand(1));
	}
	}

	return SDValue();
	}

	// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
	static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");

	MVT VT = N->getSimpleValueType(0);
	if (!VT.isVector() \|\| (VT.getScalarSizeInBits() % 8) != 0)
	return SDValue();

	SDValue N0 = peekThroughBitcasts(N->getOperand(0));
	SDValue N1 = peekThroughBitcasts(N->getOperand(1));
	if (N0.getOpcode() != ISD::AND \|\| N1.getOpcode() != ISD::AND)
	return SDValue();

	// On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
	// VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
	bool UseVPTERNLOG = (Subtarget.hasAVX512() && VT.is512BitVector()) \|\|
	Subtarget.hasVLX();
	if (!(Subtarget.hasXOP() \|\| UseVPTERNLOG \|\|
	!N0.getOperand(1).hasOneUse() \|\| !N1.getOperand(1).hasOneUse()))
	return SDValue();

	// Attempt to extract constant byte masks.
	APInt UndefElts0, UndefElts1;
	SmallVector<APInt, 32> EltBits0, EltBits1;
	if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
	false, false))
	return SDValue();
	if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
	false, false))
	return SDValue();

	for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
	// TODO - add UNDEF elts support.
	if (UndefElts0[i] \|\| UndefElts1[i])
	return SDValue();
	if (EltBits0[i] != ~EltBits1[i])
	return SDValue();
	}

	SDLoc DL(N);

	if (UseVPTERNLOG) {
	// Emit a VPTERNLOG node directly.
	SDValue A = DAG.getBitcast(VT, N0.getOperand(1));
	SDValue B = DAG.getBitcast(VT, N0.getOperand(0));
	SDValue C = DAG.getBitcast(VT, N1.getOperand(0));
	SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
	return DAG.getNode(X86ISD::VPTERNLOG, DL, VT, A, B, C, Imm);
	}

	SDValue X = N->getOperand(0);
	SDValue Y =
	DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
	DAG.getBitcast(VT, N1.getOperand(0)));
	return DAG.getNode(ISD::OR, DL, VT, X, Y);
	}

	// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
	static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
	if (N->getOpcode() != ISD::OR)
	return false;

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Canonicalize AND to LHS.
	if (N1.getOpcode() == ISD::AND)
	std::swap(N0, N1);

	// Attempt to match OR(AND(M,Y),ANDNP(M,X)).
	if (N0.getOpcode() != ISD::AND \|\| N1.getOpcode() != X86ISD::ANDNP)
	return false;

	Mask = N1.getOperand(0);
	X = N1.getOperand(1);

	// Check to see if the mask appeared in both the AND and ANDNP.
	if (N0.getOperand(0) == Mask)
	Y = N0.getOperand(1);
	else if (N0.getOperand(1) == Mask)
	Y = N0.getOperand(0);
	else
	return false;

	// TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
	// ANDNP combine allows other combines to happen that prevent matching.
	return true;
	}

	// Try to fold:
	// (or (and (m, y), (pandn m, x)))
	// into:
	// (vselect m, x, y)
	// As a special case, try to fold:
	// (or (and (m, (sub 0, x)), (pandn m, x)))
	// into:
	// (sub (xor X, M), M)
	static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");

	EVT VT = N->getValueType(0);
	if (!((VT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(VT.is256BitVector() && Subtarget.hasInt256())))
	return SDValue();

	SDValue X, Y, Mask;
	if (!matchLogicBlend(N, X, Y, Mask))
	return SDValue();

	// Validate that X, Y, and Mask are bitcasts, and see through them.
	Mask = peekThroughBitcasts(Mask);
	X = peekThroughBitcasts(X);
	Y = peekThroughBitcasts(Y);

	EVT MaskVT = Mask.getValueType();
	unsigned EltBits = MaskVT.getScalarSizeInBits();

	// TODO: Attempt to handle floating point cases as well?
	if (!MaskVT.isInteger() \|\| DAG.ComputeNumSignBits(Mask) != EltBits)
	return SDValue();

	SDLoc DL(N);

	// Attempt to combine to conditional negate: (sub (xor X, M), M)
	if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
	DAG, Subtarget))
	return Res;

	// PBLENDVB is only available on SSE 4.1.
	if (!Subtarget.hasSSE41())
	return SDValue();

	// If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
	if (Subtarget.hasVLX())
	return SDValue();

	MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;

	X = DAG.getBitcast(BlendVT, X);
	Y = DAG.getBitcast(BlendVT, Y);
	Mask = DAG.getBitcast(BlendVT, Mask);
	Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
	return DAG.getBitcast(VT, Mask);
	}

	// Helper function for combineOrCmpEqZeroToCtlzSrl
	// Transforms:
	// seteq(cmp x, 0)
	// into:
	// srl(ctlz x), log2(bitsize(x))
	// Input pattern is checked by caller.
	static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
	SelectionDAG &DAG) {
	SDValue Cmp = Op.getOperand(1);
	EVT VT = Cmp.getOperand(0).getValueType();
	unsigned Log2b = Log2_32(VT.getSizeInBits());
	SDLoc dl(Op);
	SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
	// The result of the shift is true or false, and on X86, the 32-bit
	// encoding of shr and lzcnt is more desirable.
	SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
	SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
	DAG.getConstant(Log2b, dl, MVT::i8));
	return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
	}

	// Try to transform:
	// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
	// into:
	// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
	// Will also attempt to match more generic cases, eg:
	// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
	// Only applies if the target supports the FastLZCNT feature.
	static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalize() \|\| !Subtarget.getTargetLowering()->isCtlzFast())
	return SDValue();

	auto isORCandidate = [](SDValue N) {
	return (N->getOpcode() == ISD::OR && N->hasOneUse());
	};

	// Check the zero extend is extending to 32-bit or more. The code generated by
	// srl(ctlz) for 16-bit or less variants of the pattern would require extra
	// instructions to clear the upper bits.
	if (!N->hasOneUse() \|\| !N->getSimpleValueType(0).bitsGE(MVT::i32) \|\|
	!isORCandidate(N->getOperand(0)))
	return SDValue();

	// Check the node matches: setcc(eq, cmp 0)
	auto isSetCCCandidate = [](SDValue N) {
	return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
	X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
	N->getOperand(1).getOpcode() == X86ISD::CMP &&
	isNullConstant(N->getOperand(1).getOperand(1)) &&
	N->getOperand(1).getValueType().bitsGE(MVT::i32);
	};

	SDNode *OR = N->getOperand(0).getNode();
	SDValue LHS = OR->getOperand(0);
	SDValue RHS = OR->getOperand(1);

	// Save nodes matching or(or, setcc(eq, cmp 0)).
	SmallVector<SDNode *, 2> ORNodes;
	while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) \|\|
	(isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
	ORNodes.push_back(OR);
	OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
	LHS = OR->getOperand(0);
	RHS = OR->getOperand(1);
	}

	// The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
	if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) \|\|
	!isORCandidate(SDValue(OR, 0)))
	return SDValue();

	// We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
	// to
	// or(srl(ctlz),srl(ctlz)).
	// The dag combiner can then fold it into:
	// srl(or(ctlz, ctlz)).
	EVT VT = OR->getValueType(0);
	SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
	SDValue Ret, NewRHS;
	if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
	Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);

	if (!Ret)
	return SDValue();

	// Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
	while (ORNodes.size() > 0) {
	OR = ORNodes.pop_back_val();
	LHS = OR->getOperand(0);
	RHS = OR->getOperand(1);
	// Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
	if (RHS->getOpcode() == ISD::OR)
	std::swap(LHS, RHS);
	NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
	if (!NewRHS)
	return SDValue();
	Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
	}

	if (Ret)
	Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);

	return Ret;
	}

	static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);

	// If this is SSE1 only convert to FOR to avoid scalarization.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
	return DAG.getBitcast(MVT::v4i32,
	DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
	DAG.getBitcast(MVT::v4f32, N0),
	DAG.getBitcast(MVT::v4f32, N1)));
	}

	// Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
	// TODO: Support multiple SrcOps.
	if (VT == MVT::i1) {
	SmallVector<SDValue, 2> SrcOps;
	SmallVector<APInt, 2> SrcPartials;
	if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
	SrcOps.size() == 1) {
	SDLoc dl(N);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
	EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
	SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
	if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
	Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
	if (Mask) {
	assert(SrcPartials[0].getBitWidth() == NumElts &&
	"Unexpected partial reduction mask");
	SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
	SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
	Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
	return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
	}
	}
	}

	if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
	return R;

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
	return R;

	if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
	return R;

	// Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
	// Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
	// iff the upper elements of the non-shifted arg are zero.
	// KUNPCK require 16+ bool vector elements.
	if (N0.getOpcode() == X86ISD::KSHIFTL \|\| N1.getOpcode() == X86ISD::KSHIFTL) {
	unsigned NumElts = VT.getVectorNumElements();
	unsigned HalfElts = NumElts / 2;
	APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
	if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
	N1.getConstantOperandAPInt(1) == HalfElts &&
	DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) {
	SDLoc dl(N);
	return DAG.getNode(
	ISD::CONCAT_VECTORS, dl, VT,
	extractSubVector(N0, 0, DAG, dl, HalfElts),
	extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
	}
	if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
	N0.getConstantOperandAPInt(1) == HalfElts &&
	DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) {
	SDLoc dl(N);
	return DAG.getNode(
	ISD::CONCAT_VECTORS, dl, VT,
	extractSubVector(N1, 0, DAG, dl, HalfElts),
	extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
	}
	}

	// Attempt to recursively combine an OR of shuffles.
	if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;
	}

	return SDValue();
	}

	/// Try to turn tests against the signbit in the form of:
	/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
	/// into:
	/// SETGT(X, -1)
	static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
	// This is only worth doing if the output type is i8 or i1.
	EVT ResultType = N->getValueType(0);
	if (ResultType != MVT::i8 && ResultType != MVT::i1)
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// We should be performing an xor against a truncated shift.
	if (N0.getOpcode() != ISD::TRUNCATE \|\| !N0.hasOneUse())
	return SDValue();

	// Make sure we are performing an xor against one.
	if (!isOneConstant(N1))
	return SDValue();

	// SetCC on x86 zero extends so only act on this if it's a logical shift.
	SDValue Shift = N0.getOperand(0);
	if (Shift.getOpcode() != ISD::SRL \|\| !Shift.hasOneUse())
	return SDValue();

	// Make sure we are truncating from one of i16, i32 or i64.
	EVT ShiftTy = Shift.getValueType();
	if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
	return SDValue();

	// Make sure the shift amount extracts the sign bit.
	if (!isa<ConstantSDNode>(Shift.getOperand(1)) \|\|
	Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
	return SDValue();

	// Create a greater-than comparison against -1.
	// N.B. Using SETGE against 0 works but we want a canonical looking
	// comparison, using SETGT matches up with what TranslateX86CC.
	SDLoc DL(N);
	SDValue ShiftOp = Shift.getOperand(0);
	EVT ShiftOpTy = ShiftOp.getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), ResultType);
	SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
	DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
	if (SetCCResultType != ResultType)
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
	return Cond;
	}

	/// Turn vector tests of the signbit in the form of:
	/// xor (sra X, elt_size(X)-1), -1
	/// into:
	/// pcmpgt X, -1
	///
	/// This should be called before type legalization because the pattern may not
	/// persist after that.
	static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	if (!VT.isSimple())
	return SDValue();

	switch (VT.getSimpleVT().SimpleTy) {
	default: return SDValue();
	case MVT::v16i8:
	case MVT::v8i16:
	case MVT::v4i32:
	case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
	case MVT::v32i8:
	case MVT::v16i16:
	case MVT::v8i32:
	case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
	}

	// There must be a shift right algebraic before the xor, and the xor must be a
	// 'not' operation.
	SDValue Shift = N->getOperand(0);
	SDValue Ones = N->getOperand(1);
	if (Shift.getOpcode() != ISD::SRA \|\| !Shift.hasOneUse() \|\|
	!ISD::isBuildVectorAllOnes(Ones.getNode()))
	return SDValue();

	// The shift should be smearing the sign bit across each vector element.
	auto *ShiftAmt =
	isConstOrConstSplat(Shift.getOperand(1), /AllowUndefs/ true);
	if (!ShiftAmt \|\|
	ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
	return SDValue();

	// Create a greater-than comparison against -1. We don't use the more obvious
	// greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
	return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
	}

	/// Detect patterns of truncation with unsigned saturation:
	///
	/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
	/// Return the source value x to be truncated or SDValue() if the pattern was
	/// not matched.
	///
	/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
	/// where C1 >= 0 and C2 is unsigned max of destination type.
	///
	/// (truncate (smax (smin (x, C2), C1)) to dest_type)
	/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
	///
	/// These two patterns are equivalent to:
	/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
	/// So return the smax(x, C1) value to be truncated or SDValue() if the
	/// pattern was not matched.
	static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
	const SDLoc &DL) {
	EVT InVT = In.getValueType();

	// Saturation with truncation. We truncate from InVT to VT.
	assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
	"Unexpected types for truncate operation");

	// Match min/max and return limit value as a parameter.
	auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
	if (V.getOpcode() == Opcode &&
	ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
	return V.getOperand(0);
	return SDValue();
	};

	APInt C1, C2;
	if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
	// C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
	// the element size of the destination type.
	if (C2.isMask(VT.getScalarSizeInBits()))
	return UMin;

	if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
	if (MatchMinMax(SMin, ISD::SMAX, C1))
	if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
	return SMin;

	if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
	if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
	if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
	C2.uge(C1)) {
	return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
	}

	return SDValue();
	}

	/// Detect patterns of truncation with signed saturation:
	/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
	/// signed_max_of_dest_type)) to dest_type)
	/// or:
	/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
	/// signed_min_of_dest_type)) to dest_type).
	/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
	/// Return the source value to be truncated or SDValue() if the pattern was not
	/// matched.
	static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
	unsigned NumDstBits = VT.getScalarSizeInBits();
	unsigned NumSrcBits = In.getScalarValueSizeInBits();
	assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");

	auto MatchMinMax = [](SDValue V, unsigned Opcode,
	const APInt &Limit) -> SDValue {
	APInt C;
	if (V.getOpcode() == Opcode &&
	ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
	return V.getOperand(0);
	return SDValue();
	};

	APInt SignedMax, SignedMin;
	if (MatchPackUS) {
	SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
	SignedMin = APInt(NumSrcBits, 0);
	} else {
	SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
	SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
	}

	if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
	if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
	return SMax;

	if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
	if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
	return SMin;

	return SDValue();
	}

	static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2() \|\| !VT.isVector())
	return SDValue();

	EVT SVT = VT.getVectorElementType();
	EVT InVT = In.getValueType();
	EVT InSVT = InVT.getVectorElementType();

	// If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
	// split across two registers. We can use a packusdw+perm to clamp to 0-65535
	// and concatenate at the same time. Then we can use a final vpmovuswb to
	// clip to 0-255.
	if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
	InVT == MVT::v16i32 && VT == MVT::v16i8) {
	if (auto USatVal = detectSSatPattern(In, VT, true)) {
	// Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
	SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
	DL, DAG, Subtarget);
	assert(Mid && "Failed to pack!");
	return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
	}
	}

	// vXi32 truncate instructions are available with AVX512F.
	// vXi16 truncate instructions are only available with AVX512BW.
	// For 256-bit or smaller vectors, we require VLX.
	// FIXME: We could widen truncates to 512 to remove the VLX restriction.
	// If the result type is 256-bits or larger and we have disable 512-bit
	// registers, we should go ahead and use the pack instructions if possible.
	bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) \|\|
	(Subtarget.hasBWI() && InSVT == MVT::i16)) &&
	(InVT.getSizeInBits() > 128) &&
	(Subtarget.hasVLX() \|\| InVT.getSizeInBits() > 256) &&
	!(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);

	if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
	VT.getSizeInBits() >= 64 &&
	(SVT == MVT::i8 \|\| SVT == MVT::i16) &&
	(InSVT == MVT::i16 \|\| InSVT == MVT::i32)) {
	if (auto USatVal = detectSSatPattern(In, VT, true)) {
	// vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
	// Only do this when the result is at least 64 bits or we'll leaving
	// dangling PACKSSDW nodes.
	if (SVT == MVT::i8 && InSVT == MVT::i32) {
	EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
	VT.getVectorNumElements());
	SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
	DAG, Subtarget);
	assert(Mid && "Failed to pack!");
	SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
	Subtarget);
	assert(V && "Failed to pack!");
	return V;
	} else if (SVT == MVT::i8 \|\| Subtarget.hasSSE41())
	return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
	Subtarget);
	}
	if (auto SSatVal = detectSSatPattern(In, VT))
	return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
	Subtarget);
	}

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
	Subtarget.hasAVX512() && (InSVT != MVT::i16 \|\| Subtarget.hasBWI())) {
	unsigned TruncOpc = 0;
	SDValue SatVal;
	if (auto SSatVal = detectSSatPattern(In, VT)) {
	SatVal = SSatVal;
	TruncOpc = X86ISD::VTRUNCS;
	} else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) {
	SatVal = USatVal;
	TruncOpc = X86ISD::VTRUNCUS;
	}
	if (SatVal) {
	unsigned ResElts = VT.getVectorNumElements();
	// If the input type is less than 512 bits and we don't have VLX, we need
	// to widen to 512 bits.
	if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
	unsigned NumConcats = 512 / InVT.getSizeInBits();
	ResElts *= NumConcats;
	SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
	ConcatOps[0] = SatVal;
	InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
	NumConcats * InVT.getVectorNumElements());
	SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
	}
	// Widen the result if its narrower than 128 bits.
	if (ResElts * SVT.getSizeInBits() < 128)
	ResElts = 128 / SVT.getSizeInBits();
	EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
	SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	}
	}

	return SDValue();
	}

	/// This function detects the AVG pattern between vectors of unsigned i8/i16,
	/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
	/// X86ISD::AVG instruction.
	static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	const SDLoc &DL) {
	if (!VT.isVector())
	return SDValue();
	EVT InVT = In.getValueType();
	unsigned NumElems = VT.getVectorNumElements();

	EVT ScalarVT = VT.getVectorElementType();
	if (!((ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16) &&
	NumElems >= 2 && isPowerOf2_32(NumElems)))
	return SDValue();

	// InScalarVT is the intermediate type in AVG pattern and it should be greater
	// than the original input type (i8/i16).
	EVT InScalarVT = InVT.getVectorElementType();
	if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
	return SDValue();

	if (!Subtarget.hasSSE2())
	return SDValue();

	// Detect the following pattern:
	//
	// %1 = zext <N x i8> %a to <N x i32>
	// %2 = zext <N x i8> %b to <N x i32>
	// %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
	// %4 = add nuw nsw <N x i32> %3, %2
	// %5 = lshr <N x i32> %N, <i32 1 x N>
	// %6 = trunc <N x i32> %5 to <N x i8>
	//
	// In AVX512, the last instruction can also be a trunc store.
	if (In.getOpcode() != ISD::SRL)
	return SDValue();

	// A lambda checking the given SDValue is a constant vector and each element
	// is in the range [Min, Max].
	auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
	return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
	return !(C->getAPIntValue().ult(Min) \|\| C->getAPIntValue().ugt(Max));
	});
	};

	// Check if each element of the vector is right-shifted by one.
	auto LHS = In.getOperand(0);
	auto RHS = In.getOperand(1);
	if (!IsConstVectorInRange(RHS, 1, 1))
	return SDValue();
	if (LHS.getOpcode() != ISD::ADD)
	return SDValue();

	// Detect a pattern of a + b + 1 where the order doesn't matter.
	SDValue Operands[3];
	Operands[0] = LHS.getOperand(0);
	Operands[1] = LHS.getOperand(1);

	auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
	};

	// Take care of the case when one of the operands is a constant vector whose
	// element is in the range [1, 256].
	if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
	Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
	Operands[0].getOperand(0).getValueType() == VT) {
	// The pattern is detected. Subtract one from the constant vector, then
	// demote it and emit X86ISD::AVG instruction.
	SDValue VecOnes = DAG.getConstant(1, DL, InVT);
	Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
	Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
	return SplitOpsAndApply(DAG, Subtarget, DL, VT,
	{ Operands[0].getOperand(0), Operands[1] },
	AVGBuilder);
	}

	// Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
	// Match the or case only if its 'add-like' - can be replaced by an add.
	auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
	if (ISD::ADD == V.getOpcode()) {
	Op0 = V.getOperand(0);
	Op1 = V.getOperand(1);
	return true;
	}
	if (ISD::ZERO_EXTEND != V.getOpcode())
	return false;
	V = V.getOperand(0);
	if (V.getValueType() != VT \|\| ISD::OR != V.getOpcode() \|\|
	!DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
	return false;
	Op0 = V.getOperand(0);
	Op1 = V.getOperand(1);
	return true;
	};

	SDValue Op0, Op1;
	if (FindAddLike(Operands[0], Op0, Op1))
	std::swap(Operands[0], Operands[1]);
	else if (!FindAddLike(Operands[1], Op0, Op1))
	return SDValue();
	Operands[2] = Op0;
	Operands[1] = Op1;

	// Now we have three operands of two additions. Check that one of them is a
	// constant vector with ones, and the other two can be promoted from i8/i16.
	for (int i = 0; i < 3; ++i) {
	if (!IsConstVectorInRange(Operands[i], 1, 1))
	continue;
	std::swap(Operands[i], Operands[2]);

	// Check if Operands[0] and Operands[1] are results of type promotion.
	for (int j = 0; j < 2; ++j)
	if (Operands[j].getValueType() != VT) {
	if (Operands[j].getOpcode() != ISD::ZERO_EXTEND \|\|
	Operands[j].getOperand(0).getValueType() != VT)
	return SDValue();
	Operands[j] = Operands[j].getOperand(0);
	}

	// The pattern is detected, emit X86ISD::AVG instruction(s).
	return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Operands[0], Operands[1]},
	AVGBuilder);
	}

	return SDValue();
	}

	static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	LoadSDNode *Ld = cast<LoadSDNode>(N);
	EVT RegVT = Ld->getValueType(0);
	EVT MemVT = Ld->getMemoryVT();
	SDLoc dl(Ld);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// For chips with slow 32-byte unaligned loads, break the 32-byte operation
	// into two 16-byte operations. Also split non-temporal aligned loads on
	// pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
	ISD::LoadExtType Ext = Ld->getExtensionType();
	bool Fast;
	if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
	Ext == ISD::NON_EXTLOAD &&
	((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
	Ld->getAlignment() >= 16) \|\|
	(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
	*Ld->getMemOperand(), &Fast) &&
	!Fast))) {
	unsigned NumElems = RegVT.getVectorNumElements();
	if (NumElems < 2)
	return SDValue();

	unsigned HalfOffset = 16;
	SDValue Ptr1 = Ld->getBasePtr();
	SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfOffset, dl);
	EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
	NumElems / 2);
	SDValue Load1 =
	DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
	Ld->getOriginalAlign(),
	Ld->getMemOperand()->getFlags());
	SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
	Ld->getPointerInfo().getWithOffset(HalfOffset),
	Ld->getOriginalAlign(),
	Ld->getMemOperand()->getFlags());
	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	Load1.getValue(1), Load2.getValue(1));

	SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
	return DCI.CombineTo(N, NewVec, TF, true);
	}

	// Bool vector load - attempt to cast to an integer, as we have good
	// (vXiY *ext(vXi1 bitcast(iX))) handling.
	if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
	RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
	unsigned NumElts = RegVT.getVectorNumElements();
	EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
	if (TLI.isTypeLegal(IntVT)) {
	SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(),
	Ld->getOriginalAlign(),
	Ld->getMemOperand()->getFlags());
	SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
	return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
	}
	}

	// Cast ptr32 and ptr64 pointers to the default address space before a load.
	unsigned AddrSpace = Ld->getAddressSpace();
	if (AddrSpace == X86AS::PTR64 \|\| AddrSpace == X86AS::PTR32_SPTR \|\|
	AddrSpace == X86AS::PTR32_UPTR) {
	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
	if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
	SDValue Cast =
	DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
	return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),
	Ld->getOriginalAlign(),
	Ld->getMemOperand()->getFlags());
	}
	}

	return SDValue();
	}

	/// If V is a build vector of boolean constants and exactly one of those
	/// constants is true, return the operand index of that true element.
	/// Otherwise, return -1.
	static int getOneTrueElt(SDValue V) {
	// This needs to be a build vector of booleans.
	// TODO: Checking for the i1 type matches the IR definition for the mask,
	// but the mask check could be loosened to i8 or other types. That might
	// also require checking more than 'allOnesValue'; eg, the x86 HW
	// instructions only require that the MSB is set for each mask element.
	// The ISD::MSTORE comments/definition do not specify how the mask operand
	// is formatted.
	auto *BV = dyn_cast<BuildVectorSDNode>(V);
	if (!BV \|\| BV->getValueType(0).getVectorElementType() != MVT::i1)
	return -1;

	int TrueIndex = -1;
	unsigned NumElts = BV->getValueType(0).getVectorNumElements();
	for (unsigned i = 0; i < NumElts; ++i) {
	const SDValue &Op = BV->getOperand(i);
	if (Op.isUndef())
	continue;
	auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
	if (!ConstNode)
	return -1;
	if (ConstNode->getAPIntValue().isAllOnesValue()) {
	// If we already found a one, this is too many.
	if (TrueIndex >= 0)
	return -1;
	TrueIndex = i;
	}
	}
	return TrueIndex;
	}

	/// Given a masked memory load/store operation, return true if it has one mask
	/// bit set. If it has one mask bit set, then also return the memory address of
	/// the scalar element to load/store, the vector index to insert/extract that
	/// scalar element, and the alignment for the scalar memory access.
	static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
	SelectionDAG &DAG, SDValue &Addr,
	SDValue &Index, unsigned &Alignment) {
	int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
	if (TrueMaskElt < 0)
	return false;

	// Get the address of the one scalar element that is specified by the mask
	// using the appropriate offset from the base pointer.
	EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
	Addr = MaskedOp->getBasePtr();
	if (TrueMaskElt != 0) {
	unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
	Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
	}

	Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
	Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
	return true;
	}

	/// If exactly one element of the mask is set for a non-extending masked load,
	/// it is a scalar load and vector insert.
	/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
	/// mask have already been optimized in IR, so we don't bother with those here.
	static SDValue
	reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	assert(ML->isUnindexed() && "Unexpected indexed masked load!");
	// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
	// However, some target hooks may need to be added to know when the transform
	// is profitable. Endianness would also have to be considered.

	SDValue Addr, VecIndex;
	unsigned Alignment;
	if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
	return SDValue();

	// Load the one scalar element that is specified by the mask using the
	// appropriate offset from the base pointer.
	SDLoc DL(ML);
	EVT VT = ML->getValueType(0);
	EVT EltVT = VT.getVectorElementType();
	SDValue Load =
	DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
	Alignment, ML->getMemOperand()->getFlags());

	// Insert the loaded element into the appropriate place in the vector.
	SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
	ML->getPassThru(), Load, VecIndex);
	return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
	}

	static SDValue
	combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	assert(ML->isUnindexed() && "Unexpected indexed masked load!");
	if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
	return SDValue();

	SDLoc DL(ML);
	EVT VT = ML->getValueType(0);

	// If we are loading the first and last elements of a vector, it is safe and
	// always faster to load the whole vector. Replace the masked load with a
	// vector load and select.
	unsigned NumElts = VT.getVectorNumElements();
	BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
	bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
	bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
	if (LoadFirstElt && LoadLastElt) {
	SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
	ML->getMemOperand());
	SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
	ML->getPassThru());
	return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
	}

	// Convert a masked load with a constant mask into a masked load and a select.
	// This allows the select operation to use a faster kind of select instruction
	// (for example, vblendvps -> vblendps).

	// Don't try this if the pass-through operand is already undefined. That would
	// cause an infinite loop because that's what we're about to create.
	if (ML->getPassThru().isUndef())
	return SDValue();

	if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
	return SDValue();

	// The new masked load has an undef pass-through operand. The select uses the
	// original pass-through operand.
	SDValue NewML = DAG.getMaskedLoad(
	VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
	DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
	ML->getAddressingMode(), ML->getExtensionType());
	SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
	ML->getPassThru());

	return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
	}

	static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	auto *Mld = cast<MaskedLoadSDNode>(N);

	// TODO: Expanding load with constant mask may be optimized as well.
	if (Mld->isExpandingLoad())
	return SDValue();

	if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
	if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
	return ScalarLoad;

	// TODO: Do some AVX512 subsets benefit from this transform?
	if (!Subtarget.hasAVX512())
	if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
	return Blend;
	}

	// If the mask value has been legalized to a non-boolean vector, try to
	// simplify ops leading up to it. We only demand the MSB of each lane.
	SDValue Mask = Mld->getMask();
	if (Mask.getScalarValueSizeInBits() != 1) {
	EVT VT = Mld->getValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
	if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
	if (N->getOpcode() != ISD::DELETED_NODE)
	DCI.AddToWorklist(N);
	return SDValue(N, 0);
	}
	if (SDValue NewMask =
	TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
	return DAG.getMaskedLoad(
	VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
	NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
	Mld->getAddressingMode(), Mld->getExtensionType());
	}

	return SDValue();
	}

	/// If exactly one element of the mask is set for a non-truncating masked store,
	/// it is a vector extract and scalar store.
	/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
	/// mask have already been optimized in IR, so we don't bother with those here.
	static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
	SelectionDAG &DAG) {
	// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
	// However, some target hooks may need to be added to know when the transform
	// is profitable. Endianness would also have to be considered.

	SDValue Addr, VecIndex;
	unsigned Alignment;
	if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
	return SDValue();

	// Extract the one scalar element that is actually being stored.
	SDLoc DL(MS);
	EVT VT = MS->getValue().getValueType();
	EVT EltVT = VT.getVectorElementType();
	SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
	MS->getValue(), VecIndex);

	// Store that element at the appropriate offset from the base pointer.
	return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
	Alignment, MS->getMemOperand()->getFlags());
	}

	static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
	if (Mst->isCompressingStore())
	return SDValue();

	EVT VT = Mst->getValue().getValueType();
	SDLoc dl(Mst);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (Mst->isTruncatingStore())
	return SDValue();

	if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
	return ScalarStore;

	// If the mask value has been legalized to a non-boolean vector, try to
	// simplify ops leading up to it. We only demand the MSB of each lane.
	SDValue Mask = Mst->getMask();
	if (Mask.getScalarValueSizeInBits() != 1) {
	APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
	if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
	if (N->getOpcode() != ISD::DELETED_NODE)
	DCI.AddToWorklist(N);
	return SDValue(N, 0);
	}
	if (SDValue NewMask =
	TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
	return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
	Mst->getBasePtr(), Mst->getOffset(), NewMask,
	Mst->getMemoryVT(), Mst->getMemOperand(),
	Mst->getAddressingMode());
	}

	SDValue Value = Mst->getValue();
	if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
	TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
	Mst->getMemoryVT())) {
	return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
	Mst->getBasePtr(), Mst->getOffset(), Mask,
	Mst->getMemoryVT(), Mst->getMemOperand(),
	Mst->getAddressingMode(), true);
	}

	return SDValue();
	}

	static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	StoreSDNode *St = cast<StoreSDNode>(N);
	EVT StVT = St->getMemoryVT();
	SDLoc dl(St);
	SDValue StoredVal = St->getValue();
	EVT VT = StoredVal.getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Convert a store of vXi1 into a store of iX and a bitcast.
	if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
	VT.getVectorElementType() == MVT::i1) {

	EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
	StoredVal = DAG.getBitcast(NewVT, StoredVal);

	return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
	St->getPointerInfo(), St->getOriginalAlign(),
	St->getMemOperand()->getFlags());
	}

	// If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
	// This will avoid a copy to k-register.
	if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
	StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	StoredVal.getOperand(0).getValueType() == MVT::i8) {
	return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),
	St->getBasePtr(), St->getPointerInfo(),
	St->getOriginalAlign(),
	St->getMemOperand()->getFlags());
	}

	// Widen v2i1/v4i1 stores to v8i1.
	if ((VT == MVT::v2i1 \|\| VT == MVT::v4i1) && VT == StVT &&
	Subtarget.hasAVX512()) {
	unsigned NumConcats = 8 / VT.getVectorNumElements();
	SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(VT));
	Ops[0] = StoredVal;
	StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
	return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
	St->getPointerInfo(), St->getOriginalAlign(),
	St->getMemOperand()->getFlags());
	}

	// Turn vXi1 stores of constants into a scalar store.
	if ((VT == MVT::v8i1 \|\| VT == MVT::v16i1 \|\| VT == MVT::v32i1 \|\|
	VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
	ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
	// If its a v64i1 store without 64-bit support, we need two stores.
	if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
	SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
	StoredVal->ops().slice(0, 32));
	Lo = combinevXi1ConstantToInteger(Lo, DAG);
	SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
	StoredVal->ops().slice(32, 32));
	Hi = combinevXi1ConstantToInteger(Hi, DAG);

	SDValue Ptr0 = St->getBasePtr();
	SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl);

	SDValue Ch0 =
	DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
	St->getOriginalAlign(),
	St->getMemOperand()->getFlags());
	SDValue Ch1 =
	DAG.getStore(St->getChain(), dl, Hi, Ptr1,
	St->getPointerInfo().getWithOffset(4),
	St->getOriginalAlign(),
	St->getMemOperand()->getFlags());
	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
	}

	StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
	return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
	St->getPointerInfo(), St->getOriginalAlign(),
	St->getMemOperand()->getFlags());
	}

	// If we are saving a 32-byte vector and 32-byte stores are slow, such as on
	// Sandy Bridge, perform two 16-byte stores.
	bool Fast;
	if (VT.is256BitVector() && StVT == VT &&
	TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
	*St->getMemOperand(), &Fast) &&
	!Fast) {
	unsigned NumElems = VT.getVectorNumElements();
	if (NumElems < 2)
	return SDValue();

	return splitVectorStore(St, DAG);
	}

	// Split under-aligned vector non-temporal stores.
	if (St->isNonTemporal() && StVT == VT &&
	St->getAlignment() < VT.getStoreSize()) {
	// ZMM/YMM nt-stores - either it can be stored as a series of shorter
	// vectors or the legalizer can scalarize it to use MOVNTI.
	if (VT.is256BitVector() \|\| VT.is512BitVector()) {
	unsigned NumElems = VT.getVectorNumElements();
	if (NumElems < 2)
	return SDValue();
	return splitVectorStore(St, DAG);
	}

	// XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
	// to use MOVNTI.
	if (VT.is128BitVector() && Subtarget.hasSSE2()) {
	MVT NTVT = Subtarget.hasSSE4A()
	? MVT::v2f64
	: (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
	return scalarizeVectorStore(St, NTVT, DAG);
	}
	}

	// Try to optimize v16i16->v16i8 truncating stores when BWI is not
	// supported, but avx512f is by extending to v16i32 and truncating.
	if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
	St->getValue().getOpcode() == ISD::TRUNCATE &&
	St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
	TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
	St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
	SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue());
	return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
	MVT::v16i8, St->getMemOperand());
	}

	// Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
	if (!St->isTruncatingStore() && StoredVal.hasOneUse() &&
	(StoredVal.getOpcode() == X86ISD::VTRUNCUS \|\|
	StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
	TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
	bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
	return EmitTruncSStore(IsSigned, St->getChain(),
	dl, StoredVal.getOperand(0), St->getBasePtr(),
	VT, St->getMemOperand(), DAG);
	}

	// Optimize trunc store (of multiple scalars) to shuffle and store.
	// First, pack all of the elements in one place. Next, store to memory
	// in fewer chunks.
	if (St->isTruncatingStore() && VT.isVector()) {
	// Check if we can detect an AVG pattern from the truncation. If yes,
	// replace the trunc store by a normal store with the result of X86ISD::AVG
	// instruction.
	if (DCI.isBeforeLegalize() \|\| TLI.isTypeLegal(St->getMemoryVT()))
	if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
	Subtarget, dl))
	return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
	St->getPointerInfo(), St->getOriginalAlign(),
	St->getMemOperand()->getFlags());

	if (TLI.isTruncStoreLegal(VT, StVT)) {
	if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
	return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
	dl, Val, St->getBasePtr(),
	St->getMemoryVT(), St->getMemOperand(), DAG);
	if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
	DAG, dl))
	return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
	dl, Val, St->getBasePtr(),
	St->getMemoryVT(), St->getMemOperand(), DAG);
	}

	return SDValue();
	}

	// Cast ptr32 and ptr64 pointers to the default address space before a store.
	unsigned AddrSpace = St->getAddressSpace();
	if (AddrSpace == X86AS::PTR64 \|\| AddrSpace == X86AS::PTR32_SPTR \|\|
	AddrSpace == X86AS::PTR32_UPTR) {
	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
	if (PtrVT != St->getBasePtr().getSimpleValueType()) {
	SDValue Cast =
	DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
	return DAG.getStore(St->getChain(), dl, StoredVal, Cast,
	St->getPointerInfo(), St->getOriginalAlign(),
	St->getMemOperand()->getFlags(), St->getAAInfo());
	}
	}

	// Turn load->store of MMX types into GPR load/stores. This avoids clobbering
	// the FP state in cases where an emms may be missing.
	// A preferable solution to the general problem is to figure out the right
	// places to insert EMMS. This qualifies as a quick hack.

	// Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
	if (VT.getSizeInBits() != 64)
	return SDValue();

	const Function &F = DAG.getMachineFunction().getFunction();
	bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
	bool F64IsLegal =
	!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
	if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
	isa<LoadSDNode>(St->getValue()) &&
	cast<LoadSDNode>(St->getValue())->isSimple() &&
	St->getChain().hasOneUse() && St->isSimple()) {
	LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());

	if (!ISD::isNormalLoad(Ld))
	return SDValue();

	// Avoid the transformation if there are multiple uses of the loaded value.
	if (!Ld->hasNUsesOfValue(1, 0))
	return SDValue();

	SDLoc LdDL(Ld);
	SDLoc StDL(N);
	// Lower to a single movq load/store pair.
	SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
	Ld->getBasePtr(), Ld->getMemOperand());

	// Make sure new load is placed in same chain order.
	DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
	return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
	St->getMemOperand());
	}

	// This is similar to the above case, but here we handle a scalar 64-bit
	// integer store that is extracted from a vector on a 32-bit target.
	// If we have SSE2, then we can treat it like a floating-point double
	// to get past legalization. The execution dependencies fixup pass will
	// choose the optimal machine instruction for the store if this really is
	// an integer or v2f32 rather than an f64.
	if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
	St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	SDValue OldExtract = St->getOperand(1);
	SDValue ExtOp0 = OldExtract.getOperand(0);
	unsigned VecSize = ExtOp0.getValueSizeInBits();
	EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
	SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
	SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
	BitCast, OldExtract.getOperand(1));
	return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
	St->getPointerInfo(), St->getOriginalAlign(),
	St->getMemOperand()->getFlags());
	}

	return SDValue();
	}

	static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	auto *St = cast<MemIntrinsicSDNode>(N);

	SDValue StoredVal = N->getOperand(1);
	MVT VT = StoredVal.getSimpleValueType();
	EVT MemVT = St->getMemoryVT();

	// Figure out which elements we demand.
	unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
	APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);

	APInt KnownUndef, KnownZero;
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, KnownUndef,
	KnownZero, DCI)) {
	if (N->getOpcode() != ISD::DELETED_NODE)
	DCI.AddToWorklist(N);
	return SDValue(N, 0);
	}

	return SDValue();
	}

	/// Return 'true' if this vector operation is "horizontal"
	/// and return the operands for the horizontal operation in LHS and RHS. A
	/// horizontal operation performs the binary operation on successive elements
	/// of its first operand, then on successive elements of its second operand,
	/// returning the resulting values in a vector. For example, if
	/// A = < float a0, float a1, float a2, float a3 >
	/// and
	/// B = < float b0, float b1, float b2, float b3 >
	/// then the result of doing a horizontal operation on A and B is
	/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
	/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
	/// A horizontal-op B, for some already available A and B, and if so then LHS is
	/// set to A, RHS to B, and the routine returns 'true'.
	static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
	- const X86Subtarget &Subtarget,
	- bool IsCommutative) {
	+ const X86Subtarget &Subtarget, bool IsCommutative,
	+ SmallVectorImpl<int> &PostShuffleMask) {
	// If either operand is undef, bail out. The binop should be simplified.
	if (LHS.isUndef() \|\| RHS.isUndef())
	return false;

	// Look for the following pattern:
	// A = < float a0, float a1, float a2, float a3 >
	// B = < float b0, float b1, float b2, float b3 >
	// and
	// LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
	// RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
	// then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
	// which is A horizontal-op B.

	MVT VT = LHS.getSimpleValueType();
	assert((VT.is128BitVector() \|\| VT.is256BitVector()) &&
	"Unsupported vector type for horizontal add/sub");
	unsigned NumElts = VT.getVectorNumElements();

	// TODO - can we make a general helper method that does all of this for us?
	auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
	SmallVectorImpl<int> &ShuffleMask) {
	if (Op.getOpcode() == ISD::VECTOR_SHUFFLE) {
	if (!Op.getOperand(0).isUndef())
	N0 = Op.getOperand(0);
	if (!Op.getOperand(1).isUndef())
	N1 = Op.getOperand(1);
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
	ShuffleMask.append(Mask.begin(), Mask.end());
	return;
	}
	bool UseSubVector = false;
	if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	Op.getOperand(0).getValueType().is256BitVector() &&
	llvm::isNullConstant(Op.getOperand(1))) {
	Op = Op.getOperand(0);
	UseSubVector = true;
	}
	bool IsUnary;
	SmallVector<SDValue, 2> SrcOps;
	SmallVector<int, 16> SrcShuffleMask;
	SDValue BC = peekThroughBitcasts(Op);
	if (isTargetShuffle(BC.getOpcode()) &&
	getTargetShuffleMask(BC.getNode(), BC.getSimpleValueType(), false,
	SrcOps, SrcShuffleMask, IsUnary)) {
	if (!UseSubVector && SrcShuffleMask.size() == NumElts &&
	SrcOps.size() <= 2) {
	N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
	N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
	ShuffleMask.append(SrcShuffleMask.begin(), SrcShuffleMask.end());
	}
	if (UseSubVector && (SrcShuffleMask.size() == (NumElts * 2)) &&
	SrcOps.size() == 1) {
	N0 = extract128BitVector(SrcOps[0], 0, DAG, SDLoc(Op));
	N1 = extract128BitVector(SrcOps[0], NumElts, DAG, SDLoc(Op));
	ArrayRef<int> Mask = ArrayRef<int>(SrcShuffleMask).slice(0, NumElts);
	ShuffleMask.append(Mask.begin(), Mask.end());
	}
	}
	};

	// View LHS in the form
	// LHS = VECTOR_SHUFFLE A, B, LMask
	// If LHS is not a shuffle, then pretend it is the identity shuffle:
	// LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
	// NOTE: A default initialized SDValue represents an UNDEF of type VT.
	SDValue A, B;
	SmallVector<int, 16> LMask;
	GetShuffle(LHS, A, B, LMask);

	// Likewise, view RHS in the form
	// RHS = VECTOR_SHUFFLE C, D, RMask
	SDValue C, D;
	SmallVector<int, 16> RMask;
	GetShuffle(RHS, C, D, RMask);

	// At least one of the operands should be a vector shuffle.
	unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
	if (NumShuffles == 0)
	return false;

	if (LMask.empty()) {
	A = LHS;
	for (unsigned i = 0; i != NumElts; ++i)
	LMask.push_back(i);
	}

	if (RMask.empty()) {
	C = RHS;
	for (unsigned i = 0; i != NumElts; ++i)
	RMask.push_back(i);
	}

	+ // Avoid 128-bit lane crossing if pre-AVX2 and FP (integer will split).
	+ if (!Subtarget.hasAVX2() && VT.isFloatingPoint() &&
	+ (isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), LMask) \|\|
	+ isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), RMask)))
	+ return false;
	+
	// If A and B occur in reverse order in RHS, then canonicalize by commuting
	// RHS operands and shuffle mask.
	if (A != C) {
	std::swap(C, D);
	ShuffleVectorSDNode::commuteMask(RMask);
	}
	// Check that the shuffles are both shuffling the same vectors.
	if (!(A == C && B == D))
	return false;

	+ PostShuffleMask.clear();
	+ PostShuffleMask.append(NumElts, SM_SentinelUndef);
	+
	// LHS and RHS are now:
	// LHS = shuffle A, B, LMask
	// RHS = shuffle A, B, RMask
	// Check that the masks correspond to performing a horizontal operation.
	// AVX defines horizontal add/sub to operate independently on 128-bit lanes,
	// so we just repeat the inner loop if this is a 256-bit op.
	unsigned Num128BitChunks = VT.getSizeInBits() / 128;
	unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
	+ unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
	assert((NumEltsPer128BitChunk % 2 == 0) &&
	"Vector type should have an even number of elements in each lane");
	for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
	for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
	// Ignore undefined components.
	int LIdx = LMask[i + j], RIdx = RMask[i + j];
	if (LIdx < 0 \|\| RIdx < 0 \|\|
	(!A.getNode() && (LIdx < (int)NumElts \|\| RIdx < (int)NumElts)) \|\|
	(!B.getNode() && (LIdx >= (int)NumElts \|\| RIdx >= (int)NumElts)))
	continue;

	+ // Check that successive odd/even elements are being operated on. If not,
	+ // this is not a horizontal operation.
	+ if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
	+ !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
	+ return false;
	+
	+ // Compute the post-shuffle mask index based on where the element
	+ // is stored in the HOP result, and where it needs to be moved to.
	+ int Base = LIdx & ~1u;
	+ int Index = ((Base % NumEltsPer128BitChunk) / 2) +
	+ ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
	+
	// The low half of the 128-bit result must choose from A.
	// The high half of the 128-bit result must choose from B,
	// unless B is undef. In that case, we are always choosing from A.
	- unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
	- unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0;
	-
	- // Check that successive elements are being operated on. If not, this is
	- // not a horizontal operation.
	- int Index = 2 * (i % NumEltsPer64BitChunk) + NumElts * Src + j;
	- if (!(LIdx == Index && RIdx == Index + 1) &&
	- !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
	- return false;
	+ if ((B && Base >= (int)NumElts) \|\| (!B && i >= NumEltsPer64BitChunk))
	+ Index += NumEltsPer64BitChunk;
	+ PostShuffleMask[i + j] = Index;
	}
	}

	LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
	RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.

	- if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget))
	+ bool IsIdentityPostShuffle =
	+ isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
	+ if (IsIdentityPostShuffle)
	+ PostShuffleMask.clear();
	+
	+ // Assume a SingleSource HOP if we only shuffle one input and don't need to
	+ // shuffle the result.
	+ if (!shouldUseHorizontalOp(LHS == RHS &&
	+ (NumShuffles < 2 \|\| !IsIdentityPostShuffle),
	+ DAG, Subtarget))
	return false;

	LHS = DAG.getBitcast(VT, LHS);
	RHS = DAG.getBitcast(VT, RHS);
	return true;
	}

	/// Do target-specific dag combines on floating-point adds/subs.
	static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	bool IsFadd = N->getOpcode() == ISD::FADD;
	auto HorizOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
	assert((IsFadd \|\| N->getOpcode() == ISD::FSUB) && "Wrong opcode");

	// Try to synthesize horizontal add/sub from adds/subs of shuffles.
	+ SmallVector<int, 8> PostShuffleMask;
	if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 \|\| VT == MVT::v2f64)) \|\|
	(Subtarget.hasAVX() && (VT == MVT::v8f32 \|\| VT == MVT::v4f64))) &&
	- isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd))
	- return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
	+ isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd, PostShuffleMask)) {
	+ SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
	+ if (!PostShuffleMask.empty())
	+ HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
	+ DAG.getUNDEF(VT), PostShuffleMask);
	+ return HorizBinOp;
	+ }

	// NOTE: isHorizontalBinOp may have changed LHS/RHS variables.

	return SDValue();
	}

	/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
	/// the codegen.
	/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
	/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
	/// anything that is guaranteed to be transformed by DAGCombiner.
	static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	const SDLoc &DL) {
	assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
	SDValue Src = N->getOperand(0);
	unsigned SrcOpcode = Src.getOpcode();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	EVT VT = N->getValueType(0);
	EVT SrcVT = Src.getValueType();

	auto IsFreeTruncation = [VT](SDValue Op) {
	unsigned TruncSizeInBits = VT.getScalarSizeInBits();

	// See if this has been extended from a smaller/equal size to
	// the truncation size, allowing a truncation to combine with the extend.
	unsigned Opcode = Op.getOpcode();
	if ((Opcode == ISD::ANY_EXTEND \|\| Opcode == ISD::SIGN_EXTEND \|\|
	Opcode == ISD::ZERO_EXTEND) &&
	Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
	return true;

	// See if this is a single use constant which can be constant folded.
	// NOTE: We don't peek throught bitcasts here because there is currently
	// no support for constant folding truncate+bitcast+vector_of_constants. So
	// we'll just send up with a truncate on both operands which will
	// get turned back into (truncate (binop)) causing an infinite loop.
	return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
	};

	auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
	SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
	SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
	return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
	};

	// Don't combine if the operation has other uses.
	if (!Src.hasOneUse())
	return SDValue();

	// Only support vector truncation for now.
	// TODO: i64 scalar math would benefit as well.
	if (!VT.isVector())
	return SDValue();

	// In most cases its only worth pre-truncating if we're only facing the cost
	// of one truncation.
	// i.e. if one of the inputs will constant fold or the input is repeated.
	switch (SrcOpcode) {
	case ISD::MUL:
	// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
	// better to truncate if we have the chance.
	if (SrcVT.getScalarType() == MVT::i64 &&
	TLI.isOperationLegal(SrcOpcode, VT) &&
	!TLI.isOperationLegal(SrcOpcode, SrcVT))
	return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
	LLVM_FALLTHROUGH;
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR:
	case ISD::ADD:
	case ISD::SUB: {
	SDValue Op0 = Src.getOperand(0);
	SDValue Op1 = Src.getOperand(1);
	if (TLI.isOperationLegal(SrcOpcode, VT) &&
	(Op0 == Op1 \|\| IsFreeTruncation(Op0) \|\| IsFreeTruncation(Op1)))
	return TruncateArithmetic(Op0, Op1);
	break;
	}
	}

	return SDValue();
	}

	/// Truncate using ISD::AND mask and X86ISD::PACKUS.
	/// e.g. trunc <8 x i32> X to <8 x i16> -->
	/// MaskX = X & 0xffff (clear high bits to prevent saturation)
	/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
	static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = N->getOperand(0);
	EVT InVT = In.getValueType();
	EVT OutVT = N->getValueType(0);

	APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
	OutVT.getScalarSizeInBits());
	In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
	return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
	}

	/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
	static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = N->getOperand(0);
	EVT InVT = In.getValueType();
	EVT OutVT = N->getValueType(0);
	In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
	DAG.getValueType(OutVT));
	return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
	}

	/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
	/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
	/// legalization the truncation will be translated into a BUILD_VECTOR with each
	/// element that is extracted from a vector and then truncated, and it is
	/// difficult to do this optimization based on them.
	static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT OutVT = N->getValueType(0);
	if (!OutVT.isVector())
	return SDValue();

	SDValue In = N->getOperand(0);
	if (!In.getValueType().isSimple())
	return SDValue();

	EVT InVT = In.getValueType();
	unsigned NumElems = OutVT.getVectorNumElements();

	// TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
	// SSE2, and we need to take care of it specially.
	// AVX512 provides vpmovdb.
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX2())
	return SDValue();

	EVT OutSVT = OutVT.getVectorElementType();
	EVT InSVT = InVT.getVectorElementType();
	if (!((InSVT == MVT::i32 \|\| InSVT == MVT::i64) &&
	(OutSVT == MVT::i8 \|\| OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
	NumElems >= 8))
	return SDValue();

	// SSSE3's pshufb results in less instructions in the cases below.
	if (Subtarget.hasSSSE3() && NumElems == 8 &&
	((OutSVT == MVT::i8 && InSVT != MVT::i64) \|\|
	(InSVT == MVT::i32 && OutSVT == MVT::i16)))
	return SDValue();

	SDLoc DL(N);
	// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
	// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
	// truncate 2 x v4i32 to v8i16.
	if (Subtarget.hasSSE41() \|\| OutSVT == MVT::i8)
	return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
	if (InSVT == MVT::i32)
	return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);

	return SDValue();
	}

	/// This function transforms vector truncation of 'extended sign-bits' or
	/// 'extended zero-bits' values.
	/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
	static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Requires SSE2.
	if (!Subtarget.hasSSE2())
	return SDValue();

	if (!N->getValueType(0).isVector() \|\| !N->getValueType(0).isSimple())
	return SDValue();

	SDValue In = N->getOperand(0);
	if (!In.getValueType().isSimple())
	return SDValue();

	MVT VT = N->getValueType(0).getSimpleVT();
	MVT SVT = VT.getScalarType();

	MVT InVT = In.getValueType().getSimpleVT();
	MVT InSVT = InVT.getScalarType();

	// Check we have a truncation suited for PACKSS/PACKUS.
	if (!isPowerOf2_32(VT.getVectorNumElements()))
	return SDValue();
	if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
	return SDValue();
	if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
	return SDValue();

	// Truncation to sub-128bit vXi32 can be better handled with shuffles.
	if (SVT == MVT::i32 && VT.getSizeInBits() < 128)
	return SDValue();

	// AVX512 has fast truncate, but if the input is already going to be split,
	// there's no harm in trying pack.
	if (Subtarget.hasAVX512() &&
	!(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
	InVT.is512BitVector()))
	return SDValue();

	unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
	unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;

	// Use PACKUS if the input has zero-bits that extend all the way to the
	// packed/truncated value. e.g. masks, zext_in_reg, etc.
	KnownBits Known = DAG.computeKnownBits(In);
	unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
	if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
	return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);

	// Use PACKSS if the input has sign-bits that extend all the way to the
	// packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
	unsigned NumSignBits = DAG.ComputeNumSignBits(In);

	// Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
	// a sign splat. ComputeNumSignBits struggles to see through BITCASTs later
	// on and combines/simplifications can't then use it.
	if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())
	return SDValue();

	if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))
	return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);

	return SDValue();
	}

	// Try to form a MULHU or MULHS node by looking for
	// (trunc (srl (mul ext, ext), 16))
	// TODO: This is X86 specific because we want to be able to handle wide types
	// before type legalization. But we can only do it if the vector will be
	// legalized via widening/splitting. Type legalization can't handle promotion
	// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
	// combiner.
	static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
	SelectionDAG &DAG, const X86Subtarget &Subtarget) {
	// First instruction should be a right shift of a multiply.
	if (Src.getOpcode() != ISD::SRL \|\|
	Src.getOperand(0).getOpcode() != ISD::MUL)
	return SDValue();

	if (!Subtarget.hasSSE2())
	return SDValue();

	// Only handle vXi16 types that are at least 128-bits unless they will be
	// widened.
	if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::i16)
	return SDValue();

	// Input type should be at least vXi32.
	EVT InVT = Src.getValueType();
	if (InVT.getVectorElementType().getSizeInBits() < 32)
	return SDValue();

	// Need a shift by 16.
	APInt ShiftAmt;
	if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) \|\|
	ShiftAmt != 16)
	return SDValue();

	SDValue LHS = Src.getOperand(0).getOperand(0);
	SDValue RHS = Src.getOperand(0).getOperand(1);

	unsigned ExtOpc = LHS.getOpcode();
	if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) \|\|
	RHS.getOpcode() != ExtOpc)
	return SDValue();

	// Peek through the extends.
	LHS = LHS.getOperand(0);
	RHS = RHS.getOperand(0);

	// Ensure the input types match.
	if (LHS.getValueType() != VT \|\| RHS.getValueType() != VT)
	return SDValue();

	unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
	return DAG.getNode(Opc, DL, VT, LHS, RHS);
	}

	// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
	// from one vector with signed bytes from another vector, adds together
	// adjacent pairs of 16-bit products, and saturates the result before
	// truncating to 16-bits.
	//
	// Which looks something like this:
	// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
	// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
	static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	const SDLoc &DL) {
	if (!VT.isVector() \|\| !Subtarget.hasSSSE3())
	return SDValue();

	unsigned NumElems = VT.getVectorNumElements();
	EVT ScalarVT = VT.getVectorElementType();
	if (ScalarVT != MVT::i16 \|\| NumElems < 8 \|\| !isPowerOf2_32(NumElems))
	return SDValue();

	SDValue SSatVal = detectSSatPattern(In, VT);
	if (!SSatVal \|\| SSatVal.getOpcode() != ISD::ADD)
	return SDValue();

	// Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
	// of multiplies from even/odd elements.
	SDValue N0 = SSatVal.getOperand(0);
	SDValue N1 = SSatVal.getOperand(1);

	if (N0.getOpcode() != ISD::MUL \|\| N1.getOpcode() != ISD::MUL)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	SDValue N10 = N1.getOperand(0);
	SDValue N11 = N1.getOperand(1);

	// TODO: Handle constant vectors and use knownbits/computenumsignbits?
	// Canonicalize zero_extend to LHS.
	if (N01.getOpcode() == ISD::ZERO_EXTEND)
	std::swap(N00, N01);
	if (N11.getOpcode() == ISD::ZERO_EXTEND)
	std::swap(N10, N11);

	// Ensure we have a zero_extend and a sign_extend.
	if (N00.getOpcode() != ISD::ZERO_EXTEND \|\|
	N01.getOpcode() != ISD::SIGN_EXTEND \|\|
	N10.getOpcode() != ISD::ZERO_EXTEND \|\|
	N11.getOpcode() != ISD::SIGN_EXTEND)
	return SDValue();

	// Peek through the extends.
	N00 = N00.getOperand(0);
	N01 = N01.getOperand(0);
	N10 = N10.getOperand(0);
	N11 = N11.getOperand(0);

	// Ensure the extend is from vXi8.
	if (N00.getValueType().getVectorElementType() != MVT::i8 \|\|
	N01.getValueType().getVectorElementType() != MVT::i8 \|\|
	N10.getValueType().getVectorElementType() != MVT::i8 \|\|
	N11.getValueType().getVectorElementType() != MVT::i8)
	return SDValue();

	// All inputs should be build_vectors.
	if (N00.getOpcode() != ISD::BUILD_VECTOR \|\|
	N01.getOpcode() != ISD::BUILD_VECTOR \|\|
	N10.getOpcode() != ISD::BUILD_VECTOR \|\|
	N11.getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	// N00/N10 are zero extended. N01/N11 are sign extended.

	// For each element, we need to ensure we have an odd element from one vector
	// multiplied by the odd element of another vector and the even element from
	// one of the same vectors being multiplied by the even element from the
	// other vector. So we need to make sure for each element i, this operator
	// is being performed:
	// A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
	SDValue ZExtIn, SExtIn;
	for (unsigned i = 0; i != NumElems; ++i) {
	SDValue N00Elt = N00.getOperand(i);
	SDValue N01Elt = N01.getOperand(i);
	SDValue N10Elt = N10.getOperand(i);
	SDValue N11Elt = N11.getOperand(i);
	// TODO: Be more tolerant to undefs.
	if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();
	auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
	auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
	auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
	auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
	if (!ConstN00Elt \|\| !ConstN01Elt \|\| !ConstN10Elt \|\| !ConstN11Elt)
	return SDValue();
	unsigned IdxN00 = ConstN00Elt->getZExtValue();
	unsigned IdxN01 = ConstN01Elt->getZExtValue();
	unsigned IdxN10 = ConstN10Elt->getZExtValue();
	unsigned IdxN11 = ConstN11Elt->getZExtValue();
	// Add is commutative so indices can be reordered.
	if (IdxN00 > IdxN10) {
	std::swap(IdxN00, IdxN10);
	std::swap(IdxN01, IdxN11);
	}
	// N0 indices be the even element. N1 indices must be the next odd element.
	if (IdxN00 != 2 * i \|\| IdxN10 != 2 * i + 1 \|\|
	IdxN01 != 2 * i \|\| IdxN11 != 2 * i + 1)
	return SDValue();
	SDValue N00In = N00Elt.getOperand(0);
	SDValue N01In = N01Elt.getOperand(0);
	SDValue N10In = N10Elt.getOperand(0);
	SDValue N11In = N11Elt.getOperand(0);
	// First time we find an input capture it.
	if (!ZExtIn) {
	ZExtIn = N00In;
	SExtIn = N01In;
	}
	if (ZExtIn != N00In \|\| SExtIn != N01In \|\|
	ZExtIn != N10In \|\| SExtIn != N11In)
	return SDValue();
	}

	auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	// Shrink by adding truncate nodes and let DAGCombine fold with the
	// sources.
	EVT InVT = Ops[0].getValueType();
	assert(InVT.getScalarType() == MVT::i8 &&
	"Unexpected scalar element type");
	assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
	EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
	InVT.getVectorNumElements() / 2);
	return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
	};
	return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
	PMADDBuilder);
	}

	static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue Src = N->getOperand(0);
	SDLoc DL(N);

	// Attempt to pre-truncate inputs to arithmetic ops instead.
	if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
	return V;

	// Try to detect AVG pattern first.
	if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
	return Avg;

	// Try to detect PMADD
	if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
	return PMAdd;

	// Try to combine truncation with signed/unsigned saturation.
	if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
	return Val;

	// Try to combine PMULHUW/PMULHW for vXi16.
	if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
	return V;

	// The bitcast source is a direct mmx result.
	// Detect bitcasts between i32 to x86mmx
	if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
	SDValue BCSrc = Src.getOperand(0);
	if (BCSrc.getValueType() == MVT::x86mmx)
	return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
	}

	// Try to truncate extended sign/zero bits with PACKSS/PACKUS.
	if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
	return V;

	return combineVectorTruncation(N, DAG, Subtarget);
	}

	static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	EVT VT = N->getValueType(0);
	SDValue In = N->getOperand(0);
	SDLoc DL(N);

	if (auto SSatVal = detectSSatPattern(In, VT))
	return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
	if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
	return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits()));
	if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
	return SDValue(N, 0);

	return SDValue();
	}

	/// Returns the negated value if the node \p N flips sign of FP value.
	///
	/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
	/// or FSUB(0, x)
	/// AVX512F does not have FXOR, so FNEG is lowered as
	/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
	/// In this case we go though all bitcasts.
	/// This also recognizes splat of a negated value and returns the splat of that
	/// value.
	static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
	if (N->getOpcode() == ISD::FNEG)
	return N->getOperand(0);

	// Don't recurse exponentially.
	if (Depth > SelectionDAG::MaxRecursionDepth)
	return SDValue();

	unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();

	SDValue Op = peekThroughBitcasts(SDValue(N, 0));
	EVT VT = Op->getValueType(0);

	// Make sure the element size doesn't change.
	if (VT.getScalarSizeInBits() != ScalarSize)
	return SDValue();

	unsigned Opc = Op.getOpcode();
	switch (Opc) {
	case ISD::VECTOR_SHUFFLE: {
	// For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
	// of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
	if (!Op.getOperand(1).isUndef())
	return SDValue();
	if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
	if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
	return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
	cast<ShuffleVectorSDNode>(Op)->getMask());
	break;
	}
	case ISD::INSERT_VECTOR_ELT: {
	// Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
	// -V, INDEX).
	SDValue InsVector = Op.getOperand(0);
	SDValue InsVal = Op.getOperand(1);
	if (!InsVector.isUndef())
	return SDValue();
	if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
	if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
	return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
	NegInsVal, Op.getOperand(2));
	break;
	}
	case ISD::FSUB:
	case ISD::XOR:
	case X86ISD::FXOR: {
	SDValue Op1 = Op.getOperand(1);
	SDValue Op0 = Op.getOperand(0);

	// For XOR and FXOR, we want to check if constant
	// bits of Op1 are sign bit masks. For FSUB, we
	// have to check if constant bits of Op0 are sign
	// bit masks and hence we swap the operands.
	if (Opc == ISD::FSUB)
	std::swap(Op0, Op1);

	APInt UndefElts;
	SmallVector<APInt, 16> EltBits;
	// Extract constant bits and see if they are all
	// sign bit masks. Ignore the undef elements.
	if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
	/* AllowWholeUndefs */ true,
	/* AllowPartialUndefs */ false)) {
	for (unsigned I = 0, E = EltBits.size(); I < E; I++)
	if (!UndefElts[I] && !EltBits[I].isSignMask())
	return SDValue();

	return peekThroughBitcasts(Op0);
	}
	}
	}

	return SDValue();
	}

	static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
	bool NegRes) {
	if (NegMul) {
	switch (Opcode) {
	default: llvm_unreachable("Unexpected opcode");
	case ISD::FMA: Opcode = X86ISD::FNMADD; break;
	case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
	case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
	case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
	case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
	case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
	case X86ISD::FNMADD: Opcode = ISD::FMA; break;
	case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
	case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
	case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
	case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
	case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
	}
	}

	if (NegAcc) {
	switch (Opcode) {
	default: llvm_unreachable("Unexpected opcode");
	case ISD::FMA: Opcode = X86ISD::FMSUB; break;
	case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
	case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
	case X86ISD::FMSUB: Opcode = ISD::FMA; break;
	case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
	case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
	case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
	case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
	case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
	case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
	case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
	case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
	case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
	case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
	case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
	case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
	}
	}

	if (NegRes) {
	switch (Opcode) {
	// For accuracy reason, we never combine fneg and fma under strict FP.
	default: llvm_unreachable("Unexpected opcode");
	case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
	case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
	case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
	case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
	case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
	case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
	case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
	case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
	}
	}

	return Opcode;
	}

	/// Do target-specific dag combines on floating point negations.
	static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT OrigVT = N->getValueType(0);
	SDValue Arg = isFNEG(DAG, N);
	if (!Arg)
	return SDValue();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT VT = Arg.getValueType();
	EVT SVT = VT.getScalarType();
	SDLoc DL(N);

	// Let legalize expand this if it isn't a legal type yet.
	if (!TLI.isTypeLegal(VT))
	return SDValue();

	// If we're negating a FMUL node on a target with FMA, then we can avoid the
	// use of a constant by performing (-0 - A*B) instead.
	// FIXME: Check rounding control flags as well once it becomes available.
	if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 \|\| SVT == MVT::f64) &&
	Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
	SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
	SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
	Arg.getOperand(1), Zero);
	return DAG.getBitcast(OrigVT, NewNode);
	}

	bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
	bool LegalOperations = !DCI.isBeforeLegalizeOps();
	if (SDValue NegArg =
	TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
	return DAG.getBitcast(OrigVT, NegArg);

	return SDValue();
	}

	SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
	bool LegalOperations,
	bool ForCodeSize,
	NegatibleCost &Cost,
	unsigned Depth) const {
	// fneg patterns are removable even if they have multiple uses.
	if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
	Cost = NegatibleCost::Cheaper;
	return DAG.getBitcast(Op.getValueType(), Arg);
	}

	EVT VT = Op.getValueType();
	EVT SVT = VT.getScalarType();
	unsigned Opc = Op.getOpcode();
	switch (Opc) {
	case ISD::FMA:
	case X86ISD::FMSUB:
	case X86ISD::FNMADD:
	case X86ISD::FNMSUB:
	case X86ISD::FMADD_RND:
	case X86ISD::FMSUB_RND:
	case X86ISD::FNMADD_RND:
	case X86ISD::FNMSUB_RND: {
	if (!Op.hasOneUse() \|\| !Subtarget.hasAnyFMA() \|\| !isTypeLegal(VT) \|\|
	!(SVT == MVT::f32 \|\| SVT == MVT::f64) \|\|
	!isOperationLegal(ISD::FMA, VT))
	break;

	// This is always negatible for free but we might be able to remove some
	// extra operand negations as well.
	SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
	for (int i = 0; i != 3; ++i)
	NewOps[i] = getCheaperNegatedExpression(
	Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);

	bool NegA = !!NewOps[0];
	bool NegB = !!NewOps[1];
	bool NegC = !!NewOps[2];
	unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);

	Cost = (NegA \|\| NegB \|\| NegC) ? NegatibleCost::Cheaper
	: NegatibleCost::Neutral;

	// Fill in the non-negated ops with the original values.
	for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
	if (!NewOps[i])
	NewOps[i] = Op.getOperand(i);
	return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
	}
	case X86ISD::FRCP:
	if (SDValue NegOp0 =
	getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
	ForCodeSize, Cost, Depth + 1))
	return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
	break;
	}

	return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
	ForCodeSize, Cost, Depth);
	}

	static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = N->getSimpleValueType(0);
	// If we have integer vector types available, use the integer opcodes.
	if (!VT.isVector() \|\| !Subtarget.hasSSE2())
	return SDValue();

	SDLoc dl(N);

	unsigned IntBits = VT.getScalarSizeInBits();
	MVT IntSVT = MVT::getIntegerVT(IntBits);
	MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);

	SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
	SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
	unsigned IntOpcode;
	switch (N->getOpcode()) {
	default: llvm_unreachable("Unexpected FP logic op");
	case X86ISD::FOR: IntOpcode = ISD::OR; break;
	case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
	case X86ISD::FAND: IntOpcode = ISD::AND; break;
	case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
	}
	SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
	return DAG.getBitcast(VT, IntOp);
	}


	/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
	static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
	if (N->getOpcode() != ISD::XOR)
	return SDValue();

	SDValue LHS = N->getOperand(0);
	if (!isOneConstant(N->getOperand(1)) \|\| LHS->getOpcode() != X86ISD::SETCC)
	return SDValue();

	X86::CondCode NewCC = X86::GetOppositeBranchCondition(
	X86::CondCode(LHS->getConstantOperandVal(0)));
	SDLoc DL(N);
	return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
	}

	static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// If this is SSE1 only convert to FXOR to avoid scalarization.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
	N->getValueType(0) == MVT::v4i32) {
	return DAG.getBitcast(
	MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
	DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
	DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
	}

	if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
	return Cmp;

	if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
	return R;

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue SetCC = foldXor1SetCC(N, DAG))
	return SetCC;

	if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
	return RV;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	return combineFneg(N, DAG, DCI, Subtarget);
	}

	static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	unsigned NumBits = VT.getSizeInBits();

	// TODO - Constant Folding.

	// Simplify the inputs.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	APInt DemandedMask(APInt::getAllOnesValue(NumBits));
	if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
	return SDValue(N, 0);

	return SDValue();
	}

	static bool isNullFPScalarOrVectorConst(SDValue V) {
	return isNullFPConstant(V) \|\| ISD::isBuildVectorAllZeros(V.getNode());
	}

	/// If a value is a scalar FP zero or a vector FP zero (potentially including
	/// undefined elements), return a zero constant that may be used to fold away
	/// that value. In the case of a vector, the returned constant will not contain
	/// undefined elements even if the input parameter does. This makes it suitable
	/// to be used as a replacement operand with operations (eg, bitwise-and) where
	/// an undef should not propagate.
	static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!isNullFPScalarOrVectorConst(V))
	return SDValue();

	if (V.getValueType().isVector())
	return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));

	return V;
	}

	static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	// Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
	if (!((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::f64 && Subtarget.hasSSE2()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
	return SDValue();

	auto isAllOnesConstantFP = [](SDValue V) {
	if (V.getSimpleValueType().isVector())
	return ISD::isBuildVectorAllOnes(V.getNode());
	auto *C = dyn_cast<ConstantFPSDNode>(V);
	return C && C->getConstantFPValue()->isAllOnesValue();
	};

	// fand (fxor X, -1), Y --> fandn X, Y
	if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
	return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);

	// fand X, (fxor Y, -1) --> fandn Y, X
	if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
	return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);

	return SDValue();
	}

	/// Do target-specific dag combines on X86ISD::FAND nodes.
	static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// FAND(0.0, x) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
	return V;

	// FAND(x, 0.0) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
	return V;

	if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
	return V;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FANDN nodes.
	static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// FANDN(0.0, x) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(0)))
	return N->getOperand(1);

	// FANDN(x, 0.0) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
	return V;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
	static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == X86ISD::FOR \|\| N->getOpcode() == X86ISD::FXOR);

	// F[X]OR(0.0, x) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(0)))
	return N->getOperand(1);

	// F[X]OR(x, 0.0) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(1)))
	return N->getOperand(0);

	if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
	return NewVal;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
	static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
	assert(N->getOpcode() == X86ISD::FMIN \|\| N->getOpcode() == X86ISD::FMAX);

	// FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
	if (!DAG.getTarget().Options.NoNaNsFPMath \|\|
	!DAG.getTarget().Options.NoSignedZerosFPMath)
	return SDValue();

	// If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
	// into FMINC and FMAXC, which are Commutative operations.
	unsigned NewOp = 0;
	switch (N->getOpcode()) {
	default: llvm_unreachable("unknown opcode");
	case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
	case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
	}

	return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
	N->getOperand(0), N->getOperand(1));
	}

	static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (Subtarget.useSoftFloat())
	return SDValue();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	EVT VT = N->getValueType(0);
	if (!((Subtarget.hasSSE1() && VT == MVT::f32) \|\|
	(Subtarget.hasSSE2() && VT == MVT::f64) \|\|
	(VT.isVector() && TLI.isTypeLegal(VT))))
	return SDValue();

	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	SDLoc DL(N);
	auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;

	// If we don't have to respect NaN inputs, this is a direct translation to x86
	// min/max instructions.
	if (DAG.getTarget().Options.NoNaNsFPMath \|\| N->getFlags().hasNoNaNs())
	return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());

	// If one of the operands is known non-NaN use the native min/max instructions
	// with the non-NaN input as second operand.
	if (DAG.isKnownNeverNaN(Op1))
	return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
	if (DAG.isKnownNeverNaN(Op0))
	return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());

	// If we have to respect NaN inputs, this takes at least 3 instructions.
	// Favor a library call when operating on a scalar and minimizing code size.
	if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
	return SDValue();

	EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
	VT);

	// There are 4 possibilities involving NaN inputs, and these are the required
	// outputs:
	// Op1
	// Num NaN
	// ----------------
	// Num \| Max \| Op0 \|
	// Op0 ----------------
	// NaN \| Op1 \| NaN \|
	// ----------------
	//
	// The SSE FP max/min instructions were not designed for this case, but rather
	// to implement:
	// Min = Op1 < Op0 ? Op1 : Op0
	// Max = Op1 > Op0 ? Op1 : Op0
	//
	// So they always return Op0 if either input is a NaN. However, we can still
	// use those instructions for fmaxnum by selecting away a NaN input.

	// If either operand is NaN, the 2nd source operand (Op0) is passed through.
	SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
	SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);

	// If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
	// are NaN, the NaN value of Op1 is the result.
	return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
	}

	static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	EVT VT = N->getValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	APInt KnownUndef, KnownZero;
	APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
	if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
	KnownZero, DCI))
	return SDValue(N, 0);

	// Convert a full vector load into vzload when not all bits are needed.
	SDValue In = N->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
	ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
	assert(InVT.is128BitVector() && "Expected 128-bit input vector");
	LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
	unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
	MVT MemVT = MVT::getIntegerVT(NumBits);
	MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
	if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
	SDLoc dl(N);
	SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
	DAG.getBitcast(InVT, VZLoad));
	DCI.CombineTo(N, Convert);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
	DCI.recursivelyDeleteUnusedNodes(LN);
	return SDValue(N, 0);
	}
	}

	return SDValue();
	}

	static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	bool IsStrict = N->isTargetStrictFPOpcode();
	EVT VT = N->getValueType(0);

	// Convert a full vector load into vzload when not all bits are needed.
	SDValue In = N->getOperand(IsStrict ? 1 : 0);
	MVT InVT = In.getSimpleValueType();
	if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
	ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
	assert(InVT.is128BitVector() && "Expected 128-bit input vector");
	LoadSDNode *LN = cast<LoadSDNode>(In);
	unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
	MVT MemVT = MVT::getFloatingPointVT(NumBits);
	MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
	if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
	SDLoc dl(N);
	if (IsStrict) {
	SDValue Convert =
	DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
	{N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
	DCI.CombineTo(N, Convert, Convert.getValue(1));
	} else {
	SDValue Convert =
	DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
	DCI.CombineTo(N, Convert);
	}
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
	DCI.recursivelyDeleteUnusedNodes(LN);
	return SDValue(N, 0);
	}
	}

	return SDValue();
	}

	/// Do target-specific dag combines on X86ISD::ANDNP nodes.
	static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	MVT VT = N->getSimpleValueType(0);

	// ANDNP(0, x) -> x
	if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
	return N->getOperand(1);

	// ANDNP(x, 0) -> 0
	if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
	return DAG.getConstant(0, SDLoc(N), VT);

	// Turn ANDNP back to AND if input is inverted.
	if (SDValue Not = IsNOT(N->getOperand(0), DAG))
	return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not),
	N->getOperand(1));

	// Attempt to recursively combine a bitmask ANDNP with shuffles.
	if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;
	}

	return SDValue();
	}

	static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	SDValue N1 = N->getOperand(1);

	// BT ignores high bits in the bit index operand.
	unsigned BitWidth = N1.getValueSizeInBits();
	APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
	if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
	if (N->getOpcode() != ISD::DELETED_NODE)
	DCI.AddToWorklist(N);
	return SDValue(N, 0);
	}

	return SDValue();
	}

	static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
	SDValue Src = N->getOperand(IsStrict ? 1 : 0);

	if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
	APInt KnownUndef, KnownZero;
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	APInt DemandedElts = APInt::getLowBitsSet(8, 4);
	if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
	DCI)) {
	if (N->getOpcode() != ISD::DELETED_NODE)
	DCI.AddToWorklist(N);
	return SDValue(N, 0);
	}

	// Convert a full vector load into vzload when not all bits are needed.
	if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
	LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
	if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
	SDLoc dl(N);
	if (IsStrict) {
	SDValue Convert = DAG.getNode(
	N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
	{N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
	DCI.CombineTo(N, Convert, Convert.getValue(1));
	} else {
	SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
	DAG.getBitcast(MVT::v8i16, VZLoad));
	DCI.CombineTo(N, Convert);
	}

	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
	DCI.recursivelyDeleteUnusedNodes(LN);
	return SDValue(N, 0);
	}
	}
	}

	return SDValue();
	}

	// Try to combine sext_in_reg of a cmov of constants by extending the constants.
	static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
	assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);

	EVT DstVT = N->getValueType(0);

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT ExtraVT = cast<VTSDNode>(N1)->getVT();

	if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
	return SDValue();

	// Look through single use any_extends / truncs.
	SDValue IntermediateBitwidthOp;
	if ((N0.getOpcode() == ISD::ANY_EXTEND \|\| N0.getOpcode() == ISD::TRUNCATE) &&
	N0.hasOneUse()) {
	IntermediateBitwidthOp = N0;
	N0 = N0.getOperand(0);
	}

	// See if we have a single use cmov.
	if (N0.getOpcode() != X86ISD::CMOV \|\| !N0.hasOneUse())
	return SDValue();

	SDValue CMovOp0 = N0.getOperand(0);
	SDValue CMovOp1 = N0.getOperand(1);

	// Make sure both operands are constants.
	if (!isa<ConstantSDNode>(CMovOp0.getNode()) \|\|
	!isa<ConstantSDNode>(CMovOp1.getNode()))
	return SDValue();

	SDLoc DL(N);

	// If we looked through an any_extend/trunc above, add one to the constants.
	if (IntermediateBitwidthOp) {
	unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
	CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
	CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
	}

	CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
	CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);

	EVT CMovVT = DstVT;
	// We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
	if (DstVT == MVT::i16) {
	CMovVT = MVT::i32;
	CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
	CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
	}

	SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
	N0.getOperand(2), N0.getOperand(3));

	if (CMovVT != DstVT)
	CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);

	return CMov;
	}

	static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);

	if (SDValue V = combineSextInRegCmov(N, DAG))
	return V;

	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
	SDLoc dl(N);

	// The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
	// both SSE and AVX2 since there is no sign-extended shift right
	// operation on a vector with 64-bit elements.
	//(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
	// (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
	if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND \|\|
	N0.getOpcode() == ISD::SIGN_EXTEND)) {
	SDValue N00 = N0.getOperand(0);

	// EXTLOAD has a better solution on AVX2,
	// it may be replaced with X86ISD::VSEXT node.
	if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
	if (!ISD::isNormalLoad(N00.getNode()))
	return SDValue();

	// Attempt to promote any comparison mask ops before moving the
	// SIGN_EXTEND_INREG in the way.
	if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);

	if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
	SDValue Tmp =
	DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
	return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
	}
	}
	return SDValue();
	}

	/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
	/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
	/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
	/// opportunities to combine math ops, use an LEA, or use a complex addressing
	/// mode. This can eliminate extend, add, and shift instructions.
	static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
	Ext->getOpcode() != ISD::ZERO_EXTEND)
	return SDValue();

	// TODO: This should be valid for other integer types.
	EVT VT = Ext->getValueType(0);
	if (VT != MVT::i64)
	return SDValue();

	SDValue Add = Ext->getOperand(0);
	if (Add.getOpcode() != ISD::ADD)
	return SDValue();

	bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
	bool NSW = Add->getFlags().hasNoSignedWrap();
	bool NUW = Add->getFlags().hasNoUnsignedWrap();

	// We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
	// into the 'zext'
	if ((Sext && !NSW) \|\| (!Sext && !NUW))
	return SDValue();

	// Having a constant operand to the 'add' ensures that we are not increasing
	// the instruction count because the constant is extended for free below.
	// A constant operand can also become the displacement field of an LEA.
	auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
	if (!AddOp1)
	return SDValue();

	// Don't make the 'add' bigger if there's no hope of combining it with some
	// other 'add' or 'shl' instruction.
	// TODO: It may be profitable to generate simpler LEA instructions in place
	// of single 'add' instructions, but the cost model for selecting an LEA
	// currently has a high threshold.
	bool HasLEAPotential = false;
	for (auto *User : Ext->uses()) {
	if (User->getOpcode() == ISD::ADD \|\| User->getOpcode() == ISD::SHL) {
	HasLEAPotential = true;
	break;
	}
	}
	if (!HasLEAPotential)
	return SDValue();

	// Everything looks good, so pull the '{s\|z}ext' ahead of the 'add'.
	int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
	SDValue AddOp0 = Add.getOperand(0);
	SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
	SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);

	// The wider add is guaranteed to not wrap because both operands are
	// sign-extended.
	SDNodeFlags Flags;
	Flags.setNoSignedWrap(NSW);
	Flags.setNoUnsignedWrap(NUW);
	return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
	}

	// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
	// operands and the result of CMOV is not used anywhere else - promote CMOV
	// itself instead of promoting its result. This could be beneficial, because:
	// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
	// (or more) pseudo-CMOVs only when they go one-after-another and
	// getting rid of result extension code after CMOV will help that.
	// 2) Promotion of constant CMOV arguments is free, hence the
	// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
	// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
	// promotion is also good in terms of code-size.
	// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
	// promotion).
	static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
	SDValue CMovN = Extend->getOperand(0);
	if (CMovN.getOpcode() != X86ISD::CMOV \|\| !CMovN.hasOneUse())
	return SDValue();

	EVT TargetVT = Extend->getValueType(0);
	unsigned ExtendOpcode = Extend->getOpcode();
	SDLoc DL(Extend);

	EVT VT = CMovN.getValueType();
	SDValue CMovOp0 = CMovN.getOperand(0);
	SDValue CMovOp1 = CMovN.getOperand(1);

	if (!isa<ConstantSDNode>(CMovOp0.getNode()) \|\|
	!isa<ConstantSDNode>(CMovOp1.getNode()))
	return SDValue();

	// Only extend to i32 or i64.
	if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
	return SDValue();

	// Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
	// are free.
	if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
	return SDValue();

	// If this a zero extend to i64, we should only extend to i32 and use a free
	// zero extend to finish.
	EVT ExtendVT = TargetVT;
	if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
	ExtendVT = MVT::i32;

	CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
	CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);

	SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
	CMovN.getOperand(2), CMovN.getOperand(3));

	// Finish extending if needed.
	if (ExtendVT != TargetVT)
	Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);

	return Res;
	}

	// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
	// This is more or less the reverse of combineBitcastvxi1.
	static SDValue
	combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
	Opcode != ISD::ANY_EXTEND)
	return SDValue();
	if (!DCI.isBeforeLegalizeOps())
	return SDValue();
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX512())
	return SDValue();

	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT SVT = VT.getScalarType();
	EVT InSVT = N0.getValueType().getScalarType();
	unsigned EltSizeInBits = SVT.getSizeInBits();

	// Input type must be extending a bool vector (bit-casted from a scalar
	// integer) to legal integer types.
	if (!VT.isVector())
	return SDValue();
	if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
	return SDValue();
	if (InSVT != MVT::i1 \|\| N0.getOpcode() != ISD::BITCAST)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	EVT SclVT = N0.getOperand(0).getValueType();
	if (!SclVT.isScalarInteger())
	return SDValue();

	SDLoc DL(N);
	SDValue Vec;
	SmallVector<int, 32> ShuffleMask;
	unsigned NumElts = VT.getVectorNumElements();
	assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");

	// Broadcast the scalar integer to the vector elements.
	if (NumElts > EltSizeInBits) {
	// If the scalar integer is greater than the vector element size, then we
	// must split it down into sub-sections for broadcasting. For example:
	// i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
	// i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
	assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
	unsigned Scale = NumElts / EltSizeInBits;
	EVT BroadcastVT =
	EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
	Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
	Vec = DAG.getBitcast(VT, Vec);

	for (unsigned i = 0; i != Scale; ++i)
	ShuffleMask.append(EltSizeInBits, i);
	Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
	} else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
	(SclVT == MVT::i8 \|\| SclVT == MVT::i16 \|\| SclVT == MVT::i32)) {
	// If we have register broadcast instructions, use the scalar size as the
	// element type for the shuffle. Then cast to the wider element type. The
	// widened bits won't be used, and this might allow the use of a broadcast
	// load.
	assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
	unsigned Scale = EltSizeInBits / NumElts;
	EVT BroadcastVT =
	EVT::getVectorVT(DAG.getContext(), SclVT, NumElts Scale);
	Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
	ShuffleMask.append(NumElts * Scale, 0);
	Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
	Vec = DAG.getBitcast(VT, Vec);
	} else {
	// For smaller scalar integers, we can simply any-extend it to the vector
	// element size (we don't care about the upper bits) and broadcast it to all
	// elements.
	SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
	Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
	ShuffleMask.append(NumElts, 0);
	Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
	}

	// Now, mask the relevant bit in each element.
	SmallVector<SDValue, 32> Bits;
	for (unsigned i = 0; i != NumElts; ++i) {
	int BitIdx = (i % EltSizeInBits);
	APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
	Bits.push_back(DAG.getConstant(Bit, DL, SVT));
	}
	SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
	Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);

	// Compare against the bitmask and extend the result.
	EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
	Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
	Vec = DAG.getSExtOrTrunc(Vec, DL, VT);

	// For SEXT, this is now done, otherwise shift the result down for
	// zero-extension.
	if (Opcode == ISD::SIGN_EXTEND)
	return Vec;
	return DAG.getNode(ISD::SRL, DL, VT, Vec,
	DAG.getConstant(EltSizeInBits - 1, DL, VT));
	}

	// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
	// result type.
	static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	SDLoc dl(N);

	// Only do this combine with AVX512 for vector extends.
	if (!Subtarget.hasAVX512() \|\| !VT.isVector() \|\| N0.getOpcode() != ISD::SETCC)
	return SDValue();

	// Only combine legal element types.
	EVT SVT = VT.getVectorElementType();
	if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
	SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
	return SDValue();

	// We can only do this if the vector size in 256 bits or less.
	unsigned Size = VT.getSizeInBits();
	if (Size > 256 && Subtarget.useAVX512Regs())
	return SDValue();

	// Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
	// that's the only integer compares with we have.
	ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
	if (ISD::isUnsignedIntSetCC(CC))
	return SDValue();

	// Only do this combine if the extension will be fully consumed by the setcc.
	EVT N00VT = N0.getOperand(0).getValueType();
	EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
	if (Size != MatchingVecType.getSizeInBits())
	return SDValue();

	SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);

	if (N->getOpcode() == ISD::ZERO_EXTEND)
	Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());

	return Res;
	}

	static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT InVT = N0.getValueType();
	SDLoc DL(N);

	// (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
	if (!DCI.isBeforeLegalizeOps() &&
	N0.getOpcode() == X86ISD::SETCC_CARRY) {
	SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
	N0->getOperand(1));
	bool ReplaceOtherUses = !N0.hasOneUse();
	DCI.CombineTo(N, Setcc);
	// Replace other uses with a truncate of the widened setcc_carry.
	if (ReplaceOtherUses) {
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
	N0.getValueType(), Setcc);
	DCI.CombineTo(N0.getNode(), Trunc);
	}

	return SDValue(N, 0);
	}

	if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
	return NewCMov;

	if (!DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
	return V;

	if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
	isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
	// Invert and sign-extend a boolean is the same as zero-extend and subtract
	// 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
	// lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
	// sext (xor Bool, -1) --> sub (zext Bool), 1
	SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
	return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
	}

	if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (VT.isVector())
	if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
	return R;

	if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
	return NewAdd;

	return SDValue();
	}

	static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	bool IsStrict = N->isStrictFPOpcode() \|\| N->isTargetStrictFPOpcode();

	// Let legalize expand this if it isn't a legal type yet.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isTypeLegal(VT))
	return SDValue();

	EVT ScalarVT = VT.getScalarType();
	if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) \|\| !Subtarget.hasAnyFMA())
	return SDValue();

	SDValue A = N->getOperand(IsStrict ? 1 : 0);
	SDValue B = N->getOperand(IsStrict ? 2 : 1);
	SDValue C = N->getOperand(IsStrict ? 3 : 2);

	auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
	bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
	bool LegalOperations = !DCI.isBeforeLegalizeOps();
	if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
	CodeSize)) {
	V = NegV;
	return true;
	}
	// Look through extract_vector_elts. If it comes from an FNEG, create a
	// new extract from the FNEG input.
	if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	isNullConstant(V.getOperand(1))) {
	SDValue Vec = V.getOperand(0);
	if (SDValue NegV = TLI.getCheaperNegatedExpression(
	Vec, DAG, LegalOperations, CodeSize)) {
	V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
	NegV, V.getOperand(1));
	return true;
	}
	}

	return false;
	};

	// Do not convert the passthru input of scalar intrinsics.
	// FIXME: We could allow negations of the lower element only.
	bool NegA = invertIfNegative(A);
	bool NegB = invertIfNegative(B);
	bool NegC = invertIfNegative(C);

	if (!NegA && !NegB && !NegC)
	return SDValue();

	unsigned NewOpcode =
	negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);

	if (IsStrict) {
	assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
	return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
	{N->getOperand(0), A, B, C});
	} else {
	if (N->getNumOperands() == 4)
	return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
	return DAG.getNode(NewOpcode, dl, VT, A, B, C);
	}
	}

	// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
	// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
	static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
	bool LegalOperations = !DCI.isBeforeLegalizeOps();

	SDValue N2 = N->getOperand(2);

	SDValue NegN2 =
	TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
	if (!NegN2)
	return SDValue();
	unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);

	if (N->getNumOperands() == 4)
	return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
	NegN2, N->getOperand(3));
	return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
	NegN2);
	}

	static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc dl(N);
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
	// FIXME: Is this needed? We don't seem to have any tests for it.
	if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
	N0.getOpcode() == X86ISD::SETCC_CARRY) {
	SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
	N0->getOperand(1));
	bool ReplaceOtherUses = !N0.hasOneUse();
	DCI.CombineTo(N, Setcc);
	// Replace other uses with a truncate of the widened setcc_carry.
	if (ReplaceOtherUses) {
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
	N0.getValueType(), Setcc);
	DCI.CombineTo(N0.getNode(), Trunc);
	}

	return SDValue(N, 0);
	}

	if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
	return NewCMov;

	if (DCI.isBeforeLegalizeOps())
	if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
	return V;

	if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (VT.isVector())
	if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
	return R;

	if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
	return NewAdd;

	if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
	return R;

	// TODO: Combine with any target/faux shuffle.
	if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
	VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
	APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
	if ((N00.isUndef() \|\| DAG.MaskedValueIsZero(N00, ZeroMask)) &&
	(N01.isUndef() \|\| DAG.MaskedValueIsZero(N01, ZeroMask))) {
	return concatSubVectors(N00, N01, DAG, dl);
	}
	}

	return SDValue();
	}

	/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
	/// recognizable memcmp expansion.
	static bool isOrXorXorTree(SDValue X, bool Root = true) {
	if (X.getOpcode() == ISD::OR)
	return isOrXorXorTree(X.getOperand(0), false) &&
	isOrXorXorTree(X.getOperand(1), false);
	if (Root)
	return false;
	return X.getOpcode() == ISD::XOR;
	}

	/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
	/// expansion.
	template<typename F>
	static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,
	EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
	SDValue Op0 = X.getOperand(0);
	SDValue Op1 = X.getOperand(1);
	if (X.getOpcode() == ISD::OR) {
	SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
	SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
	if (VecVT != CmpVT)
	return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
	if (HasPT)
	return DAG.getNode(ISD::OR, DL, VecVT, A, B);
	return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
	} else if (X.getOpcode() == ISD::XOR) {
	SDValue A = SToV(Op0);
	SDValue B = SToV(Op1);
	if (VecVT != CmpVT)
	return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
	if (HasPT)
	return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
	return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
	}
	llvm_unreachable("Impossible");
	}

	/// Try to map a 128-bit or larger integer comparison to vector instructions
	/// before type legalization splits it up into chunks.
	static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
	assert((CC == ISD::SETNE \|\| CC == ISD::SETEQ) && "Bad comparison predicate");

	// We're looking for an oversized integer equality comparison.
	SDValue X = SetCC->getOperand(0);
	SDValue Y = SetCC->getOperand(1);
	EVT OpVT = X.getValueType();
	unsigned OpSize = OpVT.getSizeInBits();
	if (!OpVT.isScalarInteger() \|\| OpSize < 128)
	return SDValue();

	// Ignore a comparison with zero because that gets special treatment in
	// EmitTest(). But make an exception for the special case of a pair of
	// logically-combined vector-sized operands compared to zero. This pattern may
	// be generated by the memcmp expansion pass with oversized integer compares
	// (see PR33325).
	bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
	if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
	return SDValue();

	// Don't perform this combine if constructing the vector will be expensive.
	auto IsVectorBitCastCheap = [](SDValue X) {
	X = peekThroughBitcasts(X);
	return isa<ConstantSDNode>(X) \|\| X.getValueType().isVector() \|\|
	X.getOpcode() == ISD::LOAD;
	};
	if ((!IsVectorBitCastCheap(X) \|\| !IsVectorBitCastCheap(Y)) &&
	!IsOrXorXorTreeCCZero)
	return SDValue();

	EVT VT = SetCC->getValueType(0);
	SDLoc DL(SetCC);

	// Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
	// Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
	// Otherwise use PCMPEQ (plus AND) and mask testing.
	if ((OpSize == 128 && Subtarget.hasSSE2()) \|\|
	(OpSize == 256 && Subtarget.hasAVX()) \|\|
	(OpSize == 512 && Subtarget.useAVX512Regs())) {
	bool HasPT = Subtarget.hasSSE41();

	// PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
	// vector registers are essentially free. (Technically, widening registers
	// prevents load folding, but the tradeoff is worth it.)
	bool PreferKOT = Subtarget.preferMaskRegisters();
	bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;

	EVT VecVT = MVT::v16i8;
	EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
	if (OpSize == 256) {
	VecVT = MVT::v32i8;
	CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
	}
	EVT CastVT = VecVT;
	bool NeedsAVX512FCast = false;
	if (OpSize == 512 \|\| NeedZExt) {
	if (Subtarget.hasBWI()) {
	VecVT = MVT::v64i8;
	CmpVT = MVT::v64i1;
	if (OpSize == 512)
	CastVT = VecVT;
	} else {
	VecVT = MVT::v16i32;
	CmpVT = MVT::v16i1;
	CastVT = OpSize == 512 ? VecVT :
	OpSize == 256 ? MVT::v8i32 : MVT::v4i32;
	NeedsAVX512FCast = true;
	}
	}

	auto ScalarToVector = [&](SDValue X) -> SDValue {
	bool TmpZext = false;
	EVT TmpCastVT = CastVT;
	if (X.getOpcode() == ISD::ZERO_EXTEND) {
	SDValue OrigX = X.getOperand(0);
	unsigned OrigSize = OrigX.getScalarValueSizeInBits();
	if (OrigSize < OpSize) {
	if (OrigSize == 128) {
	TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
	X = OrigX;
	TmpZext = true;
	} else if (OrigSize == 256) {
	TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
	X = OrigX;
	TmpZext = true;
	}
	}
	}
	X = DAG.getBitcast(TmpCastVT, X);
	if (!NeedZExt && !TmpZext)
	return X;
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
	DAG.getConstant(0, DL, VecVT), X,
	DAG.getVectorIdxConstant(0, DL));
	};

	SDValue Cmp;
	if (IsOrXorXorTreeCCZero) {
	// This is a bitwise-combined equality comparison of 2 pairs of vectors:
	// setcc i128 (or (xor A, B), (xor C, D)), 0, eq\|ne
	// Use 2 vector equality compares and 'and' the results before doing a
	// MOVMSK.
	Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
	} else {
	SDValue VecX = ScalarToVector(X);
	SDValue VecY = ScalarToVector(Y);
	if (VecVT != CmpVT) {
	Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
	} else if (HasPT) {
	Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
	} else {
	Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
	}
	}
	// AVX512 should emit a setcc that will lower to kortest.
	if (VecVT != CmpVT) {
	EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 :
	CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16;
	return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
	DAG.getConstant(0, DL, KRegVT), CC);
	}
	if (HasPT) {
	SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,
	Cmp);
	SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
	X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
	SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
	return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
	}
	// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
	// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
	// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
	assert(Cmp.getValueType() == MVT::v16i8 &&
	"Non 128-bit vector on pre-SSE41 target");
	SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
	SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
	return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
	}

	return SDValue();
	}

	static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
	const SDValue LHS = N->getOperand(0);
	const SDValue RHS = N->getOperand(1);
	EVT VT = N->getValueType(0);
	EVT OpVT = LHS.getValueType();
	SDLoc DL(N);

	if (CC == ISD::SETNE \|\| CC == ISD::SETEQ) {
	if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
	return V;

	if (VT == MVT::i1 && isNullConstant(RHS)) {
	SDValue X86CC;
	if (SDValue V =
	MatchVectorAllZeroTest(LHS, CC, DL, Subtarget, DAG, X86CC))
	return DAG.getNode(ISD::TRUNCATE, DL, VT,
	DAG.getNode(X86ISD::SETCC, DL, MVT::i8, X86CC, V));
	}
	}

	if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
	(CC == ISD::SETNE \|\| CC == ISD::SETEQ \|\| ISD::isSignedIntSetCC(CC))) {
	// Using temporaries to avoid messing up operand ordering for later
	// transformations if this doesn't work.
	SDValue Op0 = LHS;
	SDValue Op1 = RHS;
	ISD::CondCode TmpCC = CC;
	// Put build_vector on the right.
	if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
	std::swap(Op0, Op1);
	TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
	}

	bool IsSEXT0 =
	(Op0.getOpcode() == ISD::SIGN_EXTEND) &&
	(Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
	bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());

	if (IsSEXT0 && IsVZero1) {
	assert(VT == Op0.getOperand(0).getValueType() &&
	"Unexpected operand type");
	if (TmpCC == ISD::SETGT)
	return DAG.getConstant(0, DL, VT);
	if (TmpCC == ISD::SETLE)
	return DAG.getConstant(1, DL, VT);
	if (TmpCC == ISD::SETEQ \|\| TmpCC == ISD::SETGE)
	return DAG.getNOT(DL, Op0.getOperand(0), VT);

	assert((TmpCC == ISD::SETNE \|\| TmpCC == ISD::SETLT) &&
	"Unexpected condition code!");
	return Op0.getOperand(0);
	}
	}

	// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
	// pre-promote its result type since vXi1 vectors don't get promoted
	// during type legalization.
	// NOTE: The element count check is to ignore operand types that need to
	// go through type promotion to a 128-bit vector.
	if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
	VT.getVectorElementType() == MVT::i1 &&
	(OpVT.getVectorElementType() == MVT::i8 \|\|
	OpVT.getVectorElementType() == MVT::i16)) {
	SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
	return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
	}

	// For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
	// to avoid scalarization via legalization because v4i32 is not a legal type.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
	LHS.getValueType() == MVT::v4f32)
	return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);

	return SDValue();
	}

	static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue Src = N->getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	MVT VT = N->getSimpleValueType(0);
	unsigned NumBits = VT.getScalarSizeInBits();
	unsigned NumElts = SrcVT.getVectorNumElements();

	// Perform constant folding.
	if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
	assert(VT == MVT::i32 && "Unexpected result type");
	APInt Imm(32, 0);
	for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
	if (!Src.getOperand(Idx).isUndef() &&
	Src.getConstantOperandAPInt(Idx).isNegative())
	Imm.setBit(Idx);
	}
	return DAG.getConstant(Imm, SDLoc(N), VT);
	}

	// Look through int->fp bitcasts that don't change the element width.
	unsigned EltWidth = SrcVT.getScalarSizeInBits();
	if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
	Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
	return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));

	// Fold movmsk(not(x)) -> not(movmsk) to improve folding of movmsk results
	// with scalar comparisons.
	if (SDValue NotSrc = IsNOT(Src, DAG)) {
	SDLoc DL(N);
	APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
	NotSrc = DAG.getBitcast(SrcVT, NotSrc);
	return DAG.getNode(ISD::XOR, DL, VT,
	DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
	DAG.getConstant(NotMask, DL, VT));
	}

	// Simplify the inputs.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	APInt DemandedMask(APInt::getAllOnesValue(NumBits));
	if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
	return SDValue(N, 0);

	return SDValue();
	}

	static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	// With vector masks we only demand the upper bit of the mask.
	SDValue Mask = cast<X86MaskedGatherScatterSDNode>(N)->getMask();
	if (Mask.getScalarValueSizeInBits() != 1) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
	if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
	if (N->getOpcode() != ISD::DELETED_NODE)
	DCI.AddToWorklist(N);
	return SDValue(N, 0);
	}
	}

	return SDValue();
	}

	static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
	SDValue Index, SDValue Base, SDValue Scale,
	SelectionDAG &DAG) {
	SDLoc DL(GorS);

	if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
	SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
	Gather->getMask(), Base, Index, Scale } ;
	return DAG.getMaskedGather(Gather->getVTList(),
	Gather->getMemoryVT(), DL, Ops,
	Gather->getMemOperand(),
	Gather->getIndexType());
	}
	auto *Scatter = cast<MaskedScatterSDNode>(GorS);
	SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
	Scatter->getMask(), Base, Index, Scale };
	return DAG.getMaskedScatter(Scatter->getVTList(),
	Scatter->getMemoryVT(), DL,
	Ops, Scatter->getMemOperand(),
	Scatter->getIndexType());
	}

	static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	SDLoc DL(N);
	auto *GorS = cast<MaskedGatherScatterSDNode>(N);
	SDValue Index = GorS->getIndex();
	SDValue Base = GorS->getBasePtr();
	SDValue Scale = GorS->getScale();

	if (DCI.isBeforeLegalize()) {
	unsigned IndexWidth = Index.getScalarValueSizeInBits();

	// Shrink constant indices if they are larger than 32-bits.
	// Only do this before legalize types since v2i64 could become v2i32.
	// FIXME: We could check that the type is legal if we're after legalize
	// types, but then we would need to construct test cases where that happens.
	// FIXME: We could support more than just constant vectors, but we need to
	// careful with costing. A truncate that can be optimized out would be fine.
	// Otherwise we might only want to create a truncate if it avoids a split.
	if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
	if (BV->isConstant() && IndexWidth > 32 &&
	DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
	unsigned NumElts = Index.getValueType().getVectorNumElements();
	EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
	Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
	return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
	}
	}

	// Shrink any sign/zero extends from 32 or smaller to larger than 32 if
	// there are sufficient sign bits. Only do this before legalize types to
	// avoid creating illegal types in truncate.
	if ((Index.getOpcode() == ISD::SIGN_EXTEND \|\|
	Index.getOpcode() == ISD::ZERO_EXTEND) &&
	IndexWidth > 32 &&
	Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
	DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
	unsigned NumElts = Index.getValueType().getVectorNumElements();
	EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
	Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
	return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
	}
	}

	if (DCI.isBeforeLegalizeOps()) {
	unsigned IndexWidth = Index.getScalarValueSizeInBits();

	// Make sure the index is either i32 or i64
	if (IndexWidth != 32 && IndexWidth != 64) {
	MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
	EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
	Index.getValueType().getVectorNumElements());
	Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
	return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
	}
	}

	// With vector masks we only demand the upper bit of the mask.
	SDValue Mask = GorS->getMask();
	if (Mask.getScalarValueSizeInBits() != 1) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
	if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
	if (N->getOpcode() != ISD::DELETED_NODE)
	DCI.AddToWorklist(N);
	return SDValue(N, 0);
	}
	}

	return SDValue();
	}

	// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
	static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
	SDValue EFLAGS = N->getOperand(1);

	// Try to simplify the EFLAGS and condition code operands.
	if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
	return getSETCC(CC, Flags, DL, DAG);

	return SDValue();
	}

	/// Optimize branch condition evaluation.
	static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	SDValue EFLAGS = N->getOperand(3);
	X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));

	// Try to simplify the EFLAGS and condition code operands.
	// Make sure to not keep references to operands, as combineSetCCEFLAGS can
	// RAUW them under us.
	if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
	SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
	return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
	N->getOperand(1), Cond, Flags);
	}

	return SDValue();
	}

	// TODO: Could we move this to DAGCombine?
	static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
	SelectionDAG &DAG) {
	// Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
	// to optimize away operation when it's from a constant.
	//
	// The general transformation is:
	// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
	// AND(VECTOR_CMP(x,y), constant2)
	// constant2 = UNARYOP(constant)

	// Early exit if this isn't a vector operation, the operand of the
	// unary operation isn't a bitwise AND, or if the sizes of the operations
	// aren't the same.
	EVT VT = N->getValueType(0);
	bool IsStrict = N->isStrictFPOpcode();
	unsigned NumEltBits = VT.getScalarSizeInBits();
	SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
	if (!VT.isVector() \|\| Op0.getOpcode() != ISD::AND \|\|
	DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits \|\|
	VT.getSizeInBits() != Op0.getValueSizeInBits())
	return SDValue();

	// Now check that the other operand of the AND is a constant. We could
	// make the transformation for non-constant splats as well, but it's unclear
	// that would be a benefit as it would not eliminate any operations, just
	// perform one more step in scalar code before moving to the vector unit.
	if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
	// Bail out if the vector isn't a constant.
	if (!BV->isConstant())
	return SDValue();

	// Everything checks out. Build up the new and improved node.
	SDLoc DL(N);
	EVT IntVT = BV->getValueType(0);
	// Create a new constant of the appropriate type for the transformed
	// DAG.
	SDValue SourceConst;
	if (IsStrict)
	SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
	{N->getOperand(0), SDValue(BV, 0)});
	else
	SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
	// The AND node needs bitcasts to/from an integer vector type around it.
	SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
	SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
	MaskConst);
	SDValue Res = DAG.getBitcast(VT, NewAnd);
	if (IsStrict)
	return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
	return Res;
	}

	return SDValue();
	}

	/// If we are converting a value to floating-point, try to replace scalar
	/// truncate of an extracted vector element with a bitcast. This tries to keep
	/// the sequence on XMM registers rather than moving between vector and GPRs.
	static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
	// TODO: This is currently only used by combineSIntToFP, but it is generalized
	// to allow being called by any similar cast opcode.
	// TODO: Consider merging this into lowering: vectorizeExtractedCast().
	SDValue Trunc = N->getOperand(0);
	if (!Trunc.hasOneUse() \|\| Trunc.getOpcode() != ISD::TRUNCATE)
	return SDValue();

	SDValue ExtElt = Trunc.getOperand(0);
	if (!ExtElt.hasOneUse() \|\| ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isNullConstant(ExtElt.getOperand(1)))
	return SDValue();

	EVT TruncVT = Trunc.getValueType();
	EVT SrcVT = ExtElt.getValueType();
	unsigned DestWidth = TruncVT.getSizeInBits();
	unsigned SrcWidth = SrcVT.getSizeInBits();
	if (SrcWidth % DestWidth != 0)
	return SDValue();

	// inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
	EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
	unsigned VecWidth = SrcVecVT.getSizeInBits();
	unsigned NumElts = VecWidth / DestWidth;
	EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
	SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
	SDLoc DL(N);
	SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
	BitcastVec, ExtElt.getOperand(1));
	return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
	}

	static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	bool IsStrict = N->isStrictFPOpcode();
	SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
	EVT VT = N->getValueType(0);
	EVT InVT = Op0.getValueType();

	// UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
	// UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
	// UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
	if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
	SDLoc dl(N);
	EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	InVT.getVectorNumElements());
	SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);

	// UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
	if (IsStrict)
	return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
	{N->getOperand(0), P});
	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
	}

	// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
	// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
	// the optimization here.
	if (DAG.SignBitIsZero(Op0)) {
	if (IsStrict)
	return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
	{N->getOperand(0), Op0});
	return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
	}

	return SDValue();
	}

	static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// First try to optimize away the conversion entirely when it's
	// conditionally from a constant. Vectors only.
	bool IsStrict = N->isStrictFPOpcode();
	if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
	return Res;

	// Now move on to more general possibilities.
	SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
	EVT VT = N->getValueType(0);
	EVT InVT = Op0.getValueType();

	// SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
	// SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
	// SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
	if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
	SDLoc dl(N);
	EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	InVT.getVectorNumElements());
	SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
	if (IsStrict)
	return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
	{N->getOperand(0), P});
	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
	}

	// Without AVX512DQ we only support i64 to float scalar conversion. For both
	// vectors and scalars, see if we know that the upper bits are all the sign
	// bit, in which case we can truncate the input to i32 and convert from that.
	if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
	unsigned BitWidth = InVT.getScalarSizeInBits();
	unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
	if (NumSignBits >= (BitWidth - 31)) {
	EVT TruncVT = MVT::i32;
	if (InVT.isVector())
	TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
	InVT.getVectorNumElements());
	SDLoc dl(N);
	if (DCI.isBeforeLegalize() \|\| TruncVT != MVT::v2i32) {
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
	if (IsStrict)
	return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
	{N->getOperand(0), Trunc});
	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
	}
	// If we're after legalize and the type is v2i32 we need to shuffle and
	// use CVTSI2P.
	assert(InVT == MVT::v2i64 && "Unexpected VT!");
	SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
	SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
	{ 0, 2, -1, -1 });
	if (IsStrict)
	return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
	{N->getOperand(0), Shuf});
	return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
	}
	}

	// Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
	// a 32-bit target where SSE doesn't support i64->FP operations.
	if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
	Op0.getOpcode() == ISD::LOAD) {
	LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());

	// This transformation is not supported if the result type is f16 or f128.
	if (VT == MVT::f16 \|\| VT == MVT::f128)
	return SDValue();

	// If we have AVX512DQ we can use packed conversion instructions unless
	// the VT is f80.
	if (Subtarget.hasDQI() && VT != MVT::f80)
	return SDValue();

	if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
	Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
	std::pair<SDValue, SDValue> Tmp =
	Subtarget.getTargetLowering()->BuildFILD(
	VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
	DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
	return Tmp.first;
	}
	}

	if (IsStrict)
	return SDValue();

	if (SDValue V = combineToFPTruncExtElt(N, DAG))
	return V;

	return SDValue();
	}

	static bool needCarryOrOverflowFlag(SDValue Flags) {
	assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");

	for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
	UI != UE; ++UI) {
	SDNode User = UI;

	X86::CondCode CC;
	switch (User->getOpcode()) {
	default:
	// Be conservative.
	return true;
	case X86ISD::SETCC:
	case X86ISD::SETCC_CARRY:
	CC = (X86::CondCode)User->getConstantOperandVal(0);
	break;
	case X86ISD::BRCOND:
	CC = (X86::CondCode)User->getConstantOperandVal(2);
	break;
	case X86ISD::CMOV:
	CC = (X86::CondCode)User->getConstantOperandVal(2);
	break;
	}

	switch (CC) {
	default: break;
	case X86::COND_A: case X86::COND_AE:
	case X86::COND_B: case X86::COND_BE:
	case X86::COND_O: case X86::COND_NO:
	case X86::COND_G: case X86::COND_GE:
	case X86::COND_L: case X86::COND_LE:
	return true;
	}
	}

	return false;
	}

	static bool onlyZeroFlagUsed(SDValue Flags) {
	assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");

	for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
	UI != UE; ++UI) {
	SDNode User = UI;

	unsigned CCOpNo;
	switch (User->getOpcode()) {
	default:
	// Be conservative.
	return false;
	case X86ISD::SETCC: CCOpNo = 0; break;
	case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
	case X86ISD::BRCOND: CCOpNo = 2; break;
	case X86ISD::CMOV: CCOpNo = 2; break;
	}

	X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
	if (CC != X86::COND_E && CC != X86::COND_NE)
	return false;
	}

	return true;
	}

	static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
	// Only handle test patterns.
	if (!isNullConstant(N->getOperand(1)))
	return SDValue();

	// If we have a CMP of a truncated binop, see if we can make a smaller binop
	// and use its flags directly.
	// TODO: Maybe we should try promoting compares that only use the zero flag
	// first if we can prove the upper bits with computeKnownBits?
	SDLoc dl(N);
	SDValue Op = N->getOperand(0);
	EVT VT = Op.getValueType();

	// If we have a constant logical shift that's only used in a comparison
	// against zero turn it into an equivalent AND. This allows turning it into
	// a TEST instruction later.
	if ((Op.getOpcode() == ISD::SRL \|\| Op.getOpcode() == ISD::SHL) &&
	Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
	onlyZeroFlagUsed(SDValue(N, 0))) {
	unsigned BitWidth = VT.getSizeInBits();
	const APInt &ShAmt = Op.getConstantOperandAPInt(1);
	if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
	unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
	APInt Mask = Op.getOpcode() == ISD::SRL
	? APInt::getHighBitsSet(BitWidth, MaskBits)
	: APInt::getLowBitsSet(BitWidth, MaskBits);
	if (Mask.isSignedIntN(32)) {
	Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
	DAG.getConstant(Mask, dl, VT));
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, VT));
	}
	}
	}

	// Look for a truncate with a single use.
	if (Op.getOpcode() != ISD::TRUNCATE \|\| !Op.hasOneUse())
	return SDValue();

	Op = Op.getOperand(0);

	// Arithmetic op can only have one use.
	if (!Op.hasOneUse())
	return SDValue();

	unsigned NewOpc;
	switch (Op.getOpcode()) {
	default: return SDValue();
	case ISD::AND:
	// Skip and with constant. We have special handling for and with immediate
	// during isel to generate test instructions.
	if (isa<ConstantSDNode>(Op.getOperand(1)))
	return SDValue();
	NewOpc = X86ISD::AND;
	break;
	case ISD::OR: NewOpc = X86ISD::OR; break;
	case ISD::XOR: NewOpc = X86ISD::XOR; break;
	case ISD::ADD:
	// If the carry or overflow flag is used, we can't truncate.
	if (needCarryOrOverflowFlag(SDValue(N, 0)))
	return SDValue();
	NewOpc = X86ISD::ADD;
	break;
	case ISD::SUB:
	// If the carry or overflow flag is used, we can't truncate.
	if (needCarryOrOverflowFlag(SDValue(N, 0)))
	return SDValue();
	NewOpc = X86ISD::SUB;
	break;
	}

	// We found an op we can narrow. Truncate its inputs.
	SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
	SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));

	// Use a X86 specific opcode to avoid DAG combine messing with it.
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);

	// For AND, keep a CMP so that we can match the test pattern.
	if (NewOpc == X86ISD::AND)
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, VT));

	// Return the flags.
	return Op.getValue(1);
	}

	static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	assert((X86ISD::ADD == N->getOpcode() \|\| X86ISD::SUB == N->getOpcode()) &&
	"Expected X86ISD::ADD or X86ISD::SUB");

	SDLoc DL(N);
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	MVT VT = LHS.getSimpleValueType();
	unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;

	// If we don't use the flag result, simplify back to a generic ADD/SUB.
	if (!N->hasAnyUseOfValue(1)) {
	SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
	return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
	}

	// Fold any similar generic ADD/SUB opcodes to reuse this node.
	auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
	SDValue Ops[] = {N0, N1};
	SDVTList VTs = DAG.getVTList(N->getValueType(0));
	if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
	SDValue Op(N, 0);
	if (Negate)
	Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
	DCI.CombineTo(GenericAddSub, Op);
	}
	};
	MatchGeneric(LHS, RHS, false);
	MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());

	return SDValue();
	}

	static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
	if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
	MVT VT = N->getSimpleValueType(0);
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
	N->getOperand(0), N->getOperand(1),
	Flags);
	}

	// Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
	// iff the flag result is dead.
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) &&
	!N->hasAnyUseOfValue(1))
	return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0),
	Op0.getOperand(1), N->getOperand(2));

	return SDValue();
	}

	// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
	static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	// If the LHS and RHS of the ADC node are zero, then it can't overflow and
	// the result is either zero or one (depending on the input carry bit).
	// Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
	if (X86::isZeroNode(N->getOperand(0)) &&
	X86::isZeroNode(N->getOperand(1)) &&
	// We don't have a good way to replace an EFLAGS use, so only do this when
	// dead right now.
	SDValue(N, 1).use_empty()) {
	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
	SDValue Res1 =
	DAG.getNode(ISD::AND, DL, VT,
	DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
	N->getOperand(2)),
	DAG.getConstant(1, DL, VT));
	return DCI.CombineTo(N, Res1, CarryOut);
	}

	if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
	MVT VT = N->getSimpleValueType(0);
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
	N->getOperand(0), N->getOperand(1),
	Flags);
	}

	return SDValue();
	}

	/// If this is an add or subtract where one operand is produced by a cmp+setcc,
	/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
	/// with CMP+{ADC, SBB}.
	static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
	bool IsSub = N->getOpcode() == ISD::SUB;
	SDValue X = N->getOperand(0);
	SDValue Y = N->getOperand(1);

	// If this is an add, canonicalize a zext operand to the RHS.
	// TODO: Incomplete? What if both sides are zexts?
	if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
	Y.getOpcode() != ISD::ZERO_EXTEND)
	std::swap(X, Y);

	// Look through a one-use zext.
	bool PeekedThroughZext = false;
	if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
	Y = Y.getOperand(0);
	PeekedThroughZext = true;
	}

	// If this is an add, canonicalize a setcc operand to the RHS.
	// TODO: Incomplete? What if both sides are setcc?
	// TODO: Should we allow peeking through a zext of the other operand?
	if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
	Y.getOpcode() != X86ISD::SETCC)
	std::swap(X, Y);

	if (Y.getOpcode() != X86ISD::SETCC \|\| !Y.hasOneUse())
	return SDValue();

	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);

	// If X is -1 or 0, then we have an opportunity to avoid constants required in
	// the general case below.
	auto *ConstantX = dyn_cast<ConstantSDNode>(X);
	if (ConstantX) {
	if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) \|\|
	(IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
	// This is a complicated way to get -1 or 0 from the carry flag:
	// -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
	// 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
	Y.getOperand(1));
	}

	if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) \|\|
	(IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
	SDValue EFLAGS = Y->getOperand(1);
	if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
	EFLAGS.getValueType().isInteger() &&
	!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
	// Swap the operands of a SUB, and we have the same pattern as above.
	// -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
	// 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
	SDValue NewSub = DAG.getNode(
	X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
	EFLAGS.getOperand(1), EFLAGS.getOperand(0));
	SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
	NewEFLAGS);
	}
	}
	}

	if (CC == X86::COND_B) {
	// X + SETB Z --> adc X, 0
	// X - SETB Z --> sbb X, 0
	return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
	DAG.getVTList(VT, MVT::i32), X,
	DAG.getConstant(0, DL, VT), Y.getOperand(1));
	}

	if (CC == X86::COND_A) {
	SDValue EFLAGS = Y.getOperand(1);
	// Try to convert COND_A into COND_B in an attempt to facilitate
	// materializing "setb reg".
	//
	// Do not flip "e > c", where "c" is a constant, because Cmp instruction
	// cannot take an immediate as its first operand.
	//
	if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
	EFLAGS.getValueType().isInteger() &&
	!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
	SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
	EFLAGS.getNode()->getVTList(),
	EFLAGS.getOperand(1), EFLAGS.getOperand(0));
	SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
	return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
	DAG.getVTList(VT, MVT::i32), X,
	DAG.getConstant(0, DL, VT), NewEFLAGS);
	}
	}

	if (CC == X86::COND_AE) {
	// X + SETAE --> sbb X, -1
	// X - SETAE --> adc X, -1
	return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
	DAG.getVTList(VT, MVT::i32), X,
	DAG.getConstant(-1, DL, VT), Y.getOperand(1));
	}

	if (CC == X86::COND_BE) {
	// X + SETBE --> sbb X, -1
	// X - SETBE --> adc X, -1
	SDValue EFLAGS = Y.getOperand(1);
	// Try to convert COND_BE into COND_AE in an attempt to facilitate
	// materializing "setae reg".
	//
	// Do not flip "e <= c", where "c" is a constant, because Cmp instruction
	// cannot take an immediate as its first operand.
	//
	if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
	EFLAGS.getValueType().isInteger() &&
	!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
	SDValue NewSub = DAG.getNode(
	X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
	EFLAGS.getOperand(1), EFLAGS.getOperand(0));
	SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
	return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
	DAG.getVTList(VT, MVT::i32), X,
	DAG.getConstant(-1, DL, VT), NewEFLAGS);
	}
	}

	if (CC != X86::COND_E && CC != X86::COND_NE)
	return SDValue();

	SDValue Cmp = Y.getOperand(1);
	if (Cmp.getOpcode() != X86ISD::CMP \|\| !Cmp.hasOneUse() \|\|
	!X86::isZeroNode(Cmp.getOperand(1)) \|\|
	!Cmp.getOperand(0).getValueType().isInteger())
	return SDValue();

	SDValue Z = Cmp.getOperand(0);
	EVT ZVT = Z.getValueType();

	// If X is -1 or 0, then we have an opportunity to avoid constants required in
	// the general case below.
	if (ConstantX) {
	// 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
	// fake operands:
	// 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
	// -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
	if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) \|\|
	(!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
	SDValue Zero = DAG.getConstant(0, DL, ZVT);
	SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
	SDValue(Neg.getNode(), 1));
	}

	// cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
	// with fake operands:
	// 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
	// -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
	if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) \|\|
	(!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
	SDValue One = DAG.getConstant(1, DL, ZVT);
	SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
	SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
	Cmp1.getValue(1));
	}
	}

	// (cmp Z, 1) sets the carry flag if Z is 0.
	SDValue One = DAG.getConstant(1, DL, ZVT);
	SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
	SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);

	// Add the flags type for ADC/SBB nodes.
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);

	// X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
	// X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
	if (CC == X86::COND_NE)
	return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
	DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));

	// X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
	// X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
	return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
	DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
	}

	static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
	const SDLoc &DL, EVT VT,
	const X86Subtarget &Subtarget) {
	// Example of pattern we try to detect:
	// t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
	//(add (build_vector (extract_elt t, 0),
	// (extract_elt t, 2),
	// (extract_elt t, 4),
	// (extract_elt t, 6)),
	// (build_vector (extract_elt t, 1),
	// (extract_elt t, 3),
	// (extract_elt t, 5),
	// (extract_elt t, 7)))

	if (!Subtarget.hasSSE2())
	return SDValue();

	if (Op0.getOpcode() != ISD::BUILD_VECTOR \|\|
	Op1.getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::i32 \|\|
	VT.getVectorNumElements() < 4 \|\|
	!isPowerOf2_32(VT.getVectorNumElements()))
	return SDValue();

	// Check if one of Op0,Op1 is of the form:
	// (build_vector (extract_elt Mul, 0),
	// (extract_elt Mul, 2),
	// (extract_elt Mul, 4),
	// ...
	// the other is of the form:
	// (build_vector (extract_elt Mul, 1),
	// (extract_elt Mul, 3),
	// (extract_elt Mul, 5),
	// ...
	// and identify Mul.
	SDValue Mul;
	for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
	SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
	Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
	// TODO: Be more tolerant to undefs.
	if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();
	auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
	auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
	auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
	auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
	if (!Const0L \|\| !Const1L \|\| !Const0H \|\| !Const1H)
	return SDValue();
	unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
	Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
	// Commutativity of mul allows factors of a product to reorder.
	if (Idx0L > Idx1L)
	std::swap(Idx0L, Idx1L);
	if (Idx0H > Idx1H)
	std::swap(Idx0H, Idx1H);
	// Commutativity of add allows pairs of factors to reorder.
	if (Idx0L > Idx0H) {
	std::swap(Idx0L, Idx0H);
	std::swap(Idx1L, Idx1H);
	}
	if (Idx0L != 2 * i \|\| Idx1L != 2 * i + 1 \|\| Idx0H != 2 * i + 2 \|\|
	Idx1H != 2 * i + 3)
	return SDValue();
	if (!Mul) {
	// First time an extract_elt's source vector is visited. Must be a MUL
	// with 2X number of vector elements than the BUILD_VECTOR.
	// Both extracts must be from same MUL.
	Mul = Op0L->getOperand(0);
	if (Mul->getOpcode() != ISD::MUL \|\|
	Mul.getValueType().getVectorNumElements() != 2 * e)
	return SDValue();
	}
	// Check that the extract is from the same MUL previously seen.
	if (Mul != Op0L->getOperand(0) \|\| Mul != Op1L->getOperand(0) \|\|
	Mul != Op0H->getOperand(0) \|\| Mul != Op1H->getOperand(0))
	return SDValue();
	}

	// Check if the Mul source can be safely shrunk.
	ShrinkMode Mode;
	if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) \|\|
	Mode == ShrinkMode::MULU16)
	return SDValue();

	EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
	VT.getVectorNumElements() * 2);
	SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
	SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));

	auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	EVT InVT = Ops[0].getValueType();
	assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
	EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	InVT.getVectorNumElements() / 2);
	return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
	};
	return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
	}

	// Attempt to turn this pattern into PMADDWD.
	// (add (mul (sext (build_vector)), (sext (build_vector))),
	// (mul (sext (build_vector)), (sext (build_vector)))
	static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
	const SDLoc &DL, EVT VT,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	if (N0.getOpcode() != ISD::MUL \|\| N1.getOpcode() != ISD::MUL)
	return SDValue();

	if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::i32 \|\|
	VT.getVectorNumElements() < 4 \|\|
	!isPowerOf2_32(VT.getVectorNumElements()))
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	SDValue N10 = N1.getOperand(0);
	SDValue N11 = N1.getOperand(1);

	// All inputs need to be sign extends.
	// TODO: Support ZERO_EXTEND from known positive?
	if (N00.getOpcode() != ISD::SIGN_EXTEND \|\|
	N01.getOpcode() != ISD::SIGN_EXTEND \|\|
	N10.getOpcode() != ISD::SIGN_EXTEND \|\|
	N11.getOpcode() != ISD::SIGN_EXTEND)
	return SDValue();

	// Peek through the extends.
	N00 = N00.getOperand(0);
	N01 = N01.getOperand(0);
	N10 = N10.getOperand(0);
	N11 = N11.getOperand(0);

	// Must be extending from vXi16.
	EVT InVT = N00.getValueType();
	if (InVT.getVectorElementType() != MVT::i16 \|\| N01.getValueType() != InVT \|\|
	N10.getValueType() != InVT \|\| N11.getValueType() != InVT)
	return SDValue();

	// All inputs should be build_vectors.
	if (N00.getOpcode() != ISD::BUILD_VECTOR \|\|
	N01.getOpcode() != ISD::BUILD_VECTOR \|\|
	N10.getOpcode() != ISD::BUILD_VECTOR \|\|
	N11.getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	// For each element, we need to ensure we have an odd element from one vector
	// multiplied by the odd element of another vector and the even element from
	// one of the same vectors being multiplied by the even element from the
	// other vector. So we need to make sure for each element i, this operator
	// is being performed:
	// A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
	SDValue In0, In1;
	for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
	SDValue N00Elt = N00.getOperand(i);
	SDValue N01Elt = N01.getOperand(i);
	SDValue N10Elt = N10.getOperand(i);
	SDValue N11Elt = N11.getOperand(i);
	// TODO: Be more tolerant to undefs.
	if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();
	auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
	auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
	auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
	auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
	if (!ConstN00Elt \|\| !ConstN01Elt \|\| !ConstN10Elt \|\| !ConstN11Elt)
	return SDValue();
	unsigned IdxN00 = ConstN00Elt->getZExtValue();
	unsigned IdxN01 = ConstN01Elt->getZExtValue();
	unsigned IdxN10 = ConstN10Elt->getZExtValue();
	unsigned IdxN11 = ConstN11Elt->getZExtValue();
	// Add is commutative so indices can be reordered.
	if (IdxN00 > IdxN10) {
	std::swap(IdxN00, IdxN10);
	std::swap(IdxN01, IdxN11);
	}
	// N0 indices be the even element. N1 indices must be the next odd element.
	if (IdxN00 != 2 * i \|\| IdxN10 != 2 * i + 1 \|\|
	IdxN01 != 2 * i \|\| IdxN11 != 2 * i + 1)
	return SDValue();
	SDValue N00In = N00Elt.getOperand(0);
	SDValue N01In = N01Elt.getOperand(0);
	SDValue N10In = N10Elt.getOperand(0);
	SDValue N11In = N11Elt.getOperand(0);
	// First time we find an input capture it.
	if (!In0) {
	In0 = N00In;
	In1 = N01In;
	}
	// Mul is commutative so the input vectors can be in any order.
	// Canonicalize to make the compares easier.
	if (In0 != N00In)
	std::swap(N00In, N01In);
	if (In0 != N10In)
	std::swap(N10In, N11In);
	if (In0 != N00In \|\| In1 != N01In \|\| In0 != N10In \|\| In1 != N11In)
	return SDValue();
	}

	auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	// Shrink by adding truncate nodes and let DAGCombine fold with the
	// sources.
	EVT OpVT = Ops[0].getValueType();
	assert(OpVT.getScalarType() == MVT::i16 &&
	"Unexpected scalar element type");
	assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
	EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	OpVT.getVectorNumElements() / 2);
	return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
	};
	return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
	PMADDBuilder);
	}

	static SDValue combineAddOrSubToHADDorHSUB(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	bool IsAdd = N->getOpcode() == ISD::ADD;
	assert((IsAdd \|\| N->getOpcode() == ISD::SUB) && "Wrong opcode");

	+ SmallVector<int, 8> PostShuffleMask;
	if ((VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\| VT == MVT::v16i16 \|\|
	VT == MVT::v8i32) &&
	Subtarget.hasSSSE3() &&
	- isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd)) {
	+ isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd, PostShuffleMask)) {
	auto HOpBuilder = [IsAdd](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	- return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB,
	- DL, Ops[0].getValueType(), Ops);
	+ return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB, DL,
	+ Ops[0].getValueType(), Ops);
	};
	- return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
	- HOpBuilder);
	+ SDValue HorizBinOp =
	+ SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HOpBuilder);
	+ if (!PostShuffleMask.empty())
	+ HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
	+ DAG.getUNDEF(VT), PostShuffleMask);
	+ return HorizBinOp;
	}

	return SDValue();
	}

	static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
	return MAdd;
	if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
	return MAdd;

	// Try to synthesize horizontal adds from adds of shuffles.
	if (SDValue V = combineAddOrSubToHADDorHSUB(N, DAG, Subtarget))
	return V;

	// If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
	// (sub Y, (sext (vXi1 X))).
	// FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
	// generic DAG combine without a legal type check, but adding this there
	// caused regressions.
	if (VT.isVector()) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
	Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
	TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
	SDLoc DL(N);
	SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
	return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
	}

	if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
	Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
	TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
	SDLoc DL(N);
	SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
	return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
	}
	}

	return combineAddOrSubToADCOrSBB(N, DAG);
	}

	static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	EVT VT = N->getValueType(0);

	if (!VT.isVector())
	return SDValue();

	// PSUBUS is supported, starting from SSE2, but truncation for v8i32
	// is only worth it with SSSE3 (PSHUFB).
	EVT EltVT = VT.getVectorElementType();
	if (!(Subtarget.hasSSE2() && (EltVT == MVT::i8 \|\| EltVT == MVT::i16)) &&
	!(Subtarget.hasSSSE3() && (VT == MVT::v8i32 \|\| VT == MVT::v8i64)) &&
	!(Subtarget.useBWIRegs() && (VT == MVT::v16i32)))
	return SDValue();

	SDValue SubusLHS, SubusRHS;
	// Try to find umax(a,b) - b or a - umin(a,b) patterns
	// they may be converted to subus(a,b).
	// TODO: Need to add IR canonicalization for this code.
	if (Op0.getOpcode() == ISD::UMAX) {
	SubusRHS = Op1;
	SDValue MaxLHS = Op0.getOperand(0);
	SDValue MaxRHS = Op0.getOperand(1);
	if (MaxLHS == Op1)
	SubusLHS = MaxRHS;
	else if (MaxRHS == Op1)
	SubusLHS = MaxLHS;
	else
	return SDValue();
	} else if (Op1.getOpcode() == ISD::UMIN) {
	SubusLHS = Op0;
	SDValue MinLHS = Op1.getOperand(0);
	SDValue MinRHS = Op1.getOperand(1);
	if (MinLHS == Op0)
	SubusRHS = MinRHS;
	else if (MinRHS == Op0)
	SubusRHS = MinLHS;
	else
	return SDValue();
	} else if (Op1.getOpcode() == ISD::TRUNCATE &&
	Op1.getOperand(0).getOpcode() == ISD::UMIN &&
	(EltVT == MVT::i8 \|\| EltVT == MVT::i16)) {
	// Special case where the UMIN has been truncated. Try to push the truncate
	// further up. This is similar to the i32/i64 special processing.
	SubusLHS = Op0;
	SDValue MinLHS = Op1.getOperand(0).getOperand(0);
	SDValue MinRHS = Op1.getOperand(0).getOperand(1);
	EVT TruncVT = Op1.getOperand(0).getValueType();
	if (!(Subtarget.hasSSSE3() && (TruncVT == MVT::v8i32 \|\|
	TruncVT == MVT::v8i64)) &&
	!(Subtarget.useBWIRegs() && (TruncVT == MVT::v16i32)))
	return SDValue();
	SDValue OpToSaturate;
	if (MinLHS.getOpcode() == ISD::ZERO_EXTEND &&
	MinLHS.getOperand(0) == Op0)
	OpToSaturate = MinRHS;
	else if (MinRHS.getOpcode() == ISD::ZERO_EXTEND &&
	MinRHS.getOperand(0) == Op0)
	OpToSaturate = MinLHS;
	else
	return SDValue();

	// Saturate the non-extended input and then truncate it.
	SDLoc DL(N);
	SDValue SaturationConst =
	DAG.getConstant(APInt::getLowBitsSet(TruncVT.getScalarSizeInBits(),
	VT.getScalarSizeInBits()),
	DL, TruncVT);
	SDValue UMin = DAG.getNode(ISD::UMIN, DL, TruncVT, OpToSaturate,
	SaturationConst);
	SubusRHS = DAG.getNode(ISD::TRUNCATE, DL, VT, UMin);
	} else
	return SDValue();

	// PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
	// special preprocessing in some cases.
	if (EltVT == MVT::i8 \|\| EltVT == MVT::i16)
	return DAG.getNode(ISD::USUBSAT, SDLoc(N), VT, SubusLHS, SubusRHS);

	assert((VT == MVT::v8i32 \|\| VT == MVT::v16i32 \|\| VT == MVT::v8i64) &&
	"Unexpected VT!");

	// Special preprocessing case can be only applied
	// if the value was zero extended from 16 bit,
	// so we require first 16 bits to be zeros for 32 bit
	// values, or first 48 bits for 64 bit values.
	KnownBits Known = DAG.computeKnownBits(SubusLHS);
	unsigned NumZeros = Known.countMinLeadingZeros();
	if ((VT == MVT::v8i64 && NumZeros < 48) \|\| NumZeros < 16)
	return SDValue();

	EVT ExtType = SubusLHS.getValueType();
	EVT ShrinkedType;
	if (VT == MVT::v8i32 \|\| VT == MVT::v8i64)
	ShrinkedType = MVT::v8i16;
	else
	ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;

	// If SubusLHS is zeroextended - truncate SubusRHS to it's
	// size SubusRHS = umin(0xFFF.., SubusRHS).
	SDValue SaturationConst =
	DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
	ShrinkedType.getScalarSizeInBits()),
	SDLoc(SubusLHS), ExtType);
	SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
	SaturationConst);
	SDValue NewSubusLHS =
	DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
	SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
	SDValue Psubus = DAG.getNode(ISD::USUBSAT, SDLoc(N), ShrinkedType,
	NewSubusLHS, NewSubusRHS);

	// Zero extend the result, it may be used somewhere as 32 bit,
	// if not zext and following trunc will shrink.
	return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
	}

	static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	// X86 can't encode an immediate LHS of a sub. See if we can push the
	// negation into a preceding instruction.
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
	// If the RHS of the sub is a XOR with one use and a constant, invert the
	// immediate. Then add one to the LHS of the sub so we can turn
	// X-Y -> X+~Y+1, saving one register.
	if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
	isa<ConstantSDNode>(Op1.getOperand(1))) {
	const APInt &XorC = Op1.getConstantOperandAPInt(1);
	EVT VT = Op0.getValueType();
	SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
	Op1.getOperand(0),
	DAG.getConstant(~XorC, SDLoc(Op1), VT));
	return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
	DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
	}
	}

	// Try to synthesize horizontal subs from subs of shuffles.
	if (SDValue V = combineAddOrSubToHADDorHSUB(N, DAG, Subtarget))
	return V;

	// Try to create PSUBUS if SUB's argument is max/min
	if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
	return V;

	return combineAddOrSubToADCOrSBB(N, DAG);
	}

	static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = N->getSimpleValueType(0);
	SDLoc DL(N);

	if (N->getOperand(0) == N->getOperand(1)) {
	if (N->getOpcode() == X86ISD::PCMPEQ)
	return DAG.getConstant(-1, DL, VT);
	if (N->getOpcode() == X86ISD::PCMPGT)
	return DAG.getConstant(0, DL, VT);
	}

	return SDValue();
	}

	/// Helper that combines an array of subvector ops as if they were the operands
	/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
	/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
	static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
	ArrayRef<SDValue> Ops, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
	unsigned EltSizeInBits = VT.getScalarSizeInBits();

	if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
	return DAG.getUNDEF(VT);

	if (llvm::all_of(Ops, [](SDValue Op) {
	return ISD::isBuildVectorAllZeros(Op.getNode());
	}))
	return getZeroVector(VT, Subtarget, DAG, DL);

	SDValue Op0 = Ops[0];
	bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });

	// Fold subvector loads into one.
	// If needed, look through bitcasts to get to the load.
	if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
	bool Fast;
	const X86TargetLowering *TLI = Subtarget.getTargetLowering();
	if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
	*FirstLd->getMemOperand(), &Fast) &&
	Fast) {
	if (SDValue Ld =
	EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
	return Ld;
	}
	}

	// Repeated subvectors.
	if (IsSplat) {
	// If this broadcast/subv_broadcast is inserted into both halves, use a
	// larger broadcast/subv_broadcast.
	if (Op0.getOpcode() == X86ISD::VBROADCAST \|\|
	Op0.getOpcode() == X86ISD::SUBV_BROADCAST)
	return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));

	// If this broadcast_load is inserted into both halves, use a larger
	// broadcast_load. Update other uses to use an extracted subvector.
	if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
	auto *MemIntr = cast<MemIntrinsicSDNode>(Op0);
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()};
	SDValue BcastLd = DAG.getMemIntrinsicNode(
	X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
	MemIntr->getMemOperand());
	DAG.ReplaceAllUsesOfValueWith(
	Op0, extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
	DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
	return BcastLd;
	}

	// concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
	if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
	(Subtarget.hasAVX2() \|\| MayFoldLoad(Op0.getOperand(0))))
	return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
	Op0.getOperand(0),
	DAG.getIntPtrConstant(0, DL)));

	// concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
	if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	(Subtarget.hasAVX2() \|\|
	(EltSizeInBits >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
	Op0.getOperand(0).getValueType() == VT.getScalarType())
	return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));

	// concat_vectors(extract_subvector(broadcast(x)),
	// extract_subvector(broadcast(x))) -> broadcast(x)
	if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	Op0.getOperand(0).getValueType() == VT) {
	if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST \|\|
	Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)
	return Op0.getOperand(0);
	}
	}

	// Repeated opcode.
	// TODO - combineX86ShufflesRecursively should handle shuffle concatenation
	// but it currently struggles with different vector widths.
	if (llvm::all_of(Ops, [Op0](SDValue Op) {
	return Op.getOpcode() == Op0.getOpcode();
	})) {
	unsigned NumOps = Ops.size();
	switch (Op0.getOpcode()) {
	case X86ISD::SHUFP: {
	// Add SHUFPD support if/when necessary.
	if (!IsSplat && VT.getScalarType() == MVT::f32 &&
	llvm::all_of(Ops, [Op0](SDValue Op) {
	return Op.getOperand(2) == Op0.getOperand(2);
	})) {
	SmallVector<SDValue, 2> LHS, RHS;
	for (unsigned i = 0; i != NumOps; ++i) {
	LHS.push_back(Ops[i].getOperand(0));
	RHS.push_back(Ops[i].getOperand(1));
	}
	return DAG.getNode(Op0.getOpcode(), DL, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS),
	DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS),
	Op0.getOperand(2));
	}
	break;
	}
	case X86ISD::PSHUFHW:
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFD:
	if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
	Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
	SmallVector<SDValue, 2> Src;
	for (unsigned i = 0; i != NumOps; ++i)
	Src.push_back(Ops[i].getOperand(0));
	return DAG.getNode(Op0.getOpcode(), DL, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
	Op0.getOperand(1));
	}
	LLVM_FALLTHROUGH;
	case X86ISD::VPERMILPI:
	// TODO - add support for vXf64/vXi64 shuffles.
	if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 \|\| VT == MVT::v8i32) &&
	Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
	SmallVector<SDValue, 2> Src;
	for (unsigned i = 0; i != NumOps; ++i)
	Src.push_back(DAG.getBitcast(MVT::v4f32, Ops[i].getOperand(0)));
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f32, Src);
	Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
	Op0.getOperand(1));
	return DAG.getBitcast(VT, Res);
	}
	break;
	case X86ISD::VSHLI:
	case X86ISD::VSRAI:
	case X86ISD::VSRLI:
	if (((VT.is256BitVector() && Subtarget.hasInt256()) \|\|
	(VT.is512BitVector() && Subtarget.useAVX512Regs() &&
	(EltSizeInBits >= 32 \|\| Subtarget.useBWIRegs()))) &&
	llvm::all_of(Ops, [Op0](SDValue Op) {
	return Op0.getOperand(1) == Op.getOperand(1);
	})) {
	SmallVector<SDValue, 2> Src;
	for (unsigned i = 0; i != NumOps; ++i)
	Src.push_back(Ops[i].getOperand(0));
	return DAG.getNode(Op0.getOpcode(), DL, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
	Op0.getOperand(1));
	}
	break;
	case X86ISD::VPERMI:
	case X86ISD::VROTLI:
	case X86ISD::VROTRI:
	if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
	llvm::all_of(Ops, [Op0](SDValue Op) {
	return Op0.getOperand(1) == Op.getOperand(1);
	})) {
	SmallVector<SDValue, 2> Src;
	for (unsigned i = 0; i != NumOps; ++i)
	Src.push_back(Ops[i].getOperand(0));
	return DAG.getNode(Op0.getOpcode(), DL, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
	Op0.getOperand(1));
	}
	break;
	case X86ISD::PACKSS:
	case X86ISD::PACKUS:
	if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
	Subtarget.hasInt256()) {
	SmallVector<SDValue, 2> LHS, RHS;
	for (unsigned i = 0; i != NumOps; ++i) {
	LHS.push_back(Ops[i].getOperand(0));
	RHS.push_back(Ops[i].getOperand(1));
	}
	MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
	SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
	NumOps * SrcVT.getVectorNumElements());
	return DAG.getNode(Op0.getOpcode(), DL, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS),
	DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS));
	}
	break;
	case X86ISD::PALIGNR:
	if (!IsSplat &&
	((VT.is256BitVector() && Subtarget.hasInt256()) \|\|
	(VT.is512BitVector() && Subtarget.useBWIRegs())) &&
	llvm::all_of(Ops, [Op0](SDValue Op) {
	return Op0.getOperand(2) == Op.getOperand(2);
	})) {
	SmallVector<SDValue, 2> LHS, RHS;
	for (unsigned i = 0; i != NumOps; ++i) {
	LHS.push_back(Ops[i].getOperand(0));
	RHS.push_back(Ops[i].getOperand(1));
	}
	return DAG.getNode(Op0.getOpcode(), DL, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS),
	DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS),
	Op0.getOperand(2));
	}
	break;
	}
	}

	return SDValue();
	}

	static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	EVT SrcVT = N->getOperand(0).getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Don't do anything for i1 vectors.
	if (VT.getVectorElementType() == MVT::i1)
	return SDValue();

	if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
	SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
	if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
	DCI, Subtarget))
	return R;
	}

	return SDValue();
	}

	static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	MVT OpVT = N->getSimpleValueType(0);

	bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;

	SDLoc dl(N);
	SDValue Vec = N->getOperand(0);
	SDValue SubVec = N->getOperand(1);

	uint64_t IdxVal = N->getConstantOperandVal(2);
	MVT SubVecVT = SubVec.getSimpleValueType();

	if (Vec.isUndef() && SubVec.isUndef())
	return DAG.getUNDEF(OpVT);

	// Inserting undefs/zeros into zeros/undefs is a zero vector.
	if ((Vec.isUndef() \|\| ISD::isBuildVectorAllZeros(Vec.getNode())) &&
	(SubVec.isUndef() \|\| ISD::isBuildVectorAllZeros(SubVec.getNode())))
	return getZeroVector(OpVT, Subtarget, DAG, dl);

	if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
	// If we're inserting into a zero vector and then into a larger zero vector,
	// just insert into the larger zero vector directly.
	if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
	ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
	uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
	getZeroVector(OpVT, Subtarget, DAG, dl),
	SubVec.getOperand(1),
	DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
	}

	// If we're inserting into a zero vector and our input was extracted from an
	// insert into a zero vector of the same type and the extraction was at
	// least as large as the original insertion. Just insert the original
	// subvector into a zero vector.
	if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
	isNullConstant(SubVec.getOperand(1)) &&
	SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
	SDValue Ins = SubVec.getOperand(0);
	if (isNullConstant(Ins.getOperand(2)) &&
	ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
	Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
	getZeroVector(OpVT, Subtarget, DAG, dl),
	Ins.getOperand(1), N->getOperand(2));
	}
	}

	// Stop here if this is an i1 vector.
	if (IsI1Vector)
	return SDValue();

	// If this is an insert of an extract, combine to a shuffle. Don't do this
	// if the insert or extract can be represented with a subregister operation.
	if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	SubVec.getOperand(0).getSimpleValueType() == OpVT &&
	(IdxVal != 0 \|\|
	!(Vec.isUndef() \|\| ISD::isBuildVectorAllZeros(Vec.getNode())))) {
	int ExtIdxVal = SubVec.getConstantOperandVal(1);
	if (ExtIdxVal != 0) {
	int VecNumElts = OpVT.getVectorNumElements();
	int SubVecNumElts = SubVecVT.getVectorNumElements();
	SmallVector<int, 64> Mask(VecNumElts);
	// First create an identity shuffle mask.
	for (int i = 0; i != VecNumElts; ++i)
	Mask[i] = i;
	// Now insert the extracted portion.
	for (int i = 0; i != SubVecNumElts; ++i)
	Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;

	return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
	}
	}

	// Match concat_vector style patterns.
	SmallVector<SDValue, 2> SubVectorOps;
	if (collectConcatOps(N, SubVectorOps)) {
	if (SDValue Fold =
	combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
	return Fold;

	// If we're inserting all zeros into the upper half, change this to
	// a concat with zero. We will match this to a move
	// with implicit upper bit zeroing during isel.
	// We do this here because we don't want combineConcatVectorOps to
	// create INSERT_SUBVECTOR from CONCAT_VECTORS.
	if (SubVectorOps.size() == 2 &&
	ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
	getZeroVector(OpVT, Subtarget, DAG, dl),
	SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
	}

	// If this is a broadcast insert into an upper undef, use a larger broadcast.
	if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
	return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));

	// If this is a broadcast load inserted into an upper undef, use a larger
	// broadcast load.
	if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
	SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
	auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
	SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
	SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
	SDValue BcastLd =
	DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
	MemIntr->getMemoryVT(),
	MemIntr->getMemOperand());
	DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
	return BcastLd;
	}

	return SDValue();
	}

	/// If we are extracting a subvector of a vector select and the select condition
	/// is composed of concatenated vectors, try to narrow the select width. This
	/// is a common pattern for AVX1 integer code because 256-bit selects may be
	/// legal, but there is almost no integer math/logic available for 256-bit.
	/// This function should only be called with legal types (otherwise, the calls
	/// to get simple value types will assert).
	static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
	SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
	SmallVector<SDValue, 4> CatOps;
	if (Sel.getOpcode() != ISD::VSELECT \|\|
	!collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
	return SDValue();

	// Note: We assume simple value types because this should only be called with
	// legal operations/types.
	// TODO: This can be extended to handle extraction to 256-bits.
	MVT VT = Ext->getSimpleValueType(0);
	if (!VT.is128BitVector())
	return SDValue();

	MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
	if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
	return SDValue();

	MVT WideVT = Ext->getOperand(0).getSimpleValueType();
	MVT SelVT = Sel.getSimpleValueType();
	assert((SelVT.is256BitVector() \|\| SelVT.is512BitVector()) &&
	"Unexpected vector type with legal operations");

	unsigned SelElts = SelVT.getVectorNumElements();
	unsigned CastedElts = WideVT.getVectorNumElements();
	unsigned ExtIdx = Ext->getConstantOperandVal(1);
	if (SelElts % CastedElts == 0) {
	// The select has the same or more (narrower) elements than the extract
	// operand. The extraction index gets scaled by that factor.
	ExtIdx *= (SelElts / CastedElts);
	} else if (CastedElts % SelElts == 0) {
	// The select has less (wider) elements than the extract operand. Make sure
	// that the extraction index can be divided evenly.
	unsigned IndexDivisor = CastedElts / SelElts;
	if (ExtIdx % IndexDivisor != 0)
	return SDValue();
	ExtIdx /= IndexDivisor;
	} else {
	llvm_unreachable("Element count of simple vector types are not divisible?");
	}

	unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
	unsigned NarrowElts = SelElts / NarrowingFactor;
	MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
	SDLoc DL(Ext);
	SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
	SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
	SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
	SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
	return DAG.getBitcast(VT, NarrowSel);
	}

	static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// For AVX1 only, if we are extracting from a 256-bit and+not (which will
	// eventually get combined/lowered into ANDNP) with a concatenated operand,
	// split the 'and' into 128-bit ops to avoid the concatenate and extract.
	// We let generic combining take over from there to simplify the
	// insert/extract and 'not'.
	// This pattern emerges during AVX1 legalization. We handle it before lowering
	// to avoid complications like splitting constant vector loads.

	// Capture the original wide type in the likely case that we need to bitcast
	// back to this type.
	if (!N->getValueType(0).isSimple())
	return SDValue();

	MVT VT = N->getSimpleValueType(0);
	SDValue InVec = N->getOperand(0);
	unsigned IdxVal = N->getConstantOperandVal(1);
	SDValue InVecBC = peekThroughBitcasts(InVec);
	EVT InVecVT = InVec.getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
	TLI.isTypeLegal(InVecVT) &&
	InVecVT.getSizeInBits() == 256 && InVecBC.getOpcode() == ISD::AND) {
	auto isConcatenatedNot = [] (SDValue V) {
	V = peekThroughBitcasts(V);
	if (!isBitwiseNot(V))
	return false;
	SDValue NotOp = V->getOperand(0);
	return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
	};
	if (isConcatenatedNot(InVecBC.getOperand(0)) \|\|
	isConcatenatedNot(InVecBC.getOperand(1))) {
	// extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
	SDValue Concat = splitVectorIntBinary(InVecBC, DAG);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
	DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
	}
	}

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue V = narrowExtractedVectorSelect(N, DAG))
	return V;

	if (ISD::isBuildVectorAllZeros(InVec.getNode()))
	return getZeroVector(VT, Subtarget, DAG, SDLoc(N));

	if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
	if (VT.getScalarType() == MVT::i1)
	return DAG.getConstant(1, SDLoc(N), VT);
	return getOnesVector(VT, DAG, SDLoc(N));
	}

	if (InVec.getOpcode() == ISD::BUILD_VECTOR)
	return DAG.getBuildVector(
	VT, SDLoc(N),
	InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));

	// If we are extracting from an insert into a zero vector, replace with a
	// smaller insert into zero if we don't access less than the original
	// subvector. Don't do this for i1 vectors.
	if (VT.getVectorElementType() != MVT::i1 &&
	InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 &&
	InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) &&
	ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) &&
	InVec.getOperand(1).getValueSizeInBits() <= VT.getSizeInBits()) {
	SDLoc DL(N);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
	getZeroVector(VT, Subtarget, DAG, DL),
	InVec.getOperand(1), InVec.getOperand(2));
	}

	// If we're extracting from a broadcast then we're better off just
	// broadcasting to the smaller type directly, assuming this is the only use.
	// As its a broadcast we don't care about the extraction index.
	if (InVec.getOpcode() == X86ISD::VBROADCAST && InVec.hasOneUse() &&
	InVec.getOperand(0).getValueSizeInBits() <= VT.getSizeInBits())
	return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0));

	if (InVec.getOpcode() == X86ISD::VBROADCAST_LOAD && InVec.hasOneUse()) {
	auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
	if (MemIntr->getMemoryVT().getSizeInBits() <= VT.getSizeInBits()) {
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
	SDValue BcastLd =
	DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
	MemIntr->getMemoryVT(),
	MemIntr->getMemOperand());
	DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
	return BcastLd;
	}
	}

	// If we're extracting an upper subvector from a broadcast we should just
	// extract the lowest subvector instead which should allow
	// SimplifyDemandedVectorElts do more simplifications.
	if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST \|\|
	InVec.getOpcode() == X86ISD::VBROADCAST_LOAD))
	return extractSubVector(InVec, 0, DAG, SDLoc(N), VT.getSizeInBits());

	// If we're extracting a broadcasted subvector, just use the source.
	if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST &&
	InVec.getOperand(0).getValueType() == VT)
	return InVec.getOperand(0);

	// Attempt to extract from the source of a shuffle vector.
	if ((InVecVT.getSizeInBits() % VT.getSizeInBits()) == 0 &&
	(IdxVal % VT.getVectorNumElements()) == 0) {
	SmallVector<int, 32> ShuffleMask;
	SmallVector<int, 32> ScaledMask;
	SmallVector<SDValue, 2> ShuffleInputs;
	unsigned NumSubVecs = InVecVT.getSizeInBits() / VT.getSizeInBits();
	// Decode the shuffle mask and scale it so its shuffling subvectors.
	if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
	scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
	unsigned SubVecIdx = IdxVal / VT.getVectorNumElements();
	if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
	return DAG.getUNDEF(VT);
	if (ScaledMask[SubVecIdx] == SM_SentinelZero)
	return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
	SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
	if (Src.getValueSizeInBits() == InVecVT.getSizeInBits()) {
	unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
	unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements();
	return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
	SDLoc(N), VT.getSizeInBits());
	}
	}
	}

	// If we're extracting the lowest subvector and we're the only user,
	// we may be able to perform this with a smaller vector width.
	if (IdxVal == 0 && InVec.hasOneUse()) {
	unsigned InOpcode = InVec.getOpcode();
	if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
	// v2f64 CVTDQ2PD(v4i32).
	if (InOpcode == ISD::SINT_TO_FP &&
	InVec.getOperand(0).getValueType() == MVT::v4i32) {
	return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
	}
	// v2f64 CVTUDQ2PD(v4i32).
	if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
	InVec.getOperand(0).getValueType() == MVT::v4i32) {
	return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
	}
	// v2f64 CVTPS2PD(v4f32).
	if (InOpcode == ISD::FP_EXTEND &&
	InVec.getOperand(0).getValueType() == MVT::v4f32) {
	return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
	}
	}
	if ((InOpcode == ISD::ANY_EXTEND \|\|
	InOpcode == ISD::ANY_EXTEND_VECTOR_INREG \|\|
	InOpcode == ISD::ZERO_EXTEND \|\|
	InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG \|\|
	InOpcode == ISD::SIGN_EXTEND \|\|
	InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
	VT.is128BitVector() &&
	InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
	unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
	return DAG.getNode(ExtOp, SDLoc(N), VT, InVec.getOperand(0));
	}
	if (InOpcode == ISD::VSELECT &&
	InVec.getOperand(0).getValueType().is256BitVector() &&
	InVec.getOperand(1).getValueType().is256BitVector() &&
	InVec.getOperand(2).getValueType().is256BitVector()) {
	SDLoc DL(N);
	SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
	SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
	SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
	return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
	}
	}

	return SDValue();
	}

	static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
	EVT VT = N->getValueType(0);
	SDValue Src = N->getOperand(0);
	SDLoc DL(N);

	// If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
	// This occurs frequently in our masked scalar intrinsic code and our
	// floating point select lowering with AVX512.
	// TODO: SimplifyDemandedBits instead?
	if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
	if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
	if (C->getAPIntValue().isOneValue())
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,
	Src.getOperand(0));

	// Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
	if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
	Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
	if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
	if (C->isNullValue())
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
	Src.getOperand(1));

	// Reduce v2i64 to v4i32 if we don't need the upper bits.
	// TODO: Move to DAGCombine/SimplifyDemandedBits?
	if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	auto IsAnyExt64 = [](SDValue Op) {
	if (Op.getValueType() != MVT::i64 \|\| !Op.hasOneUse())
	return SDValue();
	if (Op.getOpcode() == ISD::ANY_EXTEND &&
	Op.getOperand(0).getScalarValueSizeInBits() <= 32)
	return Op.getOperand(0);
	if (auto *Ld = dyn_cast<LoadSDNode>(Op))
	if (Ld->getExtensionType() == ISD::EXTLOAD &&
	Ld->getMemoryVT().getScalarSizeInBits() <= 32)
	return Op;
	return SDValue();
	};
	if (SDValue ExtSrc = IsAnyExt64(peekThroughOneUseBitcasts(Src)))
	return DAG.getBitcast(
	VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
	DAG.getAnyExtOrTrunc(ExtSrc, DL, MVT::i32)));
	}

	// Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
	if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
	Src.getOperand(0).getValueType() == MVT::x86mmx)
	return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));

	return SDValue();
	}

	// Simplify PMULDQ and PMULUDQ operations.
	static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);

	// Canonicalize constant to RHS.
	if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
	!DAG.isConstantIntBuildVectorOrConstantInt(RHS))
	return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);

	// Multiply by zero.
	// Don't return RHS as it may contain UNDEFs.
	if (ISD::isBuildVectorAllZeros(RHS.getNode()))
	return DAG.getConstant(0, SDLoc(N), N->getValueType(0));

	// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
	return SDValue(N, 0);

	// If the input is an extend_invec and the SimplifyDemandedBits call didn't
	// convert it to any_extend_invec, due to the LegalOperations check, do the
	// conversion directly to a vector shuffle manually. This exposes combine
	// opportunities missed by combineExtInVec not calling
	// combineX86ShufflesRecursively on SSE4.1 targets.
	// FIXME: This is basically a hack around several other issues related to
	// ANY_EXTEND_VECTOR_INREG.
	if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
	(LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG \|\|
	LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
	LHS.getOperand(0).getValueType() == MVT::v4i32) {
	SDLoc dl(N);
	LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
	LHS.getOperand(0), { 0, -1, 1, -1 });
	LHS = DAG.getBitcast(MVT::v2i64, LHS);
	return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
	}
	if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
	(RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG \|\|
	RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
	RHS.getOperand(0).getValueType() == MVT::v4i32) {
	SDLoc dl(N);
	RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
	RHS.getOperand(0), { 0, -1, 1, -1 });
	RHS = DAG.getBitcast(MVT::v2i64, RHS);
	return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
	}

	return SDValue();
	}

	static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue In = N->getOperand(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Try to merge vector loads and extend_inreg to an extload.
	if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
	In.hasOneUse()) {
	auto *Ld = cast<LoadSDNode>(In);
	if (Ld->isSimple()) {
	MVT SVT = In.getSimpleValueType().getVectorElementType();
	ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG
	? ISD::SEXTLOAD
	: ISD::ZEXTLOAD;
	EVT MemVT =
	EVT::getVectorVT(*DAG.getContext(), SVT, VT.getVectorNumElements());
	if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
	SDValue Load =
	DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), MemVT,
	Ld->getOriginalAlign(),
	Ld->getMemOperand()->getFlags());
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
	return Load;
	}
	}
	}

	// Attempt to combine as a shuffle.
	// TODO: SSE41 support
	if (Subtarget.hasAVX() && N->getOpcode() != ISD::SIGN_EXTEND_VECTOR_INREG) {
	SDValue Op(N, 0);
	if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;
	}

	return SDValue();
	}

	static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	EVT VT = N->getValueType(0);

	if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
	return DAG.getConstant(0, SDLoc(N), VT);

	APInt KnownUndef, KnownZero;
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
	if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
	KnownZero, DCI))
	return SDValue(N, 0);

	return SDValue();
	}

	// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
	// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
	// extra instructions between the conversion due to going to scalar and back.
	static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (Subtarget.useSoftFloat() \|\| !Subtarget.hasF16C())
	return SDValue();

	if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
	return SDValue();

	if (N->getValueType(0) != MVT::f32 \|\|
	N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
	return SDValue();

	SDLoc dl(N);
	SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
	N->getOperand(0).getOperand(0));
	Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
	DAG.getTargetConstant(4, dl, MVT::i32));
	Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
	DAG.getIntPtrConstant(0, dl));
	}

	static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasF16C() \|\| Subtarget.useSoftFloat())
	return SDValue();

	bool IsStrict = N->isStrictFPOpcode();
	EVT VT = N->getValueType(0);
	SDValue Src = N->getOperand(IsStrict ? 1 : 0);
	EVT SrcVT = Src.getValueType();

	if (!SrcVT.isVector() \|\| SrcVT.getVectorElementType() != MVT::f16)
	return SDValue();

	if (VT.getVectorElementType() != MVT::f32 &&
	VT.getVectorElementType() != MVT::f64)
	return SDValue();

	unsigned NumElts = VT.getVectorNumElements();
	if (NumElts == 1 \|\| !isPowerOf2_32(NumElts))
	return SDValue();

	SDLoc dl(N);

	// Convert the input to vXi16.
	EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
	Src = DAG.getBitcast(IntVT, Src);

	// Widen to at least 8 input elements.
	if (NumElts < 8) {
	unsigned NumConcats = 8 / NumElts;
	SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
	: DAG.getConstant(0, dl, IntVT);
	SmallVector<SDValue, 4> Ops(NumConcats, Fill);
	Ops[0] = Src;
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
	}

	// Destination is vXf32 with at least 4 elements.
	EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
	std::max(4U, NumElts));
	SDValue Cvt, Chain;
	if (IsStrict) {
	Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
	{N->getOperand(0), Src});
	Chain = Cvt.getValue(1);
	} else {
	Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
	}

	if (NumElts < 4) {
	assert(NumElts == 2 && "Unexpected size");
	Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
	DAG.getIntPtrConstant(0, dl));
	}

	if (IsStrict) {
	// Extend to the original VT if necessary.
	if (Cvt.getValueType() != VT) {
	Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
	{Chain, Cvt});
	Chain = Cvt.getValue(1);
	}
	return DAG.getMergeValues({Cvt, Chain}, dl);
	}

	// Extend to the original VT if necessary.
	return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
	}

	// Try to find a larger VBROADCAST_LOAD that we can extract from. Limit this to
	// cases where the loads have the same input chain and the output chains are
	// unused. This avoids any memory ordering issues.
	static SDValue combineVBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	// Only do this if the chain result is unused.
	if (N->hasAnyUseOfValue(1))
	return SDValue();

	auto *MemIntrin = cast<MemIntrinsicSDNode>(N);

	SDValue Ptr = MemIntrin->getBasePtr();
	SDValue Chain = MemIntrin->getChain();
	EVT VT = N->getSimpleValueType(0);
	EVT MemVT = MemIntrin->getMemoryVT();

	// Look at other users of our base pointer and try to find a wider broadcast.
	// The input chain and the size of the memory VT must match.
	for (SDNode *User : Ptr->uses())
	if (User != N && User->getOpcode() == X86ISD::VBROADCAST_LOAD &&
	cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
	cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
	cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
	MemVT.getSizeInBits() &&
	!User->hasAnyUseOfValue(1) &&
	User->getValueSizeInBits(0) > VT.getSizeInBits()) {
	SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
	VT.getSizeInBits());
	Extract = DAG.getBitcast(VT, Extract);
	return DCI.CombineTo(N, Extract, SDValue(User, 1));
	}

	return SDValue();
	}

	static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasF16C() \|\| Subtarget.useSoftFloat())
	return SDValue();

	EVT VT = N->getValueType(0);
	SDValue Src = N->getOperand(0);
	EVT SrcVT = Src.getValueType();

	if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::f16 \|\|
	SrcVT.getVectorElementType() != MVT::f32)
	return SDValue();

	unsigned NumElts = VT.getVectorNumElements();
	if (NumElts == 1 \|\| !isPowerOf2_32(NumElts))
	return SDValue();

	SDLoc dl(N);

	// Widen to at least 4 input elements.
	if (NumElts < 4)
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
	DAG.getConstantFP(0.0, dl, SrcVT));

	// Destination is v8i16 with at least 8 elements.
	EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
	std::max(8U, NumElts));
	SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src,
	DAG.getTargetConstant(4, dl, MVT::i32));

	// Extract down to real number of elements.
	if (NumElts < 8) {
	EVT IntVT = VT.changeVectorElementTypeToInteger();
	Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
	DAG.getIntPtrConstant(0, dl));
	}

	return DAG.getBitcast(VT, Cvt);
	}

	static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
	SDValue Src = N->getOperand(0);

	// Turn MOVDQ2Q+simple_load into an mmx load.
	if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
	LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());

	if (LN->isSimple()) {
	SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
	LN->getBasePtr(),
	LN->getPointerInfo(),
	LN->getOriginalAlign(),
	LN->getMemOperand()->getFlags());
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
	return NewLd;
	}
	}

	return SDValue();
	}

	SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	switch (N->getOpcode()) {
	default: break;
	case ISD::SCALAR_TO_VECTOR:
	return combineScalarToVector(N, DAG);
	case ISD::EXTRACT_VECTOR_ELT:
	case X86ISD::PEXTRW:
	case X86ISD::PEXTRB:
	return combineExtractVectorElt(N, DAG, DCI, Subtarget);
	case ISD::CONCAT_VECTORS:
	return combineConcatVectors(N, DAG, DCI, Subtarget);
	case ISD::INSERT_SUBVECTOR:
	return combineInsertSubvector(N, DAG, DCI, Subtarget);
	case ISD::EXTRACT_SUBVECTOR:
	return combineExtractSubvector(N, DAG, DCI, Subtarget);
	case ISD::VSELECT:
	case ISD::SELECT:
	case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
	case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
	case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
	case X86ISD::CMP: return combineCMP(N, DAG);
	case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
	case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
	case X86ISD::ADD:
	case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
	case X86ISD::SBB: return combineSBB(N, DAG);
	case X86ISD::ADC: return combineADC(N, DAG, DCI);
	case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
	case ISD::SHL: return combineShiftLeft(N, DAG);
	case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
	case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
	case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
	case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
	case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
	case X86ISD::BEXTR: return combineBEXTR(N, DAG, DCI, Subtarget);
	case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
	case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
	case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
	case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
	case X86ISD::VEXTRACT_STORE:
	return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
	case ISD::SINT_TO_FP:
	case ISD::STRICT_SINT_TO_FP:
	return combineSIntToFP(N, DAG, DCI, Subtarget);
	case ISD::UINT_TO_FP:
	case ISD::STRICT_UINT_TO_FP:
	return combineUIntToFP(N, DAG, Subtarget);
	case ISD::FADD:
	case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
	case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
	case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
	case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
	case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
	case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
	case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
	case X86ISD::FXOR:
	case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
	case X86ISD::FMIN:
	case X86ISD::FMAX: return combineFMinFMax(N, DAG);
	case ISD::FMINNUM:
	case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
	case X86ISD::CVTSI2P:
	case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
	case X86ISD::CVTP2SI:
	case X86ISD::CVTP2UI:
	case X86ISD::STRICT_CVTTP2SI:
	case X86ISD::CVTTP2SI:
	case X86ISD::STRICT_CVTTP2UI:
	case X86ISD::CVTTP2UI:
	return combineCVTP2I_CVTTP2I(N, DAG, DCI);
	case X86ISD::STRICT_CVTPH2PS:
	case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
	case X86ISD::BT: return combineBT(N, DAG, DCI);
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
	case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
	case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
	case ISD::ANY_EXTEND_VECTOR_INREG:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	case ISD::ZERO_EXTEND_VECTOR_INREG: return combineExtInVec(N, DAG, DCI,
	Subtarget);
	case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
	case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
	case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
	case X86ISD::PACKSS:
	case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
	case X86ISD::VSHL:
	case X86ISD::VSRA:
	case X86ISD::VSRL:
	return combineVectorShiftVar(N, DAG, DCI, Subtarget);
	case X86ISD::VSHLI:
	case X86ISD::VSRAI:
	case X86ISD::VSRLI:
	return combineVectorShiftImm(N, DAG, DCI, Subtarget);
	case ISD::INSERT_VECTOR_ELT:
	case X86ISD::PINSRB:
	case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
	case X86ISD::SHUFP: // Handle all target specific shuffles
	case X86ISD::INSERTPS:
	case X86ISD::EXTRQI:
	case X86ISD::INSERTQI:
	case X86ISD::VALIGN:
	case X86ISD::PALIGNR:
	case X86ISD::VSHLDQ:
	case X86ISD::VSRLDQ:
	case X86ISD::BLENDI:
	case X86ISD::UNPCKH:
	case X86ISD::UNPCKL:
	case X86ISD::MOVHLPS:
	case X86ISD::MOVLHPS:
	case X86ISD::PSHUFB:
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFHW:
	case X86ISD::PSHUFLW:
	case X86ISD::MOVSHDUP:
	case X86ISD::MOVSLDUP:
	case X86ISD::MOVDDUP:
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	case X86ISD::VBROADCAST:
	case X86ISD::VPPERM:
	case X86ISD::VPERMI:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	case X86ISD::VPERMIL2:
	case X86ISD::VPERMILPI:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERM2X128:
	case X86ISD::SHUF128:
	case X86ISD::VZEXT_MOVL:
	case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
	case X86ISD::FMADD_RND:
	case X86ISD::FMSUB:
	case X86ISD::STRICT_FMSUB:
	case X86ISD::FMSUB_RND:
	case X86ISD::FNMADD:
	case X86ISD::STRICT_FNMADD:
	case X86ISD::FNMADD_RND:
	case X86ISD::FNMSUB:
	case X86ISD::STRICT_FNMSUB:
	case X86ISD::FNMSUB_RND:
	case ISD::FMA:
	case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
	case X86ISD::FMADDSUB_RND:
	case X86ISD::FMSUBADD_RND:
	case X86ISD::FMADDSUB:
	case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
	case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
	case X86ISD::MGATHER:
	case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
	case ISD::MGATHER:
	case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
	case X86ISD::PCMPEQ:
	case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
	case X86ISD::PMULDQ:
	case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
	case X86ISD::KSHIFTL:
	case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
	case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
	case ISD::STRICT_FP_EXTEND:
	case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
	case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
	case X86ISD::VBROADCAST_LOAD: return combineVBROADCAST_LOAD(N, DAG, DCI);
	case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
	}

	return SDValue();
	}

	bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
	if (!isTypeLegal(VT))
	return false;

	// There are no vXi8 shifts.
	if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
	return false;

	// TODO: Almost no 8-bit ops are desirable because they have no actual
	// size/speed advantages vs. 32-bit ops, but they do have a major
	// potential disadvantage by causing partial register stalls.
	//
	// 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
	// we have specializations to turn 32-bit multiply/shl into LEA or other ops.
	// Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
	// check for a constant operand to the multiply.
	if ((Opc == ISD::MUL \|\| Opc == ISD::SHL) && VT == MVT::i8)
	return false;

	// i16 instruction encodings are longer and some i16 instructions are slow,
	// so those are not desirable.
	if (VT == MVT::i16) {
	switch (Opc) {
	default:
	break;
	case ISD::LOAD:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL:
	case ISD::SUB:
	case ISD::ADD:
	case ISD::MUL:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	return false;
	}
	}

	// Any legal type not explicitly accounted for above here is desirable.
	return true;
	}

	SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
	SDValue Value, SDValue Addr,
	SelectionDAG &DAG) const {
	const Module *M = DAG.getMachineFunction().getMMI().getModule();
	Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
	if (IsCFProtectionSupported) {
	// In case control-flow branch protection is enabled, we need to add
	// notrack prefix to the indirect branch.
	// In order to do that we create NT_BRIND SDNode.
	// Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
	return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
	}

	return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
	}

	bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
	EVT VT = Op.getValueType();
	bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
	isa<ConstantSDNode>(Op.getOperand(1));

	// i16 is legal, but undesirable since i16 instruction encodings are longer
	// and some i16 instructions are slow.
	// 8-bit multiply-by-constant can usually be expanded to something cheaper
	// using LEA and/or other ALU ops.
	if (VT != MVT::i16 && !Is8BitMulByConstant)
	return false;

	auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
	if (!Op.hasOneUse())
	return false;
	SDNode User = Op->use_begin();
	if (!ISD::isNormalStore(User))
	return false;
	auto *Ld = cast<LoadSDNode>(Load);
	auto *St = cast<StoreSDNode>(User);
	return Ld->getBasePtr() == St->getBasePtr();
	};

	auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
	if (!Load.hasOneUse() \|\| Load.getOpcode() != ISD::ATOMIC_LOAD)
	return false;
	if (!Op.hasOneUse())
	return false;
	SDNode User = Op->use_begin();
	if (User->getOpcode() != ISD::ATOMIC_STORE)
	return false;
	auto *Ld = cast<AtomicSDNode>(Load);
	auto *St = cast<AtomicSDNode>(User);
	return Ld->getBasePtr() == St->getBasePtr();
	};

	bool Commute = false;
	switch (Op.getOpcode()) {
	default: return false;
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	break;
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL: {
	SDValue N0 = Op.getOperand(0);
	// Look out for (store (shl (load), x)).
	if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
	return false;
	break;
	}
	case ISD::ADD:
	case ISD::MUL:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	Commute = true;
	LLVM_FALLTHROUGH;
	case ISD::SUB: {
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);
	// Avoid disabling potential load folding opportunities.
	if (MayFoldLoad(N1) &&
	(!Commute \|\| !isa<ConstantSDNode>(N0) \|\|
	(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
	return false;
	if (MayFoldLoad(N0) &&
	((Commute && !isa<ConstantSDNode>(N1)) \|\|
	(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
	return false;
	if (IsFoldableAtomicRMW(N0, Op) \|\|
	(Commute && IsFoldableAtomicRMW(N1, Op)))
	return false;
	}
	}

	PVT = MVT::i32;
	return true;
	}

	//===----------------------------------------------------------------------===//
	// X86 Inline Assembly Support
	//===----------------------------------------------------------------------===//

	// Helper to match a string separated by whitespace.
	static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
	S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.

	for (StringRef Piece : Pieces) {
	if (!S.startswith(Piece)) // Check if the piece matches.
	return false;

	S = S.substr(Piece.size());
	StringRef::size_type Pos = S.find_first_not_of(" \t");
	if (Pos == 0) // We matched a prefix.
	return false;

	S = S.substr(Pos);
	}

	return S.empty();
	}

	static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {

	if (AsmPieces.size() == 3 \|\| AsmPieces.size() == 4) {
	if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
	std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
	std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {

	if (AsmPieces.size() == 3)
	return true;
	else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
	return true;
	}
	}
	return false;
	}

	bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
	InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());

	const std::string &AsmStr = IA->getAsmString();

	IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
	if (!Ty \|\| Ty->getBitWidth() % 16 != 0)
	return false;

	// TODO: should remove alternatives from the asmstring: "foo {a\|b}" -> "foo a"
	SmallVector<StringRef, 4> AsmPieces;
	SplitString(AsmStr, AsmPieces, ";\n");

	switch (AsmPieces.size()) {
	default: return false;
	case 1:
	// FIXME: this should verify that we are targeting a 486 or better. If not,
	// we will turn this bswap into something that will be lowered to logical
	// ops instead of emitting the bswap asm. For now, we don't support 486 or
	// lower so don't worry about this.
	// bswap $0
	if (matchAsm(AsmPieces[0], {"bswap", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswapl", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswapq", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) \|\|
	matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) \|\|
	matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
	// No need to check constraints, nothing other than the equivalent of
	// "=r,0" would be valid here.
	return IntrinsicLowering::LowerToByteSwap(CI);
	}

	// rorw $$8, ${0:w} --> llvm.bswap.i16
	if (CI->getType()->isIntegerTy(16) &&
	IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
	(matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) \|\|
	matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
	AsmPieces.clear();
	StringRef ConstraintsStr = IA->getConstraintString();
	SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
	array_pod_sort(AsmPieces.begin(), AsmPieces.end());
	if (clobbersFlagRegisters(AsmPieces))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}
	break;
	case 3:
	if (CI->getType()->isIntegerTy(32) &&
	IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
	matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
	matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
	matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
	AsmPieces.clear();
	StringRef ConstraintsStr = IA->getConstraintString();
	SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
	array_pod_sort(AsmPieces.begin(), AsmPieces.end());
	if (clobbersFlagRegisters(AsmPieces))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}

	if (CI->getType()->isIntegerTy(64)) {
	InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
	if (Constraints.size() >= 2 &&
	Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
	Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
	// bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
	if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
	matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
	matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}
	}
	break;
	}
	return false;
	}

	static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
	X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
	.Case("{@cca}", X86::COND_A)
	.Case("{@ccae}", X86::COND_AE)
	.Case("{@ccb}", X86::COND_B)
	.Case("{@ccbe}", X86::COND_BE)
	.Case("{@ccc}", X86::COND_B)
	.Case("{@cce}", X86::COND_E)
	.Case("{@ccz}", X86::COND_E)
	.Case("{@ccg}", X86::COND_G)
	.Case("{@ccge}", X86::COND_GE)
	.Case("{@ccl}", X86::COND_L)
	.Case("{@ccle}", X86::COND_LE)
	.Case("{@ccna}", X86::COND_BE)
	.Case("{@ccnae}", X86::COND_B)
	.Case("{@ccnb}", X86::COND_AE)
	.Case("{@ccnbe}", X86::COND_A)
	.Case("{@ccnc}", X86::COND_AE)
	.Case("{@ccne}", X86::COND_NE)
	.Case("{@ccnz}", X86::COND_NE)
	.Case("{@ccng}", X86::COND_LE)
	.Case("{@ccnge}", X86::COND_L)
	.Case("{@ccnl}", X86::COND_GE)
	.Case("{@ccnle}", X86::COND_G)
	.Case("{@ccno}", X86::COND_NO)
	.Case("{@ccnp}", X86::COND_P)
	.Case("{@ccns}", X86::COND_NS)
	.Case("{@cco}", X86::COND_O)
	.Case("{@ccp}", X86::COND_P)
	.Case("{@ccs}", X86::COND_S)
	.Default(X86::COND_INVALID);
	return Cond;
	}

	/// Given a constraint letter, return the type of constraint for this target.
	X86TargetLowering::ConstraintType
	X86TargetLowering::getConstraintType(StringRef Constraint) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	case 'R':
	case 'q':
	case 'Q':
	case 'f':
	case 't':
	case 'u':
	case 'y':
	case 'x':
	case 'v':
	case 'l':
	case 'k': // AVX512 masking registers.
	return C_RegisterClass;
	case 'a':
	case 'b':
	case 'c':
	case 'd':
	case 'S':
	case 'D':
	case 'A':
	return C_Register;
	case 'I':
	case 'J':
	case 'K':
	case 'N':
	case 'G':
	case 'L':
	case 'M':
	return C_Immediate;
	case 'C':
	case 'e':
	case 'Z':
	return C_Other;
	default:
	break;
	}
	}
	else if (Constraint.size() == 2) {
	switch (Constraint[0]) {
	default:
	break;
	case 'Y':
	switch (Constraint[1]) {
	default:
	break;
	case 'z':
	return C_Register;
	case 'i':
	case 'm':
	case 'k':
	case 't':
	case '2':
	return C_RegisterClass;
	}
	}
	} else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
	return C_Other;
	return TargetLowering::getConstraintType(Constraint);
	}

	/// Examine constraint type and operand type and determine a weight value.
	/// This object must already have been set up with the operand type
	/// and the current alternative constraint selected.
	TargetLowering::ConstraintWeight
	X86TargetLowering::getSingleConstraintMatchWeight(
	AsmOperandInfo &info, const char *constraint) const {
	ConstraintWeight weight = CW_Invalid;
	Value *CallOperandVal = info.CallOperandVal;
	// If we don't have a value, we can't do a match,
	// but allow it at the lowest weight.
	if (!CallOperandVal)
	return CW_Default;
	Type *type = CallOperandVal->getType();
	// Look at the constraint type.
	switch (*constraint) {
	default:
	weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
	LLVM_FALLTHROUGH;
	case 'R':
	case 'q':
	case 'Q':
	case 'a':
	case 'b':
	case 'c':
	case 'd':
	case 'S':
	case 'D':
	case 'A':
	if (CallOperandVal->getType()->isIntegerTy())
	weight = CW_SpecificReg;
	break;
	case 'f':
	case 't':
	case 'u':
	if (type->isFloatingPointTy())
	weight = CW_SpecificReg;
	break;
	case 'y':
	if (type->isX86_MMXTy() && Subtarget.hasMMX())
	weight = CW_SpecificReg;
	break;
	case 'Y':
	if (StringRef(constraint).size() != 2)
	break;
	switch (constraint[1]) {
	default:
	return CW_Invalid;
	// XMM0
	case 'z':
	if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) \|\|
	((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) \|\|
	((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
	return CW_SpecificReg;
	return CW_Invalid;
	// Conditional OpMask regs (AVX512)
	case 'k':
	if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
	return CW_Register;
	return CW_Invalid;
	// Any MMX reg
	case 'm':
	if (type->isX86_MMXTy() && Subtarget.hasMMX())
	return weight;
	return CW_Invalid;
	// Any SSE reg when ISA >= SSE2, same as 'x'
	case 'i':
	case 't':
	case '2':
	if (!Subtarget.hasSSE2())
	return CW_Invalid;
	break;
	}
	break;
	case 'v':
	if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
	weight = CW_Register;
	LLVM_FALLTHROUGH;
	case 'x':
	if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) \|\|
	((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
	weight = CW_Register;
	break;
	case 'k':
	// Enable conditional vector operations using %k<#> registers.
	if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
	weight = CW_Register;
	break;
	case 'I':
	if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
	if (C->getZExtValue() <= 31)
	weight = CW_Constant;
	}
	break;
	case 'J':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 63)
	weight = CW_Constant;
	}
	break;
	case 'K':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
	weight = CW_Constant;
	}
	break;
	case 'L':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getZExtValue() == 0xff) \|\| (C->getZExtValue() == 0xffff))
	weight = CW_Constant;
	}
	break;
	case 'M':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 3)
	weight = CW_Constant;
	}
	break;
	case 'N':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 0xff)
	weight = CW_Constant;
	}
	break;
	case 'G':
	case 'C':
	if (isa<ConstantFP>(CallOperandVal)) {
	weight = CW_Constant;
	}
	break;
	case 'e':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getSExtValue() >= -0x80000000LL) &&
	(C->getSExtValue() <= 0x7fffffffLL))
	weight = CW_Constant;
	}
	break;
	case 'Z':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 0xffffffff)
	weight = CW_Constant;
	}
	break;
	}
	return weight;
	}

	/// Try to replace an X constraint, which matches anything, with another that
	/// has more specific requirements based on the type of the corresponding
	/// operand.
	const char *X86TargetLowering::
	LowerXConstraint(EVT ConstraintVT) const {
	// FP X constraints get lowered to SSE1/2 registers if available, otherwise
	// 'f' like normal targets.
	if (ConstraintVT.isFloatingPoint()) {
	if (Subtarget.hasSSE1())
	return "x";
	}

	return TargetLowering::LowerXConstraint(ConstraintVT);
	}

	// Lower @cc targets via setcc.
	SDValue X86TargetLowering::LowerAsmOutputForConstraint(
	SDValue &Chain, SDValue &Flag, SDLoc DL, const AsmOperandInfo &OpInfo,
	SelectionDAG &DAG) const {
	X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
	if (Cond == X86::COND_INVALID)
	return SDValue();
	// Check that return type is valid.
	if (OpInfo.ConstraintVT.isVector() \|\| !OpInfo.ConstraintVT.isInteger() \|\|
	OpInfo.ConstraintVT.getSizeInBits() < 8)
	report_fatal_error("Flag output operand is of invalid type");

	// Get EFLAGS register. Only update chain when copyfrom is glued.
	if (Flag.getNode()) {
	Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);
	Chain = Flag.getValue(1);
	} else
	Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
	// Extract CC code.
	SDValue CC = getSETCC(Cond, Flag, DL, DAG);
	// Extend to 32-bits
	SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);

	return Result;
	}

	/// Lower the specified operand into the Ops vector.
	/// If it is invalid, don't add anything to Ops.
	void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
	std::string &Constraint,
	std::vector<SDValue>&Ops,
	SelectionDAG &DAG) const {
	SDValue Result;

	// Only support length 1 constraints for now.
	if (Constraint.length() > 1) return;

	char ConstraintLetter = Constraint[0];
	switch (ConstraintLetter) {
	default: break;
	case 'I':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 31) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'J':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 63) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'K':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (isInt<8>(C->getSExtValue())) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'L':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() == 0xff \|\| C->getZExtValue() == 0xffff \|\|
	(Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
	Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'M':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 3) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'N':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 255) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'O':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 127) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'e': {
	// 32-bit signed value
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
	C->getSExtValue())) {
	// Widen to 64 bits here to get it sign extended.
	Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
	break;
	}
	// FIXME gcc accepts some relocatable values here too, but only in certain
	// memory models; it's complicated.
	}
	return;
	}
	case 'Z': {
	// 32-bit unsigned value
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
	C->getZExtValue())) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	// FIXME gcc accepts some relocatable values here too, but only in certain
	// memory models; it's complicated.
	return;
	}
	case 'i': {
	// Literal immediates are always ok.
	if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
	bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
	BooleanContent BCont = getBooleanContents(MVT::i64);
	ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
	: ISD::SIGN_EXTEND;
	int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
	: CST->getSExtValue();
	Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
	break;
	}

	// In any sort of PIC mode addresses need to be computed at runtime by
	// adding in a register or some sort of table lookup. These can't
	// be used as immediates.
	if (Subtarget.isPICStyleGOT() \|\| Subtarget.isPICStyleStubPIC())
	return;

	// If we are in non-pic codegen mode, we allow the address of a global (with
	// an optional displacement) to be used with 'i'.
	if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
	// If we require an extra load to get this address, as in PIC mode, we
	// can't accept it.
	if (isGlobalStubReference(
	Subtarget.classifyGlobalReference(GA->getGlobal())))
	return;
	break;
	}
	}

	if (Result.getNode()) {
	Ops.push_back(Result);
	return;
	}
	return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
	}

	/// Check if \p RC is a general purpose register class.
	/// I.e., GR* or one of their variant.
	static bool isGRClass(const TargetRegisterClass &RC) {
	return RC.hasSuperClassEq(&X86::GR8RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR16RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR32RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR64RegClass) \|\|
	RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
	}

	/// Check if \p RC is a vector register class.
	/// I.e., FR* / VR* or one of their variant.
	static bool isFRClass(const TargetRegisterClass &RC) {
	return RC.hasSuperClassEq(&X86::FR32XRegClass) \|\|
	RC.hasSuperClassEq(&X86::FR64XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR128XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR256XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR512RegClass);
	}

	/// Check if \p RC is a mask register class.
	/// I.e., VK* or one of their variant.
	static bool isVKClass(const TargetRegisterClass &RC) {
	return RC.hasSuperClassEq(&X86::VK1RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK2RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK4RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK8RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK16RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK32RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK64RegClass);
	}

	std::pair<unsigned, const TargetRegisterClass *>
	X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint,
	MVT VT) const {
	// First, see if this is a constraint that directly corresponds to an LLVM
	// register class.
	if (Constraint.size() == 1) {
	// GCC Constraint Letters
	switch (Constraint[0]) {
	default: break;
	// 'A' means [ER]AX + [ER]DX.
	case 'A':
	if (Subtarget.is64Bit())
	return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
	assert((Subtarget.is32Bit() \|\| Subtarget.is16Bit()) &&
	"Expecting 64, 32 or 16 bit subtarget");
	return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);

	// TODO: Slight differences here in allocation order and leaving
	// RIP in the class. Do they matter any more here than they do
	// in the normal allocation?
	case 'k':
	if (Subtarget.hasAVX512()) {
	if (VT == MVT::i1)
	return std::make_pair(0U, &X86::VK1RegClass);
	if (VT == MVT::i8)
	return std::make_pair(0U, &X86::VK8RegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::VK16RegClass);
	}
	if (Subtarget.hasBWI()) {
	if (VT == MVT::i32)
	return std::make_pair(0U, &X86::VK32RegClass);
	if (VT == MVT::i64)
	return std::make_pair(0U, &X86::VK64RegClass);
	}
	break;
	case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
	if (Subtarget.is64Bit()) {
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8RegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16RegClass);
	if (VT == MVT::i32 \|\| VT == MVT::f32)
	return std::make_pair(0U, &X86::GR32RegClass);
	if (VT != MVT::f80)
	return std::make_pair(0U, &X86::GR64RegClass);
	break;
	}
	LLVM_FALLTHROUGH;
	// 32-bit fallthrough
	case 'Q': // Q_REGS
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16_ABCDRegClass);
	if (VT == MVT::i32 \|\| VT == MVT::f32 \|\| !Subtarget.is64Bit())
	return std::make_pair(0U, &X86::GR32_ABCDRegClass);
	if (VT != MVT::f80)
	return std::make_pair(0U, &X86::GR64_ABCDRegClass);
	break;
	case 'r': // GENERAL_REGS
	case 'l': // INDEX_REGS
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8RegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16RegClass);
	if (VT == MVT::i32 \|\| VT == MVT::f32 \|\| !Subtarget.is64Bit())
	return std::make_pair(0U, &X86::GR32RegClass);
	if (VT != MVT::f80)
	return std::make_pair(0U, &X86::GR64RegClass);
	break;
	case 'R': // LEGACY_REGS
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8_NOREXRegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16_NOREXRegClass);
	if (VT == MVT::i32 \|\| VT == MVT::f32 \|\| !Subtarget.is64Bit())
	return std::make_pair(0U, &X86::GR32_NOREXRegClass);
	if (VT != MVT::f80)
	return std::make_pair(0U, &X86::GR64_NOREXRegClass);
	break;
	case 'f': // FP Stack registers.
	// If SSE is enabled for this VT, use f80 to ensure the isel moves the
	// value to the correct fpstack register class.
	if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
	return std::make_pair(0U, &X86::RFP32RegClass);
	if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
	return std::make_pair(0U, &X86::RFP64RegClass);
	if (VT == MVT::f32 \|\| VT == MVT::f64 \|\| VT == MVT::f80)
	return std::make_pair(0U, &X86::RFP80RegClass);
	break;
	case 'y': // MMX_REGS if MMX allowed.
	if (!Subtarget.hasMMX()) break;
	return std::make_pair(0U, &X86::VR64RegClass);
	case 'v':
	case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
	if (!Subtarget.hasSSE1()) break;
	bool VConstraint = (Constraint[0] == 'v');

	switch (VT.SimpleTy) {
	default: break;
	// Scalar SSE types.
	case MVT::f32:
	case MVT::i32:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::FR32XRegClass);
	return std::make_pair(0U, &X86::FR32RegClass);
	case MVT::f64:
	case MVT::i64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::FR64XRegClass);
	return std::make_pair(0U, &X86::FR64RegClass);
	case MVT::i128:
	if (Subtarget.is64Bit()) {
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::VR128XRegClass);
	return std::make_pair(0U, &X86::VR128RegClass);
	}
	break;
	// Vector types and fp128.
	case MVT::f128:
	case MVT::v16i8:
	case MVT::v8i16:
	case MVT::v4i32:
	case MVT::v2i64:
	case MVT::v4f32:
	case MVT::v2f64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::VR128XRegClass);
	return std::make_pair(0U, &X86::VR128RegClass);
	// AVX types.
	case MVT::v32i8:
	case MVT::v16i16:
	case MVT::v8i32:
	case MVT::v4i64:
	case MVT::v8f32:
	case MVT::v4f64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::VR256XRegClass);
	if (Subtarget.hasAVX())
	return std::make_pair(0U, &X86::VR256RegClass);
	break;
	case MVT::v64i8:
	case MVT::v32i16:
	case MVT::v8f64:
	case MVT::v16f32:
	case MVT::v16i32:
	case MVT::v8i64:
	if (!Subtarget.hasAVX512()) break;
	if (VConstraint)
	return std::make_pair(0U, &X86::VR512RegClass);
	return std::make_pair(0U, &X86::VR512_0_15RegClass);
	}
	break;
	}
	} else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
	switch (Constraint[1]) {
	default:
	break;
	case 'i':
	case 't':
	case '2':
	return getRegForInlineAsmConstraint(TRI, "x", VT);
	case 'm':
	if (!Subtarget.hasMMX()) break;
	return std::make_pair(0U, &X86::VR64RegClass);
	case 'z':
	if (!Subtarget.hasSSE1()) break;
	switch (VT.SimpleTy) {
	default: break;
	// Scalar SSE types.
	case MVT::f32:
	case MVT::i32:
	return std::make_pair(X86::XMM0, &X86::FR32RegClass);
	case MVT::f64:
	case MVT::i64:
	return std::make_pair(X86::XMM0, &X86::FR64RegClass);
	case MVT::f128:
	case MVT::v16i8:
	case MVT::v8i16:
	case MVT::v4i32:
	case MVT::v2i64:
	case MVT::v4f32:
	case MVT::v2f64:
	return std::make_pair(X86::XMM0, &X86::VR128RegClass);
	// AVX types.
	case MVT::v32i8:
	case MVT::v16i16:
	case MVT::v8i32:
	case MVT::v4i64:
	case MVT::v8f32:
	case MVT::v4f64:
	if (Subtarget.hasAVX())
	return std::make_pair(X86::YMM0, &X86::VR256RegClass);
	break;
	case MVT::v64i8:
	case MVT::v32i16:
	case MVT::v8f64:
	case MVT::v16f32:
	case MVT::v16i32:
	case MVT::v8i64:
	if (Subtarget.hasAVX512())
	return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
	break;
	}
	break;
	case 'k':
	// This register class doesn't allocate k0 for masked vector operation.
	if (Subtarget.hasAVX512()) {
	if (VT == MVT::i1)
	return std::make_pair(0U, &X86::VK1WMRegClass);
	if (VT == MVT::i8)
	return std::make_pair(0U, &X86::VK8WMRegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::VK16WMRegClass);
	}
	if (Subtarget.hasBWI()) {
	if (VT == MVT::i32)
	return std::make_pair(0U, &X86::VK32WMRegClass);
	if (VT == MVT::i64)
	return std::make_pair(0U, &X86::VK64WMRegClass);
	}
	break;
	}
	}

	if (parseConstraintCode(Constraint) != X86::COND_INVALID)
	return std::make_pair(0U, &X86::GR32RegClass);

	// Use the default implementation in TargetLowering to convert the register
	// constraint into a member of a register class.
	std::pair<Register, const TargetRegisterClass*> Res;
	Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

	// Not found as a standard register?
	if (!Res.second) {
	// Map st(0) -> st(7) -> ST0
	if (Constraint.size() == 7 && Constraint[0] == '{' &&
	tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
	Constraint[3] == '(' &&
	(Constraint[4] >= '0' && Constraint[4] <= '7') &&
	Constraint[5] == ')' && Constraint[6] == '}') {
	// st(7) is not allocatable and thus not a member of RFP80. Return
	// singleton class in cases where we have a reference to it.
	if (Constraint[4] == '7')
	return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
	return std::make_pair(X86::FP0 + Constraint[4] - '0',
	&X86::RFP80RegClass);
	}

	// GCC allows "st(0)" to be called just plain "st".
	if (StringRef("{st}").equals_lower(Constraint))
	return std::make_pair(X86::FP0, &X86::RFP80RegClass);

	// flags -> EFLAGS
	if (StringRef("{flags}").equals_lower(Constraint))
	return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);

	// dirflag -> DF
	if (StringRef("{dirflag}").equals_lower(Constraint))
	return std::make_pair(X86::DF, &X86::DFCCRRegClass);

	// fpsr -> FPSW
	if (StringRef("{fpsr}").equals_lower(Constraint))
	return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);

	return Res;
	}

	// Make sure it isn't a register that requires 64-bit mode.
	if (!Subtarget.is64Bit() &&
	(isFRClass(Res.second) \|\| isGRClass(Res.second)) &&
	TRI->getEncodingValue(Res.first) >= 8) {
	// Register requires REX prefix, but we're in 32-bit mode.
	return std::make_pair(0, nullptr);
	}

	// Make sure it isn't a register that requires AVX512.
	if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
	TRI->getEncodingValue(Res.first) & 0x10) {
	// Register requires EVEX prefix.
	return std::make_pair(0, nullptr);
	}

	// Otherwise, check to see if this is a register class of the wrong value
	// type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
	// turn into {ax},{dx}.
	// MVT::Other is used to specify clobber names.
	if (TRI->isTypeLegalForClass(*Res.second, VT) \|\| VT == MVT::Other)
	return Res; // Correct type already, nothing to do.

	// Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
	// return "eax". This should even work for things like getting 64bit integer
	// registers when given an f64 type.
	const TargetRegisterClass *Class = Res.second;
	// The generic code will match the first register class that contains the
	// given register. Thus, based on the ordering of the tablegened file,
	// the "plain" GR classes might not come first.
	// Therefore, use a helper method.
	if (isGRClass(*Class)) {
	unsigned Size = VT.getSizeInBits();
	if (Size == 1) Size = 8;
	Register DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
	if (DestReg > 0) {
	bool is64Bit = Subtarget.is64Bit();
	const TargetRegisterClass *RC =
	Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
	: Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
	: Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
	: Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr)
	: nullptr;
	if (Size == 64 && !is64Bit) {
	// Model GCC's behavior here and select a fixed pair of 32-bit
	// registers.
	switch (DestReg) {
	case X86::RAX:
	return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
	case X86::RDX:
	return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
	case X86::RCX:
	return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
	case X86::RBX:
	return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
	case X86::RSI:
	return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
	case X86::RDI:
	return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
	case X86::RBP:
	return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
	default:
	return std::make_pair(0, nullptr);
	}
	}
	if (RC && RC->contains(DestReg))
	return std::make_pair(DestReg, RC);
	return Res;
	}
	// No register found/type mismatch.
	return std::make_pair(0, nullptr);
	} else if (isFRClass(*Class)) {
	// Handle references to XMM physical registers that got mapped into the
	// wrong class. This can happen with constraints like {xmm0} where the
	// target independent register mapper will just pick the first match it can
	// find, ignoring the required type.

	// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
	if (VT == MVT::f32 \|\| VT == MVT::i32)
	Res.second = &X86::FR32XRegClass;
	else if (VT == MVT::f64 \|\| VT == MVT::i64)
	Res.second = &X86::FR64XRegClass;
	else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
	Res.second = &X86::VR128XRegClass;
	else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
	Res.second = &X86::VR256XRegClass;
	else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
	Res.second = &X86::VR512RegClass;
	else {
	// Type mismatch and not a clobber: Return an error;
	Res.first = 0;
	Res.second = nullptr;
	}
	} else if (isVKClass(*Class)) {
	if (VT == MVT::i1)
	Res.second = &X86::VK1RegClass;
	else if (VT == MVT::i8)
	Res.second = &X86::VK8RegClass;
	else if (VT == MVT::i16)
	Res.second = &X86::VK16RegClass;
	else if (VT == MVT::i32)
	Res.second = &X86::VK32RegClass;
	else if (VT == MVT::i64)
	Res.second = &X86::VK64RegClass;
	else {
	// Type mismatch and not a clobber: Return an error;
	Res.first = 0;
	Res.second = nullptr;
	}
	}

	return Res;
	}

	int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS) const {
	// Scaling factors are not free at all.
	// An indexed folded instruction, i.e., inst (reg1, reg2, scale),
	// will take 2 allocations in the out of order engine instead of 1
	// for plain addressing mode, i.e. inst (reg1).
	// E.g.,
	// vaddps (%rsi,%rdx), %ymm0, %ymm1
	// Requires two allocations (one for the load, one for the computation)
	// whereas:
	// vaddps (%rsi), %ymm0, %ymm1
	// Requires just 1 allocation, i.e., freeing allocations for other operations
	// and having less micro operations to execute.
	//
	// For some X86 architectures, this is even worse because for instance for
	// stores, the complex addressing mode forces the instruction to use the
	// "load" ports instead of the dedicated "store" port.
	// E.g., on Haswell:
	// vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
	// vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
	if (isLegalAddressingMode(DL, AM, Ty, AS))
	// Scale represents reg2 * scale, thus account for 1
	// as soon as we use a second register.
	return AM.Scale != 0;
	return -1;
	}

	bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
	// Integer division on x86 is expensive. However, when aggressively optimizing
	// for code size, we prefer to use a div instruction, as it is usually smaller
	// than the alternative sequence.
	// The exception to this is vector division. Since x86 doesn't have vector
	// integer division, leaving the division as-is is a loss even in terms of
	// size, because it will have to be scalarized, while the alternative code
	// sequence can be performed in vector form.
	bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
	return OptSize && !VT.isVector();
	}

	void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
	if (!Subtarget.is64Bit())
	return;

	// Update IsSplitCSR in X86MachineFunctionInfo.
	X86MachineFunctionInfo *AFI =
	Entry->getParent()->getInfo<X86MachineFunctionInfo>();
	AFI->setIsSplitCSR(true);
	}

	void X86TargetLowering::insertCopiesSplitCSR(
	MachineBasicBlock *Entry,
	const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
	if (!IStart)
	return;

	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
	MachineBasicBlock::iterator MBBI = Entry->begin();
	for (const MCPhysReg I = IStart; I; ++I) {
	const TargetRegisterClass *RC = nullptr;
	if (X86::GR64RegClass.contains(*I))
	RC = &X86::GR64RegClass;
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");

	Register NewVR = MRI->createVirtualRegister(RC);
	// Create copy from CSR to a virtual register.
	// FIXME: this currently does not emit CFI pseudo-instructions, it works
	// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
	// nounwind. If we want to generalize this later, we may need to emit
	// CFI pseudo-instructions.
	assert(
	Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
	"Function should be nounwind in insertCopiesSplitCSR!");
	Entry->addLiveIn(*I);
	BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
	.addReg(*I);

	// Insert the copy-back instructions right before the terminator.
	for (auto *Exit : Exits)
	BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
	TII->get(TargetOpcode::COPY), *I)
	.addReg(NewVR);
	}
	}

	bool X86TargetLowering::supportSwiftError() const {
	return Subtarget.is64Bit();
	}

	/// Returns true if stack probing through a function call is requested.
	bool X86TargetLowering::hasStackProbeSymbol(MachineFunction &MF) const {
	return !getStackProbeSymbolName(MF).empty();
	}

	/// Returns true if stack probing through inline assembly is requested.
	bool X86TargetLowering::hasInlineStackProbe(MachineFunction &MF) const {

	// No inline stack probe for Windows, they have their own mechanism.
	if (Subtarget.isOSWindows() \|\|
	MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
	return false;

	// If the function specifically requests inline stack probes, emit them.
	if (MF.getFunction().hasFnAttribute("probe-stack"))
	return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
	"inline-asm";

	return false;
	}

	/// Returns the name of the symbol used to emit stack probes or the empty
	/// string if not applicable.
	StringRef
	X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
	// Inline Stack probes disable stack probe call
	if (hasInlineStackProbe(MF))
	return "";

	// If the function specifically requests stack probes, emit them.
	if (MF.getFunction().hasFnAttribute("probe-stack"))
	return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();

	// Generally, if we aren't on Windows, the platform ABI does not include
	// support for stack probes, so don't emit them.
	if (!Subtarget.isOSWindows() \|\| Subtarget.isTargetMachO() \|\|
	MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
	return "";

	// We need a stack probe to conform to the Windows ABI. Choose the right
	// symbol.
	if (Subtarget.is64Bit())
	return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
	return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
	}

	unsigned
	X86TargetLowering::getStackProbeSize(MachineFunction &MF) const {
	// The default stack probe size is 4096 if the function has no stackprobesize
	// attribute.
	unsigned StackProbeSize = 4096;
	const Function &Fn = MF.getFunction();
	if (Fn.hasFnAttribute("stack-probe-size"))
	Fn.getFnAttribute("stack-probe-size")
	.getValueAsString()
	.getAsInteger(0, StackProbeSize);
	return StackProbeSize;
	}
	diff --git a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
	index c40901255424..cd39428b9c38 100644
	--- a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
	+++ b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
	@@ -1,385 +1,387 @@
	//===- LibDriver.cpp - lib.exe-compatible driver --------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// Defines an interface to a lib.exe-compatible driver that also understands
	// bitcode files. Used by llvm-lib and lld-link /lib.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/ToolDrivers/llvm-lib/LibDriver.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/StringSet.h"
	#include "llvm/BinaryFormat/COFF.h"
	#include "llvm/BinaryFormat/Magic.h"
	#include "llvm/Bitcode/BitcodeReader.h"
	#include "llvm/Object/ArchiveWriter.h"
	#include "llvm/Object/COFF.h"
	#include "llvm/Object/WindowsMachineFlag.h"
	#include "llvm/Option/Arg.h"
	#include "llvm/Option/ArgList.h"
	#include "llvm/Option/Option.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Path.h"
	#include "llvm/Support/Process.h"
	#include "llvm/Support/StringSaver.h"
	#include "llvm/Support/raw_ostream.h"

	using namespace llvm;

	namespace {

	enum {
	OPT_INVALID = 0,
	#define OPTION(_1, _2, ID, _4, _5, _6, _7, _8, _9, _10, _11, _12) OPT_##ID,
	#include "Options.inc"
	#undef OPTION
	};

	#define PREFIX(NAME, VALUE) const char *const NAME[] = VALUE;
	#include "Options.inc"
	#undef PREFIX

	static const opt::OptTable::Info InfoTable[] = {
	#define OPTION(X1, X2, ID, KIND, GROUP, ALIAS, X7, X8, X9, X10, X11, X12) \
	{X1, X2, X10, X11, OPT_##ID, opt::Option::KIND##Class, \
	X9, X8, OPT_##GROUP, OPT_##ALIAS, X7, X12},
	#include "Options.inc"
	#undef OPTION
	};

	class LibOptTable : public opt::OptTable {
	public:
	LibOptTable() : OptTable(InfoTable, true) {}
	};

	}

	static std::string getDefaultOutputPath(const NewArchiveMember &FirstMember) {
	SmallString<128> Val = StringRef(FirstMember.Buf->getBufferIdentifier());
	sys::path::replace_extension(Val, ".lib");
	return std::string(Val.str());
	}

	static std::vector<StringRef> getSearchPaths(opt::InputArgList *Args,
	StringSaver &Saver) {
	std::vector<StringRef> Ret;
	// Add current directory as first item of the search path.
	Ret.push_back("");

	// Add /libpath flags.
	for (auto *Arg : Args->filtered(OPT_libpath))
	Ret.push_back(Arg->getValue());

	// Add $LIB.
	Optional<std::string> EnvOpt = sys::Process::GetEnv("LIB");
	if (!EnvOpt.hasValue())
	return Ret;
	StringRef Env = Saver.save(*EnvOpt);
	while (!Env.empty()) {
	StringRef Path;
	std::tie(Path, Env) = Env.split(';');
	Ret.push_back(Path);
	}
	return Ret;
	}

	static std::string findInputFile(StringRef File, ArrayRef<StringRef> Paths) {
	for (StringRef Dir : Paths) {
	SmallString<128> Path = Dir;
	sys::path::append(Path, File);
	if (sys::fs::exists(Path))
	return std::string(Path);
	}
	return "";
	}

	static void fatalOpenError(llvm::Error E, Twine File) {
	if (!E)
	return;
	handleAllErrors(std::move(E), [&](const llvm::ErrorInfoBase &EIB) {
	llvm::errs() << "error opening '" << File << "': " << EIB.message() << '\n';
	exit(1);
	});
	}

	static void doList(opt::InputArgList& Args) {
	// lib.exe prints the contents of the first archive file.
	std::unique_ptr<MemoryBuffer> B;
	for (auto *Arg : Args.filtered(OPT_INPUT)) {
	// Create or open the archive object.
	ErrorOr<std::unique_ptr<MemoryBuffer>> MaybeBuf =
	MemoryBuffer::getFile(Arg->getValue(), -1, false);
	fatalOpenError(errorCodeToError(MaybeBuf.getError()), Arg->getValue());

	if (identify_magic(MaybeBuf.get()->getBuffer()) == file_magic::archive) {
	B = std::move(MaybeBuf.get());
	break;
	}
	}

	// lib.exe doesn't print an error if no .lib files are passed.
	if (!B)
	return;

	Error Err = Error::success();
	object::Archive Archive(B.get()->getMemBufferRef(), Err);
	fatalOpenError(std::move(Err), B->getBufferIdentifier());

	for (auto &C : Archive.children(Err)) {
	Expected<StringRef> NameOrErr = C.getName();
	fatalOpenError(NameOrErr.takeError(), B->getBufferIdentifier());
	StringRef Name = NameOrErr.get();
	llvm::outs() << Name << '\n';
	}
	fatalOpenError(std::move(Err), B->getBufferIdentifier());
	}

	static COFF::MachineTypes getCOFFFileMachine(MemoryBufferRef MB) {
	std::error_code EC;
	auto Obj = object::COFFObjectFile::create(MB);
	if (!Obj) {
	llvm::errs() << MB.getBufferIdentifier()
	<< ": failed to open: " << Obj.takeError() << '\n';
	exit(1);
	}

	uint16_t Machine = (*Obj)->getMachine();
	if (Machine != COFF::IMAGE_FILE_MACHINE_I386 &&
	Machine != COFF::IMAGE_FILE_MACHINE_AMD64 &&
	Machine != COFF::IMAGE_FILE_MACHINE_ARMNT &&
	Machine != COFF::IMAGE_FILE_MACHINE_ARM64) {
	llvm::errs() << MB.getBufferIdentifier() << ": unknown machine: " << Machine
	<< '\n';
	exit(1);
	}

	return static_cast<COFF::MachineTypes>(Machine);
	}

	static COFF::MachineTypes getBitcodeFileMachine(MemoryBufferRef MB) {
	Expected<std::string> TripleStr = getBitcodeTargetTriple(MB);
	if (!TripleStr) {
	llvm::errs() << MB.getBufferIdentifier()
	<< ": failed to get target triple from bitcode\n";
	exit(1);
	}

	switch (Triple(*TripleStr).getArch()) {
	case Triple::x86:
	return COFF::IMAGE_FILE_MACHINE_I386;
	case Triple::x86_64:
	return COFF::IMAGE_FILE_MACHINE_AMD64;
	case Triple::arm:
	return COFF::IMAGE_FILE_MACHINE_ARMNT;
	case Triple::aarch64:
	return COFF::IMAGE_FILE_MACHINE_ARM64;
	default:
	llvm::errs() << MB.getBufferIdentifier()
	<< ": unknown arch in target triple " << *TripleStr << '\n';
	exit(1);
	}
	}

	static void appendFile(std::vector<NewArchiveMember> &Members,
	COFF::MachineTypes &LibMachine,
	std::string &LibMachineSource, MemoryBufferRef MB) {
	file_magic Magic = identify_magic(MB.getBuffer());

	if (Magic != file_magic::coff_object && Magic != file_magic::bitcode &&
	- Magic != file_magic::archive && Magic != file_magic::windows_resource) {
	+ Magic != file_magic::archive && Magic != file_magic::windows_resource &&
	+ Magic != file_magic::coff_import_library) {
	llvm::errs() << MB.getBufferIdentifier()
	- << ": not a COFF object, bitcode, archive or resource file\n";
	+ << ": not a COFF object, bitcode, archive, import library or "
	+ "resource file\n";
	exit(1);
	}

	// If a user attempts to add an archive to another archive, llvm-lib doesn't
	// handle the first archive file as a single file. Instead, it extracts all
	// members from the archive and add them to the second archive. This beahvior
	// is for compatibility with Microsoft's lib command.
	if (Magic == file_magic::archive) {
	Error Err = Error::success();
	object::Archive Archive(MB, Err);
	fatalOpenError(std::move(Err), MB.getBufferIdentifier());

	for (auto &C : Archive.children(Err)) {
	Expected<MemoryBufferRef> ChildMB = C.getMemoryBufferRef();
	if (!ChildMB) {
	handleAllErrors(ChildMB.takeError(), [&](const ErrorInfoBase &EIB) {
	llvm::errs() << MB.getBufferIdentifier() << ": " << EIB.message()
	<< "\n";
	});
	exit(1);
	}

	appendFile(Members, LibMachine, LibMachineSource, *ChildMB);
	}

	fatalOpenError(std::move(Err), MB.getBufferIdentifier());
	return;
	}

	// Check that all input files have the same machine type.
	// Mixing normal objects and LTO bitcode files is fine as long as they
	// have the same machine type.
	// Doing this here duplicates the header parsing work that writeArchive()
	// below does, but it's not a lot of work and it's a bit awkward to do
	// in writeArchive() which needs to support many tools, can't assume the
	// input is COFF, and doesn't have a good way to report errors.
	if (Magic == file_magic::coff_object \|\| Magic == file_magic::bitcode) {
	COFF::MachineTypes FileMachine = (Magic == file_magic::coff_object)
	? getCOFFFileMachine(MB)
	: getBitcodeFileMachine(MB);

	// FIXME: Once lld-link rejects multiple resource .obj files:
	// Call convertResToCOFF() on .res files and add the resulting
	// COFF file to the .lib output instead of adding the .res file, and remove
	// this check. See PR42180.
	if (FileMachine != COFF::IMAGE_FILE_MACHINE_UNKNOWN) {
	if (LibMachine == COFF::IMAGE_FILE_MACHINE_UNKNOWN) {
	LibMachine = FileMachine;
	LibMachineSource =
	(" (inferred from earlier file '" + MB.getBufferIdentifier() + "')")
	.str();
	} else if (LibMachine != FileMachine) {
	llvm::errs() << MB.getBufferIdentifier() << ": file machine type "
	<< machineToStr(FileMachine)
	<< " conflicts with library machine type "
	<< machineToStr(LibMachine) << LibMachineSource << '\n';
	exit(1);
	}
	}
	}

	Members.emplace_back(MB);
	}

	int llvm::libDriverMain(ArrayRef<const char *> ArgsArr) {
	BumpPtrAllocator Alloc;
	StringSaver Saver(Alloc);

	// Parse command line arguments.
	SmallVector<const char *, 20> NewArgs(ArgsArr.begin(), ArgsArr.end());
	cl::ExpandResponseFiles(Saver, cl::TokenizeWindowsCommandLine, NewArgs);
	ArgsArr = NewArgs;

	LibOptTable Table;
	unsigned MissingIndex;
	unsigned MissingCount;
	opt::InputArgList Args =
	Table.ParseArgs(ArgsArr.slice(1), MissingIndex, MissingCount);
	if (MissingCount) {
	llvm::errs() << "missing arg value for \""
	<< Args.getArgString(MissingIndex) << "\", expected "
	<< MissingCount
	<< (MissingCount == 1 ? " argument.\n" : " arguments.\n");
	return 1;
	}
	for (auto *Arg : Args.filtered(OPT_UNKNOWN))
	llvm::errs() << "ignoring unknown argument: " << Arg->getAsString(Args)
	<< "\n";

	// Handle /help
	if (Args.hasArg(OPT_help)) {
	Table.PrintHelp(outs(), "llvm-lib [options] file...", "LLVM Lib");
	return 0;
	}

	// If no input files and not told otherwise, silently do nothing to match
	// lib.exe
	if (!Args.hasArgNoClaim(OPT_INPUT) && !Args.hasArg(OPT_llvmlibempty))
	return 0;

	if (Args.hasArg(OPT_lst)) {
	doList(Args);
	return 0;
	}

	std::vector<StringRef> SearchPaths = getSearchPaths(&Args, Saver);

	COFF::MachineTypes LibMachine = COFF::IMAGE_FILE_MACHINE_UNKNOWN;
	std::string LibMachineSource;
	if (auto *Arg = Args.getLastArg(OPT_machine)) {
	LibMachine = getMachineType(Arg->getValue());
	if (LibMachine == COFF::IMAGE_FILE_MACHINE_UNKNOWN) {
	llvm::errs() << "unknown /machine: arg " << Arg->getValue() << '\n';
	return 1;
	}
	LibMachineSource =
	std::string(" (from '/machine:") + Arg->getValue() + "' flag)";
	}

	std::vector<std::unique_ptr<MemoryBuffer>> MBs;
	StringSet<> Seen;
	std::vector<NewArchiveMember> Members;

	// Create a NewArchiveMember for each input file.
	for (auto *Arg : Args.filtered(OPT_INPUT)) {
	// Find a file
	std::string Path = findInputFile(Arg->getValue(), SearchPaths);
	if (Path.empty()) {
	llvm::errs() << Arg->getValue() << ": no such file or directory\n";
	return 1;
	}

	// Input files are uniquified by pathname. If you specify the exact same
	// path more than once, all but the first one are ignored.
	//
	// Note that there's a loophole in the rule; you can prepend `.\` or
	// something like that to a path to make it look different, and they are
	// handled as if they were different files. This behavior is compatible with
	// Microsoft lib.exe.
	if (!Seen.insert(Path).second)
	continue;

	// Open a file.
	ErrorOr<std::unique_ptr<MemoryBuffer>> MOrErr =
	MemoryBuffer::getFile(Path, -1, false);
	fatalOpenError(errorCodeToError(MOrErr.getError()), Path);
	MemoryBufferRef MBRef = (*MOrErr)->getMemBufferRef();

	// Append a file.
	appendFile(Members, LibMachine, LibMachineSource, MBRef);

	// Take the ownership of the file buffer to keep the file open.
	MBs.push_back(std::move(*MOrErr));
	}

	// Create an archive file.
	std::string OutputPath;
	if (auto *Arg = Args.getLastArg(OPT_out)) {
	OutputPath = Arg->getValue();
	} else if (!Members.empty()) {
	OutputPath = getDefaultOutputPath(Members[0]);
	} else {
	llvm::errs() << "no output path given, and cannot infer with no inputs\n";
	return 1;
	}
	// llvm-lib uses relative paths for both regular and thin archives, unlike
	// standard GNU ar, which only uses relative paths for thin archives and
	// basenames for regular archives.
	for (NewArchiveMember &Member : Members) {
	if (sys::path::is_relative(Member.MemberName)) {
	Expected<std::string> PathOrErr =
	computeArchiveRelativePath(OutputPath, Member.MemberName);
	if (PathOrErr)
	Member.MemberName = Saver.save(*PathOrErr);
	}
	}

	if (Error E =
	writeArchive(OutputPath, Members,
	/WriteSymtab=/true, object::Archive::K_GNU,
	/Deterministic/ true, Args.hasArg(OPT_llvmlibthin))) {
	handleAllErrors(std::move(E), [&](const ErrorInfoBase &EI) {
	llvm::errs() << OutputPath << ": " << EI.message() << "\n";
	});
	return 1;
	}

	return 0;
	}
	diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
	index 7cfe4c8b5892..c7f2f4ec3ca1 100644
	--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
	+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
	@@ -1,1876 +1,1875 @@
	//===- InstCombineSimplifyDemanded.cpp ------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains logic for simplifying instructions based on information
	// about how they are used.
	//
	//===----------------------------------------------------------------------===//

	#include "InstCombineInternal.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/IntrinsicsAMDGPU.h"
	#include "llvm/IR/IntrinsicsX86.h"
	#include "llvm/IR/PatternMatch.h"
	#include "llvm/Support/KnownBits.h"

	using namespace llvm;
	using namespace llvm::PatternMatch;

	#define DEBUG_TYPE "instcombine"

	namespace {

	struct AMDGPUImageDMaskIntrinsic {
	unsigned Intr;
	};

	#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
	#include "InstCombineTables.inc"

	} // end anonymous namespace

	/// Check to see if the specified operand of the specified instruction is a
	/// constant integer. If so, check to see if there are any bits set in the
	/// constant that are not demanded. If so, shrink the constant and return true.
	static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo,
	const APInt &Demanded) {
	assert(I && "No instruction?");
	assert(OpNo < I->getNumOperands() && "Operand index too large");

	// The operand must be a constant integer or splat integer.
	Value *Op = I->getOperand(OpNo);
	const APInt *C;
	if (!match(Op, m_APInt(C)))
	return false;

	// If there are no bits set that aren't demanded, nothing to do.
	if (C->isSubsetOf(Demanded))
	return false;

	// This instruction is producing bits that are not demanded. Shrink the RHS.
	I->setOperand(OpNo, ConstantInt::get(Op->getType(), *C & Demanded));

	return true;
	}



	/// Inst is an integer instruction that SimplifyDemandedBits knows about. See if
	/// the instruction has any properties that allow us to simplify its operands.
	bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) {
	unsigned BitWidth = Inst.getType()->getScalarSizeInBits();
	KnownBits Known(BitWidth);
	APInt DemandedMask(APInt::getAllOnesValue(BitWidth));

	Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask, Known,
	0, &Inst);
	if (!V) return false;
	if (V == &Inst) return true;
	replaceInstUsesWith(Inst, V);
	return true;
	}

	/// This form of SimplifyDemandedBits simplifies the specified instruction
	/// operand if possible, updating it in place. It returns true if it made any
	/// change and false otherwise.
	bool InstCombiner::SimplifyDemandedBits(Instruction *I, unsigned OpNo,
	const APInt &DemandedMask,
	KnownBits &Known,
	unsigned Depth) {
	Use &U = I->getOperandUse(OpNo);
	Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask, Known,
	Depth, I);
	if (!NewVal) return false;
	if (Instruction* OpInst = dyn_cast<Instruction>(U))
	salvageDebugInfo(*OpInst);

	replaceUse(U, NewVal);
	return true;
	}


	/// This function attempts to replace V with a simpler value based on the
	/// demanded bits. When this function is called, it is known that only the bits
	/// set in DemandedMask of the result of V are ever used downstream.
	/// Consequently, depending on the mask and V, it may be possible to replace V
	/// with a constant or one of its operands. In such cases, this function does
	/// the replacement and returns true. In all other cases, it returns false after
	/// analyzing the expression and setting KnownOne and known to be one in the
	/// expression. Known.Zero contains all the bits that are known to be zero in
	/// the expression. These are provided to potentially allow the caller (which
	/// might recursively be SimplifyDemandedBits itself) to simplify the
	/// expression.
	/// Known.One and Known.Zero always follow the invariant that:
	/// Known.One & Known.Zero == 0.
	/// That is, a bit can't be both 1 and 0. Note that the bits in Known.One and
	/// Known.Zero may only be accurate for those bits set in DemandedMask. Note
	/// also that the bitwidth of V, DemandedMask, Known.Zero and Known.One must all
	/// be the same.
	///
	/// This returns null if it did not change anything and it permits no
	/// simplification. This returns V itself if it did some simplification of V's
	/// operands based on the information about what bits are demanded. This returns
	/// some other non-null value if it found out that V is equal to another value
	/// in the context where the specified bits are demanded, but not for all users.
	Value InstCombiner::SimplifyDemandedUseBits(Value V, APInt DemandedMask,
	KnownBits &Known, unsigned Depth,
	Instruction *CxtI) {
	assert(V != nullptr && "Null pointer of Value???");
	assert(Depth <= 6 && "Limit Search Depth");
	uint32_t BitWidth = DemandedMask.getBitWidth();
	Type *VTy = V->getType();
	assert(
	(!VTy->isIntOrIntVectorTy() \|\| VTy->getScalarSizeInBits() == BitWidth) &&
	Known.getBitWidth() == BitWidth &&
	"Value *V, DemandedMask and Known must have same BitWidth");

	if (isa<Constant>(V)) {
	computeKnownBits(V, Known, Depth, CxtI);
	return nullptr;
	}

	Known.resetAll();
	if (DemandedMask.isNullValue()) // Not demanding any bits from V.
	return UndefValue::get(VTy);

	if (Depth == 6) // Limit search depth.
	return nullptr;

	Instruction *I = dyn_cast<Instruction>(V);
	if (!I) {
	computeKnownBits(V, Known, Depth, CxtI);
	return nullptr; // Only analyze instructions.
	}

	// If there are multiple uses of this value and we aren't at the root, then
	// we can't do any simplifications of the operands, because DemandedMask
	// only reflects the bits demanded by one of the users.
	if (Depth != 0 && !I->hasOneUse())
	return SimplifyMultipleUseDemandedBits(I, DemandedMask, Known, Depth, CxtI);

	KnownBits LHSKnown(BitWidth), RHSKnown(BitWidth);

	// If this is the root being simplified, allow it to have multiple uses,
	// just set the DemandedMask to all bits so that we can try to simplify the
	// operands. This allows visitTruncInst (for example) to simplify the
	// operand of a trunc without duplicating all the logic below.
	if (Depth == 0 && !V->hasOneUse())
	DemandedMask.setAllBits();

	switch (I->getOpcode()) {
	default:
	computeKnownBits(I, Known, Depth, CxtI);
	break;
	case Instruction::And: {
	// If either the LHS or the RHS are Zero, the result is zero.
	if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) \|\|
	SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnown.Zero, LHSKnown,
	Depth + 1))
	return I;
	assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
	assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");

	Known = LHSKnown & RHSKnown;

	// If the client is only demanding bits that we know, return the known
	// constant.
	if (DemandedMask.isSubsetOf(Known.Zero \| Known.One))
	return Constant::getIntegerValue(VTy, Known.One);

	// If all of the demanded bits are known 1 on one side, return the other.
	// These bits cannot contribute to the result of the 'and'.
	if (DemandedMask.isSubsetOf(LHSKnown.Zero \| RHSKnown.One))
	return I->getOperand(0);
	if (DemandedMask.isSubsetOf(RHSKnown.Zero \| LHSKnown.One))
	return I->getOperand(1);

	// If the RHS is a constant, see if we can simplify it.
	if (ShrinkDemandedConstant(I, 1, DemandedMask & ~LHSKnown.Zero))
	return I;

	break;
	}
	case Instruction::Or: {
	// If either the LHS or the RHS are One, the result is One.
	if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) \|\|
	SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnown.One, LHSKnown,
	Depth + 1))
	return I;
	assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
	assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");

	Known = LHSKnown \| RHSKnown;

	// If the client is only demanding bits that we know, return the known
	// constant.
	if (DemandedMask.isSubsetOf(Known.Zero \| Known.One))
	return Constant::getIntegerValue(VTy, Known.One);

	// If all of the demanded bits are known zero on one side, return the other.
	// These bits cannot contribute to the result of the 'or'.
	if (DemandedMask.isSubsetOf(LHSKnown.One \| RHSKnown.Zero))
	return I->getOperand(0);
	if (DemandedMask.isSubsetOf(RHSKnown.One \| LHSKnown.Zero))
	return I->getOperand(1);

	// If the RHS is a constant, see if we can simplify it.
	if (ShrinkDemandedConstant(I, 1, DemandedMask))
	return I;

	break;
	}
	case Instruction::Xor: {
	if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) \|\|
	SimplifyDemandedBits(I, 0, DemandedMask, LHSKnown, Depth + 1))
	return I;
	assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
	assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");

	Known = LHSKnown ^ RHSKnown;

	// If the client is only demanding bits that we know, return the known
	// constant.
	if (DemandedMask.isSubsetOf(Known.Zero \| Known.One))
	return Constant::getIntegerValue(VTy, Known.One);

	// If all of the demanded bits are known zero on one side, return the other.
	// These bits cannot contribute to the result of the 'xor'.
	if (DemandedMask.isSubsetOf(RHSKnown.Zero))
	return I->getOperand(0);
	if (DemandedMask.isSubsetOf(LHSKnown.Zero))
	return I->getOperand(1);

	// If all of the demanded bits are known to be zero on one side or the
	// other, turn this into an inclusive or.
	// e.g. (A & C1)^(B & C2) -> (A & C1)\|(B & C2) iff C1&C2 == 0
	if (DemandedMask.isSubsetOf(RHSKnown.Zero \| LHSKnown.Zero)) {
	Instruction *Or =
	BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1),
	I->getName());
	return InsertNewInstWith(Or, *I);
	}

	// If all of the demanded bits on one side are known, and all of the set
	// bits on that side are also known to be set on the other side, turn this
	// into an AND, as we know the bits will be cleared.
	// e.g. (X \| C1) ^ C2 --> (X \| C1) & ~C2 iff (C1&C2) == C2
	if (DemandedMask.isSubsetOf(RHSKnown.Zero\|RHSKnown.One) &&
	RHSKnown.One.isSubsetOf(LHSKnown.One)) {
	Constant *AndC = Constant::getIntegerValue(VTy,
	~RHSKnown.One & DemandedMask);
	Instruction *And = BinaryOperator::CreateAnd(I->getOperand(0), AndC);
	return InsertNewInstWith(And, *I);
	}

	// If the RHS is a constant, see if we can simplify it.
	// FIXME: for XOR, we prefer to force bits to 1 if they will make a -1.
	if (ShrinkDemandedConstant(I, 1, DemandedMask))
	return I;

	// If our LHS is an 'and' and if it has one use, and if any of the bits we
	// are flipping are known to be set, then the xor is just resetting those
	// bits to zero. We can just knock out bits from the 'and' and the 'xor',
	// simplifying both of them.
	if (Instruction *LHSInst = dyn_cast<Instruction>(I->getOperand(0)))
	if (LHSInst->getOpcode() == Instruction::And && LHSInst->hasOneUse() &&
	isa<ConstantInt>(I->getOperand(1)) &&
	isa<ConstantInt>(LHSInst->getOperand(1)) &&
	(LHSKnown.One & RHSKnown.One & DemandedMask) != 0) {
	ConstantInt *AndRHS = cast<ConstantInt>(LHSInst->getOperand(1));
	ConstantInt *XorRHS = cast<ConstantInt>(I->getOperand(1));
	APInt NewMask = ~(LHSKnown.One & RHSKnown.One & DemandedMask);

	Constant *AndC =
	ConstantInt::get(I->getType(), NewMask & AndRHS->getValue());
	Instruction *NewAnd = BinaryOperator::CreateAnd(I->getOperand(0), AndC);
	InsertNewInstWith(NewAnd, *I);

	Constant *XorC =
	ConstantInt::get(I->getType(), NewMask & XorRHS->getValue());
	Instruction *NewXor = BinaryOperator::CreateXor(NewAnd, XorC);
	return InsertNewInstWith(NewXor, *I);
	}

	break;
	}
	case Instruction::Select: {
	Value LHS, RHS;
	SelectPatternFlavor SPF = matchSelectPattern(I, LHS, RHS).Flavor;
	if (SPF == SPF_UMAX) {
	// UMax(A, C) == A if ...
	// The lowest non-zero bit of DemandMask is higher than the highest
	// non-zero bit of C.
	const APInt *C;
	unsigned CTZ = DemandedMask.countTrailingZeros();
	if (match(RHS, m_APInt(C)) && CTZ >= C->getActiveBits())
	return LHS;
	} else if (SPF == SPF_UMIN) {
	// UMin(A, C) == A if ...
	// The lowest non-zero bit of DemandMask is higher than the highest
	// non-one bit of C.
	// This comes from using DeMorgans on the above umax example.
	const APInt *C;
	unsigned CTZ = DemandedMask.countTrailingZeros();
	if (match(RHS, m_APInt(C)) &&
	CTZ >= C->getBitWidth() - C->countLeadingOnes())
	return LHS;
	}

	// If this is a select as part of any other min/max pattern, don't simplify
	// any further in case we break the structure.
	if (SPF != SPF_UNKNOWN)
	return nullptr;

	if (SimplifyDemandedBits(I, 2, DemandedMask, RHSKnown, Depth + 1) \|\|
	SimplifyDemandedBits(I, 1, DemandedMask, LHSKnown, Depth + 1))
	return I;
	assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
	assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");

	// If the operands are constants, see if we can simplify them.
	// This is similar to ShrinkDemandedConstant, but for a select we want to
	// try to keep the selected constants the same as icmp value constants, if
	// we can. This helps not break apart (or helps put back together)
	// canonical patterns like min and max.
	auto CanonicalizeSelectConstant = [](Instruction *I, unsigned OpNo,
	APInt DemandedMask) {
	const APInt *SelC;
	if (!match(I->getOperand(OpNo), m_APInt(SelC)))
	return false;

	// Get the constant out of the ICmp, if there is one.
	const APInt *CmpC;
	ICmpInst::Predicate Pred;
	if (!match(I->getOperand(0), m_c_ICmp(Pred, m_APInt(CmpC), m_Value())) \|\|
	CmpC->getBitWidth() != SelC->getBitWidth())
	return ShrinkDemandedConstant(I, OpNo, DemandedMask);

	// If the constant is already the same as the ICmp, leave it as-is.
	if (CmpC == SelC)
	return false;
	// If the constants are not already the same, but can be with the demand
	// mask, use the constant value from the ICmp.
	if ((CmpC & DemandedMask) == (SelC & DemandedMask)) {
	I->setOperand(OpNo, ConstantInt::get(I->getType(), *CmpC));
	return true;
	}
	return ShrinkDemandedConstant(I, OpNo, DemandedMask);
	};
	if (CanonicalizeSelectConstant(I, 1, DemandedMask) \|\|
	CanonicalizeSelectConstant(I, 2, DemandedMask))
	return I;

	// Only known if known in both the LHS and RHS.
	Known.One = RHSKnown.One & LHSKnown.One;
	Known.Zero = RHSKnown.Zero & LHSKnown.Zero;
	break;
	}
	case Instruction::ZExt:
	case Instruction::Trunc: {
	unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits();

	APInt InputDemandedMask = DemandedMask.zextOrTrunc(SrcBitWidth);
	KnownBits InputKnown(SrcBitWidth);
	if (SimplifyDemandedBits(I, 0, InputDemandedMask, InputKnown, Depth + 1))
	return I;
	assert(InputKnown.getBitWidth() == SrcBitWidth && "Src width changed?");
	Known = InputKnown.zextOrTrunc(BitWidth);
	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	break;
	}
	case Instruction::BitCast:
	if (!I->getOperand(0)->getType()->isIntOrIntVectorTy())
	return nullptr; // vector->int or fp->int?

	if (VectorType *DstVTy = dyn_cast<VectorType>(I->getType())) {
	if (VectorType *SrcVTy =
	dyn_cast<VectorType>(I->getOperand(0)->getType())) {
	if (DstVTy->getNumElements() != SrcVTy->getNumElements())
	// Don't touch a bitcast between vectors of different element counts.
	return nullptr;
	} else
	// Don't touch a scalar-to-vector bitcast.
	return nullptr;
	} else if (I->getOperand(0)->getType()->isVectorTy())
	// Don't touch a vector-to-scalar bitcast.
	return nullptr;

	if (SimplifyDemandedBits(I, 0, DemandedMask, Known, Depth + 1))
	return I;
	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	break;
	case Instruction::SExt: {
	// Compute the bits in the result that are not present in the input.
	unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits();

	APInt InputDemandedBits = DemandedMask.trunc(SrcBitWidth);

	// If any of the sign extended bits are demanded, we know that the sign
	// bit is demanded.
	if (DemandedMask.getActiveBits() > SrcBitWidth)
	InputDemandedBits.setBit(SrcBitWidth-1);

	KnownBits InputKnown(SrcBitWidth);
	if (SimplifyDemandedBits(I, 0, InputDemandedBits, InputKnown, Depth + 1))
	return I;

	// If the input sign bit is known zero, or if the NewBits are not demanded
	// convert this into a zero extension.
	if (InputKnown.isNonNegative() \|\|
	DemandedMask.getActiveBits() <= SrcBitWidth) {
	// Convert to ZExt cast.
	CastInst *NewCast = new ZExtInst(I->getOperand(0), VTy, I->getName());
	return InsertNewInstWith(NewCast, *I);
	}

	// If the sign bit of the input is known set or clear, then we know the
	// top bits of the result.
	Known = InputKnown.sext(BitWidth);
	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	break;
	}
	case Instruction::Add:
	if ((DemandedMask & 1) == 0) {
	// If we do not need the low bit, try to convert bool math to logic:
	// add iN (zext i1 X), (sext i1 Y) --> sext (~X & Y) to iN
	Value X, Y;
	if (match(I, m_c_Add(m_OneUse(m_ZExt(m_Value(X))),
	m_OneUse(m_SExt(m_Value(Y))))) &&
	X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType()) {
	// Truth table for inputs and output signbits:
	// X:0 \| X:1
	// ----------
	// Y:0 \| 0 \| 0 \|
	// Y:1 \| -1 \| 0 \|
	// ----------
	IRBuilderBase::InsertPointGuard Guard(Builder);
	Builder.SetInsertPoint(I);
	Value *AndNot = Builder.CreateAnd(Builder.CreateNot(X), Y);
	return Builder.CreateSExt(AndNot, VTy);
	}

	// add iN (sext i1 X), (sext i1 Y) --> sext (X \| Y) to iN
	// TODO: Relax the one-use checks because we are removing an instruction?
	if (match(I, m_Add(m_OneUse(m_SExt(m_Value(X))),
	m_OneUse(m_SExt(m_Value(Y))))) &&
	X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType()) {
	// Truth table for inputs and output signbits:
	// X:0 \| X:1
	// -----------
	// Y:0 \| -1 \| -1 \|
	// Y:1 \| -1 \| 0 \|
	// -----------
	IRBuilderBase::InsertPointGuard Guard(Builder);
	Builder.SetInsertPoint(I);
	Value *Or = Builder.CreateOr(X, Y);
	return Builder.CreateSExt(Or, VTy);
	}
	}
	LLVM_FALLTHROUGH;
	case Instruction::Sub: {
	/// If the high-bits of an ADD/SUB are not demanded, then we do not care
	/// about the high bits of the operands.
	unsigned NLZ = DemandedMask.countLeadingZeros();
	// Right fill the mask of bits for this ADD/SUB to demand the most
	// significant bit and all those below it.
	APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ));
	if (ShrinkDemandedConstant(I, 0, DemandedFromOps) \|\|
	SimplifyDemandedBits(I, 0, DemandedFromOps, LHSKnown, Depth + 1) \|\|
	ShrinkDemandedConstant(I, 1, DemandedFromOps) \|\|
	SimplifyDemandedBits(I, 1, DemandedFromOps, RHSKnown, Depth + 1)) {
	if (NLZ > 0) {
	// Disable the nsw and nuw flags here: We can no longer guarantee that
	// we won't wrap after simplification. Removing the nsw/nuw flags is
	// legal here because the top bit is not demanded.
	BinaryOperator &BinOP = *cast<BinaryOperator>(I);
	BinOP.setHasNoSignedWrap(false);
	BinOP.setHasNoUnsignedWrap(false);
	}
	return I;
	}

	// If we are known to be adding/subtracting zeros to every bit below
	// the highest demanded bit, we just return the other side.
	if (DemandedFromOps.isSubsetOf(RHSKnown.Zero))
	return I->getOperand(0);
	// We can't do this with the LHS for subtraction, unless we are only
	// demanding the LSB.
	if ((I->getOpcode() == Instruction::Add \|\|
	DemandedFromOps.isOneValue()) &&
	DemandedFromOps.isSubsetOf(LHSKnown.Zero))
	return I->getOperand(1);

	// Otherwise just compute the known bits of the result.
	bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
	Known = KnownBits::computeForAddSub(I->getOpcode() == Instruction::Add,
	NSW, LHSKnown, RHSKnown);
	break;
	}
	case Instruction::Shl: {
	const APInt *SA;
	if (match(I->getOperand(1), m_APInt(SA))) {
	const APInt *ShrAmt;
	if (match(I->getOperand(0), m_Shr(m_Value(), m_APInt(ShrAmt))))
	if (Instruction *Shr = dyn_cast<Instruction>(I->getOperand(0)))
	if (Value R = simplifyShrShlDemandedBits(Shr, ShrAmt, I, *SA,
	DemandedMask, Known))
	return R;

	uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
	APInt DemandedMaskIn(DemandedMask.lshr(ShiftAmt));

	// If the shift is NUW/NSW, then it does demand the high bits.
	ShlOperator *IOp = cast<ShlOperator>(I);
	if (IOp->hasNoSignedWrap())
	DemandedMaskIn.setHighBits(ShiftAmt+1);
	else if (IOp->hasNoUnsignedWrap())
	DemandedMaskIn.setHighBits(ShiftAmt);

	if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1))
	return I;
	assert(!Known.hasConflict() && "Bits known to be one AND zero?");

	bool SignBitZero = Known.Zero.isSignBitSet();
	bool SignBitOne = Known.One.isSignBitSet();
	Known.Zero <<= ShiftAmt;
	Known.One <<= ShiftAmt;
	// low bits known zero.
	if (ShiftAmt)
	Known.Zero.setLowBits(ShiftAmt);

	// If this shift has "nsw" keyword, then the result is either a poison
	// value or has the same sign bit as the first operand.
	if (IOp->hasNoSignedWrap()) {
	if (SignBitZero)
	Known.Zero.setSignBit();
	else if (SignBitOne)
	Known.One.setSignBit();
	if (Known.hasConflict())
	return UndefValue::get(I->getType());
	}
	} else {
	computeKnownBits(I, Known, Depth, CxtI);
	}
	break;
	}
	case Instruction::LShr: {
	const APInt *SA;
	if (match(I->getOperand(1), m_APInt(SA))) {
	uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1);

	// Unsigned shift right.
	APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt));

	// If the shift is exact, then it does demand the low bits (and knows that
	// they are zero).
	if (cast<LShrOperator>(I)->isExact())
	DemandedMaskIn.setLowBits(ShiftAmt);

	if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1))
	return I;
	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	Known.Zero.lshrInPlace(ShiftAmt);
	Known.One.lshrInPlace(ShiftAmt);
	if (ShiftAmt)
	Known.Zero.setHighBits(ShiftAmt); // high bits known zero.
	} else {
	computeKnownBits(I, Known, Depth, CxtI);
	}
	break;
	}
	case Instruction::AShr: {
	// If this is an arithmetic shift right and only the low-bit is set, we can
	// always convert this into a logical shr, even if the shift amount is
	// variable. The low bit of the shift cannot be an input sign bit unless
	// the shift amount is >= the size of the datatype, which is undefined.
	if (DemandedMask.isOneValue()) {
	// Perform the logical shift right.
	Instruction *NewVal = BinaryOperator::CreateLShr(
	I->getOperand(0), I->getOperand(1), I->getName());
	return InsertNewInstWith(NewVal, *I);
	}

	// If the sign bit is the only bit demanded by this ashr, then there is no
	// need to do it, the shift doesn't change the high bit.
	if (DemandedMask.isSignMask())
	return I->getOperand(0);

	const APInt *SA;
	if (match(I->getOperand(1), m_APInt(SA))) {
	uint32_t ShiftAmt = SA->getLimitedValue(BitWidth-1);

	// Signed shift right.
	APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt));
	// If any of the high bits are demanded, we should set the sign bit as
	// demanded.
	if (DemandedMask.countLeadingZeros() <= ShiftAmt)
	DemandedMaskIn.setSignBit();

	// If the shift is exact, then it does demand the low bits (and knows that
	// they are zero).
	if (cast<AShrOperator>(I)->isExact())
	DemandedMaskIn.setLowBits(ShiftAmt);

	if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1))
	return I;

	unsigned SignBits = ComputeNumSignBits(I->getOperand(0), Depth + 1, CxtI);

	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	// Compute the new bits that are at the top now plus sign bits.
	APInt HighBits(APInt::getHighBitsSet(
	BitWidth, std::min(SignBits + ShiftAmt - 1, BitWidth)));
	Known.Zero.lshrInPlace(ShiftAmt);
	Known.One.lshrInPlace(ShiftAmt);

	// If the input sign bit is known to be zero, or if none of the top bits
	// are demanded, turn this into an unsigned shift right.
	assert(BitWidth > ShiftAmt && "Shift amount not saturated?");
	if (Known.Zero[BitWidth-ShiftAmt-1] \|\|
	!DemandedMask.intersects(HighBits)) {
	BinaryOperator *LShr = BinaryOperator::CreateLShr(I->getOperand(0),
	I->getOperand(1));
	LShr->setIsExact(cast<BinaryOperator>(I)->isExact());
	return InsertNewInstWith(LShr, *I);
	} else if (Known.One[BitWidth-ShiftAmt-1]) { // New bits are known one.
	Known.One \|= HighBits;
	}
	} else {
	computeKnownBits(I, Known, Depth, CxtI);
	}
	break;
	}
	case Instruction::UDiv: {
	// UDiv doesn't demand low bits that are zero in the divisor.
	const APInt *SA;
	if (match(I->getOperand(1), m_APInt(SA))) {
	// If the shift is exact, then it does demand the low bits.
	if (cast<UDivOperator>(I)->isExact())
	break;

	// FIXME: Take the demanded mask of the result into account.
	unsigned RHSTrailingZeros = SA->countTrailingZeros();
	APInt DemandedMaskIn =
	APInt::getHighBitsSet(BitWidth, BitWidth - RHSTrailingZeros);
	if (SimplifyDemandedBits(I, 0, DemandedMaskIn, LHSKnown, Depth + 1))
	return I;

	// Propagate zero bits from the input.
	Known.Zero.setHighBits(std::min(
	BitWidth, LHSKnown.Zero.countLeadingOnes() + RHSTrailingZeros));
	} else {
	computeKnownBits(I, Known, Depth, CxtI);
	}
	break;
	}
	case Instruction::SRem:
	if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
	// X % -1 demands all the bits because we don't want to introduce
	// INT_MIN % -1 (== undef) by accident.
	if (Rem->isMinusOne())
	break;
	APInt RA = Rem->getValue().abs();
	if (RA.isPowerOf2()) {
	if (DemandedMask.ult(RA)) // srem won't affect demanded bits
	return I->getOperand(0);

	APInt LowBits = RA - 1;
	APInt Mask2 = LowBits \| APInt::getSignMask(BitWidth);
	if (SimplifyDemandedBits(I, 0, Mask2, LHSKnown, Depth + 1))
	return I;

	// The low bits of LHS are unchanged by the srem.
	Known.Zero = LHSKnown.Zero & LowBits;
	Known.One = LHSKnown.One & LowBits;

	// If LHS is non-negative or has all low bits zero, then the upper bits
	// are all zero.
	if (LHSKnown.isNonNegative() \|\| LowBits.isSubsetOf(LHSKnown.Zero))
	Known.Zero \|= ~LowBits;

	// If LHS is negative and not all low bits are zero, then the upper bits
	// are all one.
	if (LHSKnown.isNegative() && LowBits.intersects(LHSKnown.One))
	Known.One \|= ~LowBits;

	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	break;
	}
	}

	// The sign bit is the LHS's sign bit, except when the result of the
	// remainder is zero.
	if (DemandedMask.isSignBitSet()) {
	computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1, CxtI);
	// If it's known zero, our sign bit is also zero.
	if (LHSKnown.isNonNegative())
	Known.makeNonNegative();
	}
	break;
	case Instruction::URem: {
	KnownBits Known2(BitWidth);
	APInt AllOnes = APInt::getAllOnesValue(BitWidth);
	if (SimplifyDemandedBits(I, 0, AllOnes, Known2, Depth + 1) \|\|
	SimplifyDemandedBits(I, 1, AllOnes, Known2, Depth + 1))
	return I;

	unsigned Leaders = Known2.countMinLeadingZeros();
	Known.Zero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask;
	break;
	}
	case Instruction::Call: {
	bool KnownBitsComputed = false;
	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
	switch (II->getIntrinsicID()) {
	default: break;
	case Intrinsic::bswap: {
	// If the only bits demanded come from one byte of the bswap result,
	// just shift the input byte into position to eliminate the bswap.
	unsigned NLZ = DemandedMask.countLeadingZeros();
	unsigned NTZ = DemandedMask.countTrailingZeros();

	// Round NTZ down to the next byte. If we have 11 trailing zeros, then
	// we need all the bits down to bit 8. Likewise, round NLZ. If we
	// have 14 leading zeros, round to 8.
	NLZ &= ~7;
	NTZ &= ~7;
	// If we need exactly one byte, we can do this transformation.
	if (BitWidth-NLZ-NTZ == 8) {
	unsigned ResultBit = NTZ;
	unsigned InputBit = BitWidth-NTZ-8;

	// Replace this with either a left or right shift to get the byte into
	// the right place.
	Instruction *NewVal;
	if (InputBit > ResultBit)
	NewVal = BinaryOperator::CreateLShr(II->getArgOperand(0),
	ConstantInt::get(I->getType(), InputBit-ResultBit));
	else
	NewVal = BinaryOperator::CreateShl(II->getArgOperand(0),
	ConstantInt::get(I->getType(), ResultBit-InputBit));
	NewVal->takeName(I);
	return InsertNewInstWith(NewVal, *I);
	}
	break;
	}
	case Intrinsic::fshr:
	case Intrinsic::fshl: {
	const APInt *SA;
	if (!match(I->getOperand(2), m_APInt(SA)))
	break;

	// Normalize to funnel shift left. APInt shifts of BitWidth are well-
	// defined, so no need to special-case zero shifts here.
	uint64_t ShiftAmt = SA->urem(BitWidth);
	if (II->getIntrinsicID() == Intrinsic::fshr)
	ShiftAmt = BitWidth - ShiftAmt;

	APInt DemandedMaskLHS(DemandedMask.lshr(ShiftAmt));
	APInt DemandedMaskRHS(DemandedMask.shl(BitWidth - ShiftAmt));
	if (SimplifyDemandedBits(I, 0, DemandedMaskLHS, LHSKnown, Depth + 1) \|\|
	SimplifyDemandedBits(I, 1, DemandedMaskRHS, RHSKnown, Depth + 1))
	return I;

	Known.Zero = LHSKnown.Zero.shl(ShiftAmt) \|
	RHSKnown.Zero.lshr(BitWidth - ShiftAmt);
	Known.One = LHSKnown.One.shl(ShiftAmt) \|
	RHSKnown.One.lshr(BitWidth - ShiftAmt);
	KnownBitsComputed = true;
	break;
	}
	case Intrinsic::x86_mmx_pmovmskb:
	case Intrinsic::x86_sse_movmsk_ps:
	case Intrinsic::x86_sse2_movmsk_pd:
	case Intrinsic::x86_sse2_pmovmskb_128:
	case Intrinsic::x86_avx_movmsk_ps_256:
	case Intrinsic::x86_avx_movmsk_pd_256:
	case Intrinsic::x86_avx2_pmovmskb: {
	// MOVMSK copies the vector elements' sign bits to the low bits
	// and zeros the high bits.
	unsigned ArgWidth;
	if (II->getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
	ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
	} else {
	auto Arg = II->getArgOperand(0);
	auto ArgType = cast<VectorType>(Arg->getType());
	ArgWidth = ArgType->getNumElements();
	}

	// If we don't need any of low bits then return zero,
	// we know that DemandedMask is non-zero already.
	APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
	if (DemandedElts.isNullValue())
	return ConstantInt::getNullValue(VTy);

	// We know that the upper bits are set to zero.
	Known.Zero.setBitsFrom(ArgWidth);
	KnownBitsComputed = true;
	break;
	}
	case Intrinsic::x86_sse42_crc32_64_64:
	Known.Zero.setBitsFrom(32);
	KnownBitsComputed = true;
	break;
	}
	}

	if (!KnownBitsComputed)
	computeKnownBits(V, Known, Depth, CxtI);
	break;
	}
	}

	// If the client is only demanding bits that we know, return the known
	// constant.
	if (DemandedMask.isSubsetOf(Known.Zero\|Known.One))
	return Constant::getIntegerValue(VTy, Known.One);
	return nullptr;
	}

	/// Helper routine of SimplifyDemandedUseBits. It computes Known
	/// bits. It also tries to handle simplifications that can be done based on
	/// DemandedMask, but without modifying the Instruction.
	Value InstCombiner::SimplifyMultipleUseDemandedBits(Instruction I,
	const APInt &DemandedMask,
	KnownBits &Known,
	unsigned Depth,
	Instruction *CxtI) {
	unsigned BitWidth = DemandedMask.getBitWidth();
	Type *ITy = I->getType();

	KnownBits LHSKnown(BitWidth);
	KnownBits RHSKnown(BitWidth);

	// Despite the fact that we can't simplify this instruction in all User's
	// context, we can at least compute the known bits, and we can
	// do simplifications that apply to just the one user if we know that
	// this instruction has a simpler value in that context.
	switch (I->getOpcode()) {
	case Instruction::And: {
	// If either the LHS or the RHS are Zero, the result is zero.
	computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI);
	computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1,
	CxtI);

	Known = LHSKnown & RHSKnown;

	// If the client is only demanding bits that we know, return the known
	// constant.
	if (DemandedMask.isSubsetOf(Known.Zero \| Known.One))
	return Constant::getIntegerValue(ITy, Known.One);

	// If all of the demanded bits are known 1 on one side, return the other.
	// These bits cannot contribute to the result of the 'and' in this
	// context.
	if (DemandedMask.isSubsetOf(LHSKnown.Zero \| RHSKnown.One))
	return I->getOperand(0);
	if (DemandedMask.isSubsetOf(RHSKnown.Zero \| LHSKnown.One))
	return I->getOperand(1);

	break;
	}
	case Instruction::Or: {
	// We can simplify (X\|Y) -> X or Y in the user's context if we know that
	// only bits from X or Y are demanded.

	// If either the LHS or the RHS are One, the result is One.
	computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI);
	computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1,
	CxtI);

	Known = LHSKnown \| RHSKnown;

	// If the client is only demanding bits that we know, return the known
	// constant.
	if (DemandedMask.isSubsetOf(Known.Zero \| Known.One))
	return Constant::getIntegerValue(ITy, Known.One);

	// If all of the demanded bits are known zero on one side, return the
	// other. These bits cannot contribute to the result of the 'or' in this
	// context.
	if (DemandedMask.isSubsetOf(LHSKnown.One \| RHSKnown.Zero))
	return I->getOperand(0);
	if (DemandedMask.isSubsetOf(RHSKnown.One \| LHSKnown.Zero))
	return I->getOperand(1);

	break;
	}
	case Instruction::Xor: {
	// We can simplify (X^Y) -> X or Y in the user's context if we know that
	// only bits from X or Y are demanded.

	computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI);
	computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1,
	CxtI);

	Known = LHSKnown ^ RHSKnown;

	// If the client is only demanding bits that we know, return the known
	// constant.
	if (DemandedMask.isSubsetOf(Known.Zero \| Known.One))
	return Constant::getIntegerValue(ITy, Known.One);

	// If all of the demanded bits are known zero on one side, return the
	// other.
	if (DemandedMask.isSubsetOf(RHSKnown.Zero))
	return I->getOperand(0);
	if (DemandedMask.isSubsetOf(LHSKnown.Zero))
	return I->getOperand(1);

	break;
	}
	default:
	// Compute the Known bits to simplify things downstream.
	computeKnownBits(I, Known, Depth, CxtI);

	// If this user is only demanding bits that we know, return the known
	// constant.
	if (DemandedMask.isSubsetOf(Known.Zero\|Known.One))
	return Constant::getIntegerValue(ITy, Known.One);

	break;
	}

	return nullptr;
	}


	/// Helper routine of SimplifyDemandedUseBits. It tries to simplify
	/// "E1 = (X lsr C1) << C2", where the C1 and C2 are constant, into
	/// "E2 = X << (C2 - C1)" or "E2 = X >> (C1 - C2)", depending on the sign
	/// of "C2-C1".
	///
	/// Suppose E1 and E2 are generally different in bits S={bm, bm+1,
	/// ..., bn}, without considering the specific value X is holding.
	/// This transformation is legal iff one of following conditions is hold:
	/// 1) All the bit in S are 0, in this case E1 == E2.
	/// 2) We don't care those bits in S, per the input DemandedMask.
	/// 3) Combination of 1) and 2). Some bits in S are 0, and we don't care the
	/// rest bits.
	///
	/// Currently we only test condition 2).
	///
	/// As with SimplifyDemandedUseBits, it returns NULL if the simplification was
	/// not successful.
	Value *
	InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1,
	Instruction *Shl, const APInt &ShlOp1,
	const APInt &DemandedMask,
	KnownBits &Known) {
	if (!ShlOp1 \|\| !ShrOp1)
	return nullptr; // No-op.

	Value *VarX = Shr->getOperand(0);
	Type *Ty = VarX->getType();
	unsigned BitWidth = Ty->getScalarSizeInBits();
	if (ShlOp1.uge(BitWidth) \|\| ShrOp1.uge(BitWidth))
	return nullptr; // Undef.

	unsigned ShlAmt = ShlOp1.getZExtValue();
	unsigned ShrAmt = ShrOp1.getZExtValue();

	Known.One.clearAllBits();
	Known.Zero.setLowBits(ShlAmt - 1);
	Known.Zero &= DemandedMask;

	APInt BitMask1(APInt::getAllOnesValue(BitWidth));
	APInt BitMask2(APInt::getAllOnesValue(BitWidth));

	bool isLshr = (Shr->getOpcode() == Instruction::LShr);
	BitMask1 = isLshr ? (BitMask1.lshr(ShrAmt) << ShlAmt) :
	(BitMask1.ashr(ShrAmt) << ShlAmt);

	if (ShrAmt <= ShlAmt) {
	BitMask2 <<= (ShlAmt - ShrAmt);
	} else {
	BitMask2 = isLshr ? BitMask2.lshr(ShrAmt - ShlAmt):
	BitMask2.ashr(ShrAmt - ShlAmt);
	}

	// Check if condition-2 (see the comment to this function) is satified.
	if ((BitMask1 & DemandedMask) == (BitMask2 & DemandedMask)) {
	if (ShrAmt == ShlAmt)
	return VarX;

	if (!Shr->hasOneUse())
	return nullptr;

	BinaryOperator *New;
	if (ShrAmt < ShlAmt) {
	Constant *Amt = ConstantInt::get(VarX->getType(), ShlAmt - ShrAmt);
	New = BinaryOperator::CreateShl(VarX, Amt);
	BinaryOperator *Orig = cast<BinaryOperator>(Shl);
	New->setHasNoSignedWrap(Orig->hasNoSignedWrap());
	New->setHasNoUnsignedWrap(Orig->hasNoUnsignedWrap());
	} else {
	Constant *Amt = ConstantInt::get(VarX->getType(), ShrAmt - ShlAmt);
	New = isLshr ? BinaryOperator::CreateLShr(VarX, Amt) :
	BinaryOperator::CreateAShr(VarX, Amt);
	if (cast<BinaryOperator>(Shr)->isExact())
	New->setIsExact(true);
	}

	return InsertNewInstWith(New, *Shl);
	}

	return nullptr;
	}

	/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
	///
	/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
	/// struct returns.
	Value InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst II,
	APInt DemandedElts,
	int DMaskIdx) {

	- // FIXME: Allow v3i16/v3f16 in buffer intrinsics when the types are fully supported.
	- if (DMaskIdx < 0 &&
	- II->getType()->getScalarSizeInBits() != 32 &&
	- DemandedElts.getActiveBits() == 3)
	- return nullptr;
	-
	auto *IIVTy = cast<VectorType>(II->getType());
	unsigned VWidth = IIVTy->getNumElements();
	if (VWidth == 1)
	return nullptr;

	IRBuilderBase::InsertPointGuard Guard(Builder);
	Builder.SetInsertPoint(II);

	// Assume the arguments are unchanged and later override them, if needed.
	SmallVector<Value *, 16> Args(II->arg_begin(), II->arg_end());

	if (DMaskIdx < 0) {
	// Buffer case.

	const unsigned ActiveBits = DemandedElts.getActiveBits();
	const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();

	// Start assuming the prefix of elements is demanded, but possibly clear
	// some other bits if there are trailing zeros (unused components at front)
	// and update offset.
	DemandedElts = (1 << ActiveBits) - 1;

	if (UnusedComponentsAtFront > 0) {
	static const unsigned InvalidOffsetIdx = 0xf;

	unsigned OffsetIdx;
	switch (II->getIntrinsicID()) {
	case Intrinsic::amdgcn_raw_buffer_load:
	OffsetIdx = 1;
	break;
	case Intrinsic::amdgcn_s_buffer_load:
	// If resulting type is vec3, there is no point in trimming the
	// load with updated offset, as the vec3 would most likely be widened to
	// vec4 anyway during lowering.
	if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
	OffsetIdx = InvalidOffsetIdx;
	else
	OffsetIdx = 1;
	break;
	case Intrinsic::amdgcn_struct_buffer_load:
	OffsetIdx = 2;
	break;
	default:
	// TODO: handle tbuffer* intrinsics.
	OffsetIdx = InvalidOffsetIdx;
	break;
	}

	if (OffsetIdx != InvalidOffsetIdx) {
	// Clear demanded bits and update the offset.
	DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
	auto *Offset = II->getArgOperand(OffsetIdx);
	unsigned SingleComponentSizeInBits =
	getDataLayout().getTypeSizeInBits(II->getType()->getScalarType());
	unsigned OffsetAdd =
	UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
	auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
	Args[OffsetIdx] = Builder.CreateAdd(Offset, OffsetAddVal);
	}
	}
	} else {
	// Image case.

	ConstantInt *DMask = cast<ConstantInt>(II->getArgOperand(DMaskIdx));
	unsigned DMaskVal = DMask->getZExtValue() & 0xf;

	// Mask off values that are undefined because the dmask doesn't cover them
	DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;

	unsigned NewDMaskVal = 0;
	unsigned OrigLoadIdx = 0;
	for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
	const unsigned Bit = 1 << SrcIdx;
	if (!!(DMaskVal & Bit)) {
	if (!!DemandedElts[OrigLoadIdx])
	NewDMaskVal \|= Bit;
	OrigLoadIdx++;
	}
	}

	if (DMaskVal != NewDMaskVal)
	Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
	}

	unsigned NewNumElts = DemandedElts.countPopulation();
	if (!NewNumElts)
	return UndefValue::get(II->getType());

	+ // FIXME: Allow v3i16/v3f16 in buffer and image intrinsics when the types are
	+ // fully supported.
	+ if (II->getType()->getScalarSizeInBits() == 16 && NewNumElts == 3)
	+ return nullptr;
	+
	if (NewNumElts >= VWidth && DemandedElts.isMask()) {
	if (DMaskIdx >= 0)
	II->setArgOperand(DMaskIdx, Args[DMaskIdx]);
	return nullptr;
	}

	// Validate function argument and return types, extracting overloaded types
	// along the way.
	SmallVector<Type *, 6> OverloadTys;
	if (!Intrinsic::getIntrinsicSignature(II->getCalledFunction(), OverloadTys))
	return nullptr;

	Module *M = II->getParent()->getParent()->getParent();
	Type *EltTy = IIVTy->getElementType();
	Type *NewTy =
	(NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);

	OverloadTys[0] = NewTy;
	Function *NewIntrin =
	Intrinsic::getDeclaration(M, II->getIntrinsicID(), OverloadTys);

	CallInst *NewCall = Builder.CreateCall(NewIntrin, Args);
	NewCall->takeName(II);
	NewCall->copyMetadata(*II);

	if (NewNumElts == 1) {
	return Builder.CreateInsertElement(UndefValue::get(II->getType()), NewCall,
	DemandedElts.countTrailingZeros());
	}

	SmallVector<int, 8> EltMask;
	unsigned NewLoadIdx = 0;
	for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
	if (!!DemandedElts[OrigLoadIdx])
	EltMask.push_back(NewLoadIdx++);
	else
	EltMask.push_back(NewNumElts);
	}

	Value *Shuffle =
	Builder.CreateShuffleVector(NewCall, UndefValue::get(NewTy), EltMask);

	return Shuffle;
	}

	/// The specified value produces a vector with any number of elements.
	/// This method analyzes which elements of the operand are undef and returns
	/// that information in UndefElts.
	///
	/// DemandedElts contains the set of elements that are actually used by the
	/// caller, and by default (AllowMultipleUsers equals false) the value is
	/// simplified only if it has a single caller. If AllowMultipleUsers is set
	/// to true, DemandedElts refers to the union of sets of elements that are
	/// used by all callers.
	///
	/// If the information about demanded elements can be used to simplify the
	/// operation, the operation is simplified, then the resultant value is
	/// returned. This returns null if no change was made.
	Value InstCombiner::SimplifyDemandedVectorElts(Value V, APInt DemandedElts,
	APInt &UndefElts,
	unsigned Depth,
	bool AllowMultipleUsers) {
	// Cannot analyze scalable type. The number of vector elements is not a
	// compile-time constant.
	if (isa<ScalableVectorType>(V->getType()))
	return nullptr;

	unsigned VWidth = cast<FixedVectorType>(V->getType())->getNumElements();
	APInt EltMask(APInt::getAllOnesValue(VWidth));
	assert((DemandedElts & ~EltMask) == 0 && "Invalid DemandedElts!");

	if (isa<UndefValue>(V)) {
	// If the entire vector is undefined, just return this info.
	UndefElts = EltMask;
	return nullptr;
	}

	if (DemandedElts.isNullValue()) { // If nothing is demanded, provide undef.
	UndefElts = EltMask;
	return UndefValue::get(V->getType());
	}

	UndefElts = 0;

	if (auto *C = dyn_cast<Constant>(V)) {
	// Check if this is identity. If so, return 0 since we are not simplifying
	// anything.
	if (DemandedElts.isAllOnesValue())
	return nullptr;

	Type *EltTy = cast<VectorType>(V->getType())->getElementType();
	Constant *Undef = UndefValue::get(EltTy);
	SmallVector<Constant*, 16> Elts;
	for (unsigned i = 0; i != VWidth; ++i) {
	if (!DemandedElts[i]) { // If not demanded, set to undef.
	Elts.push_back(Undef);
	UndefElts.setBit(i);
	continue;
	}

	Constant *Elt = C->getAggregateElement(i);
	if (!Elt) return nullptr;

	if (isa<UndefValue>(Elt)) { // Already undef.
	Elts.push_back(Undef);
	UndefElts.setBit(i);
	} else { // Otherwise, defined.
	Elts.push_back(Elt);
	}
	}

	// If we changed the constant, return it.
	Constant *NewCV = ConstantVector::get(Elts);
	return NewCV != C ? NewCV : nullptr;
	}

	// Limit search depth.
	if (Depth == 10)
	return nullptr;

	if (!AllowMultipleUsers) {
	// If multiple users are using the root value, proceed with
	// simplification conservatively assuming that all elements
	// are needed.
	if (!V->hasOneUse()) {
	// Quit if we find multiple users of a non-root value though.
	// They'll be handled when it's their turn to be visited by
	// the main instcombine process.
	if (Depth != 0)
	// TODO: Just compute the UndefElts information recursively.
	return nullptr;

	// Conservatively assume that all elements are needed.
	DemandedElts = EltMask;
	}
	}

	Instruction *I = dyn_cast<Instruction>(V);
	if (!I) return nullptr; // Only analyze instructions.

	bool MadeChange = false;
	auto simplifyAndSetOp = [&](Instruction *Inst, unsigned OpNum,
	APInt Demanded, APInt &Undef) {
	auto *II = dyn_cast<IntrinsicInst>(Inst);
	Value *Op = II ? II->getArgOperand(OpNum) : Inst->getOperand(OpNum);
	if (Value *V = SimplifyDemandedVectorElts(Op, Demanded, Undef, Depth + 1)) {
	replaceOperand(*Inst, OpNum, V);
	MadeChange = true;
	}
	};

	APInt UndefElts2(VWidth, 0);
	APInt UndefElts3(VWidth, 0);
	switch (I->getOpcode()) {
	default: break;

	case Instruction::GetElementPtr: {
	// The LangRef requires that struct geps have all constant indices. As
	// such, we can't convert any operand to partial undef.
	auto mayIndexStructType = [](GetElementPtrInst &GEP) {
	for (auto I = gep_type_begin(GEP), E = gep_type_end(GEP);
	I != E; I++)
	if (I.isStruct())
	return true;;
	return false;
	};
	if (mayIndexStructType(cast<GetElementPtrInst>(*I)))
	break;

	// Conservatively track the demanded elements back through any vector
	// operands we may have. We know there must be at least one, or we
	// wouldn't have a vector result to get here. Note that we intentionally
	// merge the undef bits here since gepping with either an undef base or
	// index results in undef.
	for (unsigned i = 0; i < I->getNumOperands(); i++) {
	if (isa<UndefValue>(I->getOperand(i))) {
	// If the entire vector is undefined, just return this info.
	UndefElts = EltMask;
	return nullptr;
	}
	if (I->getOperand(i)->getType()->isVectorTy()) {
	APInt UndefEltsOp(VWidth, 0);
	simplifyAndSetOp(I, i, DemandedElts, UndefEltsOp);
	UndefElts \|= UndefEltsOp;
	}
	}

	break;
	}
	case Instruction::InsertElement: {
	// If this is a variable index, we don't know which element it overwrites.
	// demand exactly the same input as we produce.
	ConstantInt *Idx = dyn_cast<ConstantInt>(I->getOperand(2));
	if (!Idx) {
	// Note that we can't propagate undef elt info, because we don't know
	// which elt is getting updated.
	simplifyAndSetOp(I, 0, DemandedElts, UndefElts2);
	break;
	}

	// The element inserted overwrites whatever was there, so the input demanded
	// set is simpler than the output set.
	unsigned IdxNo = Idx->getZExtValue();
	APInt PreInsertDemandedElts = DemandedElts;
	if (IdxNo < VWidth)
	PreInsertDemandedElts.clearBit(IdxNo);

	simplifyAndSetOp(I, 0, PreInsertDemandedElts, UndefElts);

	// If this is inserting an element that isn't demanded, remove this
	// insertelement.
	if (IdxNo >= VWidth \|\| !DemandedElts[IdxNo]) {
	Worklist.push(I);
	return I->getOperand(0);
	}

	// The inserted element is defined.
	UndefElts.clearBit(IdxNo);
	break;
	}
	case Instruction::ShuffleVector: {
	auto *Shuffle = cast<ShuffleVectorInst>(I);
	assert(Shuffle->getOperand(0)->getType() ==
	Shuffle->getOperand(1)->getType() &&
	"Expected shuffle operands to have same type");
	unsigned OpWidth =
	cast<VectorType>(Shuffle->getOperand(0)->getType())->getNumElements();
	// Handle trivial case of a splat. Only check the first element of LHS
	// operand.
	if (all_of(Shuffle->getShuffleMask(), [](int Elt) { return Elt == 0; }) &&
	DemandedElts.isAllOnesValue()) {
	if (!isa<UndefValue>(I->getOperand(1))) {
	I->setOperand(1, UndefValue::get(I->getOperand(1)->getType()));
	MadeChange = true;
	}
	APInt LeftDemanded(OpWidth, 1);
	APInt LHSUndefElts(OpWidth, 0);
	simplifyAndSetOp(I, 0, LeftDemanded, LHSUndefElts);
	if (LHSUndefElts[0])
	UndefElts = EltMask;
	else
	UndefElts.clearAllBits();
	break;
	}

	APInt LeftDemanded(OpWidth, 0), RightDemanded(OpWidth, 0);
	for (unsigned i = 0; i < VWidth; i++) {
	if (DemandedElts[i]) {
	unsigned MaskVal = Shuffle->getMaskValue(i);
	if (MaskVal != -1u) {
	assert(MaskVal < OpWidth * 2 &&
	"shufflevector mask index out of range!");
	if (MaskVal < OpWidth)
	LeftDemanded.setBit(MaskVal);
	else
	RightDemanded.setBit(MaskVal - OpWidth);
	}
	}
	}

	APInt LHSUndefElts(OpWidth, 0);
	simplifyAndSetOp(I, 0, LeftDemanded, LHSUndefElts);

	APInt RHSUndefElts(OpWidth, 0);
	simplifyAndSetOp(I, 1, RightDemanded, RHSUndefElts);

	// If this shuffle does not change the vector length and the elements
	// demanded by this shuffle are an identity mask, then this shuffle is
	// unnecessary.
	//
	// We are assuming canonical form for the mask, so the source vector is
	// operand 0 and operand 1 is not used.
	//
	// Note that if an element is demanded and this shuffle mask is undefined
	// for that element, then the shuffle is not considered an identity
	// operation. The shuffle prevents poison from the operand vector from
	// leaking to the result by replacing poison with an undefined value.
	if (VWidth == OpWidth) {
	bool IsIdentityShuffle = true;
	for (unsigned i = 0; i < VWidth; i++) {
	unsigned MaskVal = Shuffle->getMaskValue(i);
	if (DemandedElts[i] && i != MaskVal) {
	IsIdentityShuffle = false;
	break;
	}
	}
	if (IsIdentityShuffle)
	return Shuffle->getOperand(0);
	}

	bool NewUndefElts = false;
	unsigned LHSIdx = -1u, LHSValIdx = -1u;
	unsigned RHSIdx = -1u, RHSValIdx = -1u;
	bool LHSUniform = true;
	bool RHSUniform = true;
	for (unsigned i = 0; i < VWidth; i++) {
	unsigned MaskVal = Shuffle->getMaskValue(i);
	if (MaskVal == -1u) {
	UndefElts.setBit(i);
	} else if (!DemandedElts[i]) {
	NewUndefElts = true;
	UndefElts.setBit(i);
	} else if (MaskVal < OpWidth) {
	if (LHSUndefElts[MaskVal]) {
	NewUndefElts = true;
	UndefElts.setBit(i);
	} else {
	LHSIdx = LHSIdx == -1u ? i : OpWidth;
	LHSValIdx = LHSValIdx == -1u ? MaskVal : OpWidth;
	LHSUniform = LHSUniform && (MaskVal == i);
	}
	} else {
	if (RHSUndefElts[MaskVal - OpWidth]) {
	NewUndefElts = true;
	UndefElts.setBit(i);
	} else {
	RHSIdx = RHSIdx == -1u ? i : OpWidth;
	RHSValIdx = RHSValIdx == -1u ? MaskVal - OpWidth : OpWidth;
	RHSUniform = RHSUniform && (MaskVal - OpWidth == i);
	}
	}
	}

	// Try to transform shuffle with constant vector and single element from
	// this constant vector to single insertelement instruction.
	// shufflevector V, C, <v1, v2, .., ci, .., vm> ->
	// insertelement V, C[ci], ci-n
	if (OpWidth == Shuffle->getType()->getNumElements()) {
	Value *Op = nullptr;
	Constant *Value = nullptr;
	unsigned Idx = -1u;

	// Find constant vector with the single element in shuffle (LHS or RHS).
	if (LHSIdx < OpWidth && RHSUniform) {
	if (auto *CV = dyn_cast<ConstantVector>(Shuffle->getOperand(0))) {
	Op = Shuffle->getOperand(1);
	Value = CV->getOperand(LHSValIdx);
	Idx = LHSIdx;
	}
	}
	if (RHSIdx < OpWidth && LHSUniform) {
	if (auto *CV = dyn_cast<ConstantVector>(Shuffle->getOperand(1))) {
	Op = Shuffle->getOperand(0);
	Value = CV->getOperand(RHSValIdx);
	Idx = RHSIdx;
	}
	}
	// Found constant vector with single element - convert to insertelement.
	if (Op && Value) {
	Instruction *New = InsertElementInst::Create(
	Op, Value, ConstantInt::get(Type::getInt32Ty(I->getContext()), Idx),
	Shuffle->getName());
	InsertNewInstWith(New, *Shuffle);
	return New;
	}
	}
	if (NewUndefElts) {
	// Add additional discovered undefs.
	SmallVector<int, 16> Elts;
	for (unsigned i = 0; i < VWidth; ++i) {
	if (UndefElts[i])
	Elts.push_back(UndefMaskElem);
	else
	Elts.push_back(Shuffle->getMaskValue(i));
	}
	Shuffle->setShuffleMask(Elts);
	MadeChange = true;
	}
	break;
	}
	case Instruction::Select: {
	// If this is a vector select, try to transform the select condition based
	// on the current demanded elements.
	SelectInst *Sel = cast<SelectInst>(I);
	if (Sel->getCondition()->getType()->isVectorTy()) {
	// TODO: We are not doing anything with UndefElts based on this call.
	// It is overwritten below based on the other select operands. If an
	// element of the select condition is known undef, then we are free to
	// choose the output value from either arm of the select. If we know that
	// one of those values is undef, then the output can be undef.
	simplifyAndSetOp(I, 0, DemandedElts, UndefElts);
	}

	// Next, see if we can transform the arms of the select.
	APInt DemandedLHS(DemandedElts), DemandedRHS(DemandedElts);
	if (auto *CV = dyn_cast<ConstantVector>(Sel->getCondition())) {
	for (unsigned i = 0; i < VWidth; i++) {
	// isNullValue() always returns false when called on a ConstantExpr.
	// Skip constant expressions to avoid propagating incorrect information.
	Constant *CElt = CV->getAggregateElement(i);
	if (isa<ConstantExpr>(CElt))
	continue;
	// TODO: If a select condition element is undef, we can demand from
	// either side. If one side is known undef, choosing that side would
	// propagate undef.
	if (CElt->isNullValue())
	DemandedLHS.clearBit(i);
	else
	DemandedRHS.clearBit(i);
	}
	}

	simplifyAndSetOp(I, 1, DemandedLHS, UndefElts2);
	simplifyAndSetOp(I, 2, DemandedRHS, UndefElts3);

	// Output elements are undefined if the element from each arm is undefined.
	// TODO: This can be improved. See comment in select condition handling.
	UndefElts = UndefElts2 & UndefElts3;
	break;
	}
	case Instruction::BitCast: {
	// Vector->vector casts only.
	VectorType *VTy = dyn_cast<VectorType>(I->getOperand(0)->getType());
	if (!VTy) break;
	unsigned InVWidth = VTy->getNumElements();
	APInt InputDemandedElts(InVWidth, 0);
	UndefElts2 = APInt(InVWidth, 0);
	unsigned Ratio;

	if (VWidth == InVWidth) {
	// If we are converting from <4 x i32> -> <4 x f32>, we demand the same
	// elements as are demanded of us.
	Ratio = 1;
	InputDemandedElts = DemandedElts;
	} else if ((VWidth % InVWidth) == 0) {
	// If the number of elements in the output is a multiple of the number of
	// elements in the input then an input element is live if any of the
	// corresponding output elements are live.
	Ratio = VWidth / InVWidth;
	for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx)
	if (DemandedElts[OutIdx])
	InputDemandedElts.setBit(OutIdx / Ratio);
	} else if ((InVWidth % VWidth) == 0) {
	// If the number of elements in the input is a multiple of the number of
	// elements in the output then an input element is live if the
	// corresponding output element is live.
	Ratio = InVWidth / VWidth;
	for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx)
	if (DemandedElts[InIdx / Ratio])
	InputDemandedElts.setBit(InIdx);
	} else {
	// Unsupported so far.
	break;
	}

	simplifyAndSetOp(I, 0, InputDemandedElts, UndefElts2);

	if (VWidth == InVWidth) {
	UndefElts = UndefElts2;
	} else if ((VWidth % InVWidth) == 0) {
	// If the number of elements in the output is a multiple of the number of
	// elements in the input then an output element is undef if the
	// corresponding input element is undef.
	for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx)
	if (UndefElts2[OutIdx / Ratio])
	UndefElts.setBit(OutIdx);
	} else if ((InVWidth % VWidth) == 0) {
	// If the number of elements in the input is a multiple of the number of
	// elements in the output then an output element is undef if all of the
	// corresponding input elements are undef.
	for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) {
	APInt SubUndef = UndefElts2.lshr(OutIdx * Ratio).zextOrTrunc(Ratio);
	if (SubUndef.countPopulation() == Ratio)
	UndefElts.setBit(OutIdx);
	}
	} else {
	llvm_unreachable("Unimp");
	}
	break;
	}
	case Instruction::FPTrunc:
	case Instruction::FPExt:
	simplifyAndSetOp(I, 0, DemandedElts, UndefElts);
	break;

	case Instruction::Call: {
	IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
	if (!II) break;
	switch (II->getIntrinsicID()) {
	case Intrinsic::masked_gather: // fallthrough
	case Intrinsic::masked_load: {
	// Subtlety: If we load from a pointer, the pointer must be valid
	// regardless of whether the element is demanded. Doing otherwise risks
	// segfaults which didn't exist in the original program.
	APInt DemandedPtrs(APInt::getAllOnesValue(VWidth)),
	DemandedPassThrough(DemandedElts);
	if (auto *CV = dyn_cast<ConstantVector>(II->getOperand(2)))
	for (unsigned i = 0; i < VWidth; i++) {
	Constant *CElt = CV->getAggregateElement(i);
	if (CElt->isNullValue())
	DemandedPtrs.clearBit(i);
	else if (CElt->isAllOnesValue())
	DemandedPassThrough.clearBit(i);
	}
	if (II->getIntrinsicID() == Intrinsic::masked_gather)
	simplifyAndSetOp(II, 0, DemandedPtrs, UndefElts2);
	simplifyAndSetOp(II, 3, DemandedPassThrough, UndefElts3);

	// Output elements are undefined if the element from both sources are.
	// TODO: can strengthen via mask as well.
	UndefElts = UndefElts2 & UndefElts3;
	break;
	}
	case Intrinsic::x86_xop_vfrcz_ss:
	case Intrinsic::x86_xop_vfrcz_sd:
	// The instructions for these intrinsics are speced to zero upper bits not
	// pass them through like other scalar intrinsics. So we shouldn't just
	// use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
	// Instead we should return a zero vector.
	if (!DemandedElts[0]) {
	Worklist.push(II);
	return ConstantAggregateZero::get(II->getType());
	}

	// Only the lower element is used.
	DemandedElts = 1;
	simplifyAndSetOp(II, 0, DemandedElts, UndefElts);

	// Only the lower element is undefined. The high elements are zero.
	UndefElts = UndefElts[0];
	break;

	// Unary scalar-as-vector operations that work column-wise.
	case Intrinsic::x86_sse_rcp_ss:
	case Intrinsic::x86_sse_rsqrt_ss:
	simplifyAndSetOp(II, 0, DemandedElts, UndefElts);

	// If lowest element of a scalar op isn't used then use Arg0.
	if (!DemandedElts[0]) {
	Worklist.push(II);
	return II->getArgOperand(0);
	}
	// TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
	// checks).
	break;

	// Binary scalar-as-vector operations that work column-wise. The high
	// elements come from operand 0. The low element is a function of both
	// operands.
	case Intrinsic::x86_sse_min_ss:
	case Intrinsic::x86_sse_max_ss:
	case Intrinsic::x86_sse_cmp_ss:
	case Intrinsic::x86_sse2_min_sd:
	case Intrinsic::x86_sse2_max_sd:
	case Intrinsic::x86_sse2_cmp_sd: {
	simplifyAndSetOp(II, 0, DemandedElts, UndefElts);

	// If lowest element of a scalar op isn't used then use Arg0.
	if (!DemandedElts[0]) {
	Worklist.push(II);
	return II->getArgOperand(0);
	}

	// Only lower element is used for operand 1.
	DemandedElts = 1;
	simplifyAndSetOp(II, 1, DemandedElts, UndefElts2);

	// Lower element is undefined if both lower elements are undefined.
	// Consider things like undef&0. The result is known zero, not undef.
	if (!UndefElts2[0])
	UndefElts.clearBit(0);

	break;
	}

	// Binary scalar-as-vector operations that work column-wise. The high
	// elements come from operand 0 and the low element comes from operand 1.
	case Intrinsic::x86_sse41_round_ss:
	case Intrinsic::x86_sse41_round_sd: {
	// Don't use the low element of operand 0.
	APInt DemandedElts2 = DemandedElts;
	DemandedElts2.clearBit(0);
	simplifyAndSetOp(II, 0, DemandedElts2, UndefElts);

	// If lowest element of a scalar op isn't used then use Arg0.
	if (!DemandedElts[0]) {
	Worklist.push(II);
	return II->getArgOperand(0);
	}

	// Only lower element is used for operand 1.
	DemandedElts = 1;
	simplifyAndSetOp(II, 1, DemandedElts, UndefElts2);

	// Take the high undef elements from operand 0 and take the lower element
	// from operand 1.
	UndefElts.clearBit(0);
	UndefElts \|= UndefElts2[0];
	break;
	}

	// Three input scalar-as-vector operations that work column-wise. The high
	// elements come from operand 0 and the low element is a function of all
	// three inputs.
	case Intrinsic::x86_avx512_mask_add_ss_round:
	case Intrinsic::x86_avx512_mask_div_ss_round:
	case Intrinsic::x86_avx512_mask_mul_ss_round:
	case Intrinsic::x86_avx512_mask_sub_ss_round:
	case Intrinsic::x86_avx512_mask_max_ss_round:
	case Intrinsic::x86_avx512_mask_min_ss_round:
	case Intrinsic::x86_avx512_mask_add_sd_round:
	case Intrinsic::x86_avx512_mask_div_sd_round:
	case Intrinsic::x86_avx512_mask_mul_sd_round:
	case Intrinsic::x86_avx512_mask_sub_sd_round:
	case Intrinsic::x86_avx512_mask_max_sd_round:
	case Intrinsic::x86_avx512_mask_min_sd_round:
	simplifyAndSetOp(II, 0, DemandedElts, UndefElts);

	// If lowest element of a scalar op isn't used then use Arg0.
	if (!DemandedElts[0]) {
	Worklist.push(II);
	return II->getArgOperand(0);
	}

	// Only lower element is used for operand 1 and 2.
	DemandedElts = 1;
	simplifyAndSetOp(II, 1, DemandedElts, UndefElts2);
	simplifyAndSetOp(II, 2, DemandedElts, UndefElts3);

	// Lower element is undefined if all three lower elements are undefined.
	// Consider things like undef&0. The result is known zero, not undef.
	if (!UndefElts2[0] \|\| !UndefElts3[0])
	UndefElts.clearBit(0);

	break;

	case Intrinsic::x86_sse2_packssdw_128:
	case Intrinsic::x86_sse2_packsswb_128:
	case Intrinsic::x86_sse2_packuswb_128:
	case Intrinsic::x86_sse41_packusdw:
	case Intrinsic::x86_avx2_packssdw:
	case Intrinsic::x86_avx2_packsswb:
	case Intrinsic::x86_avx2_packusdw:
	case Intrinsic::x86_avx2_packuswb:
	case Intrinsic::x86_avx512_packssdw_512:
	case Intrinsic::x86_avx512_packsswb_512:
	case Intrinsic::x86_avx512_packusdw_512:
	case Intrinsic::x86_avx512_packuswb_512: {
	auto *Ty0 = II->getArgOperand(0)->getType();
	unsigned InnerVWidth = cast<VectorType>(Ty0)->getNumElements();
	assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");

	unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
	unsigned VWidthPerLane = VWidth / NumLanes;
	unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;

	// Per lane, pack the elements of the first input and then the second.
	// e.g.
	// v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
	// v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
	for (int OpNum = 0; OpNum != 2; ++OpNum) {
	APInt OpDemandedElts(InnerVWidth, 0);
	for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
	unsigned LaneIdx = Lane * VWidthPerLane;
	for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
	unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
	if (DemandedElts[Idx])
	OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
	}
	}

	// Demand elements from the operand.
	APInt OpUndefElts(InnerVWidth, 0);
	simplifyAndSetOp(II, OpNum, OpDemandedElts, OpUndefElts);

	// Pack the operand's UNDEF elements, one lane at a time.
	OpUndefElts = OpUndefElts.zext(VWidth);
	for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
	APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
	LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
	LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
	UndefElts \|= LaneElts;
	}
	}
	break;
	}

	// PSHUFB
	case Intrinsic::x86_ssse3_pshuf_b_128:
	case Intrinsic::x86_avx2_pshuf_b:
	case Intrinsic::x86_avx512_pshuf_b_512:
	// PERMILVAR
	case Intrinsic::x86_avx_vpermilvar_ps:
	case Intrinsic::x86_avx_vpermilvar_ps_256:
	case Intrinsic::x86_avx512_vpermilvar_ps_512:
	case Intrinsic::x86_avx_vpermilvar_pd:
	case Intrinsic::x86_avx_vpermilvar_pd_256:
	case Intrinsic::x86_avx512_vpermilvar_pd_512:
	// PERMV
	case Intrinsic::x86_avx2_permd:
	case Intrinsic::x86_avx2_permps: {
	simplifyAndSetOp(II, 1, DemandedElts, UndefElts);
	break;
	}

	// SSE4A instructions leave the upper 64-bits of the 128-bit result
	// in an undefined state.
	case Intrinsic::x86_sse4a_extrq:
	case Intrinsic::x86_sse4a_extrqi:
	case Intrinsic::x86_sse4a_insertq:
	case Intrinsic::x86_sse4a_insertqi:
	UndefElts.setHighBits(VWidth / 2);
	break;
	case Intrinsic::amdgcn_buffer_load:
	case Intrinsic::amdgcn_buffer_load_format:
	case Intrinsic::amdgcn_raw_buffer_load:
	case Intrinsic::amdgcn_raw_buffer_load_format:
	case Intrinsic::amdgcn_raw_tbuffer_load:
	case Intrinsic::amdgcn_s_buffer_load:
	case Intrinsic::amdgcn_struct_buffer_load:
	case Intrinsic::amdgcn_struct_buffer_load_format:
	case Intrinsic::amdgcn_struct_tbuffer_load:
	case Intrinsic::amdgcn_tbuffer_load:
	return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts);
	default: {
	if (getAMDGPUImageDMaskIntrinsic(II->getIntrinsicID()))
	return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts, 0);

	break;
	}
	} // switch on IntrinsicID
	break;
	} // case Call
	} // switch on Opcode

	// TODO: We bail completely on integer div/rem and shifts because they have
	// UB/poison potential, but that should be refined.
	BinaryOperator *BO;
	if (match(I, m_BinOp(BO)) && !BO->isIntDivRem() && !BO->isShift()) {
	simplifyAndSetOp(I, 0, DemandedElts, UndefElts);
	simplifyAndSetOp(I, 1, DemandedElts, UndefElts2);

	// Any change to an instruction with potential poison must clear those flags
	// because we can not guarantee those constraints now. Other analysis may
	// determine that it is safe to re-apply the flags.
	if (MadeChange)
	BO->dropPoisonGeneratingFlags();

	// Output elements are undefined if both are undefined. Consider things
	// like undef & 0. The result is known zero, not undef.
	UndefElts &= UndefElts2;
	}

	// If we've proven all of the lanes undef, return an undef value.
	// TODO: Intersect w/demanded lanes
	if (UndefElts.isAllOnesValue())
	return UndefValue::get(I->getType());;

	return MadeChange ? I : nullptr;
	}
	diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
	index b3254c10a0b2..17a5ec3f87fa 100644
	--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
	+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
	@@ -1,3893 +1,3893 @@
	//===- InstructionCombining.cpp - Combine multiple instructions -----------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// InstructionCombining - Combine instructions to form fewer, simple
	// instructions. This pass does not modify the CFG. This pass is where
	// algebraic simplification happens.
	//
	// This pass combines things like:
	// %Y = add i32 %X, 1
	// %Z = add i32 %Y, 1
	// into:
	// %Z = add i32 %X, 2
	//
	// This is a simple worklist driven algorithm.
	//
	// This pass guarantees that the following canonicalizations are performed on
	// the program:
	// 1. If a binary operator has a constant operand, it is moved to the RHS
	// 2. Bitwise operators with constant operands are always grouped so that
	// shifts are performed first, then or's, then and's, then xor's.
	// 3. Compare instructions are converted from <,>,<=,>= to ==,!= if possible
	// 4. All cmp instructions on boolean values are replaced with logical ops
	// 5. add X, X is represented as (X*2) => (X << 1)
	// 6. Multiplies with a power-of-two constant argument are transformed into
	// shifts.
	// ... etc.
	//
	//===----------------------------------------------------------------------===//

	#include "InstCombineInternal.h"
	#include "llvm-c/Initialization.h"
	#include "llvm-c/Transforms/InstCombine.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/None.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/TinyPtrVector.h"
	#include "llvm/Analysis/AliasAnalysis.h"
	#include "llvm/Analysis/AssumptionCache.h"
	#include "llvm/Analysis/BasicAliasAnalysis.h"
	#include "llvm/Analysis/BlockFrequencyInfo.h"
	#include "llvm/Analysis/CFG.h"
	#include "llvm/Analysis/ConstantFolding.h"
	#include "llvm/Analysis/EHPersonalities.h"
	#include "llvm/Analysis/GlobalsModRef.h"
	#include "llvm/Analysis/InstructionSimplify.h"
	#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
	#include "llvm/Analysis/LoopInfo.h"
	#include "llvm/Analysis/MemoryBuiltins.h"
	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
	#include "llvm/Analysis/ProfileSummaryInfo.h"
	#include "llvm/Analysis/TargetFolder.h"
	#include "llvm/Analysis/TargetLibraryInfo.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/Analysis/VectorUtils.h"
	#include "llvm/IR/BasicBlock.h"
	#include "llvm/IR/CFG.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DIBuilder.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Dominators.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GetElementPtrTypeIterator.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/InstrTypes.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/LegacyPassManager.h"
	#include "llvm/IR/Metadata.h"
	#include "llvm/IR/Operator.h"
	#include "llvm/IR/PassManager.h"
	#include "llvm/IR/PatternMatch.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Use.h"
	#include "llvm/IR/User.h"
	#include "llvm/IR/Value.h"
	#include "llvm/IR/ValueHandle.h"
	#include "llvm/InitializePasses.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/CBindingWrapping.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/DebugCounter.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Transforms/InstCombine/InstCombine.h"
	#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
	#include "llvm/Transforms/Utils/Local.h"
	#include <algorithm>
	#include <cassert>
	#include <cstdint>
	#include <memory>
	#include <string>
	#include <utility>

	using namespace llvm;
	using namespace llvm::PatternMatch;

	#define DEBUG_TYPE "instcombine"

	STATISTIC(NumCombined , "Number of insts combined");
	STATISTIC(NumConstProp, "Number of constant folds");
	STATISTIC(NumDeadInst , "Number of dead inst eliminated");
	STATISTIC(NumSunkInst , "Number of instructions sunk");
	STATISTIC(NumExpand, "Number of expansions");
	STATISTIC(NumFactor , "Number of factorizations");
	STATISTIC(NumReassoc , "Number of reassociations");
	DEBUG_COUNTER(VisitCounter, "instcombine-visit",
	"Controls which instructions are visited");

	static constexpr unsigned InstCombineDefaultMaxIterations = 1000;
	static constexpr unsigned InstCombineDefaultInfiniteLoopThreshold = 1000;

	static cl::opt<bool>
	EnableCodeSinking("instcombine-code-sinking", cl::desc("Enable code sinking"),
	cl::init(true));

	static cl::opt<unsigned> LimitMaxIterations(
	"instcombine-max-iterations",
	cl::desc("Limit the maximum number of instruction combining iterations"),
	cl::init(InstCombineDefaultMaxIterations));

	static cl::opt<unsigned> InfiniteLoopDetectionThreshold(
	"instcombine-infinite-loop-threshold",
	cl::desc("Number of instruction combining iterations considered an "
	"infinite loop"),
	cl::init(InstCombineDefaultInfiniteLoopThreshold), cl::Hidden);

	static cl::opt<unsigned>
	MaxArraySize("instcombine-maxarray-size", cl::init(1024),
	cl::desc("Maximum array size considered when doing a combine"));

	// FIXME: Remove this flag when it is no longer necessary to convert
	// llvm.dbg.declare to avoid inaccurate debug info. Setting this to false
	// increases variable availability at the cost of accuracy. Variables that
	// cannot be promoted by mem2reg or SROA will be described as living in memory
	// for their entire lifetime. However, passes like DSE and instcombine can
	// delete stores to the alloca, leading to misleading and inaccurate debug
	// information. This flag can be removed when those passes are fixed.
	static cl::opt<unsigned> ShouldLowerDbgDeclare("instcombine-lower-dbg-declare",
	cl::Hidden, cl::init(true));

	Value InstCombiner::EmitGEPOffset(User GEP) {
	return llvm::EmitGEPOffset(&Builder, DL, GEP);
	}

	/// Return true if it is desirable to convert an integer computation from a
	/// given bit width to a new bit width.
	/// We don't want to convert from a legal to an illegal type or from a smaller
	/// to a larger illegal type. A width of '1' is always treated as a legal type
	/// because i1 is a fundamental type in IR, and there are many specialized
	/// optimizations for i1 types. Widths of 8, 16 or 32 are equally treated as
	/// legal to convert to, in order to open up more combining opportunities.
	/// NOTE: this treats i8, i16 and i32 specially, due to them being so common
	/// from frontend languages.
	bool InstCombiner::shouldChangeType(unsigned FromWidth,
	unsigned ToWidth) const {
	bool FromLegal = FromWidth == 1 \|\| DL.isLegalInteger(FromWidth);
	bool ToLegal = ToWidth == 1 \|\| DL.isLegalInteger(ToWidth);

	// Convert to widths of 8, 16 or 32 even if they are not legal types. Only
	// shrink types, to prevent infinite loops.
	if (ToWidth < FromWidth && (ToWidth == 8 \|\| ToWidth == 16 \|\| ToWidth == 32))
	return true;

	// If this is a legal integer from type, and the result would be an illegal
	// type, don't do the transformation.
	if (FromLegal && !ToLegal)
	return false;

	// Otherwise, if both are illegal, do not increase the size of the result. We
	// do allow things like i160 -> i64, but not i64 -> i160.
	if (!FromLegal && !ToLegal && ToWidth > FromWidth)
	return false;

	return true;
	}

	/// Return true if it is desirable to convert a computation from 'From' to 'To'.
	/// We don't want to convert from a legal to an illegal type or from a smaller
	/// to a larger illegal type. i1 is always treated as a legal type because it is
	/// a fundamental type in IR, and there are many specialized optimizations for
	/// i1 types.
	bool InstCombiner::shouldChangeType(Type From, Type To) const {
	// TODO: This could be extended to allow vectors. Datalayout changes might be
	// needed to properly support that.
	if (!From->isIntegerTy() \|\| !To->isIntegerTy())
	return false;

	unsigned FromWidth = From->getPrimitiveSizeInBits();
	unsigned ToWidth = To->getPrimitiveSizeInBits();
	return shouldChangeType(FromWidth, ToWidth);
	}

	// Return true, if No Signed Wrap should be maintained for I.
	// The No Signed Wrap flag can be kept if the operation "B (I.getOpcode) C",
	// where both B and C should be ConstantInts, results in a constant that does
	// not overflow. This function only handles the Add and Sub opcodes. For
	// all other opcodes, the function conservatively returns false.
	static bool maintainNoSignedWrap(BinaryOperator &I, Value B, Value C) {
	auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
	if (!OBO \|\| !OBO->hasNoSignedWrap())
	return false;

	// We reason about Add and Sub Only.
	Instruction::BinaryOps Opcode = I.getOpcode();
	if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
	return false;

	const APInt BVal, CVal;
	if (!match(B, m_APInt(BVal)) \|\| !match(C, m_APInt(CVal)))
	return false;

	bool Overflow = false;
	if (Opcode == Instruction::Add)
	(void)BVal->sadd_ov(*CVal, Overflow);
	else
	(void)BVal->ssub_ov(*CVal, Overflow);

	return !Overflow;
	}

	static bool hasNoUnsignedWrap(BinaryOperator &I) {
	auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
	return OBO && OBO->hasNoUnsignedWrap();
	}

	static bool hasNoSignedWrap(BinaryOperator &I) {
	auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
	return OBO && OBO->hasNoSignedWrap();
	}

	/// Conservatively clears subclassOptionalData after a reassociation or
	/// commutation. We preserve fast-math flags when applicable as they can be
	/// preserved.
	static void ClearSubclassDataAfterReassociation(BinaryOperator &I) {
	FPMathOperator *FPMO = dyn_cast<FPMathOperator>(&I);
	if (!FPMO) {
	I.clearSubclassOptionalData();
	return;
	}

	FastMathFlags FMF = I.getFastMathFlags();
	I.clearSubclassOptionalData();
	I.setFastMathFlags(FMF);
	}

	/// Combine constant operands of associative operations either before or after a
	/// cast to eliminate one of the associative operations:
	/// (op (cast (op X, C2)), C1) --> (cast (op X, op (C1, C2)))
	/// (op (cast (op X, C2)), C1) --> (op (cast X), op (C1, C2))
	static bool simplifyAssocCastAssoc(BinaryOperator *BinOp1, InstCombiner &IC) {
	auto *Cast = dyn_cast<CastInst>(BinOp1->getOperand(0));
	if (!Cast \|\| !Cast->hasOneUse())
	return false;

	// TODO: Enhance logic for other casts and remove this check.
	auto CastOpcode = Cast->getOpcode();
	if (CastOpcode != Instruction::ZExt)
	return false;

	// TODO: Enhance logic for other BinOps and remove this check.
	if (!BinOp1->isBitwiseLogicOp())
	return false;

	auto AssocOpcode = BinOp1->getOpcode();
	auto *BinOp2 = dyn_cast<BinaryOperator>(Cast->getOperand(0));
	if (!BinOp2 \|\| !BinOp2->hasOneUse() \|\| BinOp2->getOpcode() != AssocOpcode)
	return false;

	Constant C1, C2;
	if (!match(BinOp1->getOperand(1), m_Constant(C1)) \|\|
	!match(BinOp2->getOperand(1), m_Constant(C2)))
	return false;

	// TODO: This assumes a zext cast.
	// Eg, if it was a trunc, we'd cast C1 to the source type because casting C2
	// to the destination type might lose bits.

	// Fold the constants together in the destination type:
	// (op (cast (op X, C2)), C1) --> (op (cast X), FoldedC)
	Type *DestTy = C1->getType();
	Constant *CastC2 = ConstantExpr::getCast(CastOpcode, C2, DestTy);
	Constant *FoldedC = ConstantExpr::get(AssocOpcode, C1, CastC2);
	IC.replaceOperand(*Cast, 0, BinOp2->getOperand(0));
	IC.replaceOperand(*BinOp1, 1, FoldedC);
	return true;
	}

	/// This performs a few simplifications for operators that are associative or
	/// commutative:
	///
	/// Commutative operators:
	///
	/// 1. Order operands such that they are listed from right (least complex) to
	/// left (most complex). This puts constants before unary operators before
	/// binary operators.
	///
	/// Associative operators:
	///
	/// 2. Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies.
	/// 3. Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies.
	///
	/// Associative and commutative operators:
	///
	/// 4. Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies.
	/// 5. Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies.
	/// 6. Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)"
	/// if C1 and C2 are constants.
	bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
	Instruction::BinaryOps Opcode = I.getOpcode();
	bool Changed = false;

	do {
	// Order operands such that they are listed from right (least complex) to
	// left (most complex). This puts constants before unary operators before
	// binary operators.
	if (I.isCommutative() && getComplexity(I.getOperand(0)) <
	getComplexity(I.getOperand(1)))
	Changed = !I.swapOperands();

	BinaryOperator *Op0 = dyn_cast<BinaryOperator>(I.getOperand(0));
	BinaryOperator *Op1 = dyn_cast<BinaryOperator>(I.getOperand(1));

	if (I.isAssociative()) {
	// Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies.
	if (Op0 && Op0->getOpcode() == Opcode) {
	Value *A = Op0->getOperand(0);
	Value *B = Op0->getOperand(1);
	Value *C = I.getOperand(1);

	// Does "B op C" simplify?
	if (Value *V = SimplifyBinOp(Opcode, B, C, SQ.getWithInstruction(&I))) {
	// It simplifies to V. Form "A op V".
	replaceOperand(I, 0, A);
	replaceOperand(I, 1, V);
	bool IsNUW = hasNoUnsignedWrap(I) && hasNoUnsignedWrap(*Op0);
	bool IsNSW = maintainNoSignedWrap(I, B, C) && hasNoSignedWrap(*Op0);

	// Conservatively clear all optional flags since they may not be
	// preserved by the reassociation. Reset nsw/nuw based on the above
	// analysis.
	ClearSubclassDataAfterReassociation(I);

	// Note: this is only valid because SimplifyBinOp doesn't look at
	// the operands to Op0.
	if (IsNUW)
	I.setHasNoUnsignedWrap(true);

	if (IsNSW)
	I.setHasNoSignedWrap(true);

	Changed = true;
	++NumReassoc;
	continue;
	}
	}

	// Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies.
	if (Op1 && Op1->getOpcode() == Opcode) {
	Value *A = I.getOperand(0);
	Value *B = Op1->getOperand(0);
	Value *C = Op1->getOperand(1);

	// Does "A op B" simplify?
	if (Value *V = SimplifyBinOp(Opcode, A, B, SQ.getWithInstruction(&I))) {
	// It simplifies to V. Form "V op C".
	replaceOperand(I, 0, V);
	replaceOperand(I, 1, C);
	// Conservatively clear the optional flags, since they may not be
	// preserved by the reassociation.
	ClearSubclassDataAfterReassociation(I);
	Changed = true;
	++NumReassoc;
	continue;
	}
	}
	}

	if (I.isAssociative() && I.isCommutative()) {
	if (simplifyAssocCastAssoc(&I, *this)) {
	Changed = true;
	++NumReassoc;
	continue;
	}

	// Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies.
	if (Op0 && Op0->getOpcode() == Opcode) {
	Value *A = Op0->getOperand(0);
	Value *B = Op0->getOperand(1);
	Value *C = I.getOperand(1);

	// Does "C op A" simplify?
	if (Value *V = SimplifyBinOp(Opcode, C, A, SQ.getWithInstruction(&I))) {
	// It simplifies to V. Form "V op B".
	replaceOperand(I, 0, V);
	replaceOperand(I, 1, B);
	// Conservatively clear the optional flags, since they may not be
	// preserved by the reassociation.
	ClearSubclassDataAfterReassociation(I);
	Changed = true;
	++NumReassoc;
	continue;
	}
	}

	// Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies.
	if (Op1 && Op1->getOpcode() == Opcode) {
	Value *A = I.getOperand(0);
	Value *B = Op1->getOperand(0);
	Value *C = Op1->getOperand(1);

	// Does "C op A" simplify?
	if (Value *V = SimplifyBinOp(Opcode, C, A, SQ.getWithInstruction(&I))) {
	// It simplifies to V. Form "B op V".
	replaceOperand(I, 0, B);
	replaceOperand(I, 1, V);
	// Conservatively clear the optional flags, since they may not be
	// preserved by the reassociation.
	ClearSubclassDataAfterReassociation(I);
	Changed = true;
	++NumReassoc;
	continue;
	}
	}

	// Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)"
	// if C1 and C2 are constants.
	Value A, B;
	Constant C1, C2;
	if (Op0 && Op1 &&
	Op0->getOpcode() == Opcode && Op1->getOpcode() == Opcode &&
	match(Op0, m_OneUse(m_BinOp(m_Value(A), m_Constant(C1)))) &&
	match(Op1, m_OneUse(m_BinOp(m_Value(B), m_Constant(C2))))) {
	bool IsNUW = hasNoUnsignedWrap(I) &&
	hasNoUnsignedWrap(*Op0) &&
	hasNoUnsignedWrap(*Op1);
	BinaryOperator *NewBO = (IsNUW && Opcode == Instruction::Add) ?
	BinaryOperator::CreateNUW(Opcode, A, B) :
	BinaryOperator::Create(Opcode, A, B);

	if (isa<FPMathOperator>(NewBO)) {
	FastMathFlags Flags = I.getFastMathFlags();
	Flags &= Op0->getFastMathFlags();
	Flags &= Op1->getFastMathFlags();
	NewBO->setFastMathFlags(Flags);
	}
	InsertNewInstWith(NewBO, I);
	NewBO->takeName(Op1);
	replaceOperand(I, 0, NewBO);
	replaceOperand(I, 1, ConstantExpr::get(Opcode, C1, C2));
	// Conservatively clear the optional flags, since they may not be
	// preserved by the reassociation.
	ClearSubclassDataAfterReassociation(I);
	if (IsNUW)
	I.setHasNoUnsignedWrap(true);

	Changed = true;
	continue;
	}
	}

	// No further simplifications.
	return Changed;
	} while (true);
	}

	/// Return whether "X LOp (Y ROp Z)" is always equal to
	/// "(X LOp Y) ROp (X LOp Z)".
	static bool leftDistributesOverRight(Instruction::BinaryOps LOp,
	Instruction::BinaryOps ROp) {
	// X & (Y \| Z) <--> (X & Y) \| (X & Z)
	// X & (Y ^ Z) <--> (X & Y) ^ (X & Z)
	if (LOp == Instruction::And)
	return ROp == Instruction::Or \|\| ROp == Instruction::Xor;

	// X \| (Y & Z) <--> (X \| Y) & (X \| Z)
	if (LOp == Instruction::Or)
	return ROp == Instruction::And;

	// X * (Y + Z) <--> (X * Y) + (X * Z)
	// X * (Y - Z) <--> (X * Y) - (X * Z)
	if (LOp == Instruction::Mul)
	return ROp == Instruction::Add \|\| ROp == Instruction::Sub;

	return false;
	}

	/// Return whether "(X LOp Y) ROp Z" is always equal to
	/// "(X ROp Z) LOp (Y ROp Z)".
	static bool rightDistributesOverLeft(Instruction::BinaryOps LOp,
	Instruction::BinaryOps ROp) {
	if (Instruction::isCommutative(ROp))
	return leftDistributesOverRight(ROp, LOp);

	// (X {&\|^} Y) >> Z <--> (X >> Z) {&\|^} (Y >> Z) for all shifts.
	return Instruction::isBitwiseLogicOp(LOp) && Instruction::isShift(ROp);

	// TODO: It would be nice to handle division, aka "(X + Y)/Z = X/Z + Y/Z",
	// but this requires knowing that the addition does not overflow and other
	// such subtleties.
	}

	/// This function returns identity value for given opcode, which can be used to
	/// factor patterns like (X * 2) + X ==> (X * 2) + (X * 1) ==> X * (2 + 1).
	static Value getIdentityValue(Instruction::BinaryOps Opcode, Value V) {
	if (isa<Constant>(V))
	return nullptr;

	return ConstantExpr::getBinOpIdentity(Opcode, V->getType());
	}

	/// This function predicates factorization using distributive laws. By default,
	/// it just returns the 'Op' inputs. But for special-cases like
	/// 'add(shl(X, 5), ...)', this function will have TopOpcode == Instruction::Add
	/// and Op = shl(X, 5). The 'shl' is treated as the more general 'mul X, 32' to
	/// allow more factorization opportunities.
	static Instruction::BinaryOps
	getBinOpsForFactorization(Instruction::BinaryOps TopOpcode, BinaryOperator *Op,
	Value &LHS, Value &RHS) {
	assert(Op && "Expected a binary operator");
	LHS = Op->getOperand(0);
	RHS = Op->getOperand(1);
	if (TopOpcode == Instruction::Add \|\| TopOpcode == Instruction::Sub) {
	Constant *C;
	if (match(Op, m_Shl(m_Value(), m_Constant(C)))) {
	// X << C --> X * (1 << C)
	RHS = ConstantExpr::getShl(ConstantInt::get(Op->getType(), 1), C);
	return Instruction::Mul;
	}
	// TODO: We can add other conversions e.g. shr => div etc.
	}
	return Op->getOpcode();
	}

	/// This tries to simplify binary operations by factorizing out common terms
	/// (e. g. "(AB)+(AC)" -> "A*(B+C)").
	Value *InstCombiner::tryFactorization(BinaryOperator &I,
	Instruction::BinaryOps InnerOpcode,
	Value A, Value B, Value C, Value D) {
	assert(A && B && C && D && "All values must be provided");

	Value *V = nullptr;
	Value *SimplifiedInst = nullptr;
	Value LHS = I.getOperand(0), RHS = I.getOperand(1);
	Instruction::BinaryOps TopLevelOpcode = I.getOpcode();

	// Does "X op' Y" always equal "Y op' X"?
	bool InnerCommutative = Instruction::isCommutative(InnerOpcode);

	// Does "X op' (Y op Z)" always equal "(X op' Y) op (X op' Z)"?
	if (leftDistributesOverRight(InnerOpcode, TopLevelOpcode))
	// Does the instruction have the form "(A op' B) op (A op' D)" or, in the
	// commutative case, "(A op' B) op (C op' A)"?
	if (A == C \|\| (InnerCommutative && A == D)) {
	if (A != C)
	std::swap(C, D);
	// Consider forming "A op' (B op D)".
	// If "B op D" simplifies then it can be formed with no cost.
	V = SimplifyBinOp(TopLevelOpcode, B, D, SQ.getWithInstruction(&I));
	// If "B op D" doesn't simplify then only go on if both of the existing
	// operations "A op' B" and "C op' D" will be zapped as no longer used.
	if (!V && LHS->hasOneUse() && RHS->hasOneUse())
	V = Builder.CreateBinOp(TopLevelOpcode, B, D, RHS->getName());
	if (V) {
	SimplifiedInst = Builder.CreateBinOp(InnerOpcode, A, V);
	}
	}

	// Does "(X op Y) op' Z" always equal "(X op' Z) op (Y op' Z)"?
	if (!SimplifiedInst && rightDistributesOverLeft(TopLevelOpcode, InnerOpcode))
	// Does the instruction have the form "(A op' B) op (C op' B)" or, in the
	// commutative case, "(A op' B) op (B op' D)"?
	if (B == D \|\| (InnerCommutative && B == C)) {
	if (B != D)
	std::swap(C, D);
	// Consider forming "(A op C) op' B".
	// If "A op C" simplifies then it can be formed with no cost.
	V = SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I));

	// If "A op C" doesn't simplify then only go on if both of the existing
	// operations "A op' B" and "C op' D" will be zapped as no longer used.
	if (!V && LHS->hasOneUse() && RHS->hasOneUse())
	V = Builder.CreateBinOp(TopLevelOpcode, A, C, LHS->getName());
	if (V) {
	SimplifiedInst = Builder.CreateBinOp(InnerOpcode, V, B);
	}
	}

	if (SimplifiedInst) {
	++NumFactor;
	SimplifiedInst->takeName(&I);

	// Check if we can add NSW/NUW flags to SimplifiedInst. If so, set them.
	if (BinaryOperator *BO = dyn_cast<BinaryOperator>(SimplifiedInst)) {
	if (isa<OverflowingBinaryOperator>(SimplifiedInst)) {
	bool HasNSW = false;
	bool HasNUW = false;
	if (isa<OverflowingBinaryOperator>(&I)) {
	HasNSW = I.hasNoSignedWrap();
	HasNUW = I.hasNoUnsignedWrap();
	}

	if (auto *LOBO = dyn_cast<OverflowingBinaryOperator>(LHS)) {
	HasNSW &= LOBO->hasNoSignedWrap();
	HasNUW &= LOBO->hasNoUnsignedWrap();
	}

	if (auto *ROBO = dyn_cast<OverflowingBinaryOperator>(RHS)) {
	HasNSW &= ROBO->hasNoSignedWrap();
	HasNUW &= ROBO->hasNoUnsignedWrap();
	}

	if (TopLevelOpcode == Instruction::Add &&
	InnerOpcode == Instruction::Mul) {
	// We can propagate 'nsw' if we know that
	// %Y = mul nsw i16 %X, C
	// %Z = add nsw i16 %Y, %X
	// =>
	// %Z = mul nsw i16 %X, C+1
	//
	// iff C+1 isn't INT_MIN
	const APInt *CInt;
	if (match(V, m_APInt(CInt))) {
	if (!CInt->isMinSignedValue())
	BO->setHasNoSignedWrap(HasNSW);
	}

	// nuw can be propagated with any constant or nuw value.
	BO->setHasNoUnsignedWrap(HasNUW);
	}
	}
	}
	}
	return SimplifiedInst;
	}

	/// This tries to simplify binary operations which some other binary operation
	/// distributes over either by factorizing out common terms
	/// (eg "(AB)+(AC)" -> "A*(B+C)") or expanding out if this results in
	/// simplifications (eg: "A & (B \| C) -> (A&B) \| (A&C)" if this is a win).
	/// Returns the simplified value, or null if it didn't simplify.
	Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
	Value LHS = I.getOperand(0), RHS = I.getOperand(1);
	BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS);
	BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS);
	Instruction::BinaryOps TopLevelOpcode = I.getOpcode();

	{
	// Factorization.
	Value A, B, C, D;
	Instruction::BinaryOps LHSOpcode, RHSOpcode;
	if (Op0)
	LHSOpcode = getBinOpsForFactorization(TopLevelOpcode, Op0, A, B);
	if (Op1)
	RHSOpcode = getBinOpsForFactorization(TopLevelOpcode, Op1, C, D);

	// The instruction has the form "(A op' B) op (C op' D)". Try to factorize
	// a common term.
	if (Op0 && Op1 && LHSOpcode == RHSOpcode)
	if (Value *V = tryFactorization(I, LHSOpcode, A, B, C, D))
	return V;

	// The instruction has the form "(A op' B) op (C)". Try to factorize common
	// term.
	if (Op0)
	if (Value *Ident = getIdentityValue(LHSOpcode, RHS))
	if (Value *V = tryFactorization(I, LHSOpcode, A, B, RHS, Ident))
	return V;

	// The instruction has the form "(B) op (C op' D)". Try to factorize common
	// term.
	if (Op1)
	if (Value *Ident = getIdentityValue(RHSOpcode, LHS))
	if (Value *V = tryFactorization(I, RHSOpcode, LHS, Ident, C, D))
	return V;
	}

	// Expansion.
	if (Op0 && rightDistributesOverLeft(Op0->getOpcode(), TopLevelOpcode)) {
	// The instruction has the form "(A op' B) op C". See if expanding it out
	// to "(A op C) op' (B op C)" results in simplifications.
	Value A = Op0->getOperand(0), B = Op0->getOperand(1), *C = RHS;
	Instruction::BinaryOps InnerOpcode = Op0->getOpcode(); // op'

	Value *L = SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I));
	Value *R = SimplifyBinOp(TopLevelOpcode, B, C, SQ.getWithInstruction(&I));

	// Do "A op C" and "B op C" both simplify?
	if (L && R) {
	// They do! Return "L op' R".
	++NumExpand;
	C = Builder.CreateBinOp(InnerOpcode, L, R);
	C->takeName(&I);
	return C;
	}

	// Does "A op C" simplify to the identity value for the inner opcode?
	if (L && L == ConstantExpr::getBinOpIdentity(InnerOpcode, L->getType())) {
	// They do! Return "B op C".
	++NumExpand;
	C = Builder.CreateBinOp(TopLevelOpcode, B, C);
	C->takeName(&I);
	return C;
	}

	// Does "B op C" simplify to the identity value for the inner opcode?
	if (R && R == ConstantExpr::getBinOpIdentity(InnerOpcode, R->getType())) {
	// They do! Return "A op C".
	++NumExpand;
	C = Builder.CreateBinOp(TopLevelOpcode, A, C);
	C->takeName(&I);
	return C;
	}
	}

	if (Op1 && leftDistributesOverRight(TopLevelOpcode, Op1->getOpcode())) {
	// The instruction has the form "A op (B op' C)". See if expanding it out
	// to "(A op B) op' (A op C)" results in simplifications.
	Value A = LHS, B = Op1->getOperand(0), *C = Op1->getOperand(1);
	Instruction::BinaryOps InnerOpcode = Op1->getOpcode(); // op'

	Value *L = SimplifyBinOp(TopLevelOpcode, A, B, SQ.getWithInstruction(&I));
	Value *R = SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I));

	// Do "A op B" and "A op C" both simplify?
	if (L && R) {
	// They do! Return "L op' R".
	++NumExpand;
	A = Builder.CreateBinOp(InnerOpcode, L, R);
	A->takeName(&I);
	return A;
	}

	// Does "A op B" simplify to the identity value for the inner opcode?
	if (L && L == ConstantExpr::getBinOpIdentity(InnerOpcode, L->getType())) {
	// They do! Return "A op C".
	++NumExpand;
	A = Builder.CreateBinOp(TopLevelOpcode, A, C);
	A->takeName(&I);
	return A;
	}

	// Does "A op C" simplify to the identity value for the inner opcode?
	if (R && R == ConstantExpr::getBinOpIdentity(InnerOpcode, R->getType())) {
	// They do! Return "A op B".
	++NumExpand;
	A = Builder.CreateBinOp(TopLevelOpcode, A, B);
	A->takeName(&I);
	return A;
	}
	}

	return SimplifySelectsFeedingBinaryOp(I, LHS, RHS);
	}

	Value *InstCombiner::SimplifySelectsFeedingBinaryOp(BinaryOperator &I,
	Value LHS, Value RHS) {
	Value A, B, C, D, E, F;
	bool LHSIsSelect = match(LHS, m_Select(m_Value(A), m_Value(B), m_Value(C)));
	bool RHSIsSelect = match(RHS, m_Select(m_Value(D), m_Value(E), m_Value(F)));
	if (!LHSIsSelect && !RHSIsSelect)
	return nullptr;

	FastMathFlags FMF;
	BuilderTy::FastMathFlagGuard Guard(Builder);
	if (isa<FPMathOperator>(&I)) {
	FMF = I.getFastMathFlags();
	Builder.setFastMathFlags(FMF);
	}

	Instruction::BinaryOps Opcode = I.getOpcode();
	SimplifyQuery Q = SQ.getWithInstruction(&I);

	Value Cond, True = nullptr, *False = nullptr;
	if (LHSIsSelect && RHSIsSelect && A == D) {
	// (A ? B : C) op (A ? E : F) -> A ? (B op E) : (C op F)
	Cond = A;
	True = SimplifyBinOp(Opcode, B, E, FMF, Q);
	False = SimplifyBinOp(Opcode, C, F, FMF, Q);

	if (LHS->hasOneUse() && RHS->hasOneUse()) {
	if (False && !True)
	True = Builder.CreateBinOp(Opcode, B, E);
	else if (True && !False)
	False = Builder.CreateBinOp(Opcode, C, F);
	}
	} else if (LHSIsSelect && LHS->hasOneUse()) {
	// (A ? B : C) op Y -> A ? (B op Y) : (C op Y)
	Cond = A;
	True = SimplifyBinOp(Opcode, B, RHS, FMF, Q);
	False = SimplifyBinOp(Opcode, C, RHS, FMF, Q);
	} else if (RHSIsSelect && RHS->hasOneUse()) {
	// X op (D ? E : F) -> D ? (X op E) : (X op F)
	Cond = D;
	True = SimplifyBinOp(Opcode, LHS, E, FMF, Q);
	False = SimplifyBinOp(Opcode, LHS, F, FMF, Q);
	}

	if (!True \|\| !False)
	return nullptr;

	Value *SI = Builder.CreateSelect(Cond, True, False);
	SI->takeName(&I);
	return SI;
	}

	/// Given a 'sub' instruction, return the RHS of the instruction if the LHS is a
	/// constant zero (which is the 'negate' form).
	Value InstCombiner::dyn_castNegVal(Value V) const {
	Value *NegV;
	if (match(V, m_Neg(m_Value(NegV))))
	return NegV;

	// Constants can be considered to be negated values if they can be folded.
	if (ConstantInt *C = dyn_cast<ConstantInt>(V))
	return ConstantExpr::getNeg(C);

	if (ConstantDataVector *C = dyn_cast<ConstantDataVector>(V))
	if (C->getType()->getElementType()->isIntegerTy())
	return ConstantExpr::getNeg(C);

	if (ConstantVector *CV = dyn_cast<ConstantVector>(V)) {
	for (unsigned i = 0, e = CV->getNumOperands(); i != e; ++i) {
	Constant *Elt = CV->getAggregateElement(i);
	if (!Elt)
	return nullptr;

	if (isa<UndefValue>(Elt))
	continue;

	if (!isa<ConstantInt>(Elt))
	return nullptr;
	}
	return ConstantExpr::getNeg(CV);
	}

	return nullptr;
	}

	static Value foldOperationIntoSelectOperand(Instruction &I, Value SO,
	InstCombiner::BuilderTy &Builder) {
	if (auto *Cast = dyn_cast<CastInst>(&I))
	return Builder.CreateCast(Cast->getOpcode(), SO, I.getType());

	assert(I.isBinaryOp() && "Unexpected opcode for select folding");

	// Figure out if the constant is the left or the right argument.
	bool ConstIsRHS = isa<Constant>(I.getOperand(1));
	Constant *ConstOperand = cast<Constant>(I.getOperand(ConstIsRHS));

	if (auto *SOC = dyn_cast<Constant>(SO)) {
	if (ConstIsRHS)
	return ConstantExpr::get(I.getOpcode(), SOC, ConstOperand);
	return ConstantExpr::get(I.getOpcode(), ConstOperand, SOC);
	}

	Value Op0 = SO, Op1 = ConstOperand;
	if (!ConstIsRHS)
	std::swap(Op0, Op1);

	auto *BO = cast<BinaryOperator>(&I);
	Value *RI = Builder.CreateBinOp(BO->getOpcode(), Op0, Op1,
	SO->getName() + ".op");
	auto *FPInst = dyn_cast<Instruction>(RI);
	if (FPInst && isa<FPMathOperator>(FPInst))
	FPInst->copyFastMathFlags(BO);
	return RI;
	}

	Instruction InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst SI) {
	// Don't modify shared select instructions.
	if (!SI->hasOneUse())
	return nullptr;

	Value *TV = SI->getTrueValue();
	Value *FV = SI->getFalseValue();
	if (!(isa<Constant>(TV) \|\| isa<Constant>(FV)))
	return nullptr;

	// Bool selects with constant operands can be folded to logical ops.
	if (SI->getType()->isIntOrIntVectorTy(1))
	return nullptr;

	// If it's a bitcast involving vectors, make sure it has the same number of
	// elements on both sides.
	if (auto *BC = dyn_cast<BitCastInst>(&Op)) {
	VectorType *DestTy = dyn_cast<VectorType>(BC->getDestTy());
	VectorType *SrcTy = dyn_cast<VectorType>(BC->getSrcTy());

	// Verify that either both or neither are vectors.
	if ((SrcTy == nullptr) != (DestTy == nullptr))
	return nullptr;

	// If vectors, verify that they have the same number of elements.
	if (SrcTy && SrcTy->getNumElements() != DestTy->getNumElements())
	return nullptr;
	}

	// Test if a CmpInst instruction is used exclusively by a select as
	// part of a minimum or maximum operation. If so, refrain from doing
	// any other folding. This helps out other analyses which understand
	// non-obfuscated minimum and maximum idioms, such as ScalarEvolution
	// and CodeGen. And in this case, at least one of the comparison
	// operands has at least one user besides the compare (the select),
	// which would often largely negate the benefit of folding anyway.
	if (auto *CI = dyn_cast<CmpInst>(SI->getCondition())) {
	if (CI->hasOneUse()) {
	Value Op0 = CI->getOperand(0), Op1 = CI->getOperand(1);

	// FIXME: This is a hack to avoid infinite looping with min/max patterns.
	// We have to ensure that vector constants that only differ with
	// undef elements are treated as equivalent.
	auto areLooselyEqual = [](Value A, Value B) {
	if (A == B)
	return true;

	// Test for vector constants.
	Constant ConstA, ConstB;
	if (!match(A, m_Constant(ConstA)) \|\| !match(B, m_Constant(ConstB)))
	return false;

	// TODO: Deal with FP constants?
	if (!A->getType()->isIntOrIntVectorTy() \|\| A->getType() != B->getType())
	return false;

	// Compare for equality including undefs as equal.
	auto *Cmp = ConstantExpr::getCompare(ICmpInst::ICMP_EQ, ConstA, ConstB);
	const APInt *C;
	return match(Cmp, m_APIntAllowUndef(C)) && C->isOneValue();
	};

	if ((areLooselyEqual(TV, Op0) && areLooselyEqual(FV, Op1)) \|\|
	(areLooselyEqual(FV, Op0) && areLooselyEqual(TV, Op1)))
	return nullptr;
	}
	}

	Value *NewTV = foldOperationIntoSelectOperand(Op, TV, Builder);
	Value *NewFV = foldOperationIntoSelectOperand(Op, FV, Builder);
	return SelectInst::Create(SI->getCondition(), NewTV, NewFV, "", nullptr, SI);
	}

	static Value foldOperationIntoPhiValue(BinaryOperator I, Value *InV,
	InstCombiner::BuilderTy &Builder) {
	bool ConstIsRHS = isa<Constant>(I->getOperand(1));
	Constant *C = cast<Constant>(I->getOperand(ConstIsRHS));

	if (auto *InC = dyn_cast<Constant>(InV)) {
	if (ConstIsRHS)
	return ConstantExpr::get(I->getOpcode(), InC, C);
	return ConstantExpr::get(I->getOpcode(), C, InC);
	}

	Value Op0 = InV, Op1 = C;
	if (!ConstIsRHS)
	std::swap(Op0, Op1);

	Value *RI = Builder.CreateBinOp(I->getOpcode(), Op0, Op1, "phi.bo");
	auto *FPInst = dyn_cast<Instruction>(RI);
	if (FPInst && isa<FPMathOperator>(FPInst))
	FPInst->copyFastMathFlags(I);
	return RI;
	}

	Instruction InstCombiner::foldOpIntoPhi(Instruction &I, PHINode PN) {
	unsigned NumPHIValues = PN->getNumIncomingValues();
	if (NumPHIValues == 0)
	return nullptr;

	// We normally only transform phis with a single use. However, if a PHI has
	// multiple uses and they are all the same operation, we can fold all of the
	// uses into the PHI.
	if (!PN->hasOneUse()) {
	// Walk the use list for the instruction, comparing them to I.
	for (User *U : PN->users()) {
	Instruction *UI = cast<Instruction>(U);
	if (UI != &I && !I.isIdenticalTo(UI))
	return nullptr;
	}
	// Otherwise, we can replace all users with the new PHI we form.
	}

	// Check to see if all of the operands of the PHI are simple constants
	// (constantint/constantfp/undef). If there is one non-constant value,
	// remember the BB it is in. If there is more than one or if it is a PHI,
	// bail out. We don't do arbitrary constant expressions here because moving
	// their computation can be expensive without a cost model.
	BasicBlock *NonConstBB = nullptr;
	for (unsigned i = 0; i != NumPHIValues; ++i) {
	Value *InVal = PN->getIncomingValue(i);
	if (isa<Constant>(InVal) && !isa<ConstantExpr>(InVal))
	continue;

	if (isa<PHINode>(InVal)) return nullptr; // Itself a phi.
	if (NonConstBB) return nullptr; // More than one non-const value.

	NonConstBB = PN->getIncomingBlock(i);

	// If the InVal is an invoke at the end of the pred block, then we can't
	// insert a computation after it without breaking the edge.
	if (isa<InvokeInst>(InVal))
	if (cast<Instruction>(InVal)->getParent() == NonConstBB)
	return nullptr;

	// If the incoming non-constant value is in I's block, we will remove one
	// instruction, but insert another equivalent one, leading to infinite
	// instcombine.
	if (isPotentiallyReachable(I.getParent(), NonConstBB, &DT, LI))
	return nullptr;
	}

	// If there is exactly one non-constant value, we can insert a copy of the
	// operation in that block. However, if this is a critical edge, we would be
	// inserting the computation on some other paths (e.g. inside a loop). Only
	// do this if the pred block is unconditionally branching into the phi block.
	if (NonConstBB != nullptr) {
	BranchInst *BI = dyn_cast<BranchInst>(NonConstBB->getTerminator());
	if (!BI \|\| !BI->isUnconditional()) return nullptr;
	}

	// Okay, we can do the transformation: create the new PHI node.
	PHINode *NewPN = PHINode::Create(I.getType(), PN->getNumIncomingValues());
	InsertNewInstBefore(NewPN, *PN);
	NewPN->takeName(PN);

	// If we are going to have to insert a new computation, do so right before the
	// predecessor's terminator.
	if (NonConstBB)
	Builder.SetInsertPoint(NonConstBB->getTerminator());

	// Next, add all of the operands to the PHI.
	if (SelectInst *SI = dyn_cast<SelectInst>(&I)) {
	// We only currently try to fold the condition of a select when it is a phi,
	// not the true/false values.
	Value *TrueV = SI->getTrueValue();
	Value *FalseV = SI->getFalseValue();
	BasicBlock *PhiTransBB = PN->getParent();
	for (unsigned i = 0; i != NumPHIValues; ++i) {
	BasicBlock *ThisBB = PN->getIncomingBlock(i);
	Value *TrueVInPred = TrueV->DoPHITranslation(PhiTransBB, ThisBB);
	Value *FalseVInPred = FalseV->DoPHITranslation(PhiTransBB, ThisBB);
	Value *InV = nullptr;
	// Beware of ConstantExpr: it may eventually evaluate to getNullValue,
	// even if currently isNullValue gives false.
	Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i));
	// For vector constants, we cannot use isNullValue to fold into
	// FalseVInPred versus TrueVInPred. When we have individual nonzero
	// elements in the vector, we will incorrectly fold InC to
	// `TrueVInPred`.
	if (InC && !isa<ConstantExpr>(InC) && isa<ConstantInt>(InC))
	InV = InC->isNullValue() ? FalseVInPred : TrueVInPred;
	else {
	// Generate the select in the same block as PN's current incoming block.
	// Note: ThisBB need not be the NonConstBB because vector constants
	// which are constants by definition are handled here.
	// FIXME: This can lead to an increase in IR generation because we might
	// generate selects for vector constant phi operand, that could not be
	// folded to TrueVInPred or FalseVInPred as done for ConstantInt. For
	// non-vector phis, this transformation was always profitable because
	// the select would be generated exactly once in the NonConstBB.
	Builder.SetInsertPoint(ThisBB->getTerminator());
	InV = Builder.CreateSelect(PN->getIncomingValue(i), TrueVInPred,
	FalseVInPred, "phi.sel");
	}
	NewPN->addIncoming(InV, ThisBB);
	}
	} else if (CmpInst *CI = dyn_cast<CmpInst>(&I)) {
	Constant *C = cast<Constant>(I.getOperand(1));
	for (unsigned i = 0; i != NumPHIValues; ++i) {
	Value *InV = nullptr;
	if (auto *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
	InV = ConstantExpr::getCompare(CI->getPredicate(), InC, C);
	else
	InV = Builder.CreateCmp(CI->getPredicate(), PN->getIncomingValue(i),
	C, "phi.cmp");
	NewPN->addIncoming(InV, PN->getIncomingBlock(i));
	}
	} else if (auto *BO = dyn_cast<BinaryOperator>(&I)) {
	for (unsigned i = 0; i != NumPHIValues; ++i) {
	Value *InV = foldOperationIntoPhiValue(BO, PN->getIncomingValue(i),
	Builder);
	NewPN->addIncoming(InV, PN->getIncomingBlock(i));
	}
	} else {
	CastInst *CI = cast<CastInst>(&I);
	Type *RetTy = CI->getType();
	for (unsigned i = 0; i != NumPHIValues; ++i) {
	Value *InV;
	if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
	InV = ConstantExpr::getCast(CI->getOpcode(), InC, RetTy);
	else
	InV = Builder.CreateCast(CI->getOpcode(), PN->getIncomingValue(i),
	I.getType(), "phi.cast");
	NewPN->addIncoming(InV, PN->getIncomingBlock(i));
	}
	}

	for (auto UI = PN->user_begin(), E = PN->user_end(); UI != E;) {
	Instruction User = cast<Instruction>(UI++);
	if (User == &I) continue;
	replaceInstUsesWith(*User, NewPN);
	eraseInstFromFunction(*User);
	}
	return replaceInstUsesWith(I, NewPN);
	}

	Instruction *InstCombiner::foldBinOpIntoSelectOrPhi(BinaryOperator &I) {
	if (!isa<Constant>(I.getOperand(1)))
	return nullptr;

	if (auto *Sel = dyn_cast<SelectInst>(I.getOperand(0))) {
	if (Instruction *NewSel = FoldOpIntoSelect(I, Sel))
	return NewSel;
	} else if (auto *PN = dyn_cast<PHINode>(I.getOperand(0))) {
	if (Instruction *NewPhi = foldOpIntoPhi(I, PN))
	return NewPhi;
	}
	return nullptr;
	}

	/// Given a pointer type and a constant offset, determine whether or not there
	/// is a sequence of GEP indices into the pointed type that will land us at the
	/// specified offset. If so, fill them into NewIndices and return the resultant
	/// element type, otherwise return null.
	Type InstCombiner::FindElementAtOffset(PointerType PtrTy, int64_t Offset,
	SmallVectorImpl<Value *> &NewIndices) {
	Type *Ty = PtrTy->getElementType();
	if (!Ty->isSized())
	return nullptr;

	// Start with the index over the outer type. Note that the type size
	// might be zero (even if the offset isn't zero) if the indexed type
	// is something like [0 x {int, int}]
	Type *IndexTy = DL.getIndexType(PtrTy);
	int64_t FirstIdx = 0;
	if (int64_t TySize = DL.getTypeAllocSize(Ty)) {
	FirstIdx = Offset/TySize;
	Offset -= FirstIdx*TySize;

	// Handle hosts where % returns negative instead of values [0..TySize).
	if (Offset < 0) {
	--FirstIdx;
	Offset += TySize;
	assert(Offset >= 0);
	}
	assert((uint64_t)Offset < (uint64_t)TySize && "Out of range offset");
	}

	NewIndices.push_back(ConstantInt::get(IndexTy, FirstIdx));

	// Index into the types. If we fail, set OrigBase to null.
	while (Offset) {
	// Indexing into tail padding between struct/array elements.
	if (uint64_t(Offset * 8) >= DL.getTypeSizeInBits(Ty))
	return nullptr;

	if (StructType *STy = dyn_cast<StructType>(Ty)) {
	const StructLayout *SL = DL.getStructLayout(STy);
	assert(Offset < (int64_t)SL->getSizeInBytes() &&
	"Offset must stay within the indexed type");

	unsigned Elt = SL->getElementContainingOffset(Offset);
	NewIndices.push_back(ConstantInt::get(Type::getInt32Ty(Ty->getContext()),
	Elt));

	Offset -= SL->getElementOffset(Elt);
	Ty = STy->getElementType(Elt);
	} else if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
	uint64_t EltSize = DL.getTypeAllocSize(AT->getElementType());
	assert(EltSize && "Cannot index into a zero-sized array");
	NewIndices.push_back(ConstantInt::get(IndexTy,Offset/EltSize));
	Offset %= EltSize;
	Ty = AT->getElementType();
	} else {
	// Otherwise, we can't index into the middle of this atomic type, bail.
	return nullptr;
	}
	}

	return Ty;
	}

	static bool shouldMergeGEPs(GEPOperator &GEP, GEPOperator &Src) {
	// If this GEP has only 0 indices, it is the same pointer as
	// Src. If Src is not a trivial GEP too, don't combine
	// the indices.
	if (GEP.hasAllZeroIndices() && !Src.hasAllZeroIndices() &&
	!Src.hasOneUse())
	return false;
	return true;
	}

	/// Return a value X such that Val = X * Scale, or null if none.
	/// If the multiplication is known not to overflow, then NoSignedWrap is set.
	Value InstCombiner::Descale(Value Val, APInt Scale, bool &NoSignedWrap) {
	assert(isa<IntegerType>(Val->getType()) && "Can only descale integers!");
	assert(cast<IntegerType>(Val->getType())->getBitWidth() ==
	Scale.getBitWidth() && "Scale not compatible with value!");

	// If Val is zero or Scale is one then Val = Val * Scale.
	if (match(Val, m_Zero()) \|\| Scale == 1) {
	NoSignedWrap = true;
	return Val;
	}

	// If Scale is zero then it does not divide Val.
	if (Scale.isMinValue())
	return nullptr;

	// Look through chains of multiplications, searching for a constant that is
	// divisible by Scale. For example, descaling X(Y(Z*4)) by a factor of 4
	// will find the constant factor 4 and produce X(YZ). Descaling X(Y8) by
	// a factor of 4 will produce X(Y2). The principle of operation is to bore
	// down from Val:
	//
	// Val = M1 * X \|\| Analysis starts here and works down
	// M1 = M2 * Y \|\| Doesn't descend into terms with more
	// M2 = Z * 4 \/ than one use
	//
	// Then to modify a term at the bottom:
	//
	// Val = M1 * X
	// M1 = Z * Y \|\| Replaced M2 with Z
	//
	// Then to work back up correcting nsw flags.

	// Op - the term we are currently analyzing. Starts at Val then drills down.
	// Replaced with its descaled value before exiting from the drill down loop.
	Value *Op = Val;

	// Parent - initially null, but after drilling down notes where Op came from.
	// In the example above, Parent is (Val, 0) when Op is M1, because M1 is the
	// 0'th operand of Val.
	std::pair<Instruction *, unsigned> Parent;

	// Set if the transform requires a descaling at deeper levels that doesn't
	// overflow.
	bool RequireNoSignedWrap = false;

	// Log base 2 of the scale. Negative if not a power of 2.
	int32_t logScale = Scale.exactLogBase2();

	for (;; Op = Parent.first->getOperand(Parent.second)) { // Drill down
	if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
	// If Op is a constant divisible by Scale then descale to the quotient.
	APInt Quotient(Scale), Remainder(Scale); // Init ensures right bitwidth.
	APInt::sdivrem(CI->getValue(), Scale, Quotient, Remainder);
	if (!Remainder.isMinValue())
	// Not divisible by Scale.
	return nullptr;
	// Replace with the quotient in the parent.
	Op = ConstantInt::get(CI->getType(), Quotient);
	NoSignedWrap = true;
	break;
	}

	if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op)) {
	if (BO->getOpcode() == Instruction::Mul) {
	// Multiplication.
	NoSignedWrap = BO->hasNoSignedWrap();
	if (RequireNoSignedWrap && !NoSignedWrap)
	return nullptr;

	// There are three cases for multiplication: multiplication by exactly
	// the scale, multiplication by a constant different to the scale, and
	// multiplication by something else.
	Value *LHS = BO->getOperand(0);
	Value *RHS = BO->getOperand(1);

	if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
	// Multiplication by a constant.
	if (CI->getValue() == Scale) {
	// Multiplication by exactly the scale, replace the multiplication
	// by its left-hand side in the parent.
	Op = LHS;
	break;
	}

	// Otherwise drill down into the constant.
	if (!Op->hasOneUse())
	return nullptr;

	Parent = std::make_pair(BO, 1);
	continue;
	}

	// Multiplication by something else. Drill down into the left-hand side
	// since that's where the reassociate pass puts the good stuff.
	if (!Op->hasOneUse())
	return nullptr;

	Parent = std::make_pair(BO, 0);
	continue;
	}

	if (logScale > 0 && BO->getOpcode() == Instruction::Shl &&
	isa<ConstantInt>(BO->getOperand(1))) {
	// Multiplication by a power of 2.
	NoSignedWrap = BO->hasNoSignedWrap();
	if (RequireNoSignedWrap && !NoSignedWrap)
	return nullptr;

	Value *LHS = BO->getOperand(0);
	int32_t Amt = cast<ConstantInt>(BO->getOperand(1))->
	getLimitedValue(Scale.getBitWidth());
	// Op = LHS << Amt.

	if (Amt == logScale) {
	// Multiplication by exactly the scale, replace the multiplication
	// by its left-hand side in the parent.
	Op = LHS;
	break;
	}
	if (Amt < logScale \|\| !Op->hasOneUse())
	return nullptr;

	// Multiplication by more than the scale. Reduce the multiplying amount
	// by the scale in the parent.
	Parent = std::make_pair(BO, 1);
	Op = ConstantInt::get(BO->getType(), Amt - logScale);
	break;
	}
	}

	if (!Op->hasOneUse())
	return nullptr;

	if (CastInst *Cast = dyn_cast<CastInst>(Op)) {
	if (Cast->getOpcode() == Instruction::SExt) {
	// Op is sign-extended from a smaller type, descale in the smaller type.
	unsigned SmallSize = Cast->getSrcTy()->getPrimitiveSizeInBits();
	APInt SmallScale = Scale.trunc(SmallSize);
	// Suppose Op = sext X, and we descale X as Y * SmallScale. We want to
	// descale Op as (sext Y) * Scale. In order to have
	// sext (Y * SmallScale) = (sext Y) * Scale
	// some conditions need to hold however: SmallScale must sign-extend to
	// Scale and the multiplication Y * SmallScale should not overflow.
	if (SmallScale.sext(Scale.getBitWidth()) != Scale)
	// SmallScale does not sign-extend to Scale.
	return nullptr;
	assert(SmallScale.exactLogBase2() == logScale);
	// Require that Y * SmallScale must not overflow.
	RequireNoSignedWrap = true;

	// Drill down through the cast.
	Parent = std::make_pair(Cast, 0);
	Scale = SmallScale;
	continue;
	}

	if (Cast->getOpcode() == Instruction::Trunc) {
	// Op is truncated from a larger type, descale in the larger type.
	// Suppose Op = trunc X, and we descale X as Y * sext Scale. Then
	// trunc (Y * sext Scale) = (trunc Y) * Scale
	// always holds. However (trunc Y) * Scale may overflow even if
	// trunc (Y * sext Scale) does not, so nsw flags need to be cleared
	// from this point up in the expression (see later).
	if (RequireNoSignedWrap)
	return nullptr;

	// Drill down through the cast.
	unsigned LargeSize = Cast->getSrcTy()->getPrimitiveSizeInBits();
	Parent = std::make_pair(Cast, 0);
	Scale = Scale.sext(LargeSize);
	if (logScale + 1 == (int32_t)Cast->getType()->getPrimitiveSizeInBits())
	logScale = -1;
	assert(Scale.exactLogBase2() == logScale);
	continue;
	}
	}

	// Unsupported expression, bail out.
	return nullptr;
	}

	// If Op is zero then Val = Op * Scale.
	if (match(Op, m_Zero())) {
	NoSignedWrap = true;
	return Op;
	}

	// We know that we can successfully descale, so from here on we can safely
	// modify the IR. Op holds the descaled version of the deepest term in the
	// expression. NoSignedWrap is 'true' if multiplying Op by Scale is known
	// not to overflow.

	if (!Parent.first)
	// The expression only had one term.
	return Op;

	// Rewrite the parent using the descaled version of its operand.
	assert(Parent.first->hasOneUse() && "Drilled down when more than one use!");
	assert(Op != Parent.first->getOperand(Parent.second) &&
	"Descaling was a no-op?");
	replaceOperand(*Parent.first, Parent.second, Op);
	Worklist.push(Parent.first);

	// Now work back up the expression correcting nsw flags. The logic is based
	// on the following observation: if X * Y is known not to overflow as a signed
	// multiplication, and Y is replaced by a value Z with smaller absolute value,
	// then X * Z will not overflow as a signed multiplication either. As we work
	// our way up, having NoSignedWrap 'true' means that the descaled value at the
	// current level has strictly smaller absolute value than the original.
	Instruction *Ancestor = Parent.first;
	do {
	if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Ancestor)) {
	// If the multiplication wasn't nsw then we can't say anything about the
	// value of the descaled multiplication, and we have to clear nsw flags
	// from this point on up.
	bool OpNoSignedWrap = BO->hasNoSignedWrap();
	NoSignedWrap &= OpNoSignedWrap;
	if (NoSignedWrap != OpNoSignedWrap) {
	BO->setHasNoSignedWrap(NoSignedWrap);
	Worklist.push(Ancestor);
	}
	} else if (Ancestor->getOpcode() == Instruction::Trunc) {
	// The fact that the descaled input to the trunc has smaller absolute
	// value than the original input doesn't tell us anything useful about
	// the absolute values of the truncations.
	NoSignedWrap = false;
	}
	assert((Ancestor->getOpcode() != Instruction::SExt \|\| NoSignedWrap) &&
	"Failed to keep proper track of nsw flags while drilling down?");

	if (Ancestor == Val)
	// Got to the top, all done!
	return Val;

	// Move up one level in the expression.
	assert(Ancestor->hasOneUse() && "Drilled down when more than one use!");
	Ancestor = Ancestor->user_back();
	} while (true);
	}

	Instruction *InstCombiner::foldVectorBinop(BinaryOperator &Inst) {
	// FIXME: some of this is likely fine for scalable vectors
	if (!isa<FixedVectorType>(Inst.getType()))
	return nullptr;

	BinaryOperator::BinaryOps Opcode = Inst.getOpcode();
	Value LHS = Inst.getOperand(0), RHS = Inst.getOperand(1);
	assert(cast<VectorType>(LHS->getType())->getElementCount() ==
	cast<VectorType>(Inst.getType())->getElementCount());
	assert(cast<VectorType>(RHS->getType())->getElementCount() ==
	cast<VectorType>(Inst.getType())->getElementCount());

	// If both operands of the binop are vector concatenations, then perform the
	// narrow binop on each pair of the source operands followed by concatenation
	// of the results.
	Value L0, L1, R0, R1;
	ArrayRef<int> Mask;
	if (match(LHS, m_Shuffle(m_Value(L0), m_Value(L1), m_Mask(Mask))) &&
	match(RHS, m_Shuffle(m_Value(R0), m_Value(R1), m_SpecificMask(Mask))) &&
	LHS->hasOneUse() && RHS->hasOneUse() &&
	cast<ShuffleVectorInst>(LHS)->isConcat() &&
	cast<ShuffleVectorInst>(RHS)->isConcat()) {
	// This transform does not have the speculative execution constraint as
	// below because the shuffle is a concatenation. The new binops are
	// operating on exactly the same elements as the existing binop.
	// TODO: We could ease the mask requirement to allow different undef lanes,
	// but that requires an analysis of the binop-with-undef output value.
	Value *NewBO0 = Builder.CreateBinOp(Opcode, L0, R0);
	if (auto *BO = dyn_cast<BinaryOperator>(NewBO0))
	BO->copyIRFlags(&Inst);
	Value *NewBO1 = Builder.CreateBinOp(Opcode, L1, R1);
	if (auto *BO = dyn_cast<BinaryOperator>(NewBO1))
	BO->copyIRFlags(&Inst);
	return new ShuffleVectorInst(NewBO0, NewBO1, Mask);
	}

	// It may not be safe to reorder shuffles and things like div, urem, etc.
	// because we may trap when executing those ops on unknown vector elements.
	// See PR20059.
	if (!isSafeToSpeculativelyExecute(&Inst))
	return nullptr;

	auto createBinOpShuffle = [&](Value X, Value Y, ArrayRef<int> M) {
	Value *XY = Builder.CreateBinOp(Opcode, X, Y);
	if (auto *BO = dyn_cast<BinaryOperator>(XY))
	BO->copyIRFlags(&Inst);
	return new ShuffleVectorInst(XY, UndefValue::get(XY->getType()), M);
	};

	// If both arguments of the binary operation are shuffles that use the same
	// mask and shuffle within a single vector, move the shuffle after the binop.
	Value V1, V2;
	if (match(LHS, m_Shuffle(m_Value(V1), m_Undef(), m_Mask(Mask))) &&
	match(RHS, m_Shuffle(m_Value(V2), m_Undef(), m_SpecificMask(Mask))) &&
	V1->getType() == V2->getType() &&
	(LHS->hasOneUse() \|\| RHS->hasOneUse() \|\| LHS == RHS)) {
	// Op(shuffle(V1, Mask), shuffle(V2, Mask)) -> shuffle(Op(V1, V2), Mask)
	return createBinOpShuffle(V1, V2, Mask);
	}

	// If both arguments of a commutative binop are select-shuffles that use the
	// same mask with commuted operands, the shuffles are unnecessary.
	if (Inst.isCommutative() &&
	match(LHS, m_Shuffle(m_Value(V1), m_Value(V2), m_Mask(Mask))) &&
	match(RHS,
	m_Shuffle(m_Specific(V2), m_Specific(V1), m_SpecificMask(Mask)))) {
	auto *LShuf = cast<ShuffleVectorInst>(LHS);
	auto *RShuf = cast<ShuffleVectorInst>(RHS);
	// TODO: Allow shuffles that contain undefs in the mask?
	// That is legal, but it reduces undef knowledge.
	// TODO: Allow arbitrary shuffles by shuffling after binop?
	// That might be legal, but we have to deal with poison.
	if (LShuf->isSelect() &&
	!is_contained(LShuf->getShuffleMask(), UndefMaskElem) &&
	RShuf->isSelect() &&
	!is_contained(RShuf->getShuffleMask(), UndefMaskElem)) {
	// Example:
	// LHS = shuffle V1, V2, <0, 5, 6, 3>
	// RHS = shuffle V2, V1, <0, 5, 6, 3>
	// LHS + RHS --> (V10+V20, V21+V11, V22+V12, V13+V23) --> V1 + V2
	Instruction *NewBO = BinaryOperator::Create(Opcode, V1, V2);
	NewBO->copyIRFlags(&Inst);
	return NewBO;
	}
	}

	// If one argument is a shuffle within one vector and the other is a constant,
	// try moving the shuffle after the binary operation. This canonicalization
	// intends to move shuffles closer to other shuffles and binops closer to
	// other binops, so they can be folded. It may also enable demanded elements
	// transforms.
	unsigned NumElts = cast<FixedVectorType>(Inst.getType())->getNumElements();
	Constant *C;
	if (match(&Inst,
	m_c_BinOp(m_OneUse(m_Shuffle(m_Value(V1), m_Undef(), m_Mask(Mask))),
	- m_Constant(C))) &&
	+ m_Constant(C))) && !isa<ConstantExpr>(C) &&
	cast<FixedVectorType>(V1->getType())->getNumElements() <= NumElts) {
	assert(Inst.getType()->getScalarType() == V1->getType()->getScalarType() &&
	"Shuffle should not change scalar type");

	// Find constant NewC that has property:
	// shuffle(NewC, ShMask) = C
	// If such constant does not exist (example: ShMask=<0,0> and C=<1,2>)
	// reorder is not possible. A 1-to-1 mapping is not required. Example:
	// ShMask = <1,1,2,2> and C = <5,5,6,6> --> NewC = <undef,5,6,undef>
	bool ConstOp1 = isa<Constant>(RHS);
	ArrayRef<int> ShMask = Mask;
	unsigned SrcVecNumElts =
	cast<FixedVectorType>(V1->getType())->getNumElements();
	UndefValue *UndefScalar = UndefValue::get(C->getType()->getScalarType());
	SmallVector<Constant *, 16> NewVecC(SrcVecNumElts, UndefScalar);
	bool MayChange = true;
	for (unsigned I = 0; I < NumElts; ++I) {
	Constant *CElt = C->getAggregateElement(I);
	if (ShMask[I] >= 0) {
	assert(ShMask[I] < (int)NumElts && "Not expecting narrowing shuffle");
	Constant *NewCElt = NewVecC[ShMask[I]];
	// Bail out if:
	// 1. The constant vector contains a constant expression.
	// 2. The shuffle needs an element of the constant vector that can't
	// be mapped to a new constant vector.
	// 3. This is a widening shuffle that copies elements of V1 into the
	// extended elements (extending with undef is allowed).
	if (!CElt \|\| (!isa<UndefValue>(NewCElt) && NewCElt != CElt) \|\|
	I >= SrcVecNumElts) {
	MayChange = false;
	break;
	}
	NewVecC[ShMask[I]] = CElt;
	}
	// If this is a widening shuffle, we must be able to extend with undef
	// elements. If the original binop does not produce an undef in the high
	// lanes, then this transform is not safe.
	// Similarly for undef lanes due to the shuffle mask, we can only
	// transform binops that preserve undef.
	// TODO: We could shuffle those non-undef constant values into the
	// result by using a constant vector (rather than an undef vector)
	// as operand 1 of the new binop, but that might be too aggressive
	// for target-independent shuffle creation.
	if (I >= SrcVecNumElts \|\| ShMask[I] < 0) {
	Constant *MaybeUndef =
	ConstOp1 ? ConstantExpr::get(Opcode, UndefScalar, CElt)
	: ConstantExpr::get(Opcode, CElt, UndefScalar);
	if (!isa<UndefValue>(MaybeUndef)) {
	MayChange = false;
	break;
	}
	}
	}
	if (MayChange) {
	Constant *NewC = ConstantVector::get(NewVecC);
	// It may not be safe to execute a binop on a vector with undef elements
	// because the entire instruction can be folded to undef or create poison
	// that did not exist in the original code.
	if (Inst.isIntDivRem() \|\| (Inst.isShift() && ConstOp1))
	NewC = getSafeVectorConstantForBinop(Opcode, NewC, ConstOp1);

	// Op(shuffle(V1, Mask), C) -> shuffle(Op(V1, NewC), Mask)
	// Op(C, shuffle(V1, Mask)) -> shuffle(Op(NewC, V1), Mask)
	Value *NewLHS = ConstOp1 ? V1 : NewC;
	Value *NewRHS = ConstOp1 ? NewC : V1;
	return createBinOpShuffle(NewLHS, NewRHS, Mask);
	}
	}

	// Try to reassociate to sink a splat shuffle after a binary operation.
	if (Inst.isAssociative() && Inst.isCommutative()) {
	// Canonicalize shuffle operand as LHS.
	if (isa<ShuffleVectorInst>(RHS))
	std::swap(LHS, RHS);

	Value *X;
	ArrayRef<int> MaskC;
	int SplatIndex;
	BinaryOperator *BO;
	if (!match(LHS,
	m_OneUse(m_Shuffle(m_Value(X), m_Undef(), m_Mask(MaskC)))) \|\|
	!match(MaskC, m_SplatOrUndefMask(SplatIndex)) \|\|
	X->getType() != Inst.getType() \|\| !match(RHS, m_OneUse(m_BinOp(BO))) \|\|
	BO->getOpcode() != Opcode)
	return nullptr;

	// FIXME: This may not be safe if the analysis allows undef elements. By
	// moving 'Y' before the splat shuffle, we are implicitly assuming
	// that it is not undef/poison at the splat index.
	Value Y, OtherOp;
	if (isSplatValue(BO->getOperand(0), SplatIndex)) {
	Y = BO->getOperand(0);
	OtherOp = BO->getOperand(1);
	} else if (isSplatValue(BO->getOperand(1), SplatIndex)) {
	Y = BO->getOperand(1);
	OtherOp = BO->getOperand(0);
	} else {
	return nullptr;
	}

	// X and Y are splatted values, so perform the binary operation on those
	// values followed by a splat followed by the 2nd binary operation:
	// bo (splat X), (bo Y, OtherOp) --> bo (splat (bo X, Y)), OtherOp
	Value *NewBO = Builder.CreateBinOp(Opcode, X, Y);
	UndefValue *Undef = UndefValue::get(Inst.getType());
	SmallVector<int, 8> NewMask(MaskC.size(), SplatIndex);
	Value *NewSplat = Builder.CreateShuffleVector(NewBO, Undef, NewMask);
	Instruction *R = BinaryOperator::Create(Opcode, NewSplat, OtherOp);

	// Intersect FMF on both new binops. Other (poison-generating) flags are
	// dropped to be safe.
	if (isa<FPMathOperator>(R)) {
	R->copyFastMathFlags(&Inst);
	R->andIRFlags(BO);
	}
	if (auto *NewInstBO = dyn_cast<BinaryOperator>(NewBO))
	NewInstBO->copyIRFlags(R);
	return R;
	}

	return nullptr;
	}

	/// Try to narrow the width of a binop if at least 1 operand is an extend of
	/// of a value. This requires a potentially expensive known bits check to make
	/// sure the narrow op does not overflow.
	Instruction *InstCombiner::narrowMathIfNoOverflow(BinaryOperator &BO) {
	// We need at least one extended operand.
	Value Op0 = BO.getOperand(0), Op1 = BO.getOperand(1);

	// If this is a sub, we swap the operands since we always want an extension
	// on the RHS. The LHS can be an extension or a constant.
	if (BO.getOpcode() == Instruction::Sub)
	std::swap(Op0, Op1);

	Value *X;
	bool IsSext = match(Op0, m_SExt(m_Value(X)));
	if (!IsSext && !match(Op0, m_ZExt(m_Value(X))))
	return nullptr;

	// If both operands are the same extension from the same source type and we
	// can eliminate at least one (hasOneUse), this might work.
	CastInst::CastOps CastOpc = IsSext ? Instruction::SExt : Instruction::ZExt;
	Value *Y;
	if (!(match(Op1, m_ZExtOrSExt(m_Value(Y))) && X->getType() == Y->getType() &&
	cast<Operator>(Op1)->getOpcode() == CastOpc &&
	(Op0->hasOneUse() \|\| Op1->hasOneUse()))) {
	// If that did not match, see if we have a suitable constant operand.
	// Truncating and extending must produce the same constant.
	Constant *WideC;
	if (!Op0->hasOneUse() \|\| !match(Op1, m_Constant(WideC)))
	return nullptr;
	Constant *NarrowC = ConstantExpr::getTrunc(WideC, X->getType());
	if (ConstantExpr::getCast(CastOpc, NarrowC, BO.getType()) != WideC)
	return nullptr;
	Y = NarrowC;
	}

	// Swap back now that we found our operands.
	if (BO.getOpcode() == Instruction::Sub)
	std::swap(X, Y);

	// Both operands have narrow versions. Last step: the math must not overflow
	// in the narrow width.
	if (!willNotOverflow(BO.getOpcode(), X, Y, BO, IsSext))
	return nullptr;

	// bo (ext X), (ext Y) --> ext (bo X, Y)
	// bo (ext X), C --> ext (bo X, C')
	Value *NarrowBO = Builder.CreateBinOp(BO.getOpcode(), X, Y, "narrow");
	if (auto *NewBinOp = dyn_cast<BinaryOperator>(NarrowBO)) {
	if (IsSext)
	NewBinOp->setHasNoSignedWrap();
	else
	NewBinOp->setHasNoUnsignedWrap();
	}
	return CastInst::Create(CastOpc, NarrowBO, BO.getType());
	}

	static bool isMergedGEPInBounds(GEPOperator &GEP1, GEPOperator &GEP2) {
	// At least one GEP must be inbounds.
	if (!GEP1.isInBounds() && !GEP2.isInBounds())
	return false;

	return (GEP1.isInBounds() \|\| GEP1.hasAllZeroIndices()) &&
	(GEP2.isInBounds() \|\| GEP2.hasAllZeroIndices());
	}

	/// Thread a GEP operation with constant indices through the constant true/false
	/// arms of a select.
	static Instruction *foldSelectGEP(GetElementPtrInst &GEP,
	InstCombiner::BuilderTy &Builder) {
	if (!GEP.hasAllConstantIndices())
	return nullptr;

	Instruction *Sel;
	Value *Cond;
	Constant TrueC, FalseC;
	if (!match(GEP.getPointerOperand(), m_Instruction(Sel)) \|\|
	!match(Sel,
	m_Select(m_Value(Cond), m_Constant(TrueC), m_Constant(FalseC))))
	return nullptr;

	// gep (select Cond, TrueC, FalseC), IndexC --> select Cond, TrueC', FalseC'
	// Propagate 'inbounds' and metadata from existing instructions.
	// Note: using IRBuilder to create the constants for efficiency.
	SmallVector<Value *, 4> IndexC(GEP.idx_begin(), GEP.idx_end());
	bool IsInBounds = GEP.isInBounds();
	Value *NewTrueC = IsInBounds ? Builder.CreateInBoundsGEP(TrueC, IndexC)
	: Builder.CreateGEP(TrueC, IndexC);
	Value *NewFalseC = IsInBounds ? Builder.CreateInBoundsGEP(FalseC, IndexC)
	: Builder.CreateGEP(FalseC, IndexC);
	return SelectInst::Create(Cond, NewTrueC, NewFalseC, "", nullptr, Sel);
	}

	Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
	SmallVector<Value*, 8> Ops(GEP.op_begin(), GEP.op_end());
	Type *GEPType = GEP.getType();
	Type *GEPEltType = GEP.getSourceElementType();
	bool IsGEPSrcEleScalable = isa<ScalableVectorType>(GEPEltType);
	if (Value *V = SimplifyGEPInst(GEPEltType, Ops, SQ.getWithInstruction(&GEP)))
	return replaceInstUsesWith(GEP, V);

	// For vector geps, use the generic demanded vector support.
	// Skip if GEP return type is scalable. The number of elements is unknown at
	// compile-time.
	if (auto *GEPFVTy = dyn_cast<FixedVectorType>(GEPType)) {
	auto VWidth = GEPFVTy->getNumElements();
	APInt UndefElts(VWidth, 0);
	APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
	if (Value *V = SimplifyDemandedVectorElts(&GEP, AllOnesEltMask,
	UndefElts)) {
	if (V != &GEP)
	return replaceInstUsesWith(GEP, V);
	return &GEP;
	}

	// TODO: 1) Scalarize splat operands, 2) scalarize entire instruction if
	// possible (decide on canonical form for pointer broadcast), 3) exploit
	// undef elements to decrease demanded bits
	}

	Value *PtrOp = GEP.getOperand(0);

	// Eliminate unneeded casts for indices, and replace indices which displace
	// by multiples of a zero size type with zero.
	bool MadeChange = false;

	// Index width may not be the same width as pointer width.
	// Data layout chooses the right type based on supported integer types.
	Type *NewScalarIndexTy =
	DL.getIndexType(GEP.getPointerOperandType()->getScalarType());

	gep_type_iterator GTI = gep_type_begin(GEP);
	for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end(); I != E;
	++I, ++GTI) {
	// Skip indices into struct types.
	if (GTI.isStruct())
	continue;

	Type IndexTy = (I)->getType();
	Type *NewIndexType =
	IndexTy->isVectorTy()
	? VectorType::get(NewScalarIndexTy,
	cast<VectorType>(IndexTy)->getElementCount())
	: NewScalarIndexTy;

	// If the element type has zero size then any index over it is equivalent
	// to an index of zero, so replace it with zero if it is not zero already.
	Type *EltTy = GTI.getIndexedType();
	if (EltTy->isSized() && DL.getTypeAllocSize(EltTy).isZero())
	if (!isa<Constant>(*I) \|\| !match(I->get(), m_Zero())) {
	*I = Constant::getNullValue(NewIndexType);
	MadeChange = true;
	}

	if (IndexTy != NewIndexType) {
	// If we are using a wider index than needed for this platform, shrink
	// it to what we need. If narrower, sign-extend it to what we need.
	// This explicit cast can make subsequent optimizations more obvious.
	I = Builder.CreateIntCast(I, NewIndexType, true);
	MadeChange = true;
	}
	}
	if (MadeChange)
	return &GEP;

	// Check to see if the inputs to the PHI node are getelementptr instructions.
	if (auto *PN = dyn_cast<PHINode>(PtrOp)) {
	auto *Op1 = dyn_cast<GetElementPtrInst>(PN->getOperand(0));
	if (!Op1)
	return nullptr;

	// Don't fold a GEP into itself through a PHI node. This can only happen
	// through the back-edge of a loop. Folding a GEP into itself means that
	// the value of the previous iteration needs to be stored in the meantime,
	// thus requiring an additional register variable to be live, but not
	// actually achieving anything (the GEP still needs to be executed once per
	// loop iteration).
	if (Op1 == &GEP)
	return nullptr;

	int DI = -1;

	for (auto I = PN->op_begin()+1, E = PN->op_end(); I !=E; ++I) {
	auto Op2 = dyn_cast<GetElementPtrInst>(I);
	if (!Op2 \|\| Op1->getNumOperands() != Op2->getNumOperands())
	return nullptr;

	// As for Op1 above, don't try to fold a GEP into itself.
	if (Op2 == &GEP)
	return nullptr;

	// Keep track of the type as we walk the GEP.
	Type *CurTy = nullptr;

	for (unsigned J = 0, F = Op1->getNumOperands(); J != F; ++J) {
	if (Op1->getOperand(J)->getType() != Op2->getOperand(J)->getType())
	return nullptr;

	if (Op1->getOperand(J) != Op2->getOperand(J)) {
	if (DI == -1) {
	// We have not seen any differences yet in the GEPs feeding the
	// PHI yet, so we record this one if it is allowed to be a
	// variable.

	// The first two arguments can vary for any GEP, the rest have to be
	// static for struct slots
	if (J > 1) {
	assert(CurTy && "No current type?");
	if (CurTy->isStructTy())
	return nullptr;
	}

	DI = J;
	} else {
	// The GEP is different by more than one input. While this could be
	// extended to support GEPs that vary by more than one variable it
	// doesn't make sense since it greatly increases the complexity and
	// would result in an R+R+R addressing mode which no backend
	// directly supports and would need to be broken into several
	// simpler instructions anyway.
	return nullptr;
	}
	}

	// Sink down a layer of the type for the next iteration.
	if (J > 0) {
	if (J == 1) {
	CurTy = Op1->getSourceElementType();
	} else {
	CurTy =
	GetElementPtrInst::getTypeAtIndex(CurTy, Op1->getOperand(J));
	}
	}
	}
	}

	// If not all GEPs are identical we'll have to create a new PHI node.
	// Check that the old PHI node has only one use so that it will get
	// removed.
	if (DI != -1 && !PN->hasOneUse())
	return nullptr;

	auto *NewGEP = cast<GetElementPtrInst>(Op1->clone());
	if (DI == -1) {
	// All the GEPs feeding the PHI are identical. Clone one down into our
	// BB so that it can be merged with the current GEP.
	} else {
	// All the GEPs feeding the PHI differ at a single offset. Clone a GEP
	// into the current block so it can be merged, and create a new PHI to
	// set that index.
	PHINode *NewPN;
	{
	IRBuilderBase::InsertPointGuard Guard(Builder);
	Builder.SetInsertPoint(PN);
	NewPN = Builder.CreatePHI(Op1->getOperand(DI)->getType(),
	PN->getNumOperands());
	}

	for (auto &I : PN->operands())
	NewPN->addIncoming(cast<GEPOperator>(I)->getOperand(DI),
	PN->getIncomingBlock(I));

	NewGEP->setOperand(DI, NewPN);
	}

	GEP.getParent()->getInstList().insert(
	GEP.getParent()->getFirstInsertionPt(), NewGEP);
	replaceOperand(GEP, 0, NewGEP);
	PtrOp = NewGEP;
	}

	// Combine Indices - If the source pointer to this getelementptr instruction
	// is a getelementptr instruction, combine the indices of the two
	// getelementptr instructions into a single instruction.
	if (auto *Src = dyn_cast<GEPOperator>(PtrOp)) {
	if (!shouldMergeGEPs(cast<GEPOperator>(&GEP), Src))
	return nullptr;

	// Try to reassociate loop invariant GEP chains to enable LICM.
	if (LI && Src->getNumOperands() == 2 && GEP.getNumOperands() == 2 &&
	Src->hasOneUse()) {
	if (Loop *L = LI->getLoopFor(GEP.getParent())) {
	Value *GO1 = GEP.getOperand(1);
	Value *SO1 = Src->getOperand(1);
	// Reassociate the two GEPs if SO1 is variant in the loop and GO1 is
	// invariant: this breaks the dependence between GEPs and allows LICM
	// to hoist the invariant part out of the loop.
	if (L->isLoopInvariant(GO1) && !L->isLoopInvariant(SO1)) {
	// We have to be careful here.
	// We have something like:
	// %src = getelementptr <ty>, <ty>* %base, <ty> %idx
	// %gep = getelementptr <ty>, <ty>* %src, <ty> %idx2
	// If we just swap idx & idx2 then we could inadvertantly
	// change %src from a vector to a scalar, or vice versa.
	// Cases:
	// 1) %base a scalar & idx a scalar & idx2 a vector
	// => Swapping idx & idx2 turns %src into a vector type.
	// 2) %base a scalar & idx a vector & idx2 a scalar
	// => Swapping idx & idx2 turns %src in a scalar type
	// 3) %base, %idx, and %idx2 are scalars
	// => %src & %gep are scalars
	// => swapping idx & idx2 is safe
	// 4) %base a vector
	// => %src is a vector
	// => swapping idx & idx2 is safe.
	auto *SO0 = Src->getOperand(0);
	auto *SO0Ty = SO0->getType();
	if (!isa<VectorType>(GEPType) \|\| // case 3
	isa<VectorType>(SO0Ty)) { // case 4
	Src->setOperand(1, GO1);
	GEP.setOperand(1, SO1);
	return &GEP;
	} else {
	// Case 1 or 2
	// -- have to recreate %src & %gep
	// put NewSrc at same location as %src
	Builder.SetInsertPoint(cast<Instruction>(PtrOp));
	auto *NewSrc = cast<GetElementPtrInst>(
	Builder.CreateGEP(GEPEltType, SO0, GO1, Src->getName()));
	NewSrc->setIsInBounds(Src->isInBounds());
	auto *NewGEP = GetElementPtrInst::Create(GEPEltType, NewSrc, {SO1});
	NewGEP->setIsInBounds(GEP.isInBounds());
	return NewGEP;
	}
	}
	}
	}

	// Note that if our source is a gep chain itself then we wait for that
	// chain to be resolved before we perform this transformation. This
	// avoids us creating a TON of code in some cases.
	if (auto *SrcGEP = dyn_cast<GEPOperator>(Src->getOperand(0)))
	if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(Src, SrcGEP))
	return nullptr; // Wait until our source is folded to completion.

	SmallVector<Value*, 8> Indices;

	// Find out whether the last index in the source GEP is a sequential idx.
	bool EndsWithSequential = false;
	for (gep_type_iterator I = gep_type_begin(Src), E = gep_type_end(Src);
	I != E; ++I)
	EndsWithSequential = I.isSequential();

	// Can we combine the two pointer arithmetics offsets?
	if (EndsWithSequential) {
	// Replace: gep (gep %P, long B), long A, ...
	// With: T = long A+B; gep %P, T, ...
	Value *SO1 = Src->getOperand(Src->getNumOperands()-1);
	Value *GO1 = GEP.getOperand(1);

	// If they aren't the same type, then the input hasn't been processed
	// by the loop above yet (which canonicalizes sequential index types to
	// intptr_t). Just avoid transforming this until the input has been
	// normalized.
	if (SO1->getType() != GO1->getType())
	return nullptr;

	Value *Sum =
	SimplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP));
	// Only do the combine when we are sure the cost after the
	// merge is never more than that before the merge.
	if (Sum == nullptr)
	return nullptr;

	// Update the GEP in place if possible.
	if (Src->getNumOperands() == 2) {
	GEP.setIsInBounds(isMergedGEPInBounds(Src, cast<GEPOperator>(&GEP)));
	replaceOperand(GEP, 0, Src->getOperand(0));
	replaceOperand(GEP, 1, Sum);
	return &GEP;
	}
	Indices.append(Src->op_begin()+1, Src->op_end()-1);
	Indices.push_back(Sum);
	Indices.append(GEP.op_begin()+2, GEP.op_end());
	} else if (isa<Constant>(*GEP.idx_begin()) &&
	cast<Constant>(*GEP.idx_begin())->isNullValue() &&
	Src->getNumOperands() != 1) {
	// Otherwise we can do the fold if the first index of the GEP is a zero
	Indices.append(Src->op_begin()+1, Src->op_end());
	Indices.append(GEP.idx_begin()+1, GEP.idx_end());
	}

	if (!Indices.empty())
	return isMergedGEPInBounds(Src, cast<GEPOperator>(&GEP))
	? GetElementPtrInst::CreateInBounds(
	Src->getSourceElementType(), Src->getOperand(0), Indices,
	GEP.getName())
	: GetElementPtrInst::Create(Src->getSourceElementType(),
	Src->getOperand(0), Indices,
	GEP.getName());
	}

	// Skip if GEP source element type is scalable. The type alloc size is unknown
	// at compile-time.
	if (GEP.getNumIndices() == 1 && !IsGEPSrcEleScalable) {
	unsigned AS = GEP.getPointerAddressSpace();
	if (GEP.getOperand(1)->getType()->getScalarSizeInBits() ==
	DL.getIndexSizeInBits(AS)) {
	uint64_t TyAllocSize = DL.getTypeAllocSize(GEPEltType).getFixedSize();

	bool Matched = false;
	uint64_t C;
	Value *V = nullptr;
	if (TyAllocSize == 1) {
	V = GEP.getOperand(1);
	Matched = true;
	} else if (match(GEP.getOperand(1),
	m_AShr(m_Value(V), m_ConstantInt(C)))) {
	if (TyAllocSize == 1ULL << C)
	Matched = true;
	} else if (match(GEP.getOperand(1),
	m_SDiv(m_Value(V), m_ConstantInt(C)))) {
	if (TyAllocSize == C)
	Matched = true;
	}

	if (Matched) {
	// Canonicalize (gep i8* X, -(ptrtoint Y))
	// to (inttoptr (sub (ptrtoint X), (ptrtoint Y)))
	// The GEP pattern is emitted by the SCEV expander for certain kinds of
	// pointer arithmetic.
	if (match(V, m_Neg(m_PtrToInt(m_Value())))) {
	Operator *Index = cast<Operator>(V);
	Value *PtrToInt = Builder.CreatePtrToInt(PtrOp, Index->getType());
	Value *NewSub = Builder.CreateSub(PtrToInt, Index->getOperand(1));
	return CastInst::Create(Instruction::IntToPtr, NewSub, GEPType);
	}
	// Canonicalize (gep i8* X, (ptrtoint Y)-(ptrtoint X))
	// to (bitcast Y)
	Value *Y;
	if (match(V, m_Sub(m_PtrToInt(m_Value(Y)),
	m_PtrToInt(m_Specific(GEP.getOperand(0))))))
	return CastInst::CreatePointerBitCastOrAddrSpaceCast(Y, GEPType);
	}
	}
	}

	// We do not handle pointer-vector geps here.
	if (GEPType->isVectorTy())
	return nullptr;

	// Handle gep(bitcast x) and gep(gep x, 0, 0, 0).
	Value *StrippedPtr = PtrOp->stripPointerCasts();
	PointerType *StrippedPtrTy = cast<PointerType>(StrippedPtr->getType());

	if (StrippedPtr != PtrOp) {
	bool HasZeroPointerIndex = false;
	Type *StrippedPtrEltTy = StrippedPtrTy->getElementType();

	if (auto *C = dyn_cast<ConstantInt>(GEP.getOperand(1)))
	HasZeroPointerIndex = C->isZero();

	// Transform: GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ...
	// into : GEP [10 x i8]* X, i32 0, ...
	//
	// Likewise, transform: GEP (bitcast i8* X to [0 x i8]*), i32 0, ...
	// into : GEP i8* X, ...
	//
	// This occurs when the program declares an array extern like "int X[];"
	if (HasZeroPointerIndex) {
	if (auto *CATy = dyn_cast<ArrayType>(GEPEltType)) {
	// GEP (bitcast i8* X to [0 x i8]*), i32 0, ... ?
	if (CATy->getElementType() == StrippedPtrEltTy) {
	// -> GEP i8* X, ...
	SmallVector<Value*, 8> Idx(GEP.idx_begin()+1, GEP.idx_end());
	GetElementPtrInst *Res = GetElementPtrInst::Create(
	StrippedPtrEltTy, StrippedPtr, Idx, GEP.getName());
	Res->setIsInBounds(GEP.isInBounds());
	if (StrippedPtrTy->getAddressSpace() == GEP.getAddressSpace())
	return Res;
	// Insert Res, and create an addrspacecast.
	// e.g.,
	// GEP (addrspacecast i8 addrspace(1)* X to [0 x i8]*), i32 0, ...
	// ->
	// %0 = GEP i8 addrspace(1)* X, ...
	// addrspacecast i8 addrspace(1)* %0 to i8*
	return new AddrSpaceCastInst(Builder.Insert(Res), GEPType);
	}

	if (auto *XATy = dyn_cast<ArrayType>(StrippedPtrEltTy)) {
	// GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ... ?
	if (CATy->getElementType() == XATy->getElementType()) {
	// -> GEP [10 x i8]* X, i32 0, ...
	// At this point, we know that the cast source type is a pointer
	// to an array of the same type as the destination pointer
	// array. Because the array type is never stepped over (there
	// is a leading zero) we can fold the cast into this GEP.
	if (StrippedPtrTy->getAddressSpace() == GEP.getAddressSpace()) {
	GEP.setSourceElementType(XATy);
	return replaceOperand(GEP, 0, StrippedPtr);
	}
	// Cannot replace the base pointer directly because StrippedPtr's
	// address space is different. Instead, create a new GEP followed by
	// an addrspacecast.
	// e.g.,
	// GEP (addrspacecast [10 x i8] addrspace(1)* X to [0 x i8]*),
	// i32 0, ...
	// ->
	// %0 = GEP [10 x i8] addrspace(1)* X, ...
	// addrspacecast i8 addrspace(1)* %0 to i8*
	SmallVector<Value*, 8> Idx(GEP.idx_begin(), GEP.idx_end());
	Value *NewGEP =
	GEP.isInBounds()
	? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr,
	Idx, GEP.getName())
	: Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, Idx,
	GEP.getName());
	return new AddrSpaceCastInst(NewGEP, GEPType);
	}
	}
	}
	} else if (GEP.getNumOperands() == 2 && !IsGEPSrcEleScalable) {
	// Skip if GEP source element type is scalable. The type alloc size is
	// unknown at compile-time.
	// Transform things like: %t = getelementptr i32*
	// bitcast ([2 x i32]* %str to i32*), i32 %V into: %t1 = getelementptr [2
	// x i32]* %str, i32 0, i32 %V; bitcast
	if (StrippedPtrEltTy->isArrayTy() &&
	DL.getTypeAllocSize(StrippedPtrEltTy->getArrayElementType()) ==
	DL.getTypeAllocSize(GEPEltType)) {
	Type *IdxType = DL.getIndexType(GEPType);
	Value *Idx[2] = { Constant::getNullValue(IdxType), GEP.getOperand(1) };
	Value *NewGEP =
	GEP.isInBounds()
	? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr, Idx,
	GEP.getName())
	: Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, Idx,
	GEP.getName());

	// V and GEP are both pointer types --> BitCast
	return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP, GEPType);
	}

	// Transform things like:
	// %V = mul i64 %N, 4
	// %t = getelementptr i8* bitcast (i32* %arr to i8*), i32 %V
	// into: %t1 = getelementptr i32* %arr, i32 %N; bitcast
	if (GEPEltType->isSized() && StrippedPtrEltTy->isSized()) {
	// Check that changing the type amounts to dividing the index by a scale
	// factor.
	uint64_t ResSize = DL.getTypeAllocSize(GEPEltType).getFixedSize();
	uint64_t SrcSize = DL.getTypeAllocSize(StrippedPtrEltTy).getFixedSize();
	if (ResSize && SrcSize % ResSize == 0) {
	Value *Idx = GEP.getOperand(1);
	unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits();
	uint64_t Scale = SrcSize / ResSize;

	// Earlier transforms ensure that the index has the right type
	// according to Data Layout, which considerably simplifies the
	// logic by eliminating implicit casts.
	assert(Idx->getType() == DL.getIndexType(GEPType) &&
	"Index type does not match the Data Layout preferences");

	bool NSW;
	if (Value *NewIdx = Descale(Idx, APInt(BitWidth, Scale), NSW)) {
	// Successfully decomposed Idx as NewIdx * Scale, form a new GEP.
	// If the multiplication NewIdx * Scale may overflow then the new
	// GEP may not be "inbounds".
	Value *NewGEP =
	GEP.isInBounds() && NSW
	? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr,
	NewIdx, GEP.getName())
	: Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, NewIdx,
	GEP.getName());

	// The NewGEP must be pointer typed, so must the old one -> BitCast
	return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
	GEPType);
	}
	}
	}

	// Similarly, transform things like:
	// getelementptr i8* bitcast ([100 x double]* X to i8*), i32 %tmp
	// (where tmp = 8*tmp2) into:
	// getelementptr [100 x double]* %arr, i32 0, i32 %tmp2; bitcast
	if (GEPEltType->isSized() && StrippedPtrEltTy->isSized() &&
	StrippedPtrEltTy->isArrayTy()) {
	// Check that changing to the array element type amounts to dividing the
	// index by a scale factor.
	uint64_t ResSize = DL.getTypeAllocSize(GEPEltType).getFixedSize();
	uint64_t ArrayEltSize =
	DL.getTypeAllocSize(StrippedPtrEltTy->getArrayElementType())
	.getFixedSize();
	if (ResSize && ArrayEltSize % ResSize == 0) {
	Value *Idx = GEP.getOperand(1);
	unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits();
	uint64_t Scale = ArrayEltSize / ResSize;

	// Earlier transforms ensure that the index has the right type
	// according to the Data Layout, which considerably simplifies
	// the logic by eliminating implicit casts.
	assert(Idx->getType() == DL.getIndexType(GEPType) &&
	"Index type does not match the Data Layout preferences");

	bool NSW;
	if (Value *NewIdx = Descale(Idx, APInt(BitWidth, Scale), NSW)) {
	// Successfully decomposed Idx as NewIdx * Scale, form a new GEP.
	// If the multiplication NewIdx * Scale may overflow then the new
	// GEP may not be "inbounds".
	Type *IndTy = DL.getIndexType(GEPType);
	Value *Off[2] = {Constant::getNullValue(IndTy), NewIdx};

	Value *NewGEP =
	GEP.isInBounds() && NSW
	? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr,
	Off, GEP.getName())
	: Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, Off,
	GEP.getName());
	// The NewGEP must be pointer typed, so must the old one -> BitCast
	return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
	GEPType);
	}
	}
	}
	}
	}

	// addrspacecast between types is canonicalized as a bitcast, then an
	// addrspacecast. To take advantage of the below bitcast + struct GEP, look
	// through the addrspacecast.
	Value *ASCStrippedPtrOp = PtrOp;
	if (auto *ASC = dyn_cast<AddrSpaceCastInst>(PtrOp)) {
	// X = bitcast A addrspace(1)* to B addrspace(1)*
	// Y = addrspacecast A addrspace(1)* to B addrspace(2)*
	// Z = gep Y, <...constant indices...>
	// Into an addrspacecasted GEP of the struct.
	if (auto *BC = dyn_cast<BitCastInst>(ASC->getOperand(0)))
	ASCStrippedPtrOp = BC;
	}

	if (auto *BCI = dyn_cast<BitCastInst>(ASCStrippedPtrOp)) {
	Value *SrcOp = BCI->getOperand(0);
	PointerType *SrcType = cast<PointerType>(BCI->getSrcTy());
	Type *SrcEltType = SrcType->getElementType();

	// GEP directly using the source operand if this GEP is accessing an element
	// of a bitcasted pointer to vector or array of the same dimensions:
	// gep (bitcast <c x ty>* X to [c x ty]*), Y, Z --> gep X, Y, Z
	// gep (bitcast [c x ty]* X to <c x ty>*), Y, Z --> gep X, Y, Z
	auto areMatchingArrayAndVecTypes = [](Type ArrTy, Type VecTy,
	const DataLayout &DL) {
	auto *VecVTy = cast<VectorType>(VecTy);
	return ArrTy->getArrayElementType() == VecVTy->getElementType() &&
	ArrTy->getArrayNumElements() == VecVTy->getNumElements() &&
	DL.getTypeAllocSize(ArrTy) == DL.getTypeAllocSize(VecTy);
	};
	if (GEP.getNumOperands() == 3 &&
	((GEPEltType->isArrayTy() && SrcEltType->isVectorTy() &&
	areMatchingArrayAndVecTypes(GEPEltType, SrcEltType, DL)) \|\|
	(GEPEltType->isVectorTy() && SrcEltType->isArrayTy() &&
	areMatchingArrayAndVecTypes(SrcEltType, GEPEltType, DL)))) {

	// Create a new GEP here, as using `setOperand()` followed by
	// `setSourceElementType()` won't actually update the type of the
	// existing GEP Value. Causing issues if this Value is accessed when
	// constructing an AddrSpaceCastInst
	Value *NGEP =
	GEP.isInBounds()
	? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, {Ops[1], Ops[2]})
	: Builder.CreateGEP(SrcEltType, SrcOp, {Ops[1], Ops[2]});
	NGEP->takeName(&GEP);

	// Preserve GEP address space to satisfy users
	if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace())
	return new AddrSpaceCastInst(NGEP, GEPType);

	return replaceInstUsesWith(GEP, NGEP);
	}

	// See if we can simplify:
	// X = bitcast A* to B*
	// Y = gep X, <...constant indices...>
	// into a gep of the original struct. This is important for SROA and alias
	// analysis of unions. If "A" is also a bitcast, wait for A/X to be merged.
	unsigned OffsetBits = DL.getIndexTypeSizeInBits(GEPType);
	APInt Offset(OffsetBits, 0);
	if (!isa<BitCastInst>(SrcOp) && GEP.accumulateConstantOffset(DL, Offset)) {
	// If this GEP instruction doesn't move the pointer, just replace the GEP
	// with a bitcast of the real input to the dest type.
	if (!Offset) {
	// If the bitcast is of an allocation, and the allocation will be
	// converted to match the type of the cast, don't touch this.
	if (isa<AllocaInst>(SrcOp) \|\| isAllocationFn(SrcOp, &TLI)) {
	// See if the bitcast simplifies, if so, don't nuke this GEP yet.
	if (Instruction I = visitBitCast(BCI)) {
	if (I != BCI) {
	I->takeName(BCI);
	BCI->getParent()->getInstList().insert(BCI->getIterator(), I);
	replaceInstUsesWith(*BCI, I);
	}
	return &GEP;
	}
	}

	if (SrcType->getPointerAddressSpace() != GEP.getAddressSpace())
	return new AddrSpaceCastInst(SrcOp, GEPType);
	return new BitCastInst(SrcOp, GEPType);
	}

	// Otherwise, if the offset is non-zero, we need to find out if there is a
	// field at Offset in 'A's type. If so, we can pull the cast through the
	// GEP.
	SmallVector<Value*, 8> NewIndices;
	if (FindElementAtOffset(SrcType, Offset.getSExtValue(), NewIndices)) {
	Value *NGEP =
	GEP.isInBounds()
	? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, NewIndices)
	: Builder.CreateGEP(SrcEltType, SrcOp, NewIndices);

	if (NGEP->getType() == GEPType)
	return replaceInstUsesWith(GEP, NGEP);
	NGEP->takeName(&GEP);

	if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace())
	return new AddrSpaceCastInst(NGEP, GEPType);
	return new BitCastInst(NGEP, GEPType);
	}
	}
	}

	if (!GEP.isInBounds()) {
	unsigned IdxWidth =
	DL.getIndexSizeInBits(PtrOp->getType()->getPointerAddressSpace());
	APInt BasePtrOffset(IdxWidth, 0);
	Value *UnderlyingPtrOp =
	PtrOp->stripAndAccumulateInBoundsConstantOffsets(DL,
	BasePtrOffset);
	if (auto *AI = dyn_cast<AllocaInst>(UnderlyingPtrOp)) {
	if (GEP.accumulateConstantOffset(DL, BasePtrOffset) &&
	BasePtrOffset.isNonNegative()) {
	APInt AllocSize(
	IdxWidth,
	DL.getTypeAllocSize(AI->getAllocatedType()).getKnownMinSize());
	if (BasePtrOffset.ule(AllocSize)) {
	return GetElementPtrInst::CreateInBounds(
	GEP.getSourceElementType(), PtrOp, makeArrayRef(Ops).slice(1),
	GEP.getName());
	}
	}
	}
	}

	if (Instruction *R = foldSelectGEP(GEP, Builder))
	return R;

	return nullptr;
	}

	static bool isNeverEqualToUnescapedAlloc(Value V, const TargetLibraryInfo TLI,
	Instruction *AI) {
	if (isa<ConstantPointerNull>(V))
	return true;
	if (auto *LI = dyn_cast<LoadInst>(V))
	return isa<GlobalVariable>(LI->getPointerOperand());
	// Two distinct allocations will never be equal.
	// We rely on LookThroughBitCast in isAllocLikeFn being false, since looking
	// through bitcasts of V can cause
	// the result statement below to be true, even when AI and V (ex:
	// i8* ->i32* ->i8* of AI) are the same allocations.
	return isAllocLikeFn(V, TLI) && V != AI;
	}

	static bool isAllocSiteRemovable(Instruction *AI,
	SmallVectorImpl<WeakTrackingVH> &Users,
	const TargetLibraryInfo *TLI) {
	SmallVector<Instruction*, 4> Worklist;
	Worklist.push_back(AI);

	do {
	Instruction *PI = Worklist.pop_back_val();
	for (User *U : PI->users()) {
	Instruction *I = cast<Instruction>(U);
	switch (I->getOpcode()) {
	default:
	// Give up the moment we see something we can't handle.
	return false;

	case Instruction::AddrSpaceCast:
	case Instruction::BitCast:
	case Instruction::GetElementPtr:
	Users.emplace_back(I);
	Worklist.push_back(I);
	continue;

	case Instruction::ICmp: {
	ICmpInst *ICI = cast<ICmpInst>(I);
	// We can fold eq/ne comparisons with null to false/true, respectively.
	// We also fold comparisons in some conditions provided the alloc has
	// not escaped (see isNeverEqualToUnescapedAlloc).
	if (!ICI->isEquality())
	return false;
	unsigned OtherIndex = (ICI->getOperand(0) == PI) ? 1 : 0;
	if (!isNeverEqualToUnescapedAlloc(ICI->getOperand(OtherIndex), TLI, AI))
	return false;
	Users.emplace_back(I);
	continue;
	}

	case Instruction::Call:
	// Ignore no-op and store intrinsics.
	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
	switch (II->getIntrinsicID()) {
	default:
	return false;

	case Intrinsic::memmove:
	case Intrinsic::memcpy:
	case Intrinsic::memset: {
	MemIntrinsic *MI = cast<MemIntrinsic>(II);
	if (MI->isVolatile() \|\| MI->getRawDest() != PI)
	return false;
	LLVM_FALLTHROUGH;
	}
	case Intrinsic::assume:
	case Intrinsic::invariant_start:
	case Intrinsic::invariant_end:
	case Intrinsic::lifetime_start:
	case Intrinsic::lifetime_end:
	case Intrinsic::objectsize:
	Users.emplace_back(I);
	continue;
	}
	}

	if (isFreeCall(I, TLI)) {
	Users.emplace_back(I);
	continue;
	}
	return false;

	case Instruction::Store: {
	StoreInst *SI = cast<StoreInst>(I);
	if (SI->isVolatile() \|\| SI->getPointerOperand() != PI)
	return false;
	Users.emplace_back(I);
	continue;
	}
	}
	llvm_unreachable("missing a return?");
	}
	} while (!Worklist.empty());
	return true;
	}

	Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
	// If we have a malloc call which is only used in any amount of comparisons to
	// null and free calls, delete the calls and replace the comparisons with true
	// or false as appropriate.

	// This is based on the principle that we can substitute our own allocation
	// function (which will never return null) rather than knowledge of the
	// specific function being called. In some sense this can change the permitted
	// outputs of a program (when we convert a malloc to an alloca, the fact that
	// the allocation is now on the stack is potentially visible, for example),
	// but we believe in a permissible manner.
	SmallVector<WeakTrackingVH, 64> Users;

	// If we are removing an alloca with a dbg.declare, insert dbg.value calls
	// before each store.
	TinyPtrVector<DbgVariableIntrinsic *> DIIs;
	std::unique_ptr<DIBuilder> DIB;
	if (isa<AllocaInst>(MI)) {
	DIIs = FindDbgAddrUses(&MI);
	DIB.reset(new DIBuilder(MI.getModule(), /AllowUnresolved=*/false));
	}

	if (isAllocSiteRemovable(&MI, Users, &TLI)) {
	for (unsigned i = 0, e = Users.size(); i != e; ++i) {
	// Lowering all @llvm.objectsize calls first because they may
	// use a bitcast/GEP of the alloca we are removing.
	if (!Users[i])
	continue;

	Instruction I = cast<Instruction>(&Users[i]);

	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
	if (II->getIntrinsicID() == Intrinsic::objectsize) {
	Value *Result =
	lowerObjectSizeCall(II, DL, &TLI, /MustSucceed=/true);
	replaceInstUsesWith(*I, Result);
	eraseInstFromFunction(*I);
	Users[i] = nullptr; // Skip examining in the next loop.
	}
	}
	}
	for (unsigned i = 0, e = Users.size(); i != e; ++i) {
	if (!Users[i])
	continue;

	Instruction I = cast<Instruction>(&Users[i]);

	if (ICmpInst *C = dyn_cast<ICmpInst>(I)) {
	replaceInstUsesWith(*C,
	ConstantInt::get(Type::getInt1Ty(C->getContext()),
	C->isFalseWhenEqual()));
	} else if (auto *SI = dyn_cast<StoreInst>(I)) {
	for (auto *DII : DIIs)
	ConvertDebugDeclareToDebugValue(DII, SI, *DIB);
	} else {
	// Casts, GEP, or anything else: we're about to delete this instruction,
	// so it can not have any valid uses.
	replaceInstUsesWith(*I, UndefValue::get(I->getType()));
	}
	eraseInstFromFunction(*I);
	}

	if (InvokeInst *II = dyn_cast<InvokeInst>(&MI)) {
	// Replace invoke with a NOP intrinsic to maintain the original CFG
	Module *M = II->getModule();
	Function *F = Intrinsic::getDeclaration(M, Intrinsic::donothing);
	InvokeInst::Create(F, II->getNormalDest(), II->getUnwindDest(),
	None, "", II->getParent());
	}

	for (auto *DII : DIIs)
	eraseInstFromFunction(*DII);

	return eraseInstFromFunction(MI);
	}
	return nullptr;
	}

	/// Move the call to free before a NULL test.
	///
	/// Check if this free is accessed after its argument has been test
	/// against NULL (property 0).
	/// If yes, it is legal to move this call in its predecessor block.
	///
	/// The move is performed only if the block containing the call to free
	/// will be removed, i.e.:
	/// 1. it has only one predecessor P, and P has two successors
	/// 2. it contains the call, noops, and an unconditional branch
	/// 3. its successor is the same as its predecessor's successor
	///
	/// The profitability is out-of concern here and this function should
	/// be called only if the caller knows this transformation would be
	/// profitable (e.g., for code size).
	static Instruction *tryToMoveFreeBeforeNullTest(CallInst &FI,
	const DataLayout &DL) {
	Value *Op = FI.getArgOperand(0);
	BasicBlock *FreeInstrBB = FI.getParent();
	BasicBlock *PredBB = FreeInstrBB->getSinglePredecessor();

	// Validate part of constraint #1: Only one predecessor
	// FIXME: We can extend the number of predecessor, but in that case, we
	// would duplicate the call to free in each predecessor and it may
	// not be profitable even for code size.
	if (!PredBB)
	return nullptr;

	// Validate constraint #2: Does this block contains only the call to
	// free, noops, and an unconditional branch?
	BasicBlock *SuccBB;
	Instruction *FreeInstrBBTerminator = FreeInstrBB->getTerminator();
	if (!match(FreeInstrBBTerminator, m_UnconditionalBr(SuccBB)))
	return nullptr;

	// If there are only 2 instructions in the block, at this point,
	// this is the call to free and unconditional.
	// If there are more than 2 instructions, check that they are noops
	// i.e., they won't hurt the performance of the generated code.
	if (FreeInstrBB->size() != 2) {
	for (const Instruction &Inst : FreeInstrBB->instructionsWithoutDebug()) {
	if (&Inst == &FI \|\| &Inst == FreeInstrBBTerminator)
	continue;
	auto *Cast = dyn_cast<CastInst>(&Inst);
	if (!Cast \|\| !Cast->isNoopCast(DL))
	return nullptr;
	}
	}
	// Validate the rest of constraint #1 by matching on the pred branch.
	Instruction *TI = PredBB->getTerminator();
	BasicBlock TrueBB, FalseBB;
	ICmpInst::Predicate Pred;
	if (!match(TI, m_Br(m_ICmp(Pred,
	m_CombineOr(m_Specific(Op),
	m_Specific(Op->stripPointerCasts())),
	m_Zero()),
	TrueBB, FalseBB)))
	return nullptr;
	if (Pred != ICmpInst::ICMP_EQ && Pred != ICmpInst::ICMP_NE)
	return nullptr;

	// Validate constraint #3: Ensure the null case just falls through.
	if (SuccBB != (Pred == ICmpInst::ICMP_EQ ? TrueBB : FalseBB))
	return nullptr;
	assert(FreeInstrBB == (Pred == ICmpInst::ICMP_EQ ? FalseBB : TrueBB) &&
	"Broken CFG: missing edge from predecessor to successor");

	// At this point, we know that everything in FreeInstrBB can be moved
	// before TI.
	for (BasicBlock::iterator It = FreeInstrBB->begin(), End = FreeInstrBB->end();
	It != End;) {
	Instruction &Instr = *It++;
	if (&Instr == FreeInstrBBTerminator)
	break;
	Instr.moveBefore(TI);
	}
	assert(FreeInstrBB->size() == 1 &&
	"Only the branch instruction should remain");
	return &FI;
	}

	Instruction *InstCombiner::visitFree(CallInst &FI) {
	Value *Op = FI.getArgOperand(0);

	// free undef -> unreachable.
	if (isa<UndefValue>(Op)) {
	// Leave a marker since we can't modify the CFG here.
	CreateNonTerminatorUnreachable(&FI);
	return eraseInstFromFunction(FI);
	}

	// If we have 'free null' delete the instruction. This can happen in stl code
	// when lots of inlining happens.
	if (isa<ConstantPointerNull>(Op))
	return eraseInstFromFunction(FI);

	// If we optimize for code size, try to move the call to free before the null
	// test so that simplify cfg can remove the empty block and dead code
	// elimination the branch. I.e., helps to turn something like:
	// if (foo) free(foo);
	// into
	// free(foo);
	//
	// Note that we can only do this for 'free' and not for any flavor of
	// 'operator delete'; there is no 'operator delete' symbol for which we are
	// permitted to invent a call, even if we're passing in a null pointer.
	if (MinimizeSize) {
	LibFunc Func;
	if (TLI.getLibFunc(FI, Func) && TLI.has(Func) && Func == LibFunc_free)
	if (Instruction *I = tryToMoveFreeBeforeNullTest(FI, DL))
	return I;
	}

	return nullptr;
	}

	static bool isMustTailCall(Value *V) {
	if (auto *CI = dyn_cast<CallInst>(V))
	return CI->isMustTailCall();
	return false;
	}

	Instruction *InstCombiner::visitReturnInst(ReturnInst &RI) {
	if (RI.getNumOperands() == 0) // ret void
	return nullptr;

	Value *ResultOp = RI.getOperand(0);
	Type *VTy = ResultOp->getType();
	if (!VTy->isIntegerTy() \|\| isa<Constant>(ResultOp))
	return nullptr;

	// Don't replace result of musttail calls.
	if (isMustTailCall(ResultOp))
	return nullptr;

	// There might be assume intrinsics dominating this return that completely
	// determine the value. If so, constant fold it.
	KnownBits Known = computeKnownBits(ResultOp, 0, &RI);
	if (Known.isConstant())
	return replaceOperand(RI, 0,
	Constant::getIntegerValue(VTy, Known.getConstant()));

	return nullptr;
	}

	Instruction *InstCombiner::visitUnconditionalBranchInst(BranchInst &BI) {
	assert(BI.isUnconditional() && "Only for unconditional branches.");

	// If this store is the second-to-last instruction in the basic block
	// (excluding debug info and bitcasts of pointers) and if the block ends with
	// an unconditional branch, try to move the store to the successor block.

	auto GetLastSinkableStore = [](BasicBlock::iterator BBI) {
	auto IsNoopInstrForStoreMerging = [](BasicBlock::iterator BBI) {
	return isa<DbgInfoIntrinsic>(BBI) \|\|
	(isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy());
	};

	BasicBlock::iterator FirstInstr = BBI->getParent()->begin();
	do {
	if (BBI != FirstInstr)
	--BBI;
	} while (BBI != FirstInstr && IsNoopInstrForStoreMerging(BBI));

	return dyn_cast<StoreInst>(BBI);
	};

	if (StoreInst *SI = GetLastSinkableStore(BasicBlock::iterator(BI)))
	if (mergeStoreIntoSuccessor(*SI))
	return &BI;

	return nullptr;
	}

	Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
	if (BI.isUnconditional())
	return visitUnconditionalBranchInst(BI);

	// Change br (not X), label True, label False to: br X, label False, True
	Value *X = nullptr;
	if (match(&BI, m_Br(m_Not(m_Value(X)), m_BasicBlock(), m_BasicBlock())) &&
	!isa<Constant>(X)) {
	// Swap Destinations and condition...
	BI.swapSuccessors();
	return replaceOperand(BI, 0, X);
	}

	// If the condition is irrelevant, remove the use so that other
	// transforms on the condition become more effective.
	if (!isa<ConstantInt>(BI.getCondition()) &&
	BI.getSuccessor(0) == BI.getSuccessor(1))
	return replaceOperand(
	BI, 0, ConstantInt::getFalse(BI.getCondition()->getType()));

	// Canonicalize, for example, fcmp_one -> fcmp_oeq.
	CmpInst::Predicate Pred;
	if (match(&BI, m_Br(m_OneUse(m_FCmp(Pred, m_Value(), m_Value())),
	m_BasicBlock(), m_BasicBlock())) &&
	!isCanonicalPredicate(Pred)) {
	// Swap destinations and condition.
	CmpInst *Cond = cast<CmpInst>(BI.getCondition());
	Cond->setPredicate(CmpInst::getInversePredicate(Pred));
	BI.swapSuccessors();
	Worklist.push(Cond);
	return &BI;
	}

	return nullptr;
	}

	Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
	Value *Cond = SI.getCondition();
	Value *Op0;
	ConstantInt *AddRHS;
	if (match(Cond, m_Add(m_Value(Op0), m_ConstantInt(AddRHS)))) {
	// Change 'switch (X+4) case 1:' into 'switch (X) case -3'.
	for (auto Case : SI.cases()) {
	Constant *NewCase = ConstantExpr::getSub(Case.getCaseValue(), AddRHS);
	assert(isa<ConstantInt>(NewCase) &&
	"Result of expression should be constant");
	Case.setValue(cast<ConstantInt>(NewCase));
	}
	return replaceOperand(SI, 0, Op0);
	}

	KnownBits Known = computeKnownBits(Cond, 0, &SI);
	unsigned LeadingKnownZeros = Known.countMinLeadingZeros();
	unsigned LeadingKnownOnes = Known.countMinLeadingOnes();

	// Compute the number of leading bits we can ignore.
	// TODO: A better way to determine this would use ComputeNumSignBits().
	for (auto &C : SI.cases()) {
	LeadingKnownZeros = std::min(
	LeadingKnownZeros, C.getCaseValue()->getValue().countLeadingZeros());
	LeadingKnownOnes = std::min(
	LeadingKnownOnes, C.getCaseValue()->getValue().countLeadingOnes());
	}

	unsigned NewWidth = Known.getBitWidth() - std::max(LeadingKnownZeros, LeadingKnownOnes);

	// Shrink the condition operand if the new type is smaller than the old type.
	// But do not shrink to a non-standard type, because backend can't generate
	// good code for that yet.
	// TODO: We can make it aggressive again after fixing PR39569.
	if (NewWidth > 0 && NewWidth < Known.getBitWidth() &&
	shouldChangeType(Known.getBitWidth(), NewWidth)) {
	IntegerType *Ty = IntegerType::get(SI.getContext(), NewWidth);
	Builder.SetInsertPoint(&SI);
	Value *NewCond = Builder.CreateTrunc(Cond, Ty, "trunc");

	for (auto Case : SI.cases()) {
	APInt TruncatedCase = Case.getCaseValue()->getValue().trunc(NewWidth);
	Case.setValue(ConstantInt::get(SI.getContext(), TruncatedCase));
	}
	return replaceOperand(SI, 0, NewCond);
	}

	return nullptr;
	}

	Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
	Value *Agg = EV.getAggregateOperand();

	if (!EV.hasIndices())
	return replaceInstUsesWith(EV, Agg);

	if (Value *V = SimplifyExtractValueInst(Agg, EV.getIndices(),
	SQ.getWithInstruction(&EV)))
	return replaceInstUsesWith(EV, V);

	if (InsertValueInst *IV = dyn_cast<InsertValueInst>(Agg)) {
	// We're extracting from an insertvalue instruction, compare the indices
	const unsigned exti, exte, insi, inse;
	for (exti = EV.idx_begin(), insi = IV->idx_begin(),
	exte = EV.idx_end(), inse = IV->idx_end();
	exti != exte && insi != inse;
	++exti, ++insi) {
	if (insi != exti)
	// The insert and extract both reference distinctly different elements.
	// This means the extract is not influenced by the insert, and we can
	// replace the aggregate operand of the extract with the aggregate
	// operand of the insert. i.e., replace
	// %I = insertvalue { i32, { i32 } } %A, { i32 } { i32 42 }, 1
	// %E = extractvalue { i32, { i32 } } %I, 0
	// with
	// %E = extractvalue { i32, { i32 } } %A, 0
	return ExtractValueInst::Create(IV->getAggregateOperand(),
	EV.getIndices());
	}
	if (exti == exte && insi == inse)
	// Both iterators are at the end: Index lists are identical. Replace
	// %B = insertvalue { i32, { i32 } } %A, i32 42, 1, 0
	// %C = extractvalue { i32, { i32 } } %B, 1, 0
	// with "i32 42"
	return replaceInstUsesWith(EV, IV->getInsertedValueOperand());
	if (exti == exte) {
	// The extract list is a prefix of the insert list. i.e. replace
	// %I = insertvalue { i32, { i32 } } %A, i32 42, 1, 0
	// %E = extractvalue { i32, { i32 } } %I, 1
	// with
	// %X = extractvalue { i32, { i32 } } %A, 1
	// %E = insertvalue { i32 } %X, i32 42, 0
	// by switching the order of the insert and extract (though the
	// insertvalue should be left in, since it may have other uses).
	Value *NewEV = Builder.CreateExtractValue(IV->getAggregateOperand(),
	EV.getIndices());
	return InsertValueInst::Create(NewEV, IV->getInsertedValueOperand(),
	makeArrayRef(insi, inse));
	}
	if (insi == inse)
	// The insert list is a prefix of the extract list
	// We can simply remove the common indices from the extract and make it
	// operate on the inserted value instead of the insertvalue result.
	// i.e., replace
	// %I = insertvalue { i32, { i32 } } %A, { i32 } { i32 42 }, 1
	// %E = extractvalue { i32, { i32 } } %I, 1, 0
	// with
	// %E extractvalue { i32 } { i32 42 }, 0
	return ExtractValueInst::Create(IV->getInsertedValueOperand(),
	makeArrayRef(exti, exte));
	}
	if (WithOverflowInst *WO = dyn_cast<WithOverflowInst>(Agg)) {
	// We're extracting from an overflow intrinsic, see if we're the only user,
	// which allows us to simplify multiple result intrinsics to simpler
	// things that just get one value.
	if (WO->hasOneUse()) {
	// Check if we're grabbing only the result of a 'with overflow' intrinsic
	// and replace it with a traditional binary instruction.
	if (*EV.idx_begin() == 0) {
	Instruction::BinaryOps BinOp = WO->getBinaryOp();
	Value LHS = WO->getLHS(), RHS = WO->getRHS();
	replaceInstUsesWith(*WO, UndefValue::get(WO->getType()));
	eraseInstFromFunction(*WO);
	return BinaryOperator::Create(BinOp, LHS, RHS);
	}

	// If the normal result of the add is dead, and the RHS is a constant,
	// we can transform this into a range comparison.
	// overflow = uadd a, -4 --> overflow = icmp ugt a, 3
	if (WO->getIntrinsicID() == Intrinsic::uadd_with_overflow)
	if (ConstantInt *CI = dyn_cast<ConstantInt>(WO->getRHS()))
	return new ICmpInst(ICmpInst::ICMP_UGT, WO->getLHS(),
	ConstantExpr::getNot(CI));
	}
	}
	if (LoadInst *L = dyn_cast<LoadInst>(Agg))
	// If the (non-volatile) load only has one use, we can rewrite this to a
	// load from a GEP. This reduces the size of the load. If a load is used
	// only by extractvalue instructions then this either must have been
	// optimized before, or it is a struct with padding, in which case we
	// don't want to do the transformation as it loses padding knowledge.
	if (L->isSimple() && L->hasOneUse()) {
	// extractvalue has integer indices, getelementptr has Value*s. Convert.
	SmallVector<Value*, 4> Indices;
	// Prefix an i32 0 since we need the first element.
	Indices.push_back(Builder.getInt32(0));
	for (ExtractValueInst::idx_iterator I = EV.idx_begin(), E = EV.idx_end();
	I != E; ++I)
	Indices.push_back(Builder.getInt32(*I));

	// We need to insert these at the location of the old load, not at that of
	// the extractvalue.
	Builder.SetInsertPoint(L);
	Value *GEP = Builder.CreateInBoundsGEP(L->getType(),
	L->getPointerOperand(), Indices);
	Instruction *NL = Builder.CreateLoad(EV.getType(), GEP);
	// Whatever aliasing information we had for the orignal load must also
	// hold for the smaller load, so propagate the annotations.
	AAMDNodes Nodes;
	L->getAAMetadata(Nodes);
	NL->setAAMetadata(Nodes);
	// Returning the load directly will cause the main loop to insert it in
	// the wrong spot, so use replaceInstUsesWith().
	return replaceInstUsesWith(EV, NL);
	}
	// We could simplify extracts from other values. Note that nested extracts may
	// already be simplified implicitly by the above: extract (extract (insert) )
	// will be translated into extract ( insert ( extract ) ) first and then just
	// the value inserted, if appropriate. Similarly for extracts from single-use
	// loads: extract (extract (load)) will be translated to extract (load (gep))
	// and if again single-use then via load (gep (gep)) to load (gep).
	// However, double extracts from e.g. function arguments or return values
	// aren't handled yet.
	return nullptr;
	}

	/// Return 'true' if the given typeinfo will match anything.
	static bool isCatchAll(EHPersonality Personality, Constant *TypeInfo) {
	switch (Personality) {
	case EHPersonality::GNU_C:
	case EHPersonality::GNU_C_SjLj:
	case EHPersonality::Rust:
	// The GCC C EH and Rust personality only exists to support cleanups, so
	// it's not clear what the semantics of catch clauses are.
	return false;
	case EHPersonality::Unknown:
	return false;
	case EHPersonality::GNU_Ada:
	// While __gnat_all_others_value will match any Ada exception, it doesn't
	// match foreign exceptions (or didn't, before gcc-4.7).
	return false;
	case EHPersonality::GNU_CXX:
	case EHPersonality::GNU_CXX_SjLj:
	case EHPersonality::GNU_ObjC:
	case EHPersonality::MSVC_X86SEH:
	case EHPersonality::MSVC_Win64SEH:
	case EHPersonality::MSVC_CXX:
	case EHPersonality::CoreCLR:
	case EHPersonality::Wasm_CXX:
	return TypeInfo->isNullValue();
	}
	llvm_unreachable("invalid enum");
	}

	static bool shorter_filter(const Value LHS, const Value RHS) {
	return
	cast<ArrayType>(LHS->getType())->getNumElements()
	<
	cast<ArrayType>(RHS->getType())->getNumElements();
	}

	Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
	// The logic here should be correct for any real-world personality function.
	// However if that turns out not to be true, the offending logic can always
	// be conditioned on the personality function, like the catch-all logic is.
	EHPersonality Personality =
	classifyEHPersonality(LI.getParent()->getParent()->getPersonalityFn());

	// Simplify the list of clauses, eg by removing repeated catch clauses
	// (these are often created by inlining).
	bool MakeNewInstruction = false; // If true, recreate using the following:
	SmallVector<Constant *, 16> NewClauses; // - Clauses for the new instruction;
	bool CleanupFlag = LI.isCleanup(); // - The new instruction is a cleanup.

	SmallPtrSet<Value *, 16> AlreadyCaught; // Typeinfos known caught already.
	for (unsigned i = 0, e = LI.getNumClauses(); i != e; ++i) {
	bool isLastClause = i + 1 == e;
	if (LI.isCatch(i)) {
	// A catch clause.
	Constant *CatchClause = LI.getClause(i);
	Constant *TypeInfo = CatchClause->stripPointerCasts();

	// If we already saw this clause, there is no point in having a second
	// copy of it.
	if (AlreadyCaught.insert(TypeInfo).second) {
	// This catch clause was not already seen.
	NewClauses.push_back(CatchClause);
	} else {
	// Repeated catch clause - drop the redundant copy.
	MakeNewInstruction = true;
	}

	// If this is a catch-all then there is no point in keeping any following
	// clauses or marking the landingpad as having a cleanup.
	if (isCatchAll(Personality, TypeInfo)) {
	if (!isLastClause)
	MakeNewInstruction = true;
	CleanupFlag = false;
	break;
	}
	} else {
	// A filter clause. If any of the filter elements were already caught
	// then they can be dropped from the filter. It is tempting to try to
	// exploit the filter further by saying that any typeinfo that does not
	// occur in the filter can't be caught later (and thus can be dropped).
	// However this would be wrong, since typeinfos can match without being
	// equal (for example if one represents a C++ class, and the other some
	// class derived from it).
	assert(LI.isFilter(i) && "Unsupported landingpad clause!");
	Constant *FilterClause = LI.getClause(i);
	ArrayType *FilterType = cast<ArrayType>(FilterClause->getType());
	unsigned NumTypeInfos = FilterType->getNumElements();

	// An empty filter catches everything, so there is no point in keeping any
	// following clauses or marking the landingpad as having a cleanup. By
	// dealing with this case here the following code is made a bit simpler.
	if (!NumTypeInfos) {
	NewClauses.push_back(FilterClause);
	if (!isLastClause)
	MakeNewInstruction = true;
	CleanupFlag = false;
	break;
	}

	bool MakeNewFilter = false; // If true, make a new filter.
	SmallVector<Constant *, 16> NewFilterElts; // New elements.
	if (isa<ConstantAggregateZero>(FilterClause)) {
	// Not an empty filter - it contains at least one null typeinfo.
	assert(NumTypeInfos > 0 && "Should have handled empty filter already!");
	Constant *TypeInfo =
	Constant::getNullValue(FilterType->getElementType());
	// If this typeinfo is a catch-all then the filter can never match.
	if (isCatchAll(Personality, TypeInfo)) {
	// Throw the filter away.
	MakeNewInstruction = true;
	continue;
	}

	// There is no point in having multiple copies of this typeinfo, so
	// discard all but the first copy if there is more than one.
	NewFilterElts.push_back(TypeInfo);
	if (NumTypeInfos > 1)
	MakeNewFilter = true;
	} else {
	ConstantArray *Filter = cast<ConstantArray>(FilterClause);
	SmallPtrSet<Value *, 16> SeenInFilter; // For uniquing the elements.
	NewFilterElts.reserve(NumTypeInfos);

	// Remove any filter elements that were already caught or that already
	// occurred in the filter. While there, see if any of the elements are
	// catch-alls. If so, the filter can be discarded.
	bool SawCatchAll = false;
	for (unsigned j = 0; j != NumTypeInfos; ++j) {
	Constant *Elt = Filter->getOperand(j);
	Constant *TypeInfo = Elt->stripPointerCasts();
	if (isCatchAll(Personality, TypeInfo)) {
	// This element is a catch-all. Bail out, noting this fact.
	SawCatchAll = true;
	break;
	}

	// Even if we've seen a type in a catch clause, we don't want to
	// remove it from the filter. An unexpected type handler may be
	// set up for a call site which throws an exception of the same
	// type caught. In order for the exception thrown by the unexpected
	// handler to propagate correctly, the filter must be correctly
	// described for the call site.
	//
	// Example:
	//
	// void unexpected() { throw 1;}
	// void foo() throw (int) {
	// std::set_unexpected(unexpected);
	// try {
	// throw 2.0;
	// } catch (int i) {}
	// }

	// There is no point in having multiple copies of the same typeinfo in
	// a filter, so only add it if we didn't already.
	if (SeenInFilter.insert(TypeInfo).second)
	NewFilterElts.push_back(cast<Constant>(Elt));
	}
	// A filter containing a catch-all cannot match anything by definition.
	if (SawCatchAll) {
	// Throw the filter away.
	MakeNewInstruction = true;
	continue;
	}

	// If we dropped something from the filter, make a new one.
	if (NewFilterElts.size() < NumTypeInfos)
	MakeNewFilter = true;
	}
	if (MakeNewFilter) {
	FilterType = ArrayType::get(FilterType->getElementType(),
	NewFilterElts.size());
	FilterClause = ConstantArray::get(FilterType, NewFilterElts);
	MakeNewInstruction = true;
	}

	NewClauses.push_back(FilterClause);

	// If the new filter is empty then it will catch everything so there is
	// no point in keeping any following clauses or marking the landingpad
	// as having a cleanup. The case of the original filter being empty was
	// already handled above.
	if (MakeNewFilter && !NewFilterElts.size()) {
	assert(MakeNewInstruction && "New filter but not a new instruction!");
	CleanupFlag = false;
	break;
	}
	}
	}

	// If several filters occur in a row then reorder them so that the shortest
	// filters come first (those with the smallest number of elements). This is
	// advantageous because shorter filters are more likely to match, speeding up
	// unwinding, but mostly because it increases the effectiveness of the other
	// filter optimizations below.
	for (unsigned i = 0, e = NewClauses.size(); i + 1 < e; ) {
	unsigned j;
	// Find the maximal 'j' s.t. the range [i, j) consists entirely of filters.
	for (j = i; j != e; ++j)
	if (!isa<ArrayType>(NewClauses[j]->getType()))
	break;

	// Check whether the filters are already sorted by length. We need to know
	// if sorting them is actually going to do anything so that we only make a
	// new landingpad instruction if it does.
	for (unsigned k = i; k + 1 < j; ++k)
	if (shorter_filter(NewClauses[k+1], NewClauses[k])) {
	// Not sorted, so sort the filters now. Doing an unstable sort would be
	// correct too but reordering filters pointlessly might confuse users.
	std::stable_sort(NewClauses.begin() + i, NewClauses.begin() + j,
	shorter_filter);
	MakeNewInstruction = true;
	break;
	}

	// Look for the next batch of filters.
	i = j + 1;
	}

	// If typeinfos matched if and only if equal, then the elements of a filter L
	// that occurs later than a filter F could be replaced by the intersection of
	// the elements of F and L. In reality two typeinfos can match without being
	// equal (for example if one represents a C++ class, and the other some class
	// derived from it) so it would be wrong to perform this transform in general.
	// However the transform is correct and useful if F is a subset of L. In that
	// case L can be replaced by F, and thus removed altogether since repeating a
	// filter is pointless. So here we look at all pairs of filters F and L where
	// L follows F in the list of clauses, and remove L if every element of F is
	// an element of L. This can occur when inlining C++ functions with exception
	// specifications.
	for (unsigned i = 0; i + 1 < NewClauses.size(); ++i) {
	// Examine each filter in turn.
	Value *Filter = NewClauses[i];
	ArrayType *FTy = dyn_cast<ArrayType>(Filter->getType());
	if (!FTy)
	// Not a filter - skip it.
	continue;
	unsigned FElts = FTy->getNumElements();
	// Examine each filter following this one. Doing this backwards means that
	// we don't have to worry about filters disappearing under us when removed.
	for (unsigned j = NewClauses.size() - 1; j != i; --j) {
	Value *LFilter = NewClauses[j];
	ArrayType *LTy = dyn_cast<ArrayType>(LFilter->getType());
	if (!LTy)
	// Not a filter - skip it.
	continue;
	// If Filter is a subset of LFilter, i.e. every element of Filter is also
	// an element of LFilter, then discard LFilter.
	SmallVectorImpl<Constant *>::iterator J = NewClauses.begin() + j;
	// If Filter is empty then it is a subset of LFilter.
	if (!FElts) {
	// Discard LFilter.
	NewClauses.erase(J);
	MakeNewInstruction = true;
	// Move on to the next filter.
	continue;
	}
	unsigned LElts = LTy->getNumElements();
	// If Filter is longer than LFilter then it cannot be a subset of it.
	if (FElts > LElts)
	// Move on to the next filter.
	continue;
	// At this point we know that LFilter has at least one element.
	if (isa<ConstantAggregateZero>(LFilter)) { // LFilter only contains zeros.
	// Filter is a subset of LFilter iff Filter contains only zeros (as we
	// already know that Filter is not longer than LFilter).
	if (isa<ConstantAggregateZero>(Filter)) {
	assert(FElts <= LElts && "Should have handled this case earlier!");
	// Discard LFilter.
	NewClauses.erase(J);
	MakeNewInstruction = true;
	}
	// Move on to the next filter.
	continue;
	}
	ConstantArray *LArray = cast<ConstantArray>(LFilter);
	if (isa<ConstantAggregateZero>(Filter)) { // Filter only contains zeros.
	// Since Filter is non-empty and contains only zeros, it is a subset of
	// LFilter iff LFilter contains a zero.
	assert(FElts > 0 && "Should have eliminated the empty filter earlier!");
	for (unsigned l = 0; l != LElts; ++l)
	if (LArray->getOperand(l)->isNullValue()) {
	// LFilter contains a zero - discard it.
	NewClauses.erase(J);
	MakeNewInstruction = true;
	break;
	}
	// Move on to the next filter.
	continue;
	}
	// At this point we know that both filters are ConstantArrays. Loop over
	// operands to see whether every element of Filter is also an element of
	// LFilter. Since filters tend to be short this is probably faster than
	// using a method that scales nicely.
	ConstantArray *FArray = cast<ConstantArray>(Filter);
	bool AllFound = true;
	for (unsigned f = 0; f != FElts; ++f) {
	Value *FTypeInfo = FArray->getOperand(f)->stripPointerCasts();
	AllFound = false;
	for (unsigned l = 0; l != LElts; ++l) {
	Value *LTypeInfo = LArray->getOperand(l)->stripPointerCasts();
	if (LTypeInfo == FTypeInfo) {
	AllFound = true;
	break;
	}
	}
	if (!AllFound)
	break;
	}
	if (AllFound) {
	// Discard LFilter.
	NewClauses.erase(J);
	MakeNewInstruction = true;
	}
	// Move on to the next filter.
	}
	}

	// If we changed any of the clauses, replace the old landingpad instruction
	// with a new one.
	if (MakeNewInstruction) {
	LandingPadInst *NLI = LandingPadInst::Create(LI.getType(),
	NewClauses.size());
	for (unsigned i = 0, e = NewClauses.size(); i != e; ++i)
	NLI->addClause(NewClauses[i]);
	// A landing pad with no clauses must have the cleanup flag set. It is
	// theoretically possible, though highly unlikely, that we eliminated all
	// clauses. If so, force the cleanup flag to true.
	if (NewClauses.empty())
	CleanupFlag = true;
	NLI->setCleanup(CleanupFlag);
	return NLI;
	}

	// Even if none of the clauses changed, we may nonetheless have understood
	// that the cleanup flag is pointless. Clear it if so.
	if (LI.isCleanup() != CleanupFlag) {
	assert(!CleanupFlag && "Adding a cleanup, not removing one?!");
	LI.setCleanup(CleanupFlag);
	return &LI;
	}

	return nullptr;
	}

	Instruction *InstCombiner::visitFreeze(FreezeInst &I) {
	Value *Op0 = I.getOperand(0);

	if (Value *V = SimplifyFreezeInst(Op0, SQ.getWithInstruction(&I)))
	return replaceInstUsesWith(I, V);

	return nullptr;
	}

	/// Try to move the specified instruction from its current block into the
	/// beginning of DestBlock, which can only happen if it's safe to move the
	/// instruction past all of the instructions between it and the end of its
	/// block.
	static bool TryToSinkInstruction(Instruction I, BasicBlock DestBlock) {
	assert(I->getSingleUndroppableUse() && "Invariants didn't hold!");
	BasicBlock *SrcBlock = I->getParent();

	// Cannot move control-flow-involving, volatile loads, vaarg, etc.
	if (isa<PHINode>(I) \|\| I->isEHPad() \|\| I->mayHaveSideEffects() \|\|
	I->isTerminator())
	return false;

	// Do not sink static or dynamic alloca instructions. Static allocas must
	// remain in the entry block, and dynamic allocas must not be sunk in between
	// a stacksave / stackrestore pair, which would incorrectly shorten its
	// lifetime.
	if (isa<AllocaInst>(I))
	return false;

	// Do not sink into catchswitch blocks.
	if (isa<CatchSwitchInst>(DestBlock->getTerminator()))
	return false;

	// Do not sink convergent call instructions.
	if (auto *CI = dyn_cast<CallInst>(I)) {
	if (CI->isConvergent())
	return false;
	}
	// We can only sink load instructions if there is nothing between the load and
	// the end of block that could change the value.
	if (I->mayReadFromMemory()) {
	// We don't want to do any sophisticated alias analysis, so we only check
	// the instructions after I in I's parent block if we try to sink to its
	// successor block.
	if (DestBlock->getUniquePredecessor() != I->getParent())
	return false;
	for (BasicBlock::iterator Scan = I->getIterator(),
	E = I->getParent()->end();
	Scan != E; ++Scan)
	if (Scan->mayWriteToMemory())
	return false;
	}

	I->dropDroppableUses([DestBlock](const Use *U) {
	if (auto *I = dyn_cast<Instruction>(U->getUser()))
	return I->getParent() != DestBlock;
	return true;
	});
	/// FIXME: We could remove droppable uses that are not dominated by
	/// the new position.

	BasicBlock::iterator InsertPos = DestBlock->getFirstInsertionPt();
	I->moveBefore(&*InsertPos);
	++NumSunkInst;

	// Also sink all related debug uses from the source basic block. Otherwise we
	// get debug use before the def. Attempt to salvage debug uses first, to
	// maximise the range variables have location for. If we cannot salvage, then
	// mark the location undef: we know it was supposed to receive a new location
	// here, but that computation has been sunk.
	SmallVector<DbgVariableIntrinsic *, 2> DbgUsers;
	findDbgUsers(DbgUsers, I);

	// Update the arguments of a dbg.declare instruction, so that it
	// does not point into a sunk instruction.
	auto updateDbgDeclare = [&I](DbgVariableIntrinsic *DII) {
	if (!isa<DbgDeclareInst>(DII))
	return false;

	if (isa<CastInst>(I))
	DII->setOperand(
	0, MetadataAsValue::get(I->getContext(),
	ValueAsMetadata::get(I->getOperand(0))));
	return true;
	};

	SmallVector<DbgVariableIntrinsic *, 2> DIIClones;
	for (auto User : DbgUsers) {
	// A dbg.declare instruction should not be cloned, since there can only be
	// one per variable fragment. It should be left in the original place
	// because the sunk instruction is not an alloca (otherwise we could not be
	// here).
	if (User->getParent() != SrcBlock \|\| updateDbgDeclare(User))
	continue;

	DIIClones.emplace_back(cast<DbgVariableIntrinsic>(User->clone()));
	LLVM_DEBUG(dbgs() << "CLONE: " << *DIIClones.back() << '\n');
	}

	// Perform salvaging without the clones, then sink the clones.
	if (!DIIClones.empty()) {
	salvageDebugInfoForDbgValues(*I, DbgUsers);
	for (auto &DIIClone : DIIClones) {
	DIIClone->insertBefore(&*InsertPos);
	LLVM_DEBUG(dbgs() << "SINK: " << *DIIClone << '\n');
	}
	}

	return true;
	}

	bool InstCombiner::run() {
	while (!Worklist.isEmpty()) {
	// Walk deferred instructions in reverse order, and push them to the
	// worklist, which means they'll end up popped from the worklist in-order.
	while (Instruction *I = Worklist.popDeferred()) {
	// Check to see if we can DCE the instruction. We do this already here to
	// reduce the number of uses and thus allow other folds to trigger.
	// Note that eraseInstFromFunction() may push additional instructions on
	// the deferred worklist, so this will DCE whole instruction chains.
	if (isInstructionTriviallyDead(I, &TLI)) {
	eraseInstFromFunction(*I);
	++NumDeadInst;
	continue;
	}

	Worklist.push(I);
	}

	Instruction *I = Worklist.removeOne();
	if (I == nullptr) continue; // skip null values.

	// Check to see if we can DCE the instruction.
	if (isInstructionTriviallyDead(I, &TLI)) {
	eraseInstFromFunction(*I);
	++NumDeadInst;
	continue;
	}

	if (!DebugCounter::shouldExecute(VisitCounter))
	continue;

	// Instruction isn't dead, see if we can constant propagate it.
	if (!I->use_empty() &&
	(I->getNumOperands() == 0 \|\| isa<Constant>(I->getOperand(0)))) {
	if (Constant *C = ConstantFoldInstruction(I, DL, &TLI)) {
	LLVM_DEBUG(dbgs() << "IC: ConstFold to: " << C << " from: " << I
	<< '\n');

	// Add operands to the worklist.
	replaceInstUsesWith(*I, C);
	++NumConstProp;
	if (isInstructionTriviallyDead(I, &TLI))
	eraseInstFromFunction(*I);
	MadeIRChange = true;
	continue;
	}
	}

	// See if we can trivially sink this instruction to its user if we can
	// prove that the successor is not executed more frequently than our block.
	if (EnableCodeSinking)
	if (Use *SingleUse = I->getSingleUndroppableUse()) {
	BasicBlock *BB = I->getParent();
	Instruction *UserInst = cast<Instruction>(SingleUse->getUser());
	BasicBlock *UserParent;

	// Get the block the use occurs in.
	if (PHINode *PN = dyn_cast<PHINode>(UserInst))
	UserParent = PN->getIncomingBlock(*SingleUse);
	else
	UserParent = UserInst->getParent();

	if (UserParent != BB) {
	// See if the user is one of our successors that has only one
	// predecessor, so that we don't have to split the critical edge.
	bool ShouldSink = UserParent->getUniquePredecessor() == BB;
	// Another option where we can sink is a block that ends with a
	// terminator that does not pass control to other block (such as
	// return or unreachable). In this case:
	// - I dominates the User (by SSA form);
	// - the User will be executed at most once.
	// So sinking I down to User is always profitable or neutral.
	if (!ShouldSink) {
	auto *Term = UserParent->getTerminator();
	ShouldSink = isa<ReturnInst>(Term) \|\| isa<UnreachableInst>(Term);
	}
	if (ShouldSink) {
	assert(DT.dominates(BB, UserParent) &&
	"Dominance relation broken?");
	// Okay, the CFG is simple enough, try to sink this instruction.
	if (TryToSinkInstruction(I, UserParent)) {
	LLVM_DEBUG(dbgs() << "IC: Sink: " << *I << '\n');
	MadeIRChange = true;
	// We'll add uses of the sunk instruction below, but since sinking
	// can expose opportunities for it's operands add them to the
	// worklist
	for (Use &U : I->operands())
	if (Instruction *OpI = dyn_cast<Instruction>(U.get()))
	Worklist.push(OpI);
	}
	}
	}
	}

	// Now that we have an instruction, try combining it to simplify it.
	Builder.SetInsertPoint(I);
	Builder.SetCurrentDebugLocation(I->getDebugLoc());

	#ifndef NDEBUG
	std::string OrigI;
	#endif
	LLVM_DEBUG(raw_string_ostream SS(OrigI); I->print(SS); OrigI = SS.str(););
	LLVM_DEBUG(dbgs() << "IC: Visiting: " << OrigI << '\n');

	if (Instruction Result = visit(I)) {
	++NumCombined;
	// Should we replace the old instruction with a new one?
	if (Result != I) {
	LLVM_DEBUG(dbgs() << "IC: Old = " << *I << '\n'
	<< " New = " << *Result << '\n');

	if (I->getDebugLoc())
	Result->setDebugLoc(I->getDebugLoc());
	// Everything uses the new instruction now.
	I->replaceAllUsesWith(Result);

	// Move the name to the new instruction first.
	Result->takeName(I);

	// Insert the new instruction into the basic block...
	BasicBlock *InstParent = I->getParent();
	BasicBlock::iterator InsertPos = I->getIterator();

	// If we replace a PHI with something that isn't a PHI, fix up the
	// insertion point.
	if (!isa<PHINode>(Result) && isa<PHINode>(InsertPos))
	InsertPos = InstParent->getFirstInsertionPt();

	InstParent->getInstList().insert(InsertPos, Result);

	// Push the new instruction and any users onto the worklist.
	Worklist.pushUsersToWorkList(*Result);
	Worklist.push(Result);

	eraseInstFromFunction(*I);
	} else {
	LLVM_DEBUG(dbgs() << "IC: Mod = " << OrigI << '\n'
	<< " New = " << *I << '\n');

	// If the instruction was modified, it's possible that it is now dead.
	// if so, remove it.
	if (isInstructionTriviallyDead(I, &TLI)) {
	eraseInstFromFunction(*I);
	} else {
	Worklist.pushUsersToWorkList(*I);
	Worklist.push(I);
	}
	}
	MadeIRChange = true;
	}
	}

	Worklist.zap();
	return MadeIRChange;
	}

	/// Populate the IC worklist from a function, by walking it in depth-first
	/// order and adding all reachable code to the worklist.
	///
	/// This has a couple of tricks to make the code faster and more powerful. In
	/// particular, we constant fold and DCE instructions as we go, to avoid adding
	/// them to the worklist (this significantly speeds up instcombine on code where
	/// many instructions are dead or constant). Additionally, if we find a branch
	/// whose condition is a known constant, we only visit the reachable successors.
	static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
	const TargetLibraryInfo *TLI,
	InstCombineWorklist &ICWorklist) {
	bool MadeIRChange = false;
	SmallPtrSet<BasicBlock *, 32> Visited;
	SmallVector<BasicBlock*, 256> Worklist;
	Worklist.push_back(&F.front());

	SmallVector<Instruction*, 128> InstrsForInstCombineWorklist;
	DenseMap<Constant , Constant > FoldedConstants;

	do {
	BasicBlock *BB = Worklist.pop_back_val();

	// We have now visited this block! If we've already been here, ignore it.
	if (!Visited.insert(BB).second)
	continue;

	for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
	Instruction Inst = &BBI++;

	// ConstantProp instruction if trivially constant.
	if (!Inst->use_empty() &&
	(Inst->getNumOperands() == 0 \|\| isa<Constant>(Inst->getOperand(0))))
	if (Constant *C = ConstantFoldInstruction(Inst, DL, TLI)) {
	LLVM_DEBUG(dbgs() << "IC: ConstFold to: " << C << " from: " << Inst
	<< '\n');
	Inst->replaceAllUsesWith(C);
	++NumConstProp;
	if (isInstructionTriviallyDead(Inst, TLI))
	Inst->eraseFromParent();
	MadeIRChange = true;
	continue;
	}

	// See if we can constant fold its operands.
	for (Use &U : Inst->operands()) {
	if (!isa<ConstantVector>(U) && !isa<ConstantExpr>(U))
	continue;

	auto *C = cast<Constant>(U);
	Constant *&FoldRes = FoldedConstants[C];
	if (!FoldRes)
	FoldRes = ConstantFoldConstant(C, DL, TLI);

	if (FoldRes != C) {
	LLVM_DEBUG(dbgs() << "IC: ConstFold operand of: " << *Inst
	<< "\n Old = " << *C
	<< "\n New = " << *FoldRes << '\n');
	U = FoldRes;
	MadeIRChange = true;
	}
	}

	// Skip processing debug intrinsics in InstCombine. Processing these call instructions
	// consumes non-trivial amount of time and provides no value for the optimization.
	if (!isa<DbgInfoIntrinsic>(Inst))
	InstrsForInstCombineWorklist.push_back(Inst);
	}

	// Recursively visit successors. If this is a branch or switch on a
	// constant, only visit the reachable successor.
	Instruction *TI = BB->getTerminator();
	if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
	if (BI->isConditional() && isa<ConstantInt>(BI->getCondition())) {
	bool CondVal = cast<ConstantInt>(BI->getCondition())->getZExtValue();
	BasicBlock *ReachableBB = BI->getSuccessor(!CondVal);
	Worklist.push_back(ReachableBB);
	continue;
	}
	} else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
	if (ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition())) {
	Worklist.push_back(SI->findCaseValue(Cond)->getCaseSuccessor());
	continue;
	}
	}

	for (BasicBlock *SuccBB : successors(TI))
	Worklist.push_back(SuccBB);
	} while (!Worklist.empty());

	// Remove instructions inside unreachable blocks. This prevents the
	// instcombine code from having to deal with some bad special cases, and
	// reduces use counts of instructions.
	for (BasicBlock &BB : F) {
	if (Visited.count(&BB))
	continue;

	unsigned NumDeadInstInBB = removeAllNonTerminatorAndEHPadInstructions(&BB);
	MadeIRChange \|= NumDeadInstInBB > 0;
	NumDeadInst += NumDeadInstInBB;
	}

	// Once we've found all of the instructions to add to instcombine's worklist,
	// add them in reverse order. This way instcombine will visit from the top
	// of the function down. This jives well with the way that it adds all uses
	// of instructions to the worklist after doing a transformation, thus avoiding
	// some N^2 behavior in pathological cases.
	ICWorklist.reserve(InstrsForInstCombineWorklist.size());
	for (Instruction *Inst : reverse(InstrsForInstCombineWorklist)) {
	// DCE instruction if trivially dead. As we iterate in reverse program
	// order here, we will clean up whole chains of dead instructions.
	if (isInstructionTriviallyDead(Inst, TLI)) {
	++NumDeadInst;
	LLVM_DEBUG(dbgs() << "IC: DCE: " << *Inst << '\n');
	salvageDebugInfo(*Inst);
	Inst->eraseFromParent();
	MadeIRChange = true;
	continue;
	}

	ICWorklist.push(Inst);
	}

	return MadeIRChange;
	}

	static bool combineInstructionsOverFunction(
	Function &F, InstCombineWorklist &Worklist, AliasAnalysis *AA,
	AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT,
	OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
	ProfileSummaryInfo PSI, unsigned MaxIterations, LoopInfo LI) {
	auto &DL = F.getParent()->getDataLayout();
	MaxIterations = std::min(MaxIterations, LimitMaxIterations.getValue());

	/// Builder - This is an IRBuilder that automatically inserts new
	/// instructions into the worklist when they are created.
	IRBuilder<TargetFolder, IRBuilderCallbackInserter> Builder(
	F.getContext(), TargetFolder(DL),
	IRBuilderCallbackInserter([&Worklist, &AC](Instruction *I) {
	Worklist.add(I);
	if (match(I, m_Intrinsic<Intrinsic::assume>()))
	AC.registerAssumption(cast<CallInst>(I));
	}));

	// Lower dbg.declare intrinsics otherwise their value may be clobbered
	// by instcombiner.
	bool MadeIRChange = false;
	if (ShouldLowerDbgDeclare)
	MadeIRChange = LowerDbgDeclare(F);

	// Iterate while there is work to do.
	unsigned Iteration = 0;
	while (true) {
	++Iteration;

	if (Iteration > InfiniteLoopDetectionThreshold) {
	report_fatal_error(
	"Instruction Combining seems stuck in an infinite loop after " +
	Twine(InfiniteLoopDetectionThreshold) + " iterations.");
	}

	if (Iteration > MaxIterations) {
	LLVM_DEBUG(dbgs() << "\n\n[IC] Iteration limit #" << MaxIterations
	<< " on " << F.getName()
	<< " reached; stopping before reaching a fixpoint\n");
	break;
	}

	LLVM_DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
	<< F.getName() << "\n");

	MadeIRChange \|= prepareICWorklistFromFunction(F, DL, &TLI, Worklist);

	InstCombiner IC(Worklist, Builder, F.hasMinSize(), AA,
	AC, TLI, DT, ORE, BFI, PSI, DL, LI);
	IC.MaxArraySizeForCombine = MaxArraySize;

	if (!IC.run())
	break;

	MadeIRChange = true;
	}

	return MadeIRChange;
	}

	InstCombinePass::InstCombinePass() : MaxIterations(LimitMaxIterations) {}

	InstCombinePass::InstCombinePass(unsigned MaxIterations)
	: MaxIterations(MaxIterations) {}

	PreservedAnalyses InstCombinePass::run(Function &F,
	FunctionAnalysisManager &AM) {
	auto &AC = AM.getResult<AssumptionAnalysis>(F);
	auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
	auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
	auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);

	auto *LI = AM.getCachedResult<LoopAnalysis>(F);

	auto *AA = &AM.getResult<AAManager>(F);
	auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
	ProfileSummaryInfo *PSI =
	MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
	auto *BFI = (PSI && PSI->hasProfileSummary()) ?
	&AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;

	if (!combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, BFI,
	PSI, MaxIterations, LI))
	// No changes, all analyses are preserved.
	return PreservedAnalyses::all();

	// Mark all the analyses that instcombine updates as preserved.
	PreservedAnalyses PA;
	PA.preserveSet<CFGAnalyses>();
	PA.preserve<AAManager>();
	PA.preserve<BasicAA>();
	PA.preserve<GlobalsAA>();
	return PA;
	}

	void InstructionCombiningPass::getAnalysisUsage(AnalysisUsage &AU) const {
	AU.setPreservesCFG();
	AU.addRequired<AAResultsWrapperPass>();
	AU.addRequired<AssumptionCacheTracker>();
	AU.addRequired<TargetLibraryInfoWrapperPass>();
	AU.addRequired<DominatorTreeWrapperPass>();
	AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
	AU.addPreserved<DominatorTreeWrapperPass>();
	AU.addPreserved<AAResultsWrapperPass>();
	AU.addPreserved<BasicAAWrapperPass>();
	AU.addPreserved<GlobalsAAWrapperPass>();
	AU.addRequired<ProfileSummaryInfoWrapperPass>();
	LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
	}

	bool InstructionCombiningPass::runOnFunction(Function &F) {
	if (skipFunction(F))
	return false;

	// Required analyses.
	auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
	auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
	auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
	auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
	auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();

	// Optional analyses.
	auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
	auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
	ProfileSummaryInfo *PSI =
	&getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
	BlockFrequencyInfo *BFI =
	(PSI && PSI->hasProfileSummary()) ?
	&getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :
	nullptr;

	return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, BFI,
	PSI, MaxIterations, LI);
	}

	char InstructionCombiningPass::ID = 0;

	InstructionCombiningPass::InstructionCombiningPass()
	: FunctionPass(ID), MaxIterations(InstCombineDefaultMaxIterations) {
	initializeInstructionCombiningPassPass(*PassRegistry::getPassRegistry());
	}

	InstructionCombiningPass::InstructionCombiningPass(unsigned MaxIterations)
	: FunctionPass(ID), MaxIterations(MaxIterations) {
	initializeInstructionCombiningPassPass(*PassRegistry::getPassRegistry());
	}

	INITIALIZE_PASS_BEGIN(InstructionCombiningPass, "instcombine",
	"Combine redundant instructions", false, false)
	INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
	INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass)
	INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
	INITIALIZE_PASS_END(InstructionCombiningPass, "instcombine",
	"Combine redundant instructions", false, false)

	// Initialization Routines
	void llvm::initializeInstCombine(PassRegistry &Registry) {
	initializeInstructionCombiningPassPass(Registry);
	}

	void LLVMInitializeInstCombine(LLVMPassRegistryRef R) {
	initializeInstructionCombiningPassPass(*unwrap(R));
	}

	FunctionPass *llvm::createInstructionCombiningPass() {
	return new InstructionCombiningPass();
	}

	FunctionPass *llvm::createInstructionCombiningPass(unsigned MaxIterations) {
	return new InstructionCombiningPass(MaxIterations);
	}

	void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM) {
	unwrap(PM)->add(createInstructionCombiningPass());
	}
	diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
	index 9d0500419a7f..2f379b7f6160 100644
	--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
	+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
	@@ -1,2968 +1,2976 @@
	//===- JumpThreading.cpp - Thread control through conditional blocks ------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the Jump Threading pass.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/Transforms/Scalar/JumpThreading.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/DenseSet.h"
	#include "llvm/ADT/MapVector.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/Analysis/AliasAnalysis.h"
	#include "llvm/Analysis/BlockFrequencyInfo.h"
	#include "llvm/Analysis/BranchProbabilityInfo.h"
	#include "llvm/Analysis/CFG.h"
	#include "llvm/Analysis/ConstantFolding.h"
	#include "llvm/Analysis/DomTreeUpdater.h"
	#include "llvm/Analysis/GlobalsModRef.h"
	#include "llvm/Analysis/GuardUtils.h"
	#include "llvm/Analysis/InstructionSimplify.h"
	#include "llvm/Analysis/LazyValueInfo.h"
	#include "llvm/Analysis/Loads.h"
	#include "llvm/Analysis/LoopInfo.h"
	#include "llvm/Analysis/TargetLibraryInfo.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/IR/BasicBlock.h"
	#include "llvm/IR/CFG.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/ConstantRange.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/Dominators.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/InstrTypes.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/MDBuilder.h"
	#include "llvm/IR/Metadata.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/PassManager.h"
	#include "llvm/IR/PatternMatch.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Use.h"
	#include "llvm/IR/User.h"
	#include "llvm/IR/Value.h"
	#include "llvm/InitializePasses.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/BlockFrequency.h"
	#include "llvm/Support/BranchProbability.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Transforms/Scalar.h"
	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
	#include "llvm/Transforms/Utils/Cloning.h"
	#include "llvm/Transforms/Utils/Local.h"
	#include "llvm/Transforms/Utils/SSAUpdater.h"
	#include "llvm/Transforms/Utils/ValueMapper.h"
	#include <algorithm>
	#include <cassert>
	#include <cstddef>
	#include <cstdint>
	#include <iterator>
	#include <memory>
	#include <utility>

	using namespace llvm;
	using namespace jumpthreading;

	#define DEBUG_TYPE "jump-threading"

	STATISTIC(NumThreads, "Number of jumps threaded");
	STATISTIC(NumFolds, "Number of terminators folded");
	STATISTIC(NumDupes, "Number of branch blocks duplicated to eliminate phi");

	static cl::opt<unsigned>
	BBDuplicateThreshold("jump-threading-threshold",
	cl::desc("Max block size to duplicate for jump threading"),
	cl::init(6), cl::Hidden);

	static cl::opt<unsigned>
	ImplicationSearchThreshold(
	"jump-threading-implication-search-threshold",
	cl::desc("The number of predecessors to search for a stronger "
	"condition to use to thread over a weaker condition"),
	cl::init(3), cl::Hidden);

	static cl::opt<bool> PrintLVIAfterJumpThreading(
	"print-lvi-after-jump-threading",
	cl::desc("Print the LazyValueInfo cache after JumpThreading"), cl::init(false),
	cl::Hidden);

	static cl::opt<bool> ThreadAcrossLoopHeaders(
	"jump-threading-across-loop-headers",
	cl::desc("Allow JumpThreading to thread across loop headers, for testing"),
	cl::init(false), cl::Hidden);


	namespace {

	/// This pass performs 'jump threading', which looks at blocks that have
	/// multiple predecessors and multiple successors. If one or more of the
	/// predecessors of the block can be proven to always jump to one of the
	/// successors, we forward the edge from the predecessor to the successor by
	/// duplicating the contents of this block.
	///
	/// An example of when this can occur is code like this:
	///
	/// if () { ...
	/// X = 4;
	/// }
	/// if (X < 3) {
	///
	/// In this case, the unconditional branch at the end of the first if can be
	/// revectored to the false side of the second if.
	class JumpThreading : public FunctionPass {
	JumpThreadingPass Impl;

	public:
	static char ID; // Pass identification

	JumpThreading(int T = -1) : FunctionPass(ID), Impl(T) {
	initializeJumpThreadingPass(*PassRegistry::getPassRegistry());
	}

	bool runOnFunction(Function &F) override;

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.addRequired<DominatorTreeWrapperPass>();
	AU.addPreserved<DominatorTreeWrapperPass>();
	AU.addRequired<AAResultsWrapperPass>();
	AU.addRequired<LazyValueInfoWrapperPass>();
	AU.addPreserved<LazyValueInfoWrapperPass>();
	AU.addPreserved<GlobalsAAWrapperPass>();
	AU.addRequired<TargetLibraryInfoWrapperPass>();
	}

	void releaseMemory() override { Impl.releaseMemory(); }
	};

	} // end anonymous namespace

	char JumpThreading::ID = 0;

	INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading",
	"Jump Threading", false, false)
	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
	INITIALIZE_PASS_END(JumpThreading, "jump-threading",
	"Jump Threading", false, false)

	// Public interface to the Jump Threading pass
	FunctionPass *llvm::createJumpThreadingPass(int Threshold) {
	return new JumpThreading(Threshold);
	}

	JumpThreadingPass::JumpThreadingPass(int T) {
	DefaultBBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T);
	}

	// Update branch probability information according to conditional
	// branch probability. This is usually made possible for cloned branches
	// in inline instances by the context specific profile in the caller.
	// For instance,
	//
	// [Block PredBB]
	// [Branch PredBr]
	// if (t) {
	// Block A;
	// } else {
	// Block B;
	// }
	//
	// [Block BB]
	// cond = PN([true, %A], [..., %B]); // PHI node
	// [Branch CondBr]
	// if (cond) {
	// ... // P(cond == true) = 1%
	// }
	//
	// Here we know that when block A is taken, cond must be true, which means
	// P(cond == true \| A) = 1
	//
	// Given that P(cond == true) = P(cond == true \| A) * P(A) +
	// P(cond == true \| B) * P(B)
	// we get:
	// P(cond == true ) = P(A) + P(cond == true \| B) * P(B)
	//
	// which gives us:
	// P(A) is less than P(cond == true), i.e.
	// P(t == true) <= P(cond == true)
	//
	// In other words, if we know P(cond == true) is unlikely, we know
	// that P(t == true) is also unlikely.
	//
	static void updatePredecessorProfileMetadata(PHINode PN, BasicBlock BB) {
	BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
	if (!CondBr)
	return;

	uint64_t TrueWeight, FalseWeight;
	if (!CondBr->extractProfMetadata(TrueWeight, FalseWeight))
	return;

	if (TrueWeight + FalseWeight == 0)
	// Zero branch_weights do not give a hint for getting branch probabilities.
	// Technically it would result in division by zero denominator, which is
	// TrueWeight + FalseWeight.
	return;

	// Returns the outgoing edge of the dominating predecessor block
	// that leads to the PhiNode's incoming block:
	auto GetPredOutEdge =
	[](BasicBlock *IncomingBB,
	BasicBlock PhiBB) -> std::pair<BasicBlock , BasicBlock *> {
	auto *PredBB = IncomingBB;
	auto *SuccBB = PhiBB;
	SmallPtrSet<BasicBlock *, 16> Visited;
	while (true) {
	BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator());
	if (PredBr && PredBr->isConditional())
	return {PredBB, SuccBB};
	Visited.insert(PredBB);
	auto *SinglePredBB = PredBB->getSinglePredecessor();
	if (!SinglePredBB)
	return {nullptr, nullptr};

	// Stop searching when SinglePredBB has been visited. It means we see
	// an unreachable loop.
	if (Visited.count(SinglePredBB))
	return {nullptr, nullptr};

	SuccBB = PredBB;
	PredBB = SinglePredBB;
	}
	};

	for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
	Value *PhiOpnd = PN->getIncomingValue(i);
	ConstantInt *CI = dyn_cast<ConstantInt>(PhiOpnd);

	if (!CI \|\| !CI->getType()->isIntegerTy(1))
	continue;

	BranchProbability BP =
	(CI->isOne() ? BranchProbability::getBranchProbability(
	TrueWeight, TrueWeight + FalseWeight)
	: BranchProbability::getBranchProbability(
	FalseWeight, TrueWeight + FalseWeight));

	auto PredOutEdge = GetPredOutEdge(PN->getIncomingBlock(i), BB);
	if (!PredOutEdge.first)
	return;

	BasicBlock *PredBB = PredOutEdge.first;
	BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator());
	if (!PredBr)
	return;

	uint64_t PredTrueWeight, PredFalseWeight;
	// FIXME: We currently only set the profile data when it is missing.
	// With PGO, this can be used to refine even existing profile data with
	// context information. This needs to be done after more performance
	// testing.
	if (PredBr->extractProfMetadata(PredTrueWeight, PredFalseWeight))
	continue;

	// We can not infer anything useful when BP >= 50%, because BP is the
	// upper bound probability value.
	if (BP >= BranchProbability(50, 100))
	continue;

	SmallVector<uint32_t, 2> Weights;
	if (PredBr->getSuccessor(0) == PredOutEdge.second) {
	Weights.push_back(BP.getNumerator());
	Weights.push_back(BP.getCompl().getNumerator());
	} else {
	Weights.push_back(BP.getCompl().getNumerator());
	Weights.push_back(BP.getNumerator());
	}
	PredBr->setMetadata(LLVMContext::MD_prof,
	MDBuilder(PredBr->getParent()->getContext())
	.createBranchWeights(Weights));
	}
	}

	/// runOnFunction - Toplevel algorithm.
	bool JumpThreading::runOnFunction(Function &F) {
	if (skipFunction(F))
	return false;
	auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
	auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
	auto LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
	auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
	DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Lazy);
	std::unique_ptr<BlockFrequencyInfo> BFI;
	std::unique_ptr<BranchProbabilityInfo> BPI;
	if (F.hasProfileData()) {
	LoopInfo LI{DominatorTree(F)};
	BPI.reset(new BranchProbabilityInfo(F, LI, TLI));
	BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
	}

	bool Changed = Impl.runImpl(F, TLI, LVI, AA, &DTU, F.hasProfileData(),
	std::move(BFI), std::move(BPI));
	if (PrintLVIAfterJumpThreading) {
	dbgs() << "LVI for function '" << F.getName() << "':\n";
	LVI->printLVI(F, DTU.getDomTree(), dbgs());
	}
	return Changed;
	}

	PreservedAnalyses JumpThreadingPass::run(Function &F,
	FunctionAnalysisManager &AM) {
	auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
	auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
	auto &LVI = AM.getResult<LazyValueAnalysis>(F);
	auto &AA = AM.getResult<AAManager>(F);
	DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);

	std::unique_ptr<BlockFrequencyInfo> BFI;
	std::unique_ptr<BranchProbabilityInfo> BPI;
	if (F.hasProfileData()) {
	LoopInfo LI{DominatorTree(F)};
	BPI.reset(new BranchProbabilityInfo(F, LI, &TLI));
	BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
	}

	bool Changed = runImpl(F, &TLI, &LVI, &AA, &DTU, F.hasProfileData(),
	std::move(BFI), std::move(BPI));

	if (!Changed)
	return PreservedAnalyses::all();
	PreservedAnalyses PA;
	PA.preserve<GlobalsAA>();
	PA.preserve<DominatorTreeAnalysis>();
	PA.preserve<LazyValueAnalysis>();
	return PA;
	}

	bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
	LazyValueInfo LVI_, AliasAnalysis AA_,
	DomTreeUpdater *DTU_, bool HasProfileData_,
	std::unique_ptr<BlockFrequencyInfo> BFI_,
	std::unique_ptr<BranchProbabilityInfo> BPI_) {
	LLVM_DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
	TLI = TLI_;
	LVI = LVI_;
	AA = AA_;
	DTU = DTU_;
	BFI.reset();
	BPI.reset();
	// When profile data is available, we need to update edge weights after
	// successful jump threading, which requires both BPI and BFI being available.
	HasProfileData = HasProfileData_;
	auto *GuardDecl = F.getParent()->getFunction(
	Intrinsic::getName(Intrinsic::experimental_guard));
	HasGuards = GuardDecl && !GuardDecl->use_empty();
	if (HasProfileData) {
	BPI = std::move(BPI_);
	BFI = std::move(BFI_);
	}

	// Reduce the number of instructions duplicated when optimizing strictly for
	// size.
	if (BBDuplicateThreshold.getNumOccurrences())
	BBDupThreshold = BBDuplicateThreshold;
	else if (F.hasFnAttribute(Attribute::MinSize))
	BBDupThreshold = 3;
	else
	BBDupThreshold = DefaultBBDupThreshold;

	// JumpThreading must not processes blocks unreachable from entry. It's a
	// waste of compute time and can potentially lead to hangs.
	SmallPtrSet<BasicBlock *, 16> Unreachable;
	assert(DTU && "DTU isn't passed into JumpThreading before using it.");
	assert(DTU->hasDomTree() && "JumpThreading relies on DomTree to proceed.");
	DominatorTree &DT = DTU->getDomTree();
	for (auto &BB : F)
	if (!DT.isReachableFromEntry(&BB))
	Unreachable.insert(&BB);

	if (!ThreadAcrossLoopHeaders)
	FindLoopHeaders(F);

	bool EverChanged = false;
	bool Changed;
	do {
	Changed = false;
	for (auto &BB : F) {
	if (Unreachable.count(&BB))
	continue;
	while (ProcessBlock(&BB)) // Thread all of the branches we can over BB.
	Changed = true;

	// Jump threading may have introduced redundant debug values into BB
	// which should be removed.
	if (Changed)
	RemoveRedundantDbgInstrs(&BB);

	// Stop processing BB if it's the entry or is now deleted. The following
	// routines attempt to eliminate BB and locating a suitable replacement
	// for the entry is non-trivial.
	if (&BB == &F.getEntryBlock() \|\| DTU->isBBPendingDeletion(&BB))
	continue;

	if (pred_empty(&BB)) {
	// When ProcessBlock makes BB unreachable it doesn't bother to fix up
	// the instructions in it. We must remove BB to prevent invalid IR.
	LLVM_DEBUG(dbgs() << " JT: Deleting dead block '" << BB.getName()
	<< "' with terminator: " << *BB.getTerminator()
	<< '\n');
	LoopHeaders.erase(&BB);
	LVI->eraseBlock(&BB);
	DeleteDeadBlock(&BB, DTU);
	Changed = true;
	continue;
	}

	// ProcessBlock doesn't thread BBs with unconditional TIs. However, if BB
	// is "almost empty", we attempt to merge BB with its sole successor.
	auto *BI = dyn_cast<BranchInst>(BB.getTerminator());
	if (BI && BI->isUnconditional()) {
	BasicBlock *Succ = BI->getSuccessor(0);
	if (
	// The terminator must be the only non-phi instruction in BB.
	BB.getFirstNonPHIOrDbg()->isTerminator() &&
	// Don't alter Loop headers and latches to ensure another pass can
	// detect and transform nested loops later.
	!LoopHeaders.count(&BB) && !LoopHeaders.count(Succ) &&
	TryToSimplifyUncondBranchFromEmptyBlock(&BB, DTU)) {
	RemoveRedundantDbgInstrs(Succ);
	// BB is valid for cleanup here because we passed in DTU. F remains
	// BB's parent until a DTU->getDomTree() event.
	LVI->eraseBlock(&BB);
	Changed = true;
	}
	}
	}
	EverChanged \|= Changed;
	} while (Changed);

	LoopHeaders.clear();
	return EverChanged;
	}

	// Replace uses of Cond with ToVal when safe to do so. If all uses are
	// replaced, we can remove Cond. We cannot blindly replace all uses of Cond
	// because we may incorrectly replace uses when guards/assumes are uses of
	// of `Cond` and we used the guards/assume to reason about the `Cond` value
	// at the end of block. RAUW unconditionally replaces all uses
	// including the guards/assumes themselves and the uses before the
	// guard/assume.
	static void ReplaceFoldableUses(Instruction Cond, Value ToVal) {
	assert(Cond->getType() == ToVal->getType());
	auto *BB = Cond->getParent();
	// We can unconditionally replace all uses in non-local blocks (i.e. uses
	// strictly dominated by BB), since LVI information is true from the
	// terminator of BB.
	replaceNonLocalUsesWith(Cond, ToVal);
	for (Instruction &I : reverse(*BB)) {
	// Reached the Cond whose uses we are trying to replace, so there are no
	// more uses.
	if (&I == Cond)
	break;
	// We only replace uses in instructions that are guaranteed to reach the end
	// of BB, where we know Cond is ToVal.
	if (!isGuaranteedToTransferExecutionToSuccessor(&I))
	break;
	I.replaceUsesOfWith(Cond, ToVal);
	}
	if (Cond->use_empty() && !Cond->mayHaveSideEffects())
	Cond->eraseFromParent();
	}

	/// Return the cost of duplicating a piece of this block from first non-phi
	/// and before StopAt instruction to thread across it. Stop scanning the block
	/// when exceeding the threshold. If duplication is impossible, returns ~0U.
	static unsigned getJumpThreadDuplicationCost(BasicBlock *BB,
	Instruction *StopAt,
	unsigned Threshold) {
	assert(StopAt->getParent() == BB && "Not an instruction from proper BB?");
	/// Ignore PHI nodes, these will be flattened when duplication happens.
	BasicBlock::const_iterator I(BB->getFirstNonPHI());

	// FIXME: THREADING will delete values that are just used to compute the
	// branch, so they shouldn't count against the duplication cost.

	unsigned Bonus = 0;
	if (BB->getTerminator() == StopAt) {
	// Threading through a switch statement is particularly profitable. If this
	// block ends in a switch, decrease its cost to make it more likely to
	// happen.
	if (isa<SwitchInst>(StopAt))
	Bonus = 6;

	// The same holds for indirect branches, but slightly more so.
	if (isa<IndirectBrInst>(StopAt))
	Bonus = 8;
	}

	// Bump the threshold up so the early exit from the loop doesn't skip the
	// terminator-based Size adjustment at the end.
	Threshold += Bonus;

	// Sum up the cost of each instruction until we get to the terminator. Don't
	// include the terminator because the copy won't include it.
	unsigned Size = 0;
	for (; &*I != StopAt; ++I) {

	// Stop scanning the block if we've reached the threshold.
	if (Size > Threshold)
	return Size;

	// Debugger intrinsics don't incur code size.
	if (isa<DbgInfoIntrinsic>(I)) continue;

	// If this is a pointer->pointer bitcast, it is free.
	if (isa<BitCastInst>(I) && I->getType()->isPointerTy())
	continue;

	// Bail out if this instruction gives back a token type, it is not possible
	// to duplicate it if it is used outside this BB.
	if (I->getType()->isTokenTy() && I->isUsedOutsideOfBlock(BB))
	return ~0U;

	// All other instructions count for at least one unit.
	++Size;

	// Calls are more expensive. If they are non-intrinsic calls, we model them
	// as having cost of 4. If they are a non-vector intrinsic, we model them
	// as having cost of 2 total, and if they are a vector intrinsic, we model
	// them as having cost 1.
	if (const CallInst *CI = dyn_cast<CallInst>(I)) {
	if (CI->cannotDuplicate() \|\| CI->isConvergent())
	// Blocks with NoDuplicate are modelled as having infinite cost, so they
	// are never duplicated.
	return ~0U;
	else if (!isa<IntrinsicInst>(CI))
	Size += 3;
	else if (!CI->getType()->isVectorTy())
	Size += 1;
	}
	}

	return Size > Bonus ? Size - Bonus : 0;
	}

	/// FindLoopHeaders - We do not want jump threading to turn proper loop
	/// structures into irreducible loops. Doing this breaks up the loop nesting
	/// hierarchy and pessimizes later transformations. To prevent this from
	/// happening, we first have to find the loop headers. Here we approximate this
	/// by finding targets of backedges in the CFG.
	///
	/// Note that there definitely are cases when we want to allow threading of
	/// edges across a loop header. For example, threading a jump from outside the
	/// loop (the preheader) to an exit block of the loop is definitely profitable.
	/// It is also almost always profitable to thread backedges from within the loop
	/// to exit blocks, and is often profitable to thread backedges to other blocks
	/// within the loop (forming a nested loop). This simple analysis is not rich
	/// enough to track all of these properties and keep it up-to-date as the CFG
	/// mutates, so we don't allow any of these transformations.
	void JumpThreadingPass::FindLoopHeaders(Function &F) {
	SmallVector<std::pair<const BasicBlock,const BasicBlock>, 32> Edges;
	FindFunctionBackedges(F, Edges);

	for (const auto &Edge : Edges)
	LoopHeaders.insert(Edge.second);
	}

	/// getKnownConstant - Helper method to determine if we can thread over a
	/// terminator with the given value as its condition, and if so what value to
	/// use for that. What kind of value this is depends on whether we want an
	/// integer or a block address, but an undef is always accepted.
	/// Returns null if Val is null or not an appropriate constant.
	static Constant getKnownConstant(Value Val, ConstantPreference Preference) {
	if (!Val)
	return nullptr;

	// Undef is "known" enough.
	if (UndefValue *U = dyn_cast<UndefValue>(Val))
	return U;

	if (Preference == WantBlockAddress)
	return dyn_cast<BlockAddress>(Val->stripPointerCasts());

	return dyn_cast<ConstantInt>(Val);
	}

	/// ComputeValueKnownInPredecessors - Given a basic block BB and a value V, see
	/// if we can infer that the value is a known ConstantInt/BlockAddress or undef
	/// in any of our predecessors. If so, return the known list of value and pred
	/// BB in the result vector.
	///
	/// This returns true if there were any known values.
	bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
	Value V, BasicBlock BB, PredValueInfo &Result,
	ConstantPreference Preference, DenseSet<Value *> &RecursionSet,
	Instruction *CxtI) {
	// This method walks up use-def chains recursively. Because of this, we could
	// get into an infinite loop going around loops in the use-def chain. To
	// prevent this, keep track of what (value, block) pairs we've already visited
	// and terminate the search if we loop back to them
	if (!RecursionSet.insert(V).second)
	return false;

	// If V is a constant, then it is known in all predecessors.
	if (Constant *KC = getKnownConstant(V, Preference)) {
	for (BasicBlock *Pred : predecessors(BB))
	Result.emplace_back(KC, Pred);

	return !Result.empty();
	}

	// If V is a non-instruction value, or an instruction in a different block,
	// then it can't be derived from a PHI.
	Instruction *I = dyn_cast<Instruction>(V);
	if (!I \|\| I->getParent() != BB) {

	// Okay, if this is a live-in value, see if it has a known value at the end
	// of any of our predecessors.
	//
	// FIXME: This should be an edge property, not a block end property.
	/// TODO: Per PR2563, we could infer value range information about a
	/// predecessor based on its terminator.
	//
	// FIXME: change this to use the more-rich 'getPredicateOnEdge' method if
	// "I" is a non-local compare-with-a-constant instruction. This would be
	// able to handle value inequalities better, for example if the compare is
	// "X < 4" and "X < 3" is known true but "X < 4" itself is not available.
	// Perhaps getConstantOnEdge should be smart enough to do this?
	for (BasicBlock *P : predecessors(BB)) {
	// If the value is known by LazyValueInfo to be a constant in a
	// predecessor, use that information to try to thread this block.
	Constant *PredCst = LVI->getConstantOnEdge(V, P, BB, CxtI);
	if (Constant *KC = getKnownConstant(PredCst, Preference))
	Result.emplace_back(KC, P);
	}

	return !Result.empty();
	}

	/// If I is a PHI node, then we know the incoming values for any constants.
	if (PHINode *PN = dyn_cast<PHINode>(I)) {
	for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
	Value *InVal = PN->getIncomingValue(i);
	if (Constant *KC = getKnownConstant(InVal, Preference)) {
	Result.emplace_back(KC, PN->getIncomingBlock(i));
	} else {
	Constant *CI = LVI->getConstantOnEdge(InVal,
	PN->getIncomingBlock(i),
	BB, CxtI);
	if (Constant *KC = getKnownConstant(CI, Preference))
	Result.emplace_back(KC, PN->getIncomingBlock(i));
	}
	}

	return !Result.empty();
	}

	// Handle Cast instructions. Only see through Cast when the source operand is
	// PHI or Cmp to save the compilation time.
	if (CastInst *CI = dyn_cast<CastInst>(I)) {
	Value *Source = CI->getOperand(0);
	if (!isa<PHINode>(Source) && !isa<CmpInst>(Source))
	return false;
	ComputeValueKnownInPredecessorsImpl(Source, BB, Result, Preference,
	RecursionSet, CxtI);
	if (Result.empty())
	return false;

	// Convert the known values.
	for (auto &R : Result)
	R.first = ConstantExpr::getCast(CI->getOpcode(), R.first, CI->getType());

	return true;
	}

	// Handle some boolean conditions.
	if (I->getType()->getPrimitiveSizeInBits() == 1) {
	assert(Preference == WantInteger && "One-bit non-integer type?");
	// X \| true -> true
	// X & false -> false
	if (I->getOpcode() == Instruction::Or \|\|
	I->getOpcode() == Instruction::And) {
	PredValueInfoTy LHSVals, RHSVals;

	ComputeValueKnownInPredecessorsImpl(I->getOperand(0), BB, LHSVals,
	WantInteger, RecursionSet, CxtI);
	ComputeValueKnownInPredecessorsImpl(I->getOperand(1), BB, RHSVals,
	WantInteger, RecursionSet, CxtI);

	if (LHSVals.empty() && RHSVals.empty())
	return false;

	ConstantInt *InterestingVal;
	if (I->getOpcode() == Instruction::Or)
	InterestingVal = ConstantInt::getTrue(I->getContext());
	else
	InterestingVal = ConstantInt::getFalse(I->getContext());

	SmallPtrSet<BasicBlock*, 4> LHSKnownBBs;

	// Scan for the sentinel. If we find an undef, force it to the
	// interesting value: x\|undef -> true and x&undef -> false.
	for (const auto &LHSVal : LHSVals)
	if (LHSVal.first == InterestingVal \|\| isa<UndefValue>(LHSVal.first)) {
	Result.emplace_back(InterestingVal, LHSVal.second);
	LHSKnownBBs.insert(LHSVal.second);
	}
	for (const auto &RHSVal : RHSVals)
	if (RHSVal.first == InterestingVal \|\| isa<UndefValue>(RHSVal.first)) {
	// If we already inferred a value for this block on the LHS, don't
	// re-add it.
	if (!LHSKnownBBs.count(RHSVal.second))
	Result.emplace_back(InterestingVal, RHSVal.second);
	}

	return !Result.empty();
	}

	// Handle the NOT form of XOR.
	if (I->getOpcode() == Instruction::Xor &&
	isa<ConstantInt>(I->getOperand(1)) &&
	cast<ConstantInt>(I->getOperand(1))->isOne()) {
	ComputeValueKnownInPredecessorsImpl(I->getOperand(0), BB, Result,
	WantInteger, RecursionSet, CxtI);
	if (Result.empty())
	return false;

	// Invert the known values.
	for (auto &R : Result)
	R.first = ConstantExpr::getNot(R.first);

	return true;
	}

	// Try to simplify some other binary operator values.
	} else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
	assert(Preference != WantBlockAddress
	&& "A binary operator creating a block address?");
	if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) {
	PredValueInfoTy LHSVals;
	ComputeValueKnownInPredecessorsImpl(BO->getOperand(0), BB, LHSVals,
	WantInteger, RecursionSet, CxtI);

	// Try to use constant folding to simplify the binary operator.
	for (const auto &LHSVal : LHSVals) {
	Constant *V = LHSVal.first;
	Constant *Folded = ConstantExpr::get(BO->getOpcode(), V, CI);

	if (Constant *KC = getKnownConstant(Folded, WantInteger))
	Result.emplace_back(KC, LHSVal.second);
	}
	}

	return !Result.empty();
	}

	// Handle compare with phi operand, where the PHI is defined in this block.
	if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) {
	assert(Preference == WantInteger && "Compares only produce integers");
	Type *CmpType = Cmp->getType();
	Value *CmpLHS = Cmp->getOperand(0);
	Value *CmpRHS = Cmp->getOperand(1);
	CmpInst::Predicate Pred = Cmp->getPredicate();

	PHINode *PN = dyn_cast<PHINode>(CmpLHS);
	if (!PN)
	PN = dyn_cast<PHINode>(CmpRHS);
	if (PN && PN->getParent() == BB) {
	const DataLayout &DL = PN->getModule()->getDataLayout();
	// We can do this simplification if any comparisons fold to true or false.
	// See if any do.
	for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
	BasicBlock *PredBB = PN->getIncomingBlock(i);
	Value LHS, RHS;
	if (PN == CmpLHS) {
	LHS = PN->getIncomingValue(i);
	RHS = CmpRHS->DoPHITranslation(BB, PredBB);
	} else {
	LHS = CmpLHS->DoPHITranslation(BB, PredBB);
	RHS = PN->getIncomingValue(i);
	}
	Value *Res = SimplifyCmpInst(Pred, LHS, RHS, {DL});
	if (!Res) {
	if (!isa<Constant>(RHS))
	continue;

	// getPredicateOnEdge call will make no sense if LHS is defined in BB.
	auto LHSInst = dyn_cast<Instruction>(LHS);
	if (LHSInst && LHSInst->getParent() == BB)
	continue;

	LazyValueInfo::Tristate
	ResT = LVI->getPredicateOnEdge(Pred, LHS,
	cast<Constant>(RHS), PredBB, BB,
	CxtI ? CxtI : Cmp);
	if (ResT == LazyValueInfo::Unknown)
	continue;
	Res = ConstantInt::get(Type::getInt1Ty(LHS->getContext()), ResT);
	}

	if (Constant *KC = getKnownConstant(Res, WantInteger))
	Result.emplace_back(KC, PredBB);
	}

	return !Result.empty();
	}

	// If comparing a live-in value against a constant, see if we know the
	// live-in value on any predecessors.
	if (isa<Constant>(CmpRHS) && !CmpType->isVectorTy()) {
	Constant *CmpConst = cast<Constant>(CmpRHS);

	if (!isa<Instruction>(CmpLHS) \|\|
	cast<Instruction>(CmpLHS)->getParent() != BB) {
	for (BasicBlock *P : predecessors(BB)) {
	// If the value is known by LazyValueInfo to be a constant in a
	// predecessor, use that information to try to thread this block.
	LazyValueInfo::Tristate Res =
	LVI->getPredicateOnEdge(Pred, CmpLHS,
	CmpConst, P, BB, CxtI ? CxtI : Cmp);
	if (Res == LazyValueInfo::Unknown)
	continue;

	Constant *ResC = ConstantInt::get(CmpType, Res);
	Result.emplace_back(ResC, P);
	}

	return !Result.empty();
	}

	// InstCombine can fold some forms of constant range checks into
	// (icmp (add (x, C1)), C2). See if we have we have such a thing with
	// x as a live-in.
	{
	using namespace PatternMatch;

	Value *AddLHS;
	ConstantInt *AddConst;
	if (isa<ConstantInt>(CmpConst) &&
	match(CmpLHS, m_Add(m_Value(AddLHS), m_ConstantInt(AddConst)))) {
	if (!isa<Instruction>(AddLHS) \|\|
	cast<Instruction>(AddLHS)->getParent() != BB) {
	for (BasicBlock *P : predecessors(BB)) {
	// If the value is known by LazyValueInfo to be a ConstantRange in
	// a predecessor, use that information to try to thread this
	// block.
	ConstantRange CR = LVI->getConstantRangeOnEdge(
	AddLHS, P, BB, CxtI ? CxtI : cast<Instruction>(CmpLHS));
	// Propagate the range through the addition.
	CR = CR.add(AddConst->getValue());

	// Get the range where the compare returns true.
	ConstantRange CmpRange = ConstantRange::makeExactICmpRegion(
	Pred, cast<ConstantInt>(CmpConst)->getValue());

	Constant *ResC;
	if (CmpRange.contains(CR))
	ResC = ConstantInt::getTrue(CmpType);
	else if (CmpRange.inverse().contains(CR))
	ResC = ConstantInt::getFalse(CmpType);
	else
	continue;

	Result.emplace_back(ResC, P);
	}

	return !Result.empty();
	}
	}
	}

	// Try to find a constant value for the LHS of a comparison,
	// and evaluate it statically if we can.
	PredValueInfoTy LHSVals;
	ComputeValueKnownInPredecessorsImpl(I->getOperand(0), BB, LHSVals,
	WantInteger, RecursionSet, CxtI);

	for (const auto &LHSVal : LHSVals) {
	Constant *V = LHSVal.first;
	Constant *Folded = ConstantExpr::getCompare(Pred, V, CmpConst);
	if (Constant *KC = getKnownConstant(Folded, WantInteger))
	Result.emplace_back(KC, LHSVal.second);
	}

	return !Result.empty();
	}
	}

	if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
	// Handle select instructions where at least one operand is a known constant
	// and we can figure out the condition value for any predecessor block.
	Constant *TrueVal = getKnownConstant(SI->getTrueValue(), Preference);
	Constant *FalseVal = getKnownConstant(SI->getFalseValue(), Preference);
	PredValueInfoTy Conds;
	if ((TrueVal \|\| FalseVal) &&
	ComputeValueKnownInPredecessorsImpl(SI->getCondition(), BB, Conds,
	WantInteger, RecursionSet, CxtI)) {
	for (auto &C : Conds) {
	Constant *Cond = C.first;

	// Figure out what value to use for the condition.
	bool KnownCond;
	if (ConstantInt *CI = dyn_cast<ConstantInt>(Cond)) {
	// A known boolean.
	KnownCond = CI->isOne();
	} else {
	assert(isa<UndefValue>(Cond) && "Unexpected condition value");
	// Either operand will do, so be sure to pick the one that's a known
	// constant.
	// FIXME: Do this more cleverly if both values are known constants?
	KnownCond = (TrueVal != nullptr);
	}

	// See if the select has a known constant value for this predecessor.
	if (Constant *Val = KnownCond ? TrueVal : FalseVal)
	Result.emplace_back(Val, C.second);
	}

	return !Result.empty();
	}
	}

	// If all else fails, see if LVI can figure out a constant value for us.
	Constant *CI = LVI->getConstant(V, BB, CxtI);
	if (Constant *KC = getKnownConstant(CI, Preference)) {
	for (BasicBlock *Pred : predecessors(BB))
	Result.emplace_back(KC, Pred);
	}

	return !Result.empty();
	}

	/// GetBestDestForBranchOnUndef - If we determine that the specified block ends
	/// in an undefined jump, decide which block is best to revector to.
	///
	/// Since we can pick an arbitrary destination, we pick the successor with the
	/// fewest predecessors. This should reduce the in-degree of the others.
	static unsigned GetBestDestForJumpOnUndef(BasicBlock *BB) {
	Instruction *BBTerm = BB->getTerminator();
	unsigned MinSucc = 0;
	BasicBlock *TestBB = BBTerm->getSuccessor(MinSucc);
	// Compute the successor with the minimum number of predecessors.
	unsigned MinNumPreds = pred_size(TestBB);
	for (unsigned i = 1, e = BBTerm->getNumSuccessors(); i != e; ++i) {
	TestBB = BBTerm->getSuccessor(i);
	unsigned NumPreds = pred_size(TestBB);
	if (NumPreds < MinNumPreds) {
	MinSucc = i;
	MinNumPreds = NumPreds;
	}
	}

	return MinSucc;
	}

	static bool hasAddressTakenAndUsed(BasicBlock *BB) {
	if (!BB->hasAddressTaken()) return false;

	// If the block has its address taken, it may be a tree of dead constants
	// hanging off of it. These shouldn't keep the block alive.
	BlockAddress *BA = BlockAddress::get(BB);
	BA->removeDeadConstantUsers();
	return !BA->use_empty();
	}

	/// ProcessBlock - If there are any predecessors whose control can be threaded
	/// through to a successor, transform them now.
	bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
	// If the block is trivially dead, just return and let the caller nuke it.
	// This simplifies other transformations.
	if (DTU->isBBPendingDeletion(BB) \|\|
	(pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()))
	return false;

	// If this block has a single predecessor, and if that pred has a single
	// successor, merge the blocks. This encourages recursive jump threading
	// because now the condition in this block can be threaded through
	// predecessors of our predecessor block.
	if (MaybeMergeBasicBlockIntoOnlyPred(BB))
	return true;

	if (TryToUnfoldSelectInCurrBB(BB))
	return true;

	// Look if we can propagate guards to predecessors.
	if (HasGuards && ProcessGuards(BB))
	return true;

	// What kind of constant we're looking for.
	ConstantPreference Preference = WantInteger;

	// Look to see if the terminator is a conditional branch, switch or indirect
	// branch, if not we can't thread it.
	Value *Condition;
	Instruction *Terminator = BB->getTerminator();
	if (BranchInst *BI = dyn_cast<BranchInst>(Terminator)) {
	// Can't thread an unconditional jump.
	if (BI->isUnconditional()) return false;
	Condition = BI->getCondition();
	} else if (SwitchInst *SI = dyn_cast<SwitchInst>(Terminator)) {
	Condition = SI->getCondition();
	} else if (IndirectBrInst *IB = dyn_cast<IndirectBrInst>(Terminator)) {
	// Can't thread indirect branch with no successors.
	if (IB->getNumSuccessors() == 0) return false;
	Condition = IB->getAddress()->stripPointerCasts();
	Preference = WantBlockAddress;
	} else {
	return false; // Must be an invoke or callbr.
	}

	// Run constant folding to see if we can reduce the condition to a simple
	// constant.
	if (Instruction *I = dyn_cast<Instruction>(Condition)) {
	Value *SimpleVal =
	ConstantFoldInstruction(I, BB->getModule()->getDataLayout(), TLI);
	if (SimpleVal) {
	I->replaceAllUsesWith(SimpleVal);
	if (isInstructionTriviallyDead(I, TLI))
	I->eraseFromParent();
	Condition = SimpleVal;
	}
	}

	// If the terminator is branching on an undef, we can pick any of the
	// successors to branch to. Let GetBestDestForJumpOnUndef decide.
	if (isa<UndefValue>(Condition)) {
	unsigned BestSucc = GetBestDestForJumpOnUndef(BB);
	std::vector<DominatorTree::UpdateType> Updates;

	// Fold the branch/switch.
	Instruction *BBTerm = BB->getTerminator();
	Updates.reserve(BBTerm->getNumSuccessors());
	for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i) {
	if (i == BestSucc) continue;
	BasicBlock *Succ = BBTerm->getSuccessor(i);
	Succ->removePredecessor(BB, true);
	Updates.push_back({DominatorTree::Delete, BB, Succ});
	}

	LLVM_DEBUG(dbgs() << " In block '" << BB->getName()
	<< "' folding undef terminator: " << *BBTerm << '\n');
	BranchInst::Create(BBTerm->getSuccessor(BestSucc), BBTerm);
	BBTerm->eraseFromParent();
	DTU->applyUpdatesPermissive(Updates);
	return true;
	}

	// If the terminator of this block is branching on a constant, simplify the
	// terminator to an unconditional branch. This can occur due to threading in
	// other blocks.
	if (getKnownConstant(Condition, Preference)) {
	LLVM_DEBUG(dbgs() << " In block '" << BB->getName()
	<< "' folding terminator: " << *BB->getTerminator()
	<< '\n');
	++NumFolds;
	ConstantFoldTerminator(BB, true, nullptr, DTU);
	return true;
	}

	Instruction *CondInst = dyn_cast<Instruction>(Condition);

	// All the rest of our checks depend on the condition being an instruction.
	if (!CondInst) {
	// FIXME: Unify this with code below.
	if (ProcessThreadableEdges(Condition, BB, Preference, Terminator))
	return true;
	return false;
	}

	if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondInst)) {
	// If we're branching on a conditional, LVI might be able to determine
	// it's value at the branch instruction. We only handle comparisons
	// against a constant at this time.
	// TODO: This should be extended to handle switches as well.
	BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
	Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1));
	if (CondBr && CondConst) {
	// We should have returned as soon as we turn a conditional branch to
	// unconditional. Because its no longer interesting as far as jump
	// threading is concerned.
	assert(CondBr->isConditional() && "Threading on unconditional terminator");

	LazyValueInfo::Tristate Ret =
	LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0),
	CondConst, CondBr);
	if (Ret != LazyValueInfo::Unknown) {
	unsigned ToRemove = Ret == LazyValueInfo::True ? 1 : 0;
	unsigned ToKeep = Ret == LazyValueInfo::True ? 0 : 1;
	BasicBlock *ToRemoveSucc = CondBr->getSuccessor(ToRemove);
	ToRemoveSucc->removePredecessor(BB, true);
	BranchInst *UncondBr =
	BranchInst::Create(CondBr->getSuccessor(ToKeep), CondBr);
	UncondBr->setDebugLoc(CondBr->getDebugLoc());
	CondBr->eraseFromParent();
	if (CondCmp->use_empty())
	CondCmp->eraseFromParent();
	// We can safely replace some uses of the CondInst if it has
	// exactly one value as returned by LVI. RAUW is incorrect in the
	// presence of guards and assumes, that have the `Cond` as the use. This
	// is because we use the guards/assume to reason about the `Cond` value
	// at the end of block, but RAUW unconditionally replaces all uses
	// including the guards/assumes themselves and the uses before the
	// guard/assume.
	else if (CondCmp->getParent() == BB) {
	auto *CI = Ret == LazyValueInfo::True ?
	ConstantInt::getTrue(CondCmp->getType()) :
	ConstantInt::getFalse(CondCmp->getType());
	ReplaceFoldableUses(CondCmp, CI);
	}
	DTU->applyUpdatesPermissive(
	{{DominatorTree::Delete, BB, ToRemoveSucc}});
	return true;
	}

	// We did not manage to simplify this branch, try to see whether
	// CondCmp depends on a known phi-select pattern.
	if (TryToUnfoldSelect(CondCmp, BB))
	return true;
	}
	}

	if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator()))
	if (TryToUnfoldSelect(SI, BB))
	return true;

	// Check for some cases that are worth simplifying. Right now we want to look
	// for loads that are used by a switch or by the condition for the branch. If
	// we see one, check to see if it's partially redundant. If so, insert a PHI
	// which can then be used to thread the values.
	Value *SimplifyValue = CondInst;
	if (CmpInst *CondCmp = dyn_cast<CmpInst>(SimplifyValue))
	if (isa<Constant>(CondCmp->getOperand(1)))
	SimplifyValue = CondCmp->getOperand(0);

	// TODO: There are other places where load PRE would be profitable, such as
	// more complex comparisons.
	if (LoadInst *LoadI = dyn_cast<LoadInst>(SimplifyValue))
	if (SimplifyPartiallyRedundantLoad(LoadI))
	return true;

	// Before threading, try to propagate profile data backwards:
	if (PHINode *PN = dyn_cast<PHINode>(CondInst))
	if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
	updatePredecessorProfileMetadata(PN, BB);

	// Handle a variety of cases where we are branching on something derived from
	// a PHI node in the current block. If we can prove that any predecessors
	// compute a predictable value based on a PHI node, thread those predecessors.
	if (ProcessThreadableEdges(CondInst, BB, Preference, Terminator))
	return true;

	// If this is an otherwise-unfoldable branch on a phi node in the current
	// block, see if we can simplify.
	if (PHINode *PN = dyn_cast<PHINode>(CondInst))
	if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
	return ProcessBranchOnPHI(PN);

	// If this is an otherwise-unfoldable branch on a XOR, see if we can simplify.
	if (CondInst->getOpcode() == Instruction::Xor &&
	CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
	return ProcessBranchOnXOR(cast<BinaryOperator>(CondInst));

	// Search for a stronger dominating condition that can be used to simplify a
	// conditional branch leaving BB.
	if (ProcessImpliedCondition(BB))
	return true;

	return false;
	}

	bool JumpThreadingPass::ProcessImpliedCondition(BasicBlock *BB) {
	auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
	if (!BI \|\| !BI->isConditional())
	return false;

	Value *Cond = BI->getCondition();
	BasicBlock *CurrentBB = BB;
	BasicBlock *CurrentPred = BB->getSinglePredecessor();
	unsigned Iter = 0;

	auto &DL = BB->getModule()->getDataLayout();

	while (CurrentPred && Iter++ < ImplicationSearchThreshold) {
	auto *PBI = dyn_cast<BranchInst>(CurrentPred->getTerminator());
	if (!PBI \|\| !PBI->isConditional())
	return false;
	if (PBI->getSuccessor(0) != CurrentBB && PBI->getSuccessor(1) != CurrentBB)
	return false;

	bool CondIsTrue = PBI->getSuccessor(0) == CurrentBB;
	Optional<bool> Implication =
	isImpliedCondition(PBI->getCondition(), Cond, DL, CondIsTrue);
	if (Implication) {
	BasicBlock KeepSucc = BI->getSuccessor(Implication ? 0 : 1);
	BasicBlock RemoveSucc = BI->getSuccessor(Implication ? 1 : 0);
	RemoveSucc->removePredecessor(BB);
	BranchInst *UncondBI = BranchInst::Create(KeepSucc, BI);
	UncondBI->setDebugLoc(BI->getDebugLoc());
	BI->eraseFromParent();
	DTU->applyUpdatesPermissive({{DominatorTree::Delete, BB, RemoveSucc}});
	return true;
	}
	CurrentBB = CurrentPred;
	CurrentPred = CurrentBB->getSinglePredecessor();
	}

	return false;
	}

	/// Return true if Op is an instruction defined in the given block.
	static bool isOpDefinedInBlock(Value Op, BasicBlock BB) {
	if (Instruction *OpInst = dyn_cast<Instruction>(Op))
	if (OpInst->getParent() == BB)
	return true;
	return false;
	}

	/// SimplifyPartiallyRedundantLoad - If LoadI is an obviously partially
	/// redundant load instruction, eliminate it by replacing it with a PHI node.
	/// This is an important optimization that encourages jump threading, and needs
	/// to be run interlaced with other jump threading tasks.
	bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) {
	// Don't hack volatile and ordered loads.
	if (!LoadI->isUnordered()) return false;

	// If the load is defined in a block with exactly one predecessor, it can't be
	// partially redundant.
	BasicBlock *LoadBB = LoadI->getParent();
	if (LoadBB->getSinglePredecessor())
	return false;

	// If the load is defined in an EH pad, it can't be partially redundant,
	// because the edges between the invoke and the EH pad cannot have other
	// instructions between them.
	if (LoadBB->isEHPad())
	return false;

	Value *LoadedPtr = LoadI->getOperand(0);

	// If the loaded operand is defined in the LoadBB and its not a phi,
	// it can't be available in predecessors.
	if (isOpDefinedInBlock(LoadedPtr, LoadBB) && !isa<PHINode>(LoadedPtr))
	return false;

	// Scan a few instructions up from the load, to see if it is obviously live at
	// the entry to its block.
	BasicBlock::iterator BBIt(LoadI);
	bool IsLoadCSE;
	if (Value *AvailableVal = FindAvailableLoadedValue(
	LoadI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) {
	// If the value of the load is locally available within the block, just use
	// it. This frequently occurs for reg2mem'd allocas.

	if (IsLoadCSE) {
	LoadInst *NLoadI = cast<LoadInst>(AvailableVal);
	combineMetadataForCSE(NLoadI, LoadI, false);
	};

	// If the returned value is the load itself, replace with an undef. This can
	// only happen in dead loops.
	if (AvailableVal == LoadI)
	AvailableVal = UndefValue::get(LoadI->getType());
	if (AvailableVal->getType() != LoadI->getType())
	AvailableVal = CastInst::CreateBitOrPointerCast(
	AvailableVal, LoadI->getType(), "", LoadI);
	LoadI->replaceAllUsesWith(AvailableVal);
	LoadI->eraseFromParent();
	return true;
	}

	// Otherwise, if we scanned the whole block and got to the top of the block,
	// we know the block is locally transparent to the load. If not, something
	// might clobber its value.
	if (BBIt != LoadBB->begin())
	return false;

	// If all of the loads and stores that feed the value have the same AA tags,
	// then we can propagate them onto any newly inserted loads.
	AAMDNodes AATags;
	LoadI->getAAMetadata(AATags);

	SmallPtrSet<BasicBlock*, 8> PredsScanned;

	using AvailablePredsTy = SmallVector<std::pair<BasicBlock , Value >, 8>;

	AvailablePredsTy AvailablePreds;
	BasicBlock *OneUnavailablePred = nullptr;
	SmallVector<LoadInst*, 8> CSELoads;

	// If we got here, the loaded value is transparent through to the start of the
	// block. Check to see if it is available in any of the predecessor blocks.
	for (BasicBlock *PredBB : predecessors(LoadBB)) {
	// If we already scanned this predecessor, skip it.
	if (!PredsScanned.insert(PredBB).second)
	continue;

	BBIt = PredBB->end();
	unsigned NumScanedInst = 0;
	Value *PredAvailable = nullptr;
	// NOTE: We don't CSE load that is volatile or anything stronger than
	// unordered, that should have been checked when we entered the function.
	assert(LoadI->isUnordered() &&
	"Attempting to CSE volatile or atomic loads");
	// If this is a load on a phi pointer, phi-translate it and search
	// for available load/store to the pointer in predecessors.
	Value *Ptr = LoadedPtr->DoPHITranslation(LoadBB, PredBB);
	PredAvailable = FindAvailablePtrLoadStore(
	Ptr, LoadI->getType(), LoadI->isAtomic(), PredBB, BBIt,
	DefMaxInstsToScan, AA, &IsLoadCSE, &NumScanedInst);

	// If PredBB has a single predecessor, continue scanning through the
	// single predecessor.
	BasicBlock *SinglePredBB = PredBB;
	while (!PredAvailable && SinglePredBB && BBIt == SinglePredBB->begin() &&
	NumScanedInst < DefMaxInstsToScan) {
	SinglePredBB = SinglePredBB->getSinglePredecessor();
	if (SinglePredBB) {
	BBIt = SinglePredBB->end();
	PredAvailable = FindAvailablePtrLoadStore(
	Ptr, LoadI->getType(), LoadI->isAtomic(), SinglePredBB, BBIt,
	(DefMaxInstsToScan - NumScanedInst), AA, &IsLoadCSE,
	&NumScanedInst);
	}
	}

	if (!PredAvailable) {
	OneUnavailablePred = PredBB;
	continue;
	}

	if (IsLoadCSE)
	CSELoads.push_back(cast<LoadInst>(PredAvailable));

	// If so, this load is partially redundant. Remember this info so that we
	// can create a PHI node.
	AvailablePreds.emplace_back(PredBB, PredAvailable);
	}

	// If the loaded value isn't available in any predecessor, it isn't partially
	// redundant.
	if (AvailablePreds.empty()) return false;

	// Okay, the loaded value is available in at least one (and maybe all!)
	// predecessors. If the value is unavailable in more than one unique
	// predecessor, we want to insert a merge block for those common predecessors.
	// This ensures that we only have to insert one reload, thus not increasing
	// code size.
	BasicBlock *UnavailablePred = nullptr;

	// If the value is unavailable in one of predecessors, we will end up
	// inserting a new instruction into them. It is only valid if all the
	// instructions before LoadI are guaranteed to pass execution to its
	// successor, or if LoadI is safe to speculate.
	// TODO: If this logic becomes more complex, and we will perform PRE insertion
	// farther than to a predecessor, we need to reuse the code from GVN's PRE.
	// It requires domination tree analysis, so for this simple case it is an
	// overkill.
	if (PredsScanned.size() != AvailablePreds.size() &&
	!isSafeToSpeculativelyExecute(LoadI))
	for (auto I = LoadBB->begin(); &*I != LoadI; ++I)
	if (!isGuaranteedToTransferExecutionToSuccessor(&*I))
	return false;

	// If there is exactly one predecessor where the value is unavailable, the
	// already computed 'OneUnavailablePred' block is it. If it ends in an
	// unconditional branch, we know that it isn't a critical edge.
	if (PredsScanned.size() == AvailablePreds.size()+1 &&
	OneUnavailablePred->getTerminator()->getNumSuccessors() == 1) {
	UnavailablePred = OneUnavailablePred;
	} else if (PredsScanned.size() != AvailablePreds.size()) {
	// Otherwise, we had multiple unavailable predecessors or we had a critical
	// edge from the one.
	SmallVector<BasicBlock*, 8> PredsToSplit;
	SmallPtrSet<BasicBlock*, 8> AvailablePredSet;

	for (const auto &AvailablePred : AvailablePreds)
	AvailablePredSet.insert(AvailablePred.first);

	// Add all the unavailable predecessors to the PredsToSplit list.
	for (BasicBlock *P : predecessors(LoadBB)) {
	// If the predecessor is an indirect goto, we can't split the edge.
	// Same for CallBr.
	if (isa<IndirectBrInst>(P->getTerminator()) \|\|
	isa<CallBrInst>(P->getTerminator()))
	return false;

	if (!AvailablePredSet.count(P))
	PredsToSplit.push_back(P);
	}

	// Split them out to their own block.
	UnavailablePred = SplitBlockPreds(LoadBB, PredsToSplit, "thread-pre-split");
	}

	// If the value isn't available in all predecessors, then there will be
	// exactly one where it isn't available. Insert a load on that edge and add
	// it to the AvailablePreds list.
	if (UnavailablePred) {
	assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 &&
	"Can't handle critical edge here!");
	LoadInst *NewVal = new LoadInst(
	LoadI->getType(), LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred),
	LoadI->getName() + ".pr", false, LoadI->getAlign(),
	LoadI->getOrdering(), LoadI->getSyncScopeID(),
	UnavailablePred->getTerminator());
	NewVal->setDebugLoc(LoadI->getDebugLoc());
	if (AATags)
	NewVal->setAAMetadata(AATags);

	AvailablePreds.emplace_back(UnavailablePred, NewVal);
	}

	// Now we know that each predecessor of this block has a value in
	// AvailablePreds, sort them for efficient access as we're walking the preds.
	array_pod_sort(AvailablePreds.begin(), AvailablePreds.end());

	// Create a PHI node at the start of the block for the PRE'd load value.
	pred_iterator PB = pred_begin(LoadBB), PE = pred_end(LoadBB);
	PHINode *PN = PHINode::Create(LoadI->getType(), std::distance(PB, PE), "",
	&LoadBB->front());
	PN->takeName(LoadI);
	PN->setDebugLoc(LoadI->getDebugLoc());

	// Insert new entries into the PHI for each predecessor. A single block may
	// have multiple entries here.
	for (pred_iterator PI = PB; PI != PE; ++PI) {
	BasicBlock P = PI;
	AvailablePredsTy::iterator I =
	llvm::lower_bound(AvailablePreds, std::make_pair(P, (Value *)nullptr));

	assert(I != AvailablePreds.end() && I->first == P &&
	"Didn't find entry for predecessor!");

	// If we have an available predecessor but it requires casting, insert the
	// cast in the predecessor and use the cast. Note that we have to update the
	// AvailablePreds vector as we go so that all of the PHI entries for this
	// predecessor use the same bitcast.
	Value *&PredV = I->second;
	if (PredV->getType() != LoadI->getType())
	PredV = CastInst::CreateBitOrPointerCast(PredV, LoadI->getType(), "",
	P->getTerminator());

	PN->addIncoming(PredV, I->first);
	}

	for (LoadInst *PredLoadI : CSELoads) {
	combineMetadataForCSE(PredLoadI, LoadI, true);
	}

	LoadI->replaceAllUsesWith(PN);
	LoadI->eraseFromParent();

	return true;
	}

	/// FindMostPopularDest - The specified list contains multiple possible
	/// threadable destinations. Pick the one that occurs the most frequently in
	/// the list.
	static BasicBlock *
	FindMostPopularDest(BasicBlock *BB,
	const SmallVectorImpl<std::pair<BasicBlock *,
	BasicBlock *>> &PredToDestList) {
	assert(!PredToDestList.empty());

	// Determine popularity. If there are multiple possible destinations, we
	// explicitly choose to ignore 'undef' destinations. We prefer to thread
	// blocks with known and real destinations to threading undef. We'll handle
	// them later if interesting.
	MapVector<BasicBlock *, unsigned> DestPopularity;

	// Populate DestPopularity with the successors in the order they appear in the
	// successor list. This way, we ensure determinism by iterating it in the
	// same order in std::max_element below. We map nullptr to 0 so that we can
	// return nullptr when PredToDestList contains nullptr only.
	DestPopularity[nullptr] = 0;
	for (auto *SuccBB : successors(BB))
	DestPopularity[SuccBB] = 0;

	for (const auto &PredToDest : PredToDestList)
	if (PredToDest.second)
	DestPopularity[PredToDest.second]++;

	// Find the most popular dest.
	using VT = decltype(DestPopularity)::value_type;
	auto MostPopular = std::max_element(
	DestPopularity.begin(), DestPopularity.end(),
	[](const VT &L, const VT &R) { return L.second < R.second; });

	// Okay, we have finally picked the most popular destination.
	return MostPopular->first;
	}

	// Try to evaluate the value of V when the control flows from PredPredBB to
	// BB->getSinglePredecessor() and then on to BB.
	Constant JumpThreadingPass::EvaluateOnPredecessorEdge(BasicBlock BB,
	BasicBlock *PredPredBB,
	Value *V) {
	BasicBlock *PredBB = BB->getSinglePredecessor();
	assert(PredBB && "Expected a single predecessor");

	if (Constant *Cst = dyn_cast<Constant>(V)) {
	return Cst;
	}

	// Consult LVI if V is not an instruction in BB or PredBB.
	Instruction *I = dyn_cast<Instruction>(V);
	if (!I \|\| (I->getParent() != BB && I->getParent() != PredBB)) {
	return LVI->getConstantOnEdge(V, PredPredBB, PredBB, nullptr);
	}

	// Look into a PHI argument.
	if (PHINode *PHI = dyn_cast<PHINode>(V)) {
	if (PHI->getParent() == PredBB)
	return dyn_cast<Constant>(PHI->getIncomingValueForBlock(PredPredBB));
	return nullptr;
	}

	// If we have a CmpInst, try to fold it for each incoming edge into PredBB.
	if (CmpInst *CondCmp = dyn_cast<CmpInst>(V)) {
	if (CondCmp->getParent() == BB) {
	Constant *Op0 =
	EvaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(0));
	Constant *Op1 =
	EvaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(1));
	if (Op0 && Op1) {
	return ConstantExpr::getCompare(CondCmp->getPredicate(), Op0, Op1);
	}
	}
	return nullptr;
	}

	return nullptr;
	}

	bool JumpThreadingPass::ProcessThreadableEdges(Value Cond, BasicBlock BB,
	ConstantPreference Preference,
	Instruction *CxtI) {
	// If threading this would thread across a loop header, don't even try to
	// thread the edge.
	if (LoopHeaders.count(BB))
	return false;

	PredValueInfoTy PredValues;
	if (!ComputeValueKnownInPredecessors(Cond, BB, PredValues, Preference,
	CxtI)) {
	// We don't have known values in predecessors. See if we can thread through
	// BB and its sole predecessor.
	return MaybeThreadThroughTwoBasicBlocks(BB, Cond);
	}

	assert(!PredValues.empty() &&
	"ComputeValueKnownInPredecessors returned true with no values");

	LLVM_DEBUG(dbgs() << "IN BB: " << *BB;
	for (const auto &PredValue : PredValues) {
	dbgs() << " BB '" << BB->getName()
	<< "': FOUND condition = " << *PredValue.first
	<< " for pred '" << PredValue.second->getName() << "'.\n";
	});

	// Decide what we want to thread through. Convert our list of known values to
	// a list of known destinations for each pred. This also discards duplicate
	// predecessors and keeps track of the undefined inputs (which are represented
	// as a null dest in the PredToDestList).
	SmallPtrSet<BasicBlock*, 16> SeenPreds;
	SmallVector<std::pair<BasicBlock, BasicBlock>, 16> PredToDestList;

	BasicBlock *OnlyDest = nullptr;
	BasicBlock MultipleDestSentinel = (BasicBlock)(intptr_t)~0ULL;
	Constant *OnlyVal = nullptr;
	Constant MultipleVal = (Constant )(intptr_t)~0ULL;

	for (const auto &PredValue : PredValues) {
	BasicBlock *Pred = PredValue.second;
	if (!SeenPreds.insert(Pred).second)
	continue; // Duplicate predecessor entry.

	Constant *Val = PredValue.first;

	BasicBlock *DestBB;
	if (isa<UndefValue>(Val))
	DestBB = nullptr;
	else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
	assert(isa<ConstantInt>(Val) && "Expecting a constant integer");
	DestBB = BI->getSuccessor(cast<ConstantInt>(Val)->isZero());
	} else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
	assert(isa<ConstantInt>(Val) && "Expecting a constant integer");
	DestBB = SI->findCaseValue(cast<ConstantInt>(Val))->getCaseSuccessor();
	} else {
	assert(isa<IndirectBrInst>(BB->getTerminator())
	&& "Unexpected terminator");
	assert(isa<BlockAddress>(Val) && "Expecting a constant blockaddress");
	DestBB = cast<BlockAddress>(Val)->getBasicBlock();
	}

	// If we have exactly one destination, remember it for efficiency below.
	if (PredToDestList.empty()) {
	OnlyDest = DestBB;
	OnlyVal = Val;
	} else {
	if (OnlyDest != DestBB)
	OnlyDest = MultipleDestSentinel;
	// It possible we have same destination, but different value, e.g. default
	// case in switchinst.
	if (Val != OnlyVal)
	OnlyVal = MultipleVal;
	}

	// If the predecessor ends with an indirect goto, we can't change its
	// destination. Same for CallBr.
	if (isa<IndirectBrInst>(Pred->getTerminator()) \|\|
	isa<CallBrInst>(Pred->getTerminator()))
	continue;

	PredToDestList.emplace_back(Pred, DestBB);
	}

	// If all edges were unthreadable, we fail.
	if (PredToDestList.empty())
	return false;

	// If all the predecessors go to a single known successor, we want to fold,
	// not thread. By doing so, we do not need to duplicate the current block and
	// also miss potential opportunities in case we dont/cant duplicate.
	if (OnlyDest && OnlyDest != MultipleDestSentinel) {
	if (BB->hasNPredecessors(PredToDestList.size())) {
	bool SeenFirstBranchToOnlyDest = false;
	std::vector <DominatorTree::UpdateType> Updates;
	Updates.reserve(BB->getTerminator()->getNumSuccessors() - 1);
	for (BasicBlock *SuccBB : successors(BB)) {
	if (SuccBB == OnlyDest && !SeenFirstBranchToOnlyDest) {
	SeenFirstBranchToOnlyDest = true; // Don't modify the first branch.
	} else {
	SuccBB->removePredecessor(BB, true); // This is unreachable successor.
	Updates.push_back({DominatorTree::Delete, BB, SuccBB});
	}
	}

	// Finally update the terminator.
	Instruction *Term = BB->getTerminator();
	BranchInst::Create(OnlyDest, Term);
	Term->eraseFromParent();
	DTU->applyUpdatesPermissive(Updates);

	// If the condition is now dead due to the removal of the old terminator,
	// erase it.
	if (auto *CondInst = dyn_cast<Instruction>(Cond)) {
	if (CondInst->use_empty() && !CondInst->mayHaveSideEffects())
	CondInst->eraseFromParent();
	// We can safely replace some uses of the CondInst if it has
	// exactly one value as returned by LVI. RAUW is incorrect in the
	// presence of guards and assumes, that have the `Cond` as the use. This
	// is because we use the guards/assume to reason about the `Cond` value
	// at the end of block, but RAUW unconditionally replaces all uses
	// including the guards/assumes themselves and the uses before the
	// guard/assume.
	else if (OnlyVal && OnlyVal != MultipleVal &&
	CondInst->getParent() == BB)
	ReplaceFoldableUses(CondInst, OnlyVal);
	}
	return true;
	}
	}

	// Determine which is the most common successor. If we have many inputs and
	// this block is a switch, we want to start by threading the batch that goes
	// to the most popular destination first. If we only know about one
	// threadable destination (the common case) we can avoid this.
	BasicBlock *MostPopularDest = OnlyDest;

	if (MostPopularDest == MultipleDestSentinel) {
	// Remove any loop headers from the Dest list, ThreadEdge conservatively
	// won't process them, but we might have other destination that are eligible
	// and we still want to process.
	erase_if(PredToDestList,
	[&](const std::pair<BasicBlock , BasicBlock > &PredToDest) {
	return LoopHeaders.count(PredToDest.second) != 0;
	});

	if (PredToDestList.empty())
	return false;

	MostPopularDest = FindMostPopularDest(BB, PredToDestList);
	}

	// Now that we know what the most popular destination is, factor all
	// predecessors that will jump to it into a single predecessor.
	SmallVector<BasicBlock*, 16> PredsToFactor;
	for (const auto &PredToDest : PredToDestList)
	if (PredToDest.second == MostPopularDest) {
	BasicBlock *Pred = PredToDest.first;

	// This predecessor may be a switch or something else that has multiple
	// edges to the block. Factor each of these edges by listing them
	// according to # occurrences in PredsToFactor.
	for (BasicBlock *Succ : successors(Pred))
	if (Succ == BB)
	PredsToFactor.push_back(Pred);
	}

	// If the threadable edges are branching on an undefined value, we get to pick
	// the destination that these predecessors should get to.
	if (!MostPopularDest)
	MostPopularDest = BB->getTerminator()->
	getSuccessor(GetBestDestForJumpOnUndef(BB));

	// Ok, try to thread it!
	return TryThreadEdge(BB, PredsToFactor, MostPopularDest);
	}

	/// ProcessBranchOnPHI - We have an otherwise unthreadable conditional branch on
	/// a PHI node in the current block. See if there are any simplifications we
	/// can do based on inputs to the phi node.
	bool JumpThreadingPass::ProcessBranchOnPHI(PHINode *PN) {
	BasicBlock *BB = PN->getParent();

	// TODO: We could make use of this to do it once for blocks with common PHI
	// values.
	SmallVector<BasicBlock*, 1> PredBBs;
	PredBBs.resize(1);

	// If any of the predecessor blocks end in an unconditional branch, we can
	// duplicate the conditional branch into that block in order to further
	// encourage jump threading and to eliminate cases where we have branch on a
	// phi of an icmp (branch on icmp is much better).
	for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
	BasicBlock *PredBB = PN->getIncomingBlock(i);
	if (BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator()))
	if (PredBr->isUnconditional()) {
	PredBBs[0] = PredBB;
	// Try to duplicate BB into PredBB.
	if (DuplicateCondBranchOnPHIIntoPred(BB, PredBBs))
	return true;
	}
	}

	return false;
	}

	/// ProcessBranchOnXOR - We have an otherwise unthreadable conditional branch on
	/// a xor instruction in the current block. See if there are any
	/// simplifications we can do based on inputs to the xor.
	bool JumpThreadingPass::ProcessBranchOnXOR(BinaryOperator *BO) {
	BasicBlock *BB = BO->getParent();

	// If either the LHS or RHS of the xor is a constant, don't do this
	// optimization.
	if (isa<ConstantInt>(BO->getOperand(0)) \|\|
	isa<ConstantInt>(BO->getOperand(1)))
	return false;

	// If the first instruction in BB isn't a phi, we won't be able to infer
	// anything special about any particular predecessor.
	if (!isa<PHINode>(BB->front()))
	return false;

	// If this BB is a landing pad, we won't be able to split the edge into it.
	if (BB->isEHPad())
	return false;

	// If we have a xor as the branch input to this block, and we know that the
	// LHS or RHS of the xor in any predecessor is true/false, then we can clone
	// the condition into the predecessor and fix that value to true, saving some
	// logical ops on that path and encouraging other paths to simplify.
	//
	// This copies something like this:
	//
	// BB:
	// %X = phi i1 [1], [%X']
	// %Y = icmp eq i32 %A, %B
	// %Z = xor i1 %X, %Y
	// br i1 %Z, ...
	//
	// Into:
	// BB':
	// %Y = icmp ne i32 %A, %B
	// br i1 %Y, ...

	PredValueInfoTy XorOpValues;
	bool isLHS = true;
	if (!ComputeValueKnownInPredecessors(BO->getOperand(0), BB, XorOpValues,
	WantInteger, BO)) {
	assert(XorOpValues.empty());
	if (!ComputeValueKnownInPredecessors(BO->getOperand(1), BB, XorOpValues,
	WantInteger, BO))
	return false;
	isLHS = false;
	}

	assert(!XorOpValues.empty() &&
	"ComputeValueKnownInPredecessors returned true with no values");

	// Scan the information to see which is most popular: true or false. The
	// predecessors can be of the set true, false, or undef.
	unsigned NumTrue = 0, NumFalse = 0;
	for (const auto &XorOpValue : XorOpValues) {
	if (isa<UndefValue>(XorOpValue.first))
	// Ignore undefs for the count.
	continue;
	if (cast<ConstantInt>(XorOpValue.first)->isZero())
	++NumFalse;
	else
	++NumTrue;
	}

	// Determine which value to split on, true, false, or undef if neither.
	ConstantInt *SplitVal = nullptr;
	if (NumTrue > NumFalse)
	SplitVal = ConstantInt::getTrue(BB->getContext());
	else if (NumTrue != 0 \|\| NumFalse != 0)
	SplitVal = ConstantInt::getFalse(BB->getContext());

	// Collect all of the blocks that this can be folded into so that we can
	// factor this once and clone it once.
	SmallVector<BasicBlock*, 8> BlocksToFoldInto;
	for (const auto &XorOpValue : XorOpValues) {
	if (XorOpValue.first != SplitVal && !isa<UndefValue>(XorOpValue.first))
	continue;

	BlocksToFoldInto.push_back(XorOpValue.second);
	}

	// If we inferred a value for all of the predecessors, then duplication won't
	// help us. However, we can just replace the LHS or RHS with the constant.
	if (BlocksToFoldInto.size() ==
	cast<PHINode>(BB->front()).getNumIncomingValues()) {
	if (!SplitVal) {
	// If all preds provide undef, just nuke the xor, because it is undef too.
	BO->replaceAllUsesWith(UndefValue::get(BO->getType()));
	BO->eraseFromParent();
	} else if (SplitVal->isZero()) {
	// If all preds provide 0, replace the xor with the other input.
	BO->replaceAllUsesWith(BO->getOperand(isLHS));
	BO->eraseFromParent();
	} else {
	// If all preds provide 1, set the computed value to 1.
	BO->setOperand(!isLHS, SplitVal);
	}

	return true;
	}

	+ // If any of predecessors end with an indirect goto, we can't change its
	+ // destination. Same for CallBr.
	+ if (any_of(BlocksToFoldInto, [](BasicBlock *Pred) {
	+ return isa<IndirectBrInst>(Pred->getTerminator()) \|\|
	+ isa<CallBrInst>(Pred->getTerminator());
	+ }))
	+ return false;
	+
	// Try to duplicate BB into PredBB.
	return DuplicateCondBranchOnPHIIntoPred(BB, BlocksToFoldInto);
	}

	/// AddPHINodeEntriesForMappedBlock - We're adding 'NewPred' as a new
	/// predecessor to the PHIBB block. If it has PHI nodes, add entries for
	/// NewPred using the entries from OldPred (suitably mapped).
	static void AddPHINodeEntriesForMappedBlock(BasicBlock *PHIBB,
	BasicBlock *OldPred,
	BasicBlock *NewPred,
	DenseMap<Instruction, Value> &ValueMap) {
	for (PHINode &PN : PHIBB->phis()) {
	// Ok, we have a PHI node. Figure out what the incoming value was for the
	// DestBlock.
	Value *IV = PN.getIncomingValueForBlock(OldPred);

	// Remap the value if necessary.
	if (Instruction *Inst = dyn_cast<Instruction>(IV)) {
	DenseMap<Instruction, Value>::iterator I = ValueMap.find(Inst);
	if (I != ValueMap.end())
	IV = I->second;
	}

	PN.addIncoming(IV, NewPred);
	}
	}

	/// Merge basic block BB into its sole predecessor if possible.
	bool JumpThreadingPass::MaybeMergeBasicBlockIntoOnlyPred(BasicBlock *BB) {
	BasicBlock *SinglePred = BB->getSinglePredecessor();
	if (!SinglePred)
	return false;

	const Instruction *TI = SinglePred->getTerminator();
	if (TI->isExceptionalTerminator() \|\| TI->getNumSuccessors() != 1 \|\|
	SinglePred == BB \|\| hasAddressTakenAndUsed(BB))
	return false;

	// If SinglePred was a loop header, BB becomes one.
	if (LoopHeaders.erase(SinglePred))
	LoopHeaders.insert(BB);

	LVI->eraseBlock(SinglePred);
	MergeBasicBlockIntoOnlyPred(BB, DTU);

	// Now that BB is merged into SinglePred (i.e. SinglePred code followed by
	// BB code within one basic block `BB`), we need to invalidate the LVI
	// information associated with BB, because the LVI information need not be
	// true for all of BB after the merge. For example,
	// Before the merge, LVI info and code is as follows:
	// SinglePred: <LVI info1 for %p val>
	// %y = use of %p
	// call @exit() // need not transfer execution to successor.
	// assume(%p) // from this point on %p is true
	// br label %BB
	// BB: <LVI info2 for %p val, i.e. %p is true>
	// %x = use of %p
	// br label exit
	//
	// Note that this LVI info for blocks BB and SinglPred is correct for %p
	// (info2 and info1 respectively). After the merge and the deletion of the
	// LVI info1 for SinglePred. We have the following code:
	// BB: <LVI info2 for %p val>
	// %y = use of %p
	// call @exit()
	// assume(%p)
	// %x = use of %p <-- LVI info2 is correct from here onwards.
	// br label exit
	// LVI info2 for BB is incorrect at the beginning of BB.

	// Invalidate LVI information for BB if the LVI is not provably true for
	// all of BB.
	if (!isGuaranteedToTransferExecutionToSuccessor(BB))
	LVI->eraseBlock(BB);
	return true;
	}

	/// Update the SSA form. NewBB contains instructions that are copied from BB.
	/// ValueMapping maps old values in BB to new ones in NewBB.
	void JumpThreadingPass::UpdateSSA(
	BasicBlock BB, BasicBlock NewBB,
	DenseMap<Instruction , Value > &ValueMapping) {
	// If there were values defined in BB that are used outside the block, then we
	// now have to update all uses of the value to use either the original value,
	// the cloned value, or some PHI derived value. This can require arbitrary
	// PHI insertion, of which we are prepared to do, clean these up now.
	SSAUpdater SSAUpdate;
	SmallVector<Use *, 16> UsesToRename;

	for (Instruction &I : *BB) {
	// Scan all uses of this instruction to see if it is used outside of its
	// block, and if so, record them in UsesToRename.
	for (Use &U : I.uses()) {
	Instruction *User = cast<Instruction>(U.getUser());
	if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
	if (UserPN->getIncomingBlock(U) == BB)
	continue;
	} else if (User->getParent() == BB)
	continue;

	UsesToRename.push_back(&U);
	}

	// If there are no uses outside the block, we're done with this instruction.
	if (UsesToRename.empty())
	continue;
	LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");

	// We found a use of I outside of BB. Rename all uses of I that are outside
	// its block to be uses of the appropriate PHI node etc. See ValuesInBlocks
	// with the two values we know.
	SSAUpdate.Initialize(I.getType(), I.getName());
	SSAUpdate.AddAvailableValue(BB, &I);
	SSAUpdate.AddAvailableValue(NewBB, ValueMapping[&I]);

	while (!UsesToRename.empty())
	SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
	LLVM_DEBUG(dbgs() << "\n");
	}
	}

	/// Clone instructions in range [BI, BE) to NewBB. For PHI nodes, we only clone
	/// arguments that come from PredBB. Return the map from the variables in the
	/// source basic block to the variables in the newly created basic block.
	DenseMap<Instruction , Value >
	JumpThreadingPass::CloneInstructions(BasicBlock::iterator BI,
	BasicBlock::iterator BE, BasicBlock *NewBB,
	BasicBlock *PredBB) {
	// We are going to have to map operands from the source basic block to the new
	// copy of the block 'NewBB'. If there are PHI nodes in the source basic
	// block, evaluate them to account for entry from PredBB.
	DenseMap<Instruction , Value > ValueMapping;

	// Clone the phi nodes of the source basic block into NewBB. The resulting
	// phi nodes are trivial since NewBB only has one predecessor, but SSAUpdater
	// might need to rewrite the operand of the cloned phi.
	for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) {
	PHINode *NewPN = PHINode::Create(PN->getType(), 1, PN->getName(), NewBB);
	NewPN->addIncoming(PN->getIncomingValueForBlock(PredBB), PredBB);
	ValueMapping[PN] = NewPN;
	}

	// Clone the non-phi instructions of the source basic block into NewBB,
	// keeping track of the mapping and using it to remap operands in the cloned
	// instructions.
	for (; BI != BE; ++BI) {
	Instruction *New = BI->clone();
	New->setName(BI->getName());
	NewBB->getInstList().push_back(New);
	ValueMapping[&*BI] = New;

	// Remap operands to patch up intra-block references.
	for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
	if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
	DenseMap<Instruction , Value >::iterator I = ValueMapping.find(Inst);
	if (I != ValueMapping.end())
	New->setOperand(i, I->second);
	}
	}

	return ValueMapping;
	}

	/// Attempt to thread through two successive basic blocks.
	bool JumpThreadingPass::MaybeThreadThroughTwoBasicBlocks(BasicBlock *BB,
	Value *Cond) {
	// Consider:
	//
	// PredBB:
	// %var = phi i32* [ null, %bb1 ], [ @a, %bb2 ]
	// %tobool = icmp eq i32 %cond, 0
	// br i1 %tobool, label %BB, label ...
	//
	// BB:
	// %cmp = icmp eq i32* %var, null
	// br i1 %cmp, label ..., label ...
	//
	// We don't know the value of %var at BB even if we know which incoming edge
	// we take to BB. However, once we duplicate PredBB for each of its incoming
	// edges (say, PredBB1 and PredBB2), we know the value of %var in each copy of
	// PredBB. Then we can thread edges PredBB1->BB and PredBB2->BB through BB.

	// Require that BB end with a Branch for simplicity.
	BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
	if (!CondBr)
	return false;

	// BB must have exactly one predecessor.
	BasicBlock *PredBB = BB->getSinglePredecessor();
	if (!PredBB)
	return false;

	// Require that PredBB end with a conditional Branch. If PredBB ends with an
	// unconditional branch, we should be merging PredBB and BB instead. For
	// simplicity, we don't deal with a switch.
	BranchInst *PredBBBranch = dyn_cast<BranchInst>(PredBB->getTerminator());
	if (!PredBBBranch \|\| PredBBBranch->isUnconditional())
	return false;

	// If PredBB has exactly one incoming edge, we don't gain anything by copying
	// PredBB.
	if (PredBB->getSinglePredecessor())
	return false;

	// Don't thread through PredBB if it contains a successor edge to itself, in
	// which case we would infinite loop. Suppose we are threading an edge from
	// PredPredBB through PredBB and BB to SuccBB with PredBB containing a
	// successor edge to itself. If we allowed jump threading in this case, we
	// could duplicate PredBB and BB as, say, PredBB.thread and BB.thread. Since
	// PredBB.thread has a successor edge to PredBB, we would immediately come up
	// with another jump threading opportunity from PredBB.thread through PredBB
	// and BB to SuccBB. This jump threading would repeatedly occur. That is, we
	// would keep peeling one iteration from PredBB.
	if (llvm::is_contained(successors(PredBB), PredBB))
	return false;

	// Don't thread across a loop header.
	if (LoopHeaders.count(PredBB))
	return false;

	// Avoid complication with duplicating EH pads.
	if (PredBB->isEHPad())
	return false;

	// Find a predecessor that we can thread. For simplicity, we only consider a
	// successor edge out of BB to which we thread exactly one incoming edge into
	// PredBB.
	unsigned ZeroCount = 0;
	unsigned OneCount = 0;
	BasicBlock *ZeroPred = nullptr;
	BasicBlock *OnePred = nullptr;
	for (BasicBlock *P : predecessors(PredBB)) {
	if (ConstantInt *CI = dyn_cast_or_null<ConstantInt>(
	EvaluateOnPredecessorEdge(BB, P, Cond))) {
	if (CI->isZero()) {
	ZeroCount++;
	ZeroPred = P;
	} else if (CI->isOne()) {
	OneCount++;
	OnePred = P;
	}
	}
	}

	// Disregard complicated cases where we have to thread multiple edges.
	BasicBlock *PredPredBB;
	if (ZeroCount == 1) {
	PredPredBB = ZeroPred;
	} else if (OneCount == 1) {
	PredPredBB = OnePred;
	} else {
	return false;
	}

	BasicBlock *SuccBB = CondBr->getSuccessor(PredPredBB == ZeroPred);

	// If threading to the same block as we come from, we would infinite loop.
	if (SuccBB == BB) {
	LLVM_DEBUG(dbgs() << " Not threading across BB '" << BB->getName()
	<< "' - would thread to self!\n");
	return false;
	}

	// If threading this would thread across a loop header, don't thread the edge.
	// See the comments above FindLoopHeaders for justifications and caveats.
	if (LoopHeaders.count(BB) \|\| LoopHeaders.count(SuccBB)) {
	LLVM_DEBUG({
	bool BBIsHeader = LoopHeaders.count(BB);
	bool SuccIsHeader = LoopHeaders.count(SuccBB);
	dbgs() << " Not threading across "
	<< (BBIsHeader ? "loop header BB '" : "block BB '")
	<< BB->getName() << "' to dest "
	<< (SuccIsHeader ? "loop header BB '" : "block BB '")
	<< SuccBB->getName()
	<< "' - it might create an irreducible loop!\n";
	});
	return false;
	}

	// Compute the cost of duplicating BB and PredBB.
	unsigned BBCost =
	getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
	unsigned PredBBCost = getJumpThreadDuplicationCost(
	PredBB, PredBB->getTerminator(), BBDupThreshold);

	// Give up if costs are too high. We need to check BBCost and PredBBCost
	// individually before checking their sum because getJumpThreadDuplicationCost
	// return (unsigned)~0 for those basic blocks that cannot be duplicated.
	if (BBCost > BBDupThreshold \|\| PredBBCost > BBDupThreshold \|\|
	BBCost + PredBBCost > BBDupThreshold) {
	LLVM_DEBUG(dbgs() << " Not threading BB '" << BB->getName()
	<< "' - Cost is too high: " << PredBBCost
	<< " for PredBB, " << BBCost << "for BB\n");
	return false;
	}

	// Now we are ready to duplicate PredBB.
	ThreadThroughTwoBasicBlocks(PredPredBB, PredBB, BB, SuccBB);
	return true;
	}

	void JumpThreadingPass::ThreadThroughTwoBasicBlocks(BasicBlock *PredPredBB,
	BasicBlock *PredBB,
	BasicBlock *BB,
	BasicBlock *SuccBB) {
	LLVM_DEBUG(dbgs() << " Threading through '" << PredBB->getName() << "' and '"
	<< BB->getName() << "'\n");

	BranchInst *CondBr = cast<BranchInst>(BB->getTerminator());
	BranchInst *PredBBBranch = cast<BranchInst>(PredBB->getTerminator());

	BasicBlock *NewBB =
	BasicBlock::Create(PredBB->getContext(), PredBB->getName() + ".thread",
	PredBB->getParent(), PredBB);
	NewBB->moveAfter(PredBB);

	// Set the block frequency of NewBB.
	if (HasProfileData) {
	auto NewBBFreq = BFI->getBlockFreq(PredPredBB) *
	BPI->getEdgeProbability(PredPredBB, PredBB);
	BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
	}

	// We are going to have to map operands from the original BB block to the new
	// copy of the block 'NewBB'. If there are PHI nodes in PredBB, evaluate them
	// to account for entry from PredPredBB.
	DenseMap<Instruction , Value > ValueMapping =
	CloneInstructions(PredBB->begin(), PredBB->end(), NewBB, PredPredBB);

	// Update the terminator of PredPredBB to jump to NewBB instead of PredBB.
	// This eliminates predecessors from PredPredBB, which requires us to simplify
	// any PHI nodes in PredBB.
	Instruction *PredPredTerm = PredPredBB->getTerminator();
	for (unsigned i = 0, e = PredPredTerm->getNumSuccessors(); i != e; ++i)
	if (PredPredTerm->getSuccessor(i) == PredBB) {
	PredBB->removePredecessor(PredPredBB, true);
	PredPredTerm->setSuccessor(i, NewBB);
	}

	AddPHINodeEntriesForMappedBlock(PredBBBranch->getSuccessor(0), PredBB, NewBB,
	ValueMapping);
	AddPHINodeEntriesForMappedBlock(PredBBBranch->getSuccessor(1), PredBB, NewBB,
	ValueMapping);

	DTU->applyUpdatesPermissive(
	{{DominatorTree::Insert, NewBB, CondBr->getSuccessor(0)},
	{DominatorTree::Insert, NewBB, CondBr->getSuccessor(1)},
	{DominatorTree::Insert, PredPredBB, NewBB},
	{DominatorTree::Delete, PredPredBB, PredBB}});

	UpdateSSA(PredBB, NewBB, ValueMapping);

	// Clean up things like PHI nodes with single operands, dead instructions,
	// etc.
	SimplifyInstructionsInBlock(NewBB, TLI);
	SimplifyInstructionsInBlock(PredBB, TLI);

	SmallVector<BasicBlock *, 1> PredsToFactor;
	PredsToFactor.push_back(NewBB);
	ThreadEdge(BB, PredsToFactor, SuccBB);
	}

	/// TryThreadEdge - Thread an edge if it's safe and profitable to do so.
	bool JumpThreadingPass::TryThreadEdge(
	BasicBlock BB, const SmallVectorImpl<BasicBlock > &PredBBs,
	BasicBlock *SuccBB) {
	// If threading to the same block as we come from, we would infinite loop.
	if (SuccBB == BB) {
	LLVM_DEBUG(dbgs() << " Not threading across BB '" << BB->getName()
	<< "' - would thread to self!\n");
	return false;
	}

	// If threading this would thread across a loop header, don't thread the edge.
	// See the comments above FindLoopHeaders for justifications and caveats.
	if (LoopHeaders.count(BB) \|\| LoopHeaders.count(SuccBB)) {
	LLVM_DEBUG({
	bool BBIsHeader = LoopHeaders.count(BB);
	bool SuccIsHeader = LoopHeaders.count(SuccBB);
	dbgs() << " Not threading across "
	<< (BBIsHeader ? "loop header BB '" : "block BB '") << BB->getName()
	<< "' to dest " << (SuccIsHeader ? "loop header BB '" : "block BB '")
	<< SuccBB->getName() << "' - it might create an irreducible loop!\n";
	});
	return false;
	}

	unsigned JumpThreadCost =
	getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
	if (JumpThreadCost > BBDupThreshold) {
	LLVM_DEBUG(dbgs() << " Not threading BB '" << BB->getName()
	<< "' - Cost is too high: " << JumpThreadCost << "\n");
	return false;
	}

	ThreadEdge(BB, PredBBs, SuccBB);
	return true;
	}

	/// ThreadEdge - We have decided that it is safe and profitable to factor the
	/// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB
	/// across BB. Transform the IR to reflect this change.
	void JumpThreadingPass::ThreadEdge(BasicBlock *BB,
	const SmallVectorImpl<BasicBlock *> &PredBBs,
	BasicBlock *SuccBB) {
	assert(SuccBB != BB && "Don't create an infinite loop");

	assert(!LoopHeaders.count(BB) && !LoopHeaders.count(SuccBB) &&
	"Don't thread across loop headers");

	// And finally, do it! Start by factoring the predecessors if needed.
	BasicBlock *PredBB;
	if (PredBBs.size() == 1)
	PredBB = PredBBs[0];
	else {
	LLVM_DEBUG(dbgs() << " Factoring out " << PredBBs.size()
	<< " common predecessors.\n");
	PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm");
	}

	// And finally, do it!
	LLVM_DEBUG(dbgs() << " Threading edge from '" << PredBB->getName()
	<< "' to '" << SuccBB->getName()
	<< ", across block:\n " << *BB << "\n");

	LVI->threadEdge(PredBB, BB, SuccBB);

	BasicBlock *NewBB = BasicBlock::Create(BB->getContext(),
	BB->getName()+".thread",
	BB->getParent(), BB);
	NewBB->moveAfter(PredBB);

	// Set the block frequency of NewBB.
	if (HasProfileData) {
	auto NewBBFreq =
	BFI->getBlockFreq(PredBB) * BPI->getEdgeProbability(PredBB, BB);
	BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
	}

	// Copy all the instructions from BB to NewBB except the terminator.
	DenseMap<Instruction , Value > ValueMapping =
	CloneInstructions(BB->begin(), std::prev(BB->end()), NewBB, PredBB);

	// We didn't copy the terminator from BB over to NewBB, because there is now
	// an unconditional jump to SuccBB. Insert the unconditional jump.
	BranchInst *NewBI = BranchInst::Create(SuccBB, NewBB);
	NewBI->setDebugLoc(BB->getTerminator()->getDebugLoc());

	// Check to see if SuccBB has PHI nodes. If so, we need to add entries to the
	// PHI nodes for NewBB now.
	AddPHINodeEntriesForMappedBlock(SuccBB, BB, NewBB, ValueMapping);

	// Update the terminator of PredBB to jump to NewBB instead of BB. This
	// eliminates predecessors from BB, which requires us to simplify any PHI
	// nodes in BB.
	Instruction *PredTerm = PredBB->getTerminator();
	for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i)
	if (PredTerm->getSuccessor(i) == BB) {
	BB->removePredecessor(PredBB, true);
	PredTerm->setSuccessor(i, NewBB);
	}

	// Enqueue required DT updates.
	DTU->applyUpdatesPermissive({{DominatorTree::Insert, NewBB, SuccBB},
	{DominatorTree::Insert, PredBB, NewBB},
	{DominatorTree::Delete, PredBB, BB}});

	UpdateSSA(BB, NewBB, ValueMapping);

	// At this point, the IR is fully up to date and consistent. Do a quick scan
	// over the new instructions and zap any that are constants or dead. This
	// frequently happens because of phi translation.
	SimplifyInstructionsInBlock(NewBB, TLI);

	// Update the edge weight from BB to SuccBB, which should be less than before.
	UpdateBlockFreqAndEdgeWeight(PredBB, BB, NewBB, SuccBB);

	// Threaded an edge!
	++NumThreads;
	}

	/// Create a new basic block that will be the predecessor of BB and successor of
	/// all blocks in Preds. When profile data is available, update the frequency of
	/// this new block.
	BasicBlock JumpThreadingPass::SplitBlockPreds(BasicBlock BB,
	ArrayRef<BasicBlock *> Preds,
	const char *Suffix) {
	SmallVector<BasicBlock *, 2> NewBBs;

	// Collect the frequencies of all predecessors of BB, which will be used to
	// update the edge weight of the result of splitting predecessors.
	DenseMap<BasicBlock *, BlockFrequency> FreqMap;
	if (HasProfileData)
	for (auto Pred : Preds)
	FreqMap.insert(std::make_pair(
	Pred, BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, BB)));

	// In the case when BB is a LandingPad block we create 2 new predecessors
	// instead of just one.
	if (BB->isLandingPad()) {
	std::string NewName = std::string(Suffix) + ".split-lp";
	SplitLandingPadPredecessors(BB, Preds, Suffix, NewName.c_str(), NewBBs);
	} else {
	NewBBs.push_back(SplitBlockPredecessors(BB, Preds, Suffix));
	}

	std::vector<DominatorTree::UpdateType> Updates;
	Updates.reserve((2 * Preds.size()) + NewBBs.size());
	for (auto NewBB : NewBBs) {
	BlockFrequency NewBBFreq(0);
	Updates.push_back({DominatorTree::Insert, NewBB, BB});
	for (auto Pred : predecessors(NewBB)) {
	Updates.push_back({DominatorTree::Delete, Pred, BB});
	Updates.push_back({DominatorTree::Insert, Pred, NewBB});
	if (HasProfileData) // Update frequencies between Pred -> NewBB.
	NewBBFreq += FreqMap.lookup(Pred);
	}
	if (HasProfileData) // Apply the summed frequency to NewBB.
	BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
	}

	DTU->applyUpdatesPermissive(Updates);
	return NewBBs[0];
	}

	bool JumpThreadingPass::doesBlockHaveProfileData(BasicBlock *BB) {
	const Instruction *TI = BB->getTerminator();
	assert(TI->getNumSuccessors() > 1 && "not a split");

	MDNode *WeightsNode = TI->getMetadata(LLVMContext::MD_prof);
	if (!WeightsNode)
	return false;

	MDString *MDName = cast<MDString>(WeightsNode->getOperand(0));
	if (MDName->getString() != "branch_weights")
	return false;

	// Ensure there are weights for all of the successors. Note that the first
	// operand to the metadata node is a name, not a weight.
	return WeightsNode->getNumOperands() == TI->getNumSuccessors() + 1;
	}

	/// Update the block frequency of BB and branch weight and the metadata on the
	/// edge BB->SuccBB. This is done by scaling the weight of BB->SuccBB by 1 -
	/// Freq(PredBB->BB) / Freq(BB->SuccBB).
	void JumpThreadingPass::UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB,
	BasicBlock *BB,
	BasicBlock *NewBB,
	BasicBlock *SuccBB) {
	if (!HasProfileData)
	return;

	assert(BFI && BPI && "BFI & BPI should have been created here");

	// As the edge from PredBB to BB is deleted, we have to update the block
	// frequency of BB.
	auto BBOrigFreq = BFI->getBlockFreq(BB);
	auto NewBBFreq = BFI->getBlockFreq(NewBB);
	auto BB2SuccBBFreq = BBOrigFreq * BPI->getEdgeProbability(BB, SuccBB);
	auto BBNewFreq = BBOrigFreq - NewBBFreq;
	BFI->setBlockFreq(BB, BBNewFreq.getFrequency());

	// Collect updated outgoing edges' frequencies from BB and use them to update
	// edge probabilities.
	SmallVector<uint64_t, 4> BBSuccFreq;
	for (BasicBlock *Succ : successors(BB)) {
	auto SuccFreq = (Succ == SuccBB)
	? BB2SuccBBFreq - NewBBFreq
	: BBOrigFreq * BPI->getEdgeProbability(BB, Succ);
	BBSuccFreq.push_back(SuccFreq.getFrequency());
	}

	uint64_t MaxBBSuccFreq =
	*std::max_element(BBSuccFreq.begin(), BBSuccFreq.end());

	SmallVector<BranchProbability, 4> BBSuccProbs;
	if (MaxBBSuccFreq == 0)
	BBSuccProbs.assign(BBSuccFreq.size(),
	{1, static_cast<unsigned>(BBSuccFreq.size())});
	else {
	for (uint64_t Freq : BBSuccFreq)
	BBSuccProbs.push_back(
	BranchProbability::getBranchProbability(Freq, MaxBBSuccFreq));
	// Normalize edge probabilities so that they sum up to one.
	BranchProbability::normalizeProbabilities(BBSuccProbs.begin(),
	BBSuccProbs.end());
	}

	// Update edge probabilities in BPI.
	BPI->setEdgeProbability(BB, BBSuccProbs);

	// Update the profile metadata as well.
	//
	// Don't do this if the profile of the transformed blocks was statically
	// estimated. (This could occur despite the function having an entry
	// frequency in completely cold parts of the CFG.)
	//
	// In this case we don't want to suggest to subsequent passes that the
	// calculated weights are fully consistent. Consider this graph:
	//
	// check_1
	// 50% / \|
	// eq_1 \| 50%
	// \ \|
	// check_2
	// 50% / \|
	// eq_2 \| 50%
	// \ \|
	// check_3
	// 50% / \|
	// eq_3 \| 50%
	// \ \|
	//
	// Assuming the blocks check_* all compare the same value against 1, 2 and 3,
	// the overall probabilities are inconsistent; the total probability that the
	// value is either 1, 2 or 3 is 150%.
	//
	// As a consequence if we thread eq_1 -> check_2 to check_3, check_2->check_3
	// becomes 0%. This is even worse if the edge whose probability becomes 0% is
	// the loop exit edge. Then based solely on static estimation we would assume
	// the loop was extremely hot.
	//
	// FIXME this locally as well so that BPI and BFI are consistent as well. We
	// shouldn't make edges extremely likely or unlikely based solely on static
	// estimation.
	if (BBSuccProbs.size() >= 2 && doesBlockHaveProfileData(BB)) {
	SmallVector<uint32_t, 4> Weights;
	for (auto Prob : BBSuccProbs)
	Weights.push_back(Prob.getNumerator());

	auto TI = BB->getTerminator();
	TI->setMetadata(
	LLVMContext::MD_prof,
	MDBuilder(TI->getParent()->getContext()).createBranchWeights(Weights));
	}
	}

	/// DuplicateCondBranchOnPHIIntoPred - PredBB contains an unconditional branch
	/// to BB which contains an i1 PHI node and a conditional branch on that PHI.
	/// If we can duplicate the contents of BB up into PredBB do so now, this
	/// improves the odds that the branch will be on an analyzable instruction like
	/// a compare.
	bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
	BasicBlock BB, const SmallVectorImpl<BasicBlock > &PredBBs) {
	assert(!PredBBs.empty() && "Can't handle an empty set");

	// If BB is a loop header, then duplicating this block outside the loop would
	// cause us to transform this into an irreducible loop, don't do this.
	// See the comments above FindLoopHeaders for justifications and caveats.
	if (LoopHeaders.count(BB)) {
	LLVM_DEBUG(dbgs() << " Not duplicating loop header '" << BB->getName()
	<< "' into predecessor block '" << PredBBs[0]->getName()
	<< "' - it might create an irreducible loop!\n");
	return false;
	}

	unsigned DuplicationCost =
	getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
	if (DuplicationCost > BBDupThreshold) {
	LLVM_DEBUG(dbgs() << " Not duplicating BB '" << BB->getName()
	<< "' - Cost is too high: " << DuplicationCost << "\n");
	return false;
	}

	// And finally, do it! Start by factoring the predecessors if needed.
	std::vector<DominatorTree::UpdateType> Updates;
	BasicBlock *PredBB;
	if (PredBBs.size() == 1)
	PredBB = PredBBs[0];
	else {
	LLVM_DEBUG(dbgs() << " Factoring out " << PredBBs.size()
	<< " common predecessors.\n");
	PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm");
	}
	Updates.push_back({DominatorTree::Delete, PredBB, BB});

	// Okay, we decided to do this! Clone all the instructions in BB onto the end
	// of PredBB.
	LLVM_DEBUG(dbgs() << " Duplicating block '" << BB->getName()
	<< "' into end of '" << PredBB->getName()
	<< "' to eliminate branch on phi. Cost: "
	<< DuplicationCost << " block is:" << *BB << "\n");

	// Unless PredBB ends with an unconditional branch, split the edge so that we
	// can just clone the bits from BB into the end of the new PredBB.
	BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator());

	if (!OldPredBranch \|\| !OldPredBranch->isUnconditional()) {
	BasicBlock *OldPredBB = PredBB;
	PredBB = SplitEdge(OldPredBB, BB);
	Updates.push_back({DominatorTree::Insert, OldPredBB, PredBB});
	Updates.push_back({DominatorTree::Insert, PredBB, BB});
	Updates.push_back({DominatorTree::Delete, OldPredBB, BB});
	OldPredBranch = cast<BranchInst>(PredBB->getTerminator());
	}

	// We are going to have to map operands from the original BB block into the
	// PredBB block. Evaluate PHI nodes in BB.
	DenseMap<Instruction, Value> ValueMapping;

	BasicBlock::iterator BI = BB->begin();
	for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
	ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
	// Clone the non-phi instructions of BB into PredBB, keeping track of the
	// mapping and using it to remap operands in the cloned instructions.
	for (; BI != BB->end(); ++BI) {
	Instruction *New = BI->clone();

	// Remap operands to patch up intra-block references.
	for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
	if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
	DenseMap<Instruction, Value>::iterator I = ValueMapping.find(Inst);
	if (I != ValueMapping.end())
	New->setOperand(i, I->second);
	}

	// If this instruction can be simplified after the operands are updated,
	// just use the simplified value instead. This frequently happens due to
	// phi translation.
	if (Value *IV = SimplifyInstruction(
	New,
	{BB->getModule()->getDataLayout(), TLI, nullptr, nullptr, New})) {
	ValueMapping[&*BI] = IV;
	if (!New->mayHaveSideEffects()) {
	New->deleteValue();
	New = nullptr;
	}
	} else {
	ValueMapping[&*BI] = New;
	}
	if (New) {
	// Otherwise, insert the new instruction into the block.
	New->setName(BI->getName());
	PredBB->getInstList().insert(OldPredBranch->getIterator(), New);
	// Update Dominance from simplified New instruction operands.
	for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
	if (BasicBlock *SuccBB = dyn_cast<BasicBlock>(New->getOperand(i)))
	Updates.push_back({DominatorTree::Insert, PredBB, SuccBB});
	}
	}

	// Check to see if the targets of the branch had PHI nodes. If so, we need to
	// add entries to the PHI nodes for branch from PredBB now.
	BranchInst *BBBranch = cast<BranchInst>(BB->getTerminator());
	AddPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(0), BB, PredBB,
	ValueMapping);
	AddPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(1), BB, PredBB,
	ValueMapping);

	UpdateSSA(BB, PredBB, ValueMapping);

	// PredBB no longer jumps to BB, remove entries in the PHI node for the edge
	// that we nuked.
	BB->removePredecessor(PredBB, true);

	// Remove the unconditional branch at the end of the PredBB block.
	OldPredBranch->eraseFromParent();
	DTU->applyUpdatesPermissive(Updates);

	++NumDupes;
	return true;
	}

	// Pred is a predecessor of BB with an unconditional branch to BB. SI is
	// a Select instruction in Pred. BB has other predecessors and SI is used in
	// a PHI node in BB. SI has no other use.
	// A new basic block, NewBB, is created and SI is converted to compare and
	// conditional branch. SI is erased from parent.
	void JumpThreadingPass::UnfoldSelectInstr(BasicBlock Pred, BasicBlock BB,
	SelectInst SI, PHINode SIUse,
	unsigned Idx) {
	// Expand the select.
	//
	// Pred --
	// \| v
	// \| NewBB
	// \| \|
	// \|-----
	// v
	// BB
	BranchInst *PredTerm = cast<BranchInst>(Pred->getTerminator());
	BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "select.unfold",
	BB->getParent(), BB);
	// Move the unconditional branch to NewBB.
	PredTerm->removeFromParent();
	NewBB->getInstList().insert(NewBB->end(), PredTerm);
	// Create a conditional branch and update PHI nodes.
	BranchInst::Create(NewBB, BB, SI->getCondition(), Pred);
	SIUse->setIncomingValue(Idx, SI->getFalseValue());
	SIUse->addIncoming(SI->getTrueValue(), NewBB);

	// The select is now dead.
	SI->eraseFromParent();
	DTU->applyUpdatesPermissive({{DominatorTree::Insert, NewBB, BB},
	{DominatorTree::Insert, Pred, NewBB}});

	// Update any other PHI nodes in BB.
	for (BasicBlock::iterator BI = BB->begin();
	PHINode *Phi = dyn_cast<PHINode>(BI); ++BI)
	if (Phi != SIUse)
	Phi->addIncoming(Phi->getIncomingValueForBlock(Pred), NewBB);
	}

	bool JumpThreadingPass::TryToUnfoldSelect(SwitchInst SI, BasicBlock BB) {
	PHINode *CondPHI = dyn_cast<PHINode>(SI->getCondition());

	if (!CondPHI \|\| CondPHI->getParent() != BB)
	return false;

	for (unsigned I = 0, E = CondPHI->getNumIncomingValues(); I != E; ++I) {
	BasicBlock *Pred = CondPHI->getIncomingBlock(I);
	SelectInst *PredSI = dyn_cast<SelectInst>(CondPHI->getIncomingValue(I));

	// The second and third condition can be potentially relaxed. Currently
	// the conditions help to simplify the code and allow us to reuse existing
	// code, developed for TryToUnfoldSelect(CmpInst , BasicBlock )
	if (!PredSI \|\| PredSI->getParent() != Pred \|\| !PredSI->hasOneUse())
	continue;

	BranchInst *PredTerm = dyn_cast<BranchInst>(Pred->getTerminator());
	if (!PredTerm \|\| !PredTerm->isUnconditional())
	continue;

	UnfoldSelectInstr(Pred, BB, PredSI, CondPHI, I);
	return true;
	}
	return false;
	}

	/// TryToUnfoldSelect - Look for blocks of the form
	/// bb1:
	/// %a = select
	/// br bb2
	///
	/// bb2:
	/// %p = phi [%a, %bb1] ...
	/// %c = icmp %p
	/// br i1 %c
	///
	/// And expand the select into a branch structure if one of its arms allows %c
	/// to be folded. This later enables threading from bb1 over bb2.
	bool JumpThreadingPass::TryToUnfoldSelect(CmpInst CondCmp, BasicBlock BB) {
	BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
	PHINode *CondLHS = dyn_cast<PHINode>(CondCmp->getOperand(0));
	Constant *CondRHS = cast<Constant>(CondCmp->getOperand(1));

	if (!CondBr \|\| !CondBr->isConditional() \|\| !CondLHS \|\|
	CondLHS->getParent() != BB)
	return false;

	for (unsigned I = 0, E = CondLHS->getNumIncomingValues(); I != E; ++I) {
	BasicBlock *Pred = CondLHS->getIncomingBlock(I);
	SelectInst *SI = dyn_cast<SelectInst>(CondLHS->getIncomingValue(I));

	// Look if one of the incoming values is a select in the corresponding
	// predecessor.
	if (!SI \|\| SI->getParent() != Pred \|\| !SI->hasOneUse())
	continue;

	BranchInst *PredTerm = dyn_cast<BranchInst>(Pred->getTerminator());
	if (!PredTerm \|\| !PredTerm->isUnconditional())
	continue;

	// Now check if one of the select values would allow us to constant fold the
	// terminator in BB. We don't do the transform if both sides fold, those
	// cases will be threaded in any case.
	LazyValueInfo::Tristate LHSFolds =
	LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(1),
	CondRHS, Pred, BB, CondCmp);
	LazyValueInfo::Tristate RHSFolds =
	LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(2),
	CondRHS, Pred, BB, CondCmp);
	if ((LHSFolds != LazyValueInfo::Unknown \|\|
	RHSFolds != LazyValueInfo::Unknown) &&
	LHSFolds != RHSFolds) {
	UnfoldSelectInstr(Pred, BB, SI, CondLHS, I);
	return true;
	}
	}
	return false;
	}

	/// TryToUnfoldSelectInCurrBB - Look for PHI/Select or PHI/CMP/Select in the
	/// same BB in the form
	/// bb:
	/// %p = phi [false, %bb1], [true, %bb2], [false, %bb3], [true, %bb4], ...
	/// %s = select %p, trueval, falseval
	///
	/// or
	///
	/// bb:
	/// %p = phi [0, %bb1], [1, %bb2], [0, %bb3], [1, %bb4], ...
	/// %c = cmp %p, 0
	/// %s = select %c, trueval, falseval
	///
	/// And expand the select into a branch structure. This later enables
	/// jump-threading over bb in this pass.
	///
	/// Using the similar approach of SimplifyCFG::FoldCondBranchOnPHI(), unfold
	/// select if the associated PHI has at least one constant. If the unfolded
	/// select is not jump-threaded, it will be folded again in the later
	/// optimizations.
	bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
	// This transform can introduce a UB (a conditional branch that depends on a
	// poison value) that was not present in the original program. See
	// @TryToUnfoldSelectInCurrBB test in test/Transforms/JumpThreading/select.ll.
	// Disable this transform under MemorySanitizer.
	// FIXME: either delete it or replace with a valid transform. This issue is
	// not limited to MemorySanitizer (but has only been observed as an MSan false
	// positive in practice so far).
	if (BB->getParent()->hasFnAttribute(Attribute::SanitizeMemory))
	return false;

	// If threading this would thread across a loop header, don't thread the edge.
	// See the comments above FindLoopHeaders for justifications and caveats.
	if (LoopHeaders.count(BB))
	return false;

	for (BasicBlock::iterator BI = BB->begin();
	PHINode *PN = dyn_cast<PHINode>(BI); ++BI) {
	// Look for a Phi having at least one constant incoming value.
	if (llvm::all_of(PN->incoming_values(),
	[](Value *V) { return !isa<ConstantInt>(V); }))
	continue;

	auto isUnfoldCandidate = [BB](SelectInst SI, Value V) {
	// Check if SI is in BB and use V as condition.
	if (SI->getParent() != BB)
	return false;
	Value *Cond = SI->getCondition();
	return (Cond && Cond == V && Cond->getType()->isIntegerTy(1));
	};

	SelectInst *SI = nullptr;
	for (Use &U : PN->uses()) {
	if (ICmpInst *Cmp = dyn_cast<ICmpInst>(U.getUser())) {
	// Look for a ICmp in BB that compares PN with a constant and is the
	// condition of a Select.
	if (Cmp->getParent() == BB && Cmp->hasOneUse() &&
	isa<ConstantInt>(Cmp->getOperand(1 - U.getOperandNo())))
	if (SelectInst *SelectI = dyn_cast<SelectInst>(Cmp->user_back()))
	if (isUnfoldCandidate(SelectI, Cmp->use_begin()->get())) {
	SI = SelectI;
	break;
	}
	} else if (SelectInst *SelectI = dyn_cast<SelectInst>(U.getUser())) {
	// Look for a Select in BB that uses PN as condition.
	if (isUnfoldCandidate(SelectI, U.get())) {
	SI = SelectI;
	break;
	}
	}
	}

	if (!SI)
	continue;
	// Expand the select.
	Instruction *Term =
	SplitBlockAndInsertIfThen(SI->getCondition(), SI, false);
	BasicBlock *SplitBB = SI->getParent();
	BasicBlock *NewBB = Term->getParent();
	PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI);
	NewPN->addIncoming(SI->getTrueValue(), Term->getParent());
	NewPN->addIncoming(SI->getFalseValue(), BB);
	SI->replaceAllUsesWith(NewPN);
	SI->eraseFromParent();
	// NewBB and SplitBB are newly created blocks which require insertion.
	std::vector<DominatorTree::UpdateType> Updates;
	Updates.reserve((2 * SplitBB->getTerminator()->getNumSuccessors()) + 3);
	Updates.push_back({DominatorTree::Insert, BB, SplitBB});
	Updates.push_back({DominatorTree::Insert, BB, NewBB});
	Updates.push_back({DominatorTree::Insert, NewBB, SplitBB});
	// BB's successors were moved to SplitBB, update DTU accordingly.
	for (auto *Succ : successors(SplitBB)) {
	Updates.push_back({DominatorTree::Delete, BB, Succ});
	Updates.push_back({DominatorTree::Insert, SplitBB, Succ});
	}
	DTU->applyUpdatesPermissive(Updates);
	return true;
	}
	return false;
	}

	/// Try to propagate a guard from the current BB into one of its predecessors
	/// in case if another branch of execution implies that the condition of this
	/// guard is always true. Currently we only process the simplest case that
	/// looks like:
	///
	/// Start:
	/// %cond = ...
	/// br i1 %cond, label %T1, label %F1
	/// T1:
	/// br label %Merge
	/// F1:
	/// br label %Merge
	/// Merge:
	/// %condGuard = ...
	/// call void(i1, ...) @llvm.experimental.guard( i1 %condGuard )[ "deopt"() ]
	///
	/// And cond either implies condGuard or !condGuard. In this case all the
	/// instructions before the guard can be duplicated in both branches, and the
	/// guard is then threaded to one of them.
	bool JumpThreadingPass::ProcessGuards(BasicBlock *BB) {
	using namespace PatternMatch;

	// We only want to deal with two predecessors.
	BasicBlock Pred1, Pred2;
	auto PI = pred_begin(BB), PE = pred_end(BB);
	if (PI == PE)
	return false;
	Pred1 = *PI++;
	if (PI == PE)
	return false;
	Pred2 = *PI++;
	if (PI != PE)
	return false;
	if (Pred1 == Pred2)
	return false;

	// Try to thread one of the guards of the block.
	// TODO: Look up deeper than to immediate predecessor?
	auto *Parent = Pred1->getSinglePredecessor();
	if (!Parent \|\| Parent != Pred2->getSinglePredecessor())
	return false;

	if (auto *BI = dyn_cast<BranchInst>(Parent->getTerminator()))
	for (auto &I : *BB)
	if (isGuard(&I) && ThreadGuard(BB, cast<IntrinsicInst>(&I), BI))
	return true;

	return false;
	}

	/// Try to propagate the guard from BB which is the lower block of a diamond
	/// to one of its branches, in case if diamond's condition implies guard's
	/// condition.
	bool JumpThreadingPass::ThreadGuard(BasicBlock BB, IntrinsicInst Guard,
	BranchInst *BI) {
	assert(BI->getNumSuccessors() == 2 && "Wrong number of successors?");
	assert(BI->isConditional() && "Unconditional branch has 2 successors?");
	Value *GuardCond = Guard->getArgOperand(0);
	Value *BranchCond = BI->getCondition();
	BasicBlock *TrueDest = BI->getSuccessor(0);
	BasicBlock *FalseDest = BI->getSuccessor(1);

	auto &DL = BB->getModule()->getDataLayout();
	bool TrueDestIsSafe = false;
	bool FalseDestIsSafe = false;

	// True dest is safe if BranchCond => GuardCond.
	auto Impl = isImpliedCondition(BranchCond, GuardCond, DL);
	if (Impl && *Impl)
	TrueDestIsSafe = true;
	else {
	// False dest is safe if !BranchCond => GuardCond.
	Impl = isImpliedCondition(BranchCond, GuardCond, DL, /* LHSIsTrue */ false);
	if (Impl && *Impl)
	FalseDestIsSafe = true;
	}

	if (!TrueDestIsSafe && !FalseDestIsSafe)
	return false;

	BasicBlock *PredUnguardedBlock = TrueDestIsSafe ? TrueDest : FalseDest;
	BasicBlock *PredGuardedBlock = FalseDestIsSafe ? TrueDest : FalseDest;

	ValueToValueMapTy UnguardedMapping, GuardedMapping;
	Instruction *AfterGuard = Guard->getNextNode();
	unsigned Cost = getJumpThreadDuplicationCost(BB, AfterGuard, BBDupThreshold);
	if (Cost > BBDupThreshold)
	return false;
	// Duplicate all instructions before the guard and the guard itself to the
	// branch where implication is not proved.
	BasicBlock *GuardedBlock = DuplicateInstructionsInSplitBetween(
	BB, PredGuardedBlock, AfterGuard, GuardedMapping, *DTU);
	assert(GuardedBlock && "Could not create the guarded block?");
	// Duplicate all instructions before the guard in the unguarded branch.
	// Since we have successfully duplicated the guarded block and this block
	// has fewer instructions, we expect it to succeed.
	BasicBlock *UnguardedBlock = DuplicateInstructionsInSplitBetween(
	BB, PredUnguardedBlock, Guard, UnguardedMapping, *DTU);
	assert(UnguardedBlock && "Could not create the unguarded block?");
	LLVM_DEBUG(dbgs() << "Moved guard " << *Guard << " to block "
	<< GuardedBlock->getName() << "\n");
	// Some instructions before the guard may still have uses. For them, we need
	// to create Phi nodes merging their copies in both guarded and unguarded
	// branches. Those instructions that have no uses can be just removed.
	SmallVector<Instruction *, 4> ToRemove;
	for (auto BI = BB->begin(); &*BI != AfterGuard; ++BI)
	if (!isa<PHINode>(&*BI))
	ToRemove.push_back(&*BI);

	Instruction InsertionPoint = &BB->getFirstInsertionPt();
	assert(InsertionPoint && "Empty block?");
	// Substitute with Phis & remove.
	for (auto *Inst : reverse(ToRemove)) {
	if (!Inst->use_empty()) {
	PHINode *NewPN = PHINode::Create(Inst->getType(), 2);
	NewPN->addIncoming(UnguardedMapping[Inst], UnguardedBlock);
	NewPN->addIncoming(GuardedMapping[Inst], GuardedBlock);
	NewPN->insertBefore(InsertionPoint);
	Inst->replaceAllUsesWith(NewPN);
	}
	Inst->eraseFromParent();
	}
	return true;
	}
	diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
	index 5bc35aa4695f..f950d0d4eb2b 100644
	--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
	+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
	@@ -1,7636 +1,7645 @@
	//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
	// stores that can be put together into vector-stores. Next, it attempts to
	// construct vectorizable tree using the use-def chains. If a profitable tree
	// was found, the SLP vectorizer performs vectorization on the tree.
	//
	// The pass is inspired by the work described in the paper:
	// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/DenseSet.h"
	#include "llvm/ADT/MapVector.h"
	#include "llvm/ADT/None.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/PostOrderIterator.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SetVector.h"
	#include "llvm/ADT/SmallBitVector.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/iterator.h"
	#include "llvm/ADT/iterator_range.h"
	#include "llvm/Analysis/AliasAnalysis.h"
	#include "llvm/Analysis/CodeMetrics.h"
	#include "llvm/Analysis/DemandedBits.h"
	#include "llvm/Analysis/GlobalsModRef.h"
	#include "llvm/Analysis/LoopAccessAnalysis.h"
	#include "llvm/Analysis/LoopInfo.h"
	#include "llvm/Analysis/MemoryLocation.h"
	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
	#include "llvm/Analysis/ScalarEvolution.h"
	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
	#include "llvm/Analysis/TargetLibraryInfo.h"
	#include "llvm/Analysis/TargetTransformInfo.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/Analysis/VectorUtils.h"
	#include "llvm/Analysis/AssumptionCache.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/BasicBlock.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Dominators.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/InstrTypes.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/NoFolder.h"
	#include "llvm/IR/Operator.h"
	#include "llvm/IR/PassManager.h"
	#include "llvm/IR/PatternMatch.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Use.h"
	#include "llvm/IR/User.h"
	#include "llvm/IR/Value.h"
	#include "llvm/IR/ValueHandle.h"
	#include "llvm/IR/Verifier.h"
	#include "llvm/InitializePasses.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/DOTGraphTraits.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/GraphWriter.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Transforms/Utils/InjectTLIMappings.h"
	#include "llvm/Transforms/Utils/LoopUtils.h"
	#include "llvm/Transforms/Vectorize.h"
	#include <algorithm>
	#include <cassert>
	#include <cstdint>
	#include <iterator>
	#include <memory>
	#include <set>
	#include <string>
	#include <tuple>
	#include <utility>
	#include <vector>

	using namespace llvm;
	using namespace llvm::PatternMatch;
	using namespace slpvectorizer;

	#define SV_NAME "slp-vectorizer"
	#define DEBUG_TYPE "SLP"

	STATISTIC(NumVectorInstructions, "Number of vector instructions generated");

	cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
	cl::desc("Run the SLP vectorization passes"));

	static cl::opt<int>
	SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
	cl::desc("Only vectorize if you gain more than this "
	"number "));

	static cl::opt<bool>
	ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
	cl::desc("Attempt to vectorize horizontal reductions"));

	static cl::opt<bool> ShouldStartVectorizeHorAtStore(
	"slp-vectorize-hor-store", cl::init(false), cl::Hidden,
	cl::desc(
	"Attempt to vectorize horizontal reductions feeding into a store"));

	static cl::opt<int>
	MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
	cl::desc("Attempt to vectorize for this register size in bits"));

	static cl::opt<int>
	MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
	cl::desc("Maximum depth of the lookup for consecutive stores."));

	/// Limits the size of scheduling regions in a block.
	/// It avoid long compile times for _very_ large blocks where vector
	/// instructions are spread over a wide range.
	/// This limit is way higher than needed by real-world functions.
	static cl::opt<int>
	ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
	cl::desc("Limit the size of the SLP scheduling region per block"));

	static cl::opt<int> MinVectorRegSizeOption(
	"slp-min-reg-size", cl::init(128), cl::Hidden,
	cl::desc("Attempt to vectorize for this register size in bits"));

	static cl::opt<unsigned> RecursionMaxDepth(
	"slp-recursion-max-depth", cl::init(12), cl::Hidden,
	cl::desc("Limit the recursion depth when building a vectorizable tree"));

	static cl::opt<unsigned> MinTreeSize(
	"slp-min-tree-size", cl::init(3), cl::Hidden,
	cl::desc("Only vectorize small trees if they are fully vectorizable"));

	// The maximum depth that the look-ahead score heuristic will explore.
	// The higher this value, the higher the compilation time overhead.
	static cl::opt<int> LookAheadMaxDepth(
	"slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
	cl::desc("The maximum look-ahead depth for operand reordering scores"));

	// The Look-ahead heuristic goes through the users of the bundle to calculate
	// the users cost in getExternalUsesCost(). To avoid compilation time increase
	// we limit the number of users visited to this value.
	static cl::opt<unsigned> LookAheadUsersBudget(
	"slp-look-ahead-users-budget", cl::init(2), cl::Hidden,
	cl::desc("The maximum number of users to visit while visiting the "
	"predecessors. This prevents compilation time increase."));

	static cl::opt<bool>
	ViewSLPTree("view-slp-tree", cl::Hidden,
	cl::desc("Display the SLP trees with Graphviz"));

	// Limit the number of alias checks. The limit is chosen so that
	// it has no negative effect on the llvm benchmarks.
	static const unsigned AliasedCheckLimit = 10;

	// Another limit for the alias checks: The maximum distance between load/store
	// instructions where alias checks are done.
	// This limit is useful for very large basic blocks.
	static const unsigned MaxMemDepDistance = 160;

	/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
	/// regions to be handled.
	static const int MinScheduleRegionSize = 16;

	/// Predicate for the element types that the SLP vectorizer supports.
	///
	/// The most important thing to filter here are types which are invalid in LLVM
	/// vectors. We also filter target specific types which have absolutely no
	/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
	/// avoids spending time checking the cost model and realizing that they will
	/// be inevitably scalarized.
	static bool isValidElementType(Type *Ty) {
	return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
	!Ty->isPPC_FP128Ty();
	}

	/// \returns true if all of the instructions in \p VL are in the same block or
	/// false otherwise.
	static bool allSameBlock(ArrayRef<Value *> VL) {
	Instruction *I0 = dyn_cast<Instruction>(VL[0]);
	if (!I0)
	return false;
	BasicBlock *BB = I0->getParent();
	for (int i = 1, e = VL.size(); i < e; i++) {
	Instruction *I = dyn_cast<Instruction>(VL[i]);
	if (!I)
	return false;

	if (BB != I->getParent())
	return false;
	}
	return true;
	}

	/// \returns True if all of the values in \p VL are constants (but not
	/// globals/constant expressions).
	static bool allConstant(ArrayRef<Value *> VL) {
	// Constant expressions and globals can't be vectorized like normal integer/FP
	// constants.
	for (Value *i : VL)
	if (!isa<Constant>(i) \|\| isa<ConstantExpr>(i) \|\| isa<GlobalValue>(i))
	return false;
	return true;
	}

	/// \returns True if all of the values in \p VL are identical.
	static bool isSplat(ArrayRef<Value *> VL) {
	for (unsigned i = 1, e = VL.size(); i < e; ++i)
	if (VL[i] != VL[0])
	return false;
	return true;
	}

	/// \returns True if \p I is commutative, handles CmpInst as well as Instruction.
	static bool isCommutative(Instruction *I) {
	if (auto *IC = dyn_cast<CmpInst>(I))
	return IC->isCommutative();
	return I->isCommutative();
	}

	/// Checks if the vector of instructions can be represented as a shuffle, like:
	/// %x0 = extractelement <4 x i8> %x, i32 0
	/// %x3 = extractelement <4 x i8> %x, i32 3
	/// %y1 = extractelement <4 x i8> %y, i32 1
	/// %y2 = extractelement <4 x i8> %y, i32 2
	/// %x0x0 = mul i8 %x0, %x0
	/// %x3x3 = mul i8 %x3, %x3
	/// %y1y1 = mul i8 %y1, %y1
	/// %y2y2 = mul i8 %y2, %y2
	/// %ins1 = insertelement <4 x i8> undef, i8 %x0x0, i32 0
	/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
	/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
	/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
	/// ret <4 x i8> %ins4
	/// can be transformed into:
	/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
	/// i32 6>
	/// %2 = mul <4 x i8> %1, %1
	/// ret <4 x i8> %2
	/// We convert this initially to something like:
	/// %x0 = extractelement <4 x i8> %x, i32 0
	/// %x3 = extractelement <4 x i8> %x, i32 3
	/// %y1 = extractelement <4 x i8> %y, i32 1
	/// %y2 = extractelement <4 x i8> %y, i32 2
	/// %1 = insertelement <4 x i8> undef, i8 %x0, i32 0
	/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
	/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
	/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
	/// %5 = mul <4 x i8> %4, %4
	/// %6 = extractelement <4 x i8> %5, i32 0
	/// %ins1 = insertelement <4 x i8> undef, i8 %6, i32 0
	/// %7 = extractelement <4 x i8> %5, i32 1
	/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
	/// %8 = extractelement <4 x i8> %5, i32 2
	/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
	/// %9 = extractelement <4 x i8> %5, i32 3
	/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
	/// ret <4 x i8> %ins4
	/// InstCombiner transforms this into a shuffle and vector mul
	/// TODO: Can we split off and reuse the shuffle mask detection from
	/// TargetTransformInfo::getInstructionThroughput?
	static Optional<TargetTransformInfo::ShuffleKind>
	isShuffle(ArrayRef<Value *> VL) {
	auto *EI0 = cast<ExtractElementInst>(VL[0]);
	unsigned Size = EI0->getVectorOperandType()->getNumElements();
	Value *Vec1 = nullptr;
	Value *Vec2 = nullptr;
	enum ShuffleMode { Unknown, Select, Permute };
	ShuffleMode CommonShuffleMode = Unknown;
	for (unsigned I = 0, E = VL.size(); I < E; ++I) {
	auto *EI = cast<ExtractElementInst>(VL[I]);
	auto *Vec = EI->getVectorOperand();
	// All vector operands must have the same number of vector elements.
	if (cast<VectorType>(Vec->getType())->getNumElements() != Size)
	return None;
	auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
	if (!Idx)
	return None;
	// Undefined behavior if Idx is negative or >= Size.
	if (Idx->getValue().uge(Size))
	continue;
	unsigned IntIdx = Idx->getValue().getZExtValue();
	// We can extractelement from undef vector.
	if (isa<UndefValue>(Vec))
	continue;
	// For correct shuffling we have to have at most 2 different vector operands
	// in all extractelement instructions.
	if (!Vec1 \|\| Vec1 == Vec)
	Vec1 = Vec;
	else if (!Vec2 \|\| Vec2 == Vec)
	Vec2 = Vec;
	else
	return None;
	if (CommonShuffleMode == Permute)
	continue;
	// If the extract index is not the same as the operation number, it is a
	// permutation.
	if (IntIdx != I) {
	CommonShuffleMode = Permute;
	continue;
	}
	CommonShuffleMode = Select;
	}
	// If we're not crossing lanes in different vectors, consider it as blending.
	if (CommonShuffleMode == Select && Vec2)
	return TargetTransformInfo::SK_Select;
	// If Vec2 was never used, we have a permutation of a single vector, otherwise
	// we have permutation of 2 vectors.
	return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
	: TargetTransformInfo::SK_PermuteSingleSrc;
	}

	namespace {

	/// Main data required for vectorization of instructions.
	struct InstructionsState {
	/// The very first instruction in the list with the main opcode.
	Value *OpValue = nullptr;

	/// The main/alternate instruction.
	Instruction *MainOp = nullptr;
	Instruction *AltOp = nullptr;

	/// The main/alternate opcodes for the list of instructions.
	unsigned getOpcode() const {
	return MainOp ? MainOp->getOpcode() : 0;
	}

	unsigned getAltOpcode() const {
	return AltOp ? AltOp->getOpcode() : 0;
	}

	/// Some of the instructions in the list have alternate opcodes.
	bool isAltShuffle() const { return getOpcode() != getAltOpcode(); }

	bool isOpcodeOrAlt(Instruction *I) const {
	unsigned CheckedOpcode = I->getOpcode();
	return getOpcode() == CheckedOpcode \|\| getAltOpcode() == CheckedOpcode;
	}

	InstructionsState() = delete;
	InstructionsState(Value OpValue, Instruction MainOp, Instruction *AltOp)
	: OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
	};

	} // end anonymous namespace

	/// Chooses the correct key for scheduling data. If \p Op has the same (or
	/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
	/// OpValue.
	static Value isOneOf(const InstructionsState &S, Value Op) {
	auto *I = dyn_cast<Instruction>(Op);
	if (I && S.isOpcodeOrAlt(I))
	return Op;
	return S.OpValue;
	}

	/// \returns true if \p Opcode is allowed as part of of the main/alternate
	/// instruction for SLP vectorization.
	///
	/// Example of unsupported opcode is SDIV that can potentially cause UB if the
	/// "shuffled out" lane would result in division by zero.
	static bool isValidForAlternation(unsigned Opcode) {
	if (Instruction::isIntDivRem(Opcode))
	return false;

	return true;
	}

	/// \returns analysis of the Instructions in \p VL described in
	/// InstructionsState, the Opcode that we suppose the whole list
	/// could be vectorized even if its structure is diverse.
	static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
	unsigned BaseIndex = 0) {
	// Make sure these are all Instructions.
	if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
	return InstructionsState(VL[BaseIndex], nullptr, nullptr);

	bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
	bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
	unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
	unsigned AltOpcode = Opcode;
	unsigned AltIndex = BaseIndex;

	// Check for one alternate opcode from another BinaryOperator.
	// TODO - generalize to support all operators (types, calls etc.).
	for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
	unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
	if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
	if (InstOpcode == Opcode \|\| InstOpcode == AltOpcode)
	continue;
	if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
	isValidForAlternation(Opcode)) {
	AltOpcode = InstOpcode;
	AltIndex = Cnt;
	continue;
	}
	} else if (IsCastOp && isa<CastInst>(VL[Cnt])) {
	Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType();
	Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType();
	if (Ty0 == Ty1) {
	if (InstOpcode == Opcode \|\| InstOpcode == AltOpcode)
	continue;
	if (Opcode == AltOpcode) {
	assert(isValidForAlternation(Opcode) &&
	isValidForAlternation(InstOpcode) &&
	"Cast isn't safe for alternation, logic needs to be updated!");
	AltOpcode = InstOpcode;
	AltIndex = Cnt;
	continue;
	}
	}
	} else if (InstOpcode == Opcode \|\| InstOpcode == AltOpcode)
	continue;
	return InstructionsState(VL[BaseIndex], nullptr, nullptr);
	}

	return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
	cast<Instruction>(VL[AltIndex]));
	}

	/// \returns true if all of the values in \p VL have the same type or false
	/// otherwise.
	static bool allSameType(ArrayRef<Value *> VL) {
	Type *Ty = VL[0]->getType();
	for (int i = 1, e = VL.size(); i < e; i++)
	if (VL[i]->getType() != Ty)
	return false;

	return true;
	}

	/// \returns True if Extract{Value,Element} instruction extracts element Idx.
	static Optional<unsigned> getExtractIndex(Instruction *E) {
	unsigned Opcode = E->getOpcode();
	assert((Opcode == Instruction::ExtractElement \|\|
	Opcode == Instruction::ExtractValue) &&
	"Expected extractelement or extractvalue instruction.");
	if (Opcode == Instruction::ExtractElement) {
	auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
	if (!CI)
	return None;
	return CI->getZExtValue();
	}
	ExtractValueInst *EI = cast<ExtractValueInst>(E);
	if (EI->getNumIndices() != 1)
	return None;
	return *EI->idx_begin();
	}

	/// \returns True if in-tree use also needs extract. This refers to
	/// possible scalar operand in vectorized instruction.
	static bool InTreeUserNeedToExtract(Value Scalar, Instruction UserInst,
	TargetLibraryInfo *TLI) {
	unsigned Opcode = UserInst->getOpcode();
	switch (Opcode) {
	case Instruction::Load: {
	LoadInst *LI = cast<LoadInst>(UserInst);
	return (LI->getPointerOperand() == Scalar);
	}
	case Instruction::Store: {
	StoreInst *SI = cast<StoreInst>(UserInst);
	return (SI->getPointerOperand() == Scalar);
	}
	case Instruction::Call: {
	CallInst *CI = cast<CallInst>(UserInst);
	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
	for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
	if (hasVectorInstrinsicScalarOpd(ID, i))
	return (CI->getArgOperand(i) == Scalar);
	}
	LLVM_FALLTHROUGH;
	}
	default:
	return false;
	}
	}

	/// \returns the AA location that is being access by the instruction.
	static MemoryLocation getLocation(Instruction I, AliasAnalysis AA) {
	if (StoreInst *SI = dyn_cast<StoreInst>(I))
	return MemoryLocation::get(SI);
	if (LoadInst *LI = dyn_cast<LoadInst>(I))
	return MemoryLocation::get(LI);
	return MemoryLocation();
	}

	/// \returns True if the instruction is not a volatile or atomic load/store.
	static bool isSimple(Instruction *I) {
	if (LoadInst *LI = dyn_cast<LoadInst>(I))
	return LI->isSimple();
	if (StoreInst *SI = dyn_cast<StoreInst>(I))
	return SI->isSimple();
	if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
	return !MI->isVolatile();
	return true;
	}

	namespace llvm {

	namespace slpvectorizer {

	/// Bottom Up SLP Vectorizer.
	class BoUpSLP {
	struct TreeEntry;
	struct ScheduleData;

	public:
	using ValueList = SmallVector<Value *, 8>;
	using InstrList = SmallVector<Instruction *, 16>;
	using ValueSet = SmallPtrSet<Value *, 16>;
	using StoreList = SmallVector<StoreInst *, 8>;
	using ExtraValueToDebugLocsMap =
	MapVector<Value , SmallVector<Instruction , 2>>;

	BoUpSLP(Function Func, ScalarEvolution Se, TargetTransformInfo *Tti,
	TargetLibraryInfo TLi, AliasAnalysis Aa, LoopInfo *Li,
	DominatorTree Dt, AssumptionCache AC, DemandedBits *DB,
	const DataLayout DL, OptimizationRemarkEmitter ORE)
	: F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
	DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
	CodeMetrics::collectEphemeralValues(F, AC, EphValues);
	// Use the vector register size specified by the target unless overridden
	// by a command-line option.
	// TODO: It would be better to limit the vectorization factor based on
	// data type rather than just register size. For example, x86 AVX has
	// 256-bit registers, but it does not support integer operations
	// at that width (that requires AVX2).
	if (MaxVectorRegSizeOption.getNumOccurrences())
	MaxVecRegSize = MaxVectorRegSizeOption;
	else
	MaxVecRegSize = TTI->getRegisterBitWidth(true);

	if (MinVectorRegSizeOption.getNumOccurrences())
	MinVecRegSize = MinVectorRegSizeOption;
	else
	MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
	}

	/// Vectorize the tree that starts with the elements in \p VL.
	/// Returns the vectorized root.
	Value *vectorizeTree();

	/// Vectorize the tree but with the list of externally used values \p
	/// ExternallyUsedValues. Values in this MapVector can be replaced but the
	/// generated extractvalue instructions.
	Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);

	/// \returns the cost incurred by unwanted spills and fills, caused by
	/// holding live values over call sites.
	int getSpillCost() const;

	/// \returns the vectorization cost of the subtree that starts at \p VL.
	/// A negative number means that this is profitable.
	int getTreeCost();

	/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
	/// the purpose of scheduling and extraction in the \p UserIgnoreLst.
	void buildTree(ArrayRef<Value *> Roots,
	ArrayRef<Value *> UserIgnoreLst = None);

	/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
	/// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
	/// into account (and updating it, if required) list of externally used
	/// values stored in \p ExternallyUsedValues.
	void buildTree(ArrayRef<Value *> Roots,
	ExtraValueToDebugLocsMap &ExternallyUsedValues,
	ArrayRef<Value *> UserIgnoreLst = None);

	/// Clear the internal data structures that are created by 'buildTree'.
	void deleteTree() {
	VectorizableTree.clear();
	ScalarToTreeEntry.clear();
	MustGather.clear();
	ExternalUses.clear();
	NumOpsWantToKeepOrder.clear();
	NumOpsWantToKeepOriginalOrder = 0;
	for (auto &Iter : BlocksSchedules) {
	BlockScheduling *BS = Iter.second.get();
	BS->clear();
	}
	MinBWs.clear();
	}

	unsigned getTreeSize() const { return VectorizableTree.size(); }

	/// Perform LICM and CSE on the newly generated gather sequences.
	void optimizeGatherSequence();

	/// \returns The best order of instructions for vectorization.
	Optional<ArrayRef<unsigned>> bestOrder() const {
	auto I = std::max_element(
	NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),
	[](const decltype(NumOpsWantToKeepOrder)::value_type &D1,
	const decltype(NumOpsWantToKeepOrder)::value_type &D2) {
	return D1.second < D2.second;
	});
	if (I == NumOpsWantToKeepOrder.end() \|\|
	I->getSecond() <= NumOpsWantToKeepOriginalOrder)
	return None;

	return makeArrayRef(I->getFirst());
	}

	/// \return The vector element size in bits to use when vectorizing the
	/// expression tree ending at \p V. If V is a store, the size is the width of
	/// the stored value. Otherwise, the size is the width of the largest loaded
	/// value reaching V. This method is used by the vectorizer to calculate
	/// vectorization factors.
	unsigned getVectorElementSize(Value *V);

	/// Compute the minimum type sizes required to represent the entries in a
	/// vectorizable tree.
	void computeMinimumValueSizes();

	// \returns maximum vector register size as set by TTI or overridden by cl::opt.
	unsigned getMaxVecRegSize() const {
	return MaxVecRegSize;
	}

	// \returns minimum vector register size as set by cl::opt.
	unsigned getMinVecRegSize() const {
	return MinVecRegSize;
	}

	/// Check if homogeneous aggregate is isomorphic to some VectorType.
	/// Accepts homogeneous multidimensional aggregate of scalars/vectors like
	/// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
	/// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
	///
	/// \returns number of elements in vector if isomorphism exists, 0 otherwise.
	unsigned canMapToVector(Type *T, const DataLayout &DL) const;

	/// \returns True if the VectorizableTree is both tiny and not fully
	/// vectorizable. We do not vectorize such trees.
	bool isTreeTinyAndNotFullyVectorizable() const;

	/// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
	/// can be load combined in the backend. Load combining may not be allowed in
	/// the IR optimizer, so we do not want to alter the pattern. For example,
	/// partially transforming a scalar bswap() pattern into vector code is
	/// effectively impossible for the backend to undo.
	/// TODO: If load combining is allowed in the IR optimizer, this analysis
	/// may not be necessary.
	bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const;

	/// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
	/// can be load combined in the backend. Load combining may not be allowed in
	/// the IR optimizer, so we do not want to alter the pattern. For example,
	/// partially transforming a scalar bswap() pattern into vector code is
	/// effectively impossible for the backend to undo.
	/// TODO: If load combining is allowed in the IR optimizer, this analysis
	/// may not be necessary.
	bool isLoadCombineCandidate() const;

	OptimizationRemarkEmitter *getORE() { return ORE; }

	/// This structure holds any data we need about the edges being traversed
	/// during buildTree_rec(). We keep track of:
	/// (i) the user TreeEntry index, and
	/// (ii) the index of the edge.
	struct EdgeInfo {
	EdgeInfo() = default;
	EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
	: UserTE(UserTE), EdgeIdx(EdgeIdx) {}
	/// The user TreeEntry.
	TreeEntry *UserTE = nullptr;
	/// The operand index of the use.
	unsigned EdgeIdx = UINT_MAX;
	#ifndef NDEBUG
	friend inline raw_ostream &operator<<(raw_ostream &OS,
	const BoUpSLP::EdgeInfo &EI) {
	EI.dump(OS);
	return OS;
	}
	/// Debug print.
	void dump(raw_ostream &OS) const {
	OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
	<< " EdgeIdx:" << EdgeIdx << "}";
	}
	LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
	#endif
	};

	/// A helper data structure to hold the operands of a vector of instructions.
	/// This supports a fixed vector length for all operand vectors.
	class VLOperands {
	/// For each operand we need (i) the value, and (ii) the opcode that it
	/// would be attached to if the expression was in a left-linearized form.
	/// This is required to avoid illegal operand reordering.
	/// For example:
	/// \verbatim
	/// 0 Op1
	/// \|/
	/// Op1 Op2 Linearized + Op2
	/// \ / ----------> \|/
	/// - -
	///
	/// Op1 - Op2 (0 + Op1) - Op2
	/// \endverbatim
	///
	/// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
	///
	/// Another way to think of this is to track all the operations across the
	/// path from the operand all the way to the root of the tree and to
	/// calculate the operation that corresponds to this path. For example, the
	/// path from Op2 to the root crosses the RHS of the '-', therefore the
	/// corresponding operation is a '-' (which matches the one in the
	/// linearized tree, as shown above).
	///
	/// For lack of a better term, we refer to this operation as Accumulated
	/// Path Operation (APO).
	struct OperandData {
	OperandData() = default;
	OperandData(Value *V, bool APO, bool IsUsed)
	: V(V), APO(APO), IsUsed(IsUsed) {}
	/// The operand value.
	Value *V = nullptr;
	/// TreeEntries only allow a single opcode, or an alternate sequence of
	/// them (e.g, +, -). Therefore, we can safely use a boolean value for the
	/// APO. It is set to 'true' if 'V' is attached to an inverse operation
	/// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
	/// (e.g., Add/Mul)
	bool APO = false;
	/// Helper data for the reordering function.
	bool IsUsed = false;
	};

	/// During operand reordering, we are trying to select the operand at lane
	/// that matches best with the operand at the neighboring lane. Our
	/// selection is based on the type of value we are looking for. For example,
	/// if the neighboring lane has a load, we need to look for a load that is
	/// accessing a consecutive address. These strategies are summarized in the
	/// 'ReorderingMode' enumerator.
	enum class ReorderingMode {
	Load, ///< Matching loads to consecutive memory addresses
	Opcode, ///< Matching instructions based on opcode (same or alternate)
	Constant, ///< Matching constants
	Splat, ///< Matching the same instruction multiple times (broadcast)
	Failed, ///< We failed to create a vectorizable group
	};

	using OperandDataVec = SmallVector<OperandData, 2>;

	/// A vector of operand vectors.
	SmallVector<OperandDataVec, 4> OpsVec;

	const DataLayout &DL;
	ScalarEvolution &SE;
	const BoUpSLP &R;

	/// \returns the operand data at \p OpIdx and \p Lane.
	OperandData &getData(unsigned OpIdx, unsigned Lane) {
	return OpsVec[OpIdx][Lane];
	}

	/// \returns the operand data at \p OpIdx and \p Lane. Const version.
	const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
	return OpsVec[OpIdx][Lane];
	}

	/// Clears the used flag for all entries.
	void clearUsed() {
	for (unsigned OpIdx = 0, NumOperands = getNumOperands();
	OpIdx != NumOperands; ++OpIdx)
	for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
	++Lane)
	OpsVec[OpIdx][Lane].IsUsed = false;
	}

	/// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
	void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
	std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
	}

	// The hard-coded scores listed here are not very important. When computing
	// the scores of matching one sub-tree with another, we are basically
	// counting the number of values that are matching. So even if all scores
	// are set to 1, we would still get a decent matching result.
	// However, sometimes we have to break ties. For example we may have to
	// choose between matching loads vs matching opcodes. This is what these
	// scores are helping us with: they provide the order of preference.

	/// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
	static const int ScoreConsecutiveLoads = 3;
	/// ExtractElementInst from same vector and consecutive indexes.
	static const int ScoreConsecutiveExtracts = 3;
	/// Constants.
	static const int ScoreConstants = 2;
	/// Instructions with the same opcode.
	static const int ScoreSameOpcode = 2;
	/// Instructions with alt opcodes (e.g, add + sub).
	static const int ScoreAltOpcodes = 1;
	/// Identical instructions (a.k.a. splat or broadcast).
	static const int ScoreSplat = 1;
	/// Matching with an undef is preferable to failing.
	static const int ScoreUndef = 1;
	/// Score for failing to find a decent match.
	static const int ScoreFail = 0;
	/// User exteranl to the vectorized code.
	static const int ExternalUseCost = 1;
	/// The user is internal but in a different lane.
	static const int UserInDiffLaneCost = ExternalUseCost;

	/// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
	static int getShallowScore(Value V1, Value V2, const DataLayout &DL,
	ScalarEvolution &SE) {
	auto *LI1 = dyn_cast<LoadInst>(V1);
	auto *LI2 = dyn_cast<LoadInst>(V2);
	if (LI1 && LI2)
	return isConsecutiveAccess(LI1, LI2, DL, SE)
	? VLOperands::ScoreConsecutiveLoads
	: VLOperands::ScoreFail;

	auto *C1 = dyn_cast<Constant>(V1);
	auto *C2 = dyn_cast<Constant>(V2);
	if (C1 && C2)
	return VLOperands::ScoreConstants;

	// Extracts from consecutive indexes of the same vector better score as
	// the extracts could be optimized away.
	Value *EV;
	ConstantInt Ex1Idx, Ex2Idx;
	if (match(V1, m_ExtractElt(m_Value(EV), m_ConstantInt(Ex1Idx))) &&
	match(V2, m_ExtractElt(m_Deferred(EV), m_ConstantInt(Ex2Idx))) &&
	Ex1Idx->getZExtValue() + 1 == Ex2Idx->getZExtValue())
	return VLOperands::ScoreConsecutiveExtracts;

	auto *I1 = dyn_cast<Instruction>(V1);
	auto *I2 = dyn_cast<Instruction>(V2);
	if (I1 && I2) {
	if (I1 == I2)
	return VLOperands::ScoreSplat;
	InstructionsState S = getSameOpcode({I1, I2});
	// Note: Only consider instructions with <= 2 operands to avoid
	// complexity explosion.
	if (S.getOpcode() && S.MainOp->getNumOperands() <= 2)
	return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes
	: VLOperands::ScoreSameOpcode;
	}

	if (isa<UndefValue>(V2))
	return VLOperands::ScoreUndef;

	return VLOperands::ScoreFail;
	}

	/// Holds the values and their lane that are taking part in the look-ahead
	/// score calculation. This is used in the external uses cost calculation.
	SmallDenseMap<Value *, int> InLookAheadValues;

	/// \Returns the additinal cost due to uses of \p LHS and \p RHS that are
	/// either external to the vectorized code, or require shuffling.
	int getExternalUsesCost(const std::pair<Value *, int> &LHS,
	const std::pair<Value *, int> &RHS) {
	int Cost = 0;
	std::array<std::pair<Value *, int>, 2> Values = {{LHS, RHS}};
	for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
	Value *V = Values[Idx].first;
	// Calculate the absolute lane, using the minimum relative lane of LHS
	// and RHS as base and Idx as the offset.
	int Ln = std::min(LHS.second, RHS.second) + Idx;
	assert(Ln >= 0 && "Bad lane calculation");
	unsigned UsersBudget = LookAheadUsersBudget;
	for (User *U : V->users()) {
	if (const TreeEntry *UserTE = R.getTreeEntry(U)) {
	// The user is in the VectorizableTree. Check if we need to insert.
	auto It = llvm::find(UserTE->Scalars, U);
	assert(It != UserTE->Scalars.end() && "U is in UserTE");
	int UserLn = std::distance(UserTE->Scalars.begin(), It);
	assert(UserLn >= 0 && "Bad lane");
	if (UserLn != Ln)
	Cost += UserInDiffLaneCost;
	} else {
	// Check if the user is in the look-ahead code.
	auto It2 = InLookAheadValues.find(U);
	if (It2 != InLookAheadValues.end()) {
	// The user is in the look-ahead code. Check the lane.
	if (It2->second != Ln)
	Cost += UserInDiffLaneCost;
	} else {
	// The user is neither in SLP tree nor in the look-ahead code.
	Cost += ExternalUseCost;
	}
	}
	// Limit the number of visited uses to cap compilation time.
	if (--UsersBudget == 0)
	break;
	}
	}
	return Cost;
	}

	/// Go through the operands of \p LHS and \p RHS recursively until \p
	/// MaxLevel, and return the cummulative score. For example:
	/// \verbatim
	/// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
	/// \ / \ / \ / \ /
	/// + + + +
	/// G1 G2 G3 G4
	/// \endverbatim
	/// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
	/// each level recursively, accumulating the score. It starts from matching
	/// the additions at level 0, then moves on to the loads (level 1). The
	/// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
	/// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while
	/// {A[0],C[0]} has a score of VLOperands::ScoreFail.
	/// Please note that the order of the operands does not matter, as we
	/// evaluate the score of all profitable combinations of operands. In
	/// other words the score of G1 and G4 is the same as G1 and G2. This
	/// heuristic is based on ideas described in:
	/// Look-ahead SLP: Auto-vectorization in the presence of commutative
	/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
	/// Luís F. W. Góes
	int getScoreAtLevelRec(const std::pair<Value *, int> &LHS,
	const std::pair<Value *, int> &RHS, int CurrLevel,
	int MaxLevel) {

	Value *V1 = LHS.first;
	Value *V2 = RHS.first;
	// Get the shallow score of V1 and V2.
	int ShallowScoreAtThisLevel =
	std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) -
	getExternalUsesCost(LHS, RHS));
	int Lane1 = LHS.second;
	int Lane2 = RHS.second;

	// If reached MaxLevel,
	// or if V1 and V2 are not instructions,
	// or if they are SPLAT,
	// or if they are not consecutive, early return the current cost.
	auto *I1 = dyn_cast<Instruction>(V1);
	auto *I2 = dyn_cast<Instruction>(V2);
	if (CurrLevel == MaxLevel \|\| !(I1 && I2) \|\| I1 == I2 \|\|
	ShallowScoreAtThisLevel == VLOperands::ScoreFail \|\|
	(isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel))
	return ShallowScoreAtThisLevel;
	assert(I1 && I2 && "Should have early exited.");

	// Keep track of in-tree values for determining the external-use cost.
	InLookAheadValues[V1] = Lane1;
	InLookAheadValues[V2] = Lane2;

	// Contains the I2 operand indexes that got matched with I1 operands.
	SmallSet<unsigned, 4> Op2Used;

	// Recursion towards the operands of I1 and I2. We are trying all possbile
	// operand pairs, and keeping track of the best score.
	for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
	OpIdx1 != NumOperands1; ++OpIdx1) {
	// Try to pair op1I with the best operand of I2.
	int MaxTmpScore = 0;
	unsigned MaxOpIdx2 = 0;
	bool FoundBest = false;
	// If I2 is commutative try all combinations.
	unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
	unsigned ToIdx = isCommutative(I2)
	? I2->getNumOperands()
	: std::min(I2->getNumOperands(), OpIdx1 + 1);
	assert(FromIdx <= ToIdx && "Bad index");
	for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
	// Skip operands already paired with OpIdx1.
	if (Op2Used.count(OpIdx2))
	continue;
	// Recursively calculate the cost at each level
	int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1},
	{I2->getOperand(OpIdx2), Lane2},
	CurrLevel + 1, MaxLevel);
	// Look for the best score.
	if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) {
	MaxTmpScore = TmpScore;
	MaxOpIdx2 = OpIdx2;
	FoundBest = true;
	}
	}
	if (FoundBest) {
	// Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
	Op2Used.insert(MaxOpIdx2);
	ShallowScoreAtThisLevel += MaxTmpScore;
	}
	}
	return ShallowScoreAtThisLevel;
	}

	/// \Returns the look-ahead score, which tells us how much the sub-trees
	/// rooted at \p LHS and \p RHS match, the more they match the higher the
	/// score. This helps break ties in an informed way when we cannot decide on
	/// the order of the operands by just considering the immediate
	/// predecessors.
	int getLookAheadScore(const std::pair<Value *, int> &LHS,
	const std::pair<Value *, int> &RHS) {
	InLookAheadValues.clear();
	return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth);
	}

	// Search all operands in Ops[*][Lane] for the one that matches best
	// Ops[OpIdx][LastLane] and return its opreand index.
	// If no good match can be found, return None.
	Optional<unsigned>
	getBestOperand(unsigned OpIdx, int Lane, int LastLane,
	ArrayRef<ReorderingMode> ReorderingModes) {
	unsigned NumOperands = getNumOperands();

	// The operand of the previous lane at OpIdx.
	Value *OpLastLane = getData(OpIdx, LastLane).V;

	// Our strategy mode for OpIdx.
	ReorderingMode RMode = ReorderingModes[OpIdx];

	// The linearized opcode of the operand at OpIdx, Lane.
	bool OpIdxAPO = getData(OpIdx, Lane).APO;

	// The best operand index and its score.
	// Sometimes we have more than one option (e.g., Opcode and Undefs), so we
	// are using the score to differentiate between the two.
	struct BestOpData {
	Optional<unsigned> Idx = None;
	unsigned Score = 0;
	} BestOp;

	// Iterate through all unused operands and look for the best.
	for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
	// Get the operand at Idx and Lane.
	OperandData &OpData = getData(Idx, Lane);
	Value *Op = OpData.V;
	bool OpAPO = OpData.APO;

	// Skip already selected operands.
	if (OpData.IsUsed)
	continue;

	// Skip if we are trying to move the operand to a position with a
	// different opcode in the linearized tree form. This would break the
	// semantics.
	if (OpAPO != OpIdxAPO)
	continue;

	// Look for an operand that matches the current mode.
	switch (RMode) {
	case ReorderingMode::Load:
	case ReorderingMode::Constant:
	case ReorderingMode::Opcode: {
	bool LeftToRight = Lane > LastLane;
	Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
	Value *OpRight = (LeftToRight) ? Op : OpLastLane;
	unsigned Score =
	getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane});
	if (Score > BestOp.Score) {
	BestOp.Idx = Idx;
	BestOp.Score = Score;
	}
	break;
	}
	case ReorderingMode::Splat:
	if (Op == OpLastLane)
	BestOp.Idx = Idx;
	break;
	case ReorderingMode::Failed:
	return None;
	}
	}

	if (BestOp.Idx) {
	getData(BestOp.Idx.getValue(), Lane).IsUsed = true;
	return BestOp.Idx;
	}
	// If we could not find a good match return None.
	return None;
	}

	/// Helper for reorderOperandVecs. \Returns the lane that we should start
	/// reordering from. This is the one which has the least number of operands
	/// that can freely move about.
	unsigned getBestLaneToStartReordering() const {
	unsigned BestLane = 0;
	unsigned Min = UINT_MAX;
	for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
	++Lane) {
	unsigned NumFreeOps = getMaxNumOperandsThatCanBeReordered(Lane);
	if (NumFreeOps < Min) {
	Min = NumFreeOps;
	BestLane = Lane;
	}
	}
	return BestLane;
	}

	/// \Returns the maximum number of operands that are allowed to be reordered
	/// for \p Lane. This is used as a heuristic for selecting the first lane to
	/// start operand reordering.
	unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
	unsigned CntTrue = 0;
	unsigned NumOperands = getNumOperands();
	// Operands with the same APO can be reordered. We therefore need to count
	// how many of them we have for each APO, like this: Cnt[APO] = x.
	// Since we only have two APOs, namely true and false, we can avoid using
	// a map. Instead we can simply count the number of operands that
	// correspond to one of them (in this case the 'true' APO), and calculate
	// the other by subtracting it from the total number of operands.
	for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx)
	if (getData(OpIdx, Lane).APO)
	++CntTrue;
	unsigned CntFalse = NumOperands - CntTrue;
	return std::max(CntTrue, CntFalse);
	}

	/// Go through the instructions in VL and append their operands.
	void appendOperandsOfVL(ArrayRef<Value *> VL) {
	assert(!VL.empty() && "Bad VL");
	assert((empty() \|\| VL.size() == getNumLanes()) &&
	"Expected same number of lanes");
	assert(isa<Instruction>(VL[0]) && "Expected instruction");
	unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
	OpsVec.resize(NumOperands);
	unsigned NumLanes = VL.size();
	for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
	OpsVec[OpIdx].resize(NumLanes);
	for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
	assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
	// Our tree has just 3 nodes: the root and two operands.
	// It is therefore trivial to get the APO. We only need to check the
	// opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
	// RHS operand. The LHS operand of both add and sub is never attached
	// to an inversese operation in the linearized form, therefore its APO
	// is false. The RHS is true only if VL[Lane] is an inverse operation.

	// Since operand reordering is performed on groups of commutative
	// operations or alternating sequences (e.g., +, -), we can safely
	// tell the inverse operations by checking commutativity.
	bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
	bool APO = (OpIdx == 0) ? false : IsInverseOperation;
	OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
	APO, false};
	}
	}
	}

	/// \returns the number of operands.
	unsigned getNumOperands() const { return OpsVec.size(); }

	/// \returns the number of lanes.
	unsigned getNumLanes() const { return OpsVec[0].size(); }

	/// \returns the operand value at \p OpIdx and \p Lane.
	Value *getValue(unsigned OpIdx, unsigned Lane) const {
	return getData(OpIdx, Lane).V;
	}

	/// \returns true if the data structure is empty.
	bool empty() const { return OpsVec.empty(); }

	/// Clears the data.
	void clear() { OpsVec.clear(); }

	/// \Returns true if there are enough operands identical to \p Op to fill
	/// the whole vector.
	/// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
	bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
	bool OpAPO = getData(OpIdx, Lane).APO;
	for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
	if (Ln == Lane)
	continue;
	// This is set to true if we found a candidate for broadcast at Lane.
	bool FoundCandidate = false;
	for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
	OperandData &Data = getData(OpI, Ln);
	if (Data.APO != OpAPO \|\| Data.IsUsed)
	continue;
	if (Data.V == Op) {
	FoundCandidate = true;
	Data.IsUsed = true;
	break;
	}
	}
	if (!FoundCandidate)
	return false;
	}
	return true;
	}

	public:
	/// Initialize with all the operands of the instruction vector \p RootVL.
	VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,
	ScalarEvolution &SE, const BoUpSLP &R)
	: DL(DL), SE(SE), R(R) {
	// Append all the operands of RootVL.
	appendOperandsOfVL(RootVL);
	}

	/// \Returns a value vector with the operands across all lanes for the
	/// opearnd at \p OpIdx.
	ValueList getVL(unsigned OpIdx) const {
	ValueList OpVL(OpsVec[OpIdx].size());
	assert(OpsVec[OpIdx].size() == getNumLanes() &&
	"Expected same num of lanes across all operands");
	for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
	OpVL[Lane] = OpsVec[OpIdx][Lane].V;
	return OpVL;
	}

	// Performs operand reordering for 2 or more operands.
	// The original operands are in OrigOps[OpIdx][Lane].
	// The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
	void reorder() {
	unsigned NumOperands = getNumOperands();
	unsigned NumLanes = getNumLanes();
	// Each operand has its own mode. We are using this mode to help us select
	// the instructions for each lane, so that they match best with the ones
	// we have selected so far.
	SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);

	// This is a greedy single-pass algorithm. We are going over each lane
	// once and deciding on the best order right away with no back-tracking.
	// However, in order to increase its effectiveness, we start with the lane
	// that has operands that can move the least. For example, given the
	// following lanes:
	// Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
	// Lane 1 : A[1] = C[1] - B[1] // Visited 1st
	// Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
	// Lane 3 : A[3] = C[3] - B[3] // Visited 4th
	// we will start at Lane 1, since the operands of the subtraction cannot
	// be reordered. Then we will visit the rest of the lanes in a circular
	// fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.

	// Find the first lane that we will start our search from.
	unsigned FirstLane = getBestLaneToStartReordering();

	// Initialize the modes.
	for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
	Value *OpLane0 = getValue(OpIdx, FirstLane);
	// Keep track if we have instructions with all the same opcode on one
	// side.
	if (isa<LoadInst>(OpLane0))
	ReorderingModes[OpIdx] = ReorderingMode::Load;
	else if (isa<Instruction>(OpLane0)) {
	// Check if OpLane0 should be broadcast.
	if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
	ReorderingModes[OpIdx] = ReorderingMode::Splat;
	else
	ReorderingModes[OpIdx] = ReorderingMode::Opcode;
	}
	else if (isa<Constant>(OpLane0))
	ReorderingModes[OpIdx] = ReorderingMode::Constant;
	else if (isa<Argument>(OpLane0))
	// Our best hope is a Splat. It may save some cost in some cases.
	ReorderingModes[OpIdx] = ReorderingMode::Splat;
	else
	// NOTE: This should be unreachable.
	ReorderingModes[OpIdx] = ReorderingMode::Failed;
	}

	// If the initial strategy fails for any of the operand indexes, then we
	// perform reordering again in a second pass. This helps avoid assigning
	// high priority to the failed strategy, and should improve reordering for
	// the non-failed operand indexes.
	for (int Pass = 0; Pass != 2; ++Pass) {
	// Skip the second pass if the first pass did not fail.
	bool StrategyFailed = false;
	// Mark all operand data as free to use.
	clearUsed();
	// We keep the original operand order for the FirstLane, so reorder the
	// rest of the lanes. We are visiting the nodes in a circular fashion,
	// using FirstLane as the center point and increasing the radius
	// distance.
	for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
	// Visit the lane on the right and then the lane on the left.
	for (int Direction : {+1, -1}) {
	int Lane = FirstLane + Direction * Distance;
	if (Lane < 0 \|\| Lane >= (int)NumLanes)
	continue;
	int LastLane = Lane - Direction;
	assert(LastLane >= 0 && LastLane < (int)NumLanes &&
	"Out of bounds");
	// Look for a good match for each operand.
	for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
	// Search for the operand that matches SortedOps[OpIdx][Lane-1].
	Optional<unsigned> BestIdx =
	getBestOperand(OpIdx, Lane, LastLane, ReorderingModes);
	// By not selecting a value, we allow the operands that follow to
	// select a better matching value. We will get a non-null value in
	// the next run of getBestOperand().
	if (BestIdx) {
	// Swap the current operand with the one returned by
	// getBestOperand().
	swap(OpIdx, BestIdx.getValue(), Lane);
	} else {
	// We failed to find a best operand, set mode to 'Failed'.
	ReorderingModes[OpIdx] = ReorderingMode::Failed;
	// Enable the second pass.
	StrategyFailed = true;
	}
	}
	}
	}
	// Skip second pass if the strategy did not fail.
	if (!StrategyFailed)
	break;
	}
	}

	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
	switch (RMode) {
	case ReorderingMode::Load:
	return "Load";
	case ReorderingMode::Opcode:
	return "Opcode";
	case ReorderingMode::Constant:
	return "Constant";
	case ReorderingMode::Splat:
	return "Splat";
	case ReorderingMode::Failed:
	return "Failed";
	}
	llvm_unreachable("Unimplemented Reordering Type");
	}

	LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
	raw_ostream &OS) {
	return OS << getModeStr(RMode);
	}

	/// Debug print.
	LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
	printMode(RMode, dbgs());
	}

	friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
	return printMode(RMode, OS);
	}

	LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const {
	const unsigned Indent = 2;
	unsigned Cnt = 0;
	for (const OperandDataVec &OpDataVec : OpsVec) {
	OS << "Operand " << Cnt++ << "\n";
	for (const OperandData &OpData : OpDataVec) {
	OS.indent(Indent) << "{";
	if (Value *V = OpData.V)
	OS << *V;
	else
	OS << "null";
	OS << ", APO:" << OpData.APO << "}\n";
	}
	OS << "\n";
	}
	return OS;
	}

	/// Debug print.
	LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
	#endif
	};

	/// Checks if the instruction is marked for deletion.
	bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }

	/// Marks values operands for later deletion by replacing them with Undefs.
	void eraseInstructions(ArrayRef<Value *> AV);

	~BoUpSLP();

	private:
	/// Checks if all users of \p I are the part of the vectorization tree.
	bool areAllUsersVectorized(Instruction *I) const;

	/// \returns the cost of the vectorizable entry.
	int getEntryCost(TreeEntry *E);

	/// This is the recursive part of buildTree.
	void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
	const EdgeInfo &EI);

	/// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
	/// be vectorized to use the original vector (or aggregate "bitcast" to a
	/// vector) and sets \p CurrentOrder to the identity permutation; otherwise
	/// returns false, setting \p CurrentOrder to either an empty vector or a
	/// non-identity permutation that allows to reuse extract instructions.
	bool canReuseExtract(ArrayRef<Value > VL, Value OpValue,
	SmallVectorImpl<unsigned> &CurrentOrder) const;

	/// Vectorize a single entry in the tree.
	Value vectorizeTree(TreeEntry E);

	/// Vectorize a single entry in the tree, starting in \p VL.
	Value vectorizeTree(ArrayRef<Value > VL);

	/// \returns the scalarization cost for this type. Scalarization in this
	/// context means the creation of vectors from a group of scalars.
	int getGatherCost(VectorType *Ty,
	const DenseSet<unsigned> &ShuffledIndices) const;

	/// \returns the scalarization cost for this list of values. Assuming that
	/// this subtree gets vectorized, we may need to extract the values from the
	/// roots. This method calculates the cost of extracting the values.
	int getGatherCost(ArrayRef<Value *> VL) const;

	/// Set the Builder insert point to one after the last instruction in
	/// the bundle
	void setInsertPointAfterBundle(TreeEntry *E);

	/// \returns a vector from a collection of scalars in \p VL.
	Value Gather(ArrayRef<Value > VL, VectorType *Ty);

	/// \returns whether the VectorizableTree is fully vectorizable and will
	/// be beneficial even the tree height is tiny.
	bool isFullyVectorizableTinyTree() const;

	/// Reorder commutative or alt operands to get better probability of
	/// generating vectorized code.
	static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
	SmallVectorImpl<Value *> &Left,
	SmallVectorImpl<Value *> &Right,
	const DataLayout &DL,
	ScalarEvolution &SE,
	const BoUpSLP &R);
	struct TreeEntry {
	using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
	TreeEntry(VecTreeTy &Container) : Container(Container) {}

	/// \returns true if the scalars in VL are equal to this entry.
	bool isSame(ArrayRef<Value *> VL) const {
	if (VL.size() == Scalars.size())
	return std::equal(VL.begin(), VL.end(), Scalars.begin());
	return VL.size() == ReuseShuffleIndices.size() &&
	std::equal(
	VL.begin(), VL.end(), ReuseShuffleIndices.begin(),
	[this](Value *V, int Idx) { return V == Scalars[Idx]; });
	}

	/// A vector of scalars.
	ValueList Scalars;

	/// The Scalars are vectorized into this value. It is initialized to Null.
	Value *VectorizedValue = nullptr;

	/// Do we need to gather this sequence ?
	enum EntryState { Vectorize, NeedToGather };
	EntryState State;

	/// Does this sequence require some shuffling?
	SmallVector<int, 4> ReuseShuffleIndices;

	/// Does this entry require reordering?
	ArrayRef<unsigned> ReorderIndices;

	/// Points back to the VectorizableTree.
	///
	/// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
	/// to be a pointer and needs to be able to initialize the child iterator.
	/// Thus we need a reference back to the container to translate the indices
	/// to entries.
	VecTreeTy &Container;

	/// The TreeEntry index containing the user of this entry. We can actually
	/// have multiple users so the data structure is not truly a tree.
	SmallVector<EdgeInfo, 1> UserTreeIndices;

	/// The index of this treeEntry in VectorizableTree.
	int Idx = -1;

	private:
	/// The operands of each instruction in each lane Operands[op_index][lane].
	/// Note: This helps avoid the replication of the code that performs the
	/// reordering of operands during buildTree_rec() and vectorizeTree().
	SmallVector<ValueList, 2> Operands;

	/// The main/alternate instruction.
	Instruction *MainOp = nullptr;
	Instruction *AltOp = nullptr;

	public:
	/// Set this bundle's \p OpIdx'th operand to \p OpVL.
	void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
	if (Operands.size() < OpIdx + 1)
	Operands.resize(OpIdx + 1);
	assert(Operands[OpIdx].size() == 0 && "Already resized?");
	Operands[OpIdx].resize(Scalars.size());
	for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane)
	Operands[OpIdx][Lane] = OpVL[Lane];
	}

	/// Set the operands of this bundle in their original order.
	void setOperandsInOrder() {
	assert(Operands.empty() && "Already initialized?");
	auto *I0 = cast<Instruction>(Scalars[0]);
	Operands.resize(I0->getNumOperands());
	unsigned NumLanes = Scalars.size();
	for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
	OpIdx != NumOperands; ++OpIdx) {
	Operands[OpIdx].resize(NumLanes);
	for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
	auto *I = cast<Instruction>(Scalars[Lane]);
	assert(I->getNumOperands() == NumOperands &&
	"Expected same number of operands");
	Operands[OpIdx][Lane] = I->getOperand(OpIdx);
	}
	}
	}

	/// \returns the \p OpIdx operand of this TreeEntry.
	ValueList &getOperand(unsigned OpIdx) {
	assert(OpIdx < Operands.size() && "Off bounds");
	return Operands[OpIdx];
	}

	/// \returns the number of operands.
	unsigned getNumOperands() const { return Operands.size(); }

	/// \return the single \p OpIdx operand.
	Value *getSingleOperand(unsigned OpIdx) const {
	assert(OpIdx < Operands.size() && "Off bounds");
	assert(!Operands[OpIdx].empty() && "No operand available");
	return Operands[OpIdx][0];
	}

	/// Some of the instructions in the list have alternate opcodes.
	bool isAltShuffle() const {
	return getOpcode() != getAltOpcode();
	}

	bool isOpcodeOrAlt(Instruction *I) const {
	unsigned CheckedOpcode = I->getOpcode();
	return (getOpcode() == CheckedOpcode \|\|
	getAltOpcode() == CheckedOpcode);
	}

	/// Chooses the correct key for scheduling data. If \p Op has the same (or
	/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
	/// \p OpValue.
	Value isOneOf(Value Op) const {
	auto *I = dyn_cast<Instruction>(Op);
	if (I && isOpcodeOrAlt(I))
	return Op;
	return MainOp;
	}

	void setOperations(const InstructionsState &S) {
	MainOp = S.MainOp;
	AltOp = S.AltOp;
	}

	Instruction *getMainOp() const {
	return MainOp;
	}

	Instruction *getAltOp() const {
	return AltOp;
	}

	/// The main/alternate opcodes for the list of instructions.
	unsigned getOpcode() const {
	return MainOp ? MainOp->getOpcode() : 0;
	}

	unsigned getAltOpcode() const {
	return AltOp ? AltOp->getOpcode() : 0;
	}

	/// Update operations state of this entry if reorder occurred.
	bool updateStateIfReorder() {
	if (ReorderIndices.empty())
	return false;
	InstructionsState S = getSameOpcode(Scalars, ReorderIndices.front());
	setOperations(S);
	return true;
	}

	#ifndef NDEBUG
	/// Debug printer.
	LLVM_DUMP_METHOD void dump() const {
	dbgs() << Idx << ".\n";
	for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
	dbgs() << "Operand " << OpI << ":\n";
	for (const Value *V : Operands[OpI])
	dbgs().indent(2) << *V << "\n";
	}
	dbgs() << "Scalars: \n";
	for (Value *V : Scalars)
	dbgs().indent(2) << *V << "\n";
	dbgs() << "State: ";
	switch (State) {
	case Vectorize:
	dbgs() << "Vectorize\n";
	break;
	case NeedToGather:
	dbgs() << "NeedToGather\n";
	break;
	}
	dbgs() << "MainOp: ";
	if (MainOp)
	dbgs() << *MainOp << "\n";
	else
	dbgs() << "NULL\n";
	dbgs() << "AltOp: ";
	if (AltOp)
	dbgs() << *AltOp << "\n";
	else
	dbgs() << "NULL\n";
	dbgs() << "VectorizedValue: ";
	if (VectorizedValue)
	dbgs() << *VectorizedValue << "\n";
	else
	dbgs() << "NULL\n";
	dbgs() << "ReuseShuffleIndices: ";
	if (ReuseShuffleIndices.empty())
	dbgs() << "Emtpy";
	else
	for (unsigned ReuseIdx : ReuseShuffleIndices)
	dbgs() << ReuseIdx << ", ";
	dbgs() << "\n";
	dbgs() << "ReorderIndices: ";
	for (unsigned ReorderIdx : ReorderIndices)
	dbgs() << ReorderIdx << ", ";
	dbgs() << "\n";
	dbgs() << "UserTreeIndices: ";
	for (const auto &EInfo : UserTreeIndices)
	dbgs() << EInfo << ", ";
	dbgs() << "\n";
	}
	#endif
	};

	/// Create a new VectorizableTree entry.
	TreeEntry newTreeEntry(ArrayRef<Value > VL, Optional<ScheduleData *> Bundle,
	const InstructionsState &S,
	const EdgeInfo &UserTreeIdx,
	ArrayRef<unsigned> ReuseShuffleIndices = None,
	ArrayRef<unsigned> ReorderIndices = None) {
	bool Vectorized = (bool)Bundle;
	VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
	TreeEntry *Last = VectorizableTree.back().get();
	Last->Idx = VectorizableTree.size() - 1;
	Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
	Last->State = Vectorized ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
	Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
	ReuseShuffleIndices.end());
	Last->ReorderIndices = ReorderIndices;
	Last->setOperations(S);
	if (Vectorized) {
	for (int i = 0, e = VL.size(); i != e; ++i) {
	assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
	ScalarToTreeEntry[VL[i]] = Last;
	}
	// Update the scheduler bundle to point to this TreeEntry.
	unsigned Lane = 0;
	for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember;
	BundleMember = BundleMember->NextInBundle) {
	BundleMember->TE = Last;
	BundleMember->Lane = Lane;
	++Lane;
	}
	assert((!Bundle.getValue() \|\| Lane == VL.size()) &&
	"Bundle and VL out of sync");
	} else {
	MustGather.insert(VL.begin(), VL.end());
	}

	if (UserTreeIdx.UserTE)
	Last->UserTreeIndices.push_back(UserTreeIdx);

	return Last;
	}

	/// -- Vectorization State --
	/// Holds all of the tree entries.
	TreeEntry::VecTreeTy VectorizableTree;

	#ifndef NDEBUG
	/// Debug printer.
	LLVM_DUMP_METHOD void dumpVectorizableTree() const {
	for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
	VectorizableTree[Id]->dump();
	dbgs() << "\n";
	}
	}
	#endif

	TreeEntry getTreeEntry(Value V) {
	auto I = ScalarToTreeEntry.find(V);
	if (I != ScalarToTreeEntry.end())
	return I->second;
	return nullptr;
	}

	const TreeEntry getTreeEntry(Value V) const {
	auto I = ScalarToTreeEntry.find(V);
	if (I != ScalarToTreeEntry.end())
	return I->second;
	return nullptr;
	}

	/// Maps a specific scalar to its tree entry.
	SmallDenseMap<Value, TreeEntry > ScalarToTreeEntry;

	/// Maps a value to the proposed vectorizable size.
	SmallDenseMap<Value *, unsigned> InstrElementSize;

	/// A list of scalars that we found that we need to keep as scalars.
	ValueSet MustGather;

	/// This POD struct describes one external user in the vectorized tree.
	struct ExternalUser {
	ExternalUser(Value S, llvm::User U, int L)
	: Scalar(S), User(U), Lane(L) {}

	// Which scalar in our function.
	Value *Scalar;

	// Which user that uses the scalar.
	llvm::User *User;

	// Which lane does the scalar belong to.
	int Lane;
	};
	using UserList = SmallVector<ExternalUser, 16>;

	/// Checks if two instructions may access the same memory.
	///
	/// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
	/// is invariant in the calling loop.
	bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
	Instruction *Inst2) {
	// First check if the result is already in the cache.
	AliasCacheKey key = std::make_pair(Inst1, Inst2);
	Optional<bool> &result = AliasCache[key];
	if (result.hasValue()) {
	return result.getValue();
	}
	MemoryLocation Loc2 = getLocation(Inst2, AA);
	bool aliased = true;
	if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
	// Do the alias check.
	aliased = AA->alias(Loc1, Loc2);
	}
	// Store the result in the cache.
	result = aliased;
	return aliased;
	}

	using AliasCacheKey = std::pair<Instruction , Instruction >;

	/// Cache for alias results.
	/// TODO: consider moving this to the AliasAnalysis itself.
	DenseMap<AliasCacheKey, Optional<bool>> AliasCache;

	/// Removes an instruction from its block and eventually deletes it.
	/// It's like Instruction::eraseFromParent() except that the actual deletion
	/// is delayed until BoUpSLP is destructed.
	/// This is required to ensure that there are no incorrect collisions in the
	/// AliasCache, which can happen if a new instruction is allocated at the
	/// same address as a previously deleted instruction.
	void eraseInstruction(Instruction *I, bool ReplaceOpsWithUndef = false) {
	auto It = DeletedInstructions.try_emplace(I, ReplaceOpsWithUndef).first;
	It->getSecond() = It->getSecond() && ReplaceOpsWithUndef;
	}

	/// Temporary store for deleted instructions. Instructions will be deleted
	/// eventually when the BoUpSLP is destructed.
	DenseMap<Instruction *, bool> DeletedInstructions;

	/// A list of values that need to extracted out of the tree.
	/// This list holds pairs of (Internal Scalar : External User). External User
	/// can be nullptr, it means that this Internal Scalar will be used later,
	/// after vectorization.
	UserList ExternalUses;

	/// Values used only by @llvm.assume calls.
	SmallPtrSet<const Value *, 32> EphValues;

	/// Holds all of the instructions that we gathered.
	SetVector<Instruction *> GatherSeq;

	/// A list of blocks that we are going to CSE.
	SetVector<BasicBlock *> CSEBlocks;

	/// Contains all scheduling relevant data for an instruction.
	/// A ScheduleData either represents a single instruction or a member of an
	/// instruction bundle (= a group of instructions which is combined into a
	/// vector instruction).
	struct ScheduleData {
	// The initial value for the dependency counters. It means that the
	// dependencies are not calculated yet.
	enum { InvalidDeps = -1 };

	ScheduleData() = default;

	void init(int BlockSchedulingRegionID, Value *OpVal) {
	FirstInBundle = this;
	NextInBundle = nullptr;
	NextLoadStore = nullptr;
	IsScheduled = false;
	SchedulingRegionID = BlockSchedulingRegionID;
	UnscheduledDepsInBundle = UnscheduledDeps;
	clearDependencies();
	OpValue = OpVal;
	TE = nullptr;
	Lane = -1;
	}

	/// Returns true if the dependency information has been calculated.
	bool hasValidDependencies() const { return Dependencies != InvalidDeps; }

	/// Returns true for single instructions and for bundle representatives
	/// (= the head of a bundle).
	bool isSchedulingEntity() const { return FirstInBundle == this; }

	/// Returns true if it represents an instruction bundle and not only a
	/// single instruction.
	bool isPartOfBundle() const {
	return NextInBundle != nullptr \|\| FirstInBundle != this;
	}

	/// Returns true if it is ready for scheduling, i.e. it has no more
	/// unscheduled depending instructions/bundles.
	bool isReady() const {
	assert(isSchedulingEntity() &&
	"can't consider non-scheduling entity for ready list");
	return UnscheduledDepsInBundle == 0 && !IsScheduled;
	}

	/// Modifies the number of unscheduled dependencies, also updating it for
	/// the whole bundle.
	int incrementUnscheduledDeps(int Incr) {
	UnscheduledDeps += Incr;
	return FirstInBundle->UnscheduledDepsInBundle += Incr;
	}

	/// Sets the number of unscheduled dependencies to the number of
	/// dependencies.
	void resetUnscheduledDeps() {
	incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
	}

	/// Clears all dependency information.
	void clearDependencies() {
	Dependencies = InvalidDeps;
	resetUnscheduledDeps();
	MemoryDependencies.clear();
	}

	void dump(raw_ostream &os) const {
	if (!isSchedulingEntity()) {
	os << "/ " << *Inst;
	} else if (NextInBundle) {
	os << '[' << *Inst;
	ScheduleData *SD = NextInBundle;
	while (SD) {
	os << ';' << *SD->Inst;
	SD = SD->NextInBundle;
	}
	os << ']';
	} else {
	os << *Inst;
	}
	}

	Instruction *Inst = nullptr;

	/// Points to the head in an instruction bundle (and always to this for
	/// single instructions).
	ScheduleData *FirstInBundle = nullptr;

	/// Single linked list of all instructions in a bundle. Null if it is a
	/// single instruction.
	ScheduleData *NextInBundle = nullptr;

	/// Single linked list of all memory instructions (e.g. load, store, call)
	/// in the block - until the end of the scheduling region.
	ScheduleData *NextLoadStore = nullptr;

	/// The dependent memory instructions.
	/// This list is derived on demand in calculateDependencies().
	SmallVector<ScheduleData *, 4> MemoryDependencies;

	/// This ScheduleData is in the current scheduling region if this matches
	/// the current SchedulingRegionID of BlockScheduling.
	int SchedulingRegionID = 0;

	/// Used for getting a "good" final ordering of instructions.
	int SchedulingPriority = 0;

	/// The number of dependencies. Constitutes of the number of users of the
	/// instruction plus the number of dependent memory instructions (if any).
	/// This value is calculated on demand.
	/// If InvalidDeps, the number of dependencies is not calculated yet.
	int Dependencies = InvalidDeps;

	/// The number of dependencies minus the number of dependencies of scheduled
	/// instructions. As soon as this is zero, the instruction/bundle gets ready
	/// for scheduling.
	/// Note that this is negative as long as Dependencies is not calculated.
	int UnscheduledDeps = InvalidDeps;

	/// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
	/// single instructions.
	int UnscheduledDepsInBundle = InvalidDeps;

	/// True if this instruction is scheduled (or considered as scheduled in the
	/// dry-run).
	bool IsScheduled = false;

	/// Opcode of the current instruction in the schedule data.
	Value *OpValue = nullptr;

	/// The TreeEntry that this instruction corresponds to.
	TreeEntry *TE = nullptr;

	/// The lane of this node in the TreeEntry.
	int Lane = -1;
	};

	#ifndef NDEBUG
	friend inline raw_ostream &operator<<(raw_ostream &os,
	const BoUpSLP::ScheduleData &SD) {
	SD.dump(os);
	return os;
	}
	#endif

	friend struct GraphTraits<BoUpSLP *>;
	friend struct DOTGraphTraits<BoUpSLP *>;

	/// Contains all scheduling data for a basic block.
	struct BlockScheduling {
	BlockScheduling(BasicBlock *BB)
	: BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}

	void clear() {
	ReadyInsts.clear();
	ScheduleStart = nullptr;
	ScheduleEnd = nullptr;
	FirstLoadStoreInRegion = nullptr;
	LastLoadStoreInRegion = nullptr;

	// Reduce the maximum schedule region size by the size of the
	// previous scheduling run.
	ScheduleRegionSizeLimit -= ScheduleRegionSize;
	if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
	ScheduleRegionSizeLimit = MinScheduleRegionSize;
	ScheduleRegionSize = 0;

	// Make a new scheduling region, i.e. all existing ScheduleData is not
	// in the new region yet.
	++SchedulingRegionID;
	}

	ScheduleData getScheduleData(Value V) {
	ScheduleData *SD = ScheduleDataMap[V];
	if (SD && SD->SchedulingRegionID == SchedulingRegionID)
	return SD;
	return nullptr;
	}

	ScheduleData getScheduleData(Value V, Value *Key) {
	if (V == Key)
	return getScheduleData(V);
	auto I = ExtraScheduleDataMap.find(V);
	if (I != ExtraScheduleDataMap.end()) {
	ScheduleData *SD = I->second[Key];
	if (SD && SD->SchedulingRegionID == SchedulingRegionID)
	return SD;
	}
	return nullptr;
	}

	bool isInSchedulingRegion(ScheduleData *SD) const {
	return SD->SchedulingRegionID == SchedulingRegionID;
	}

	/// Marks an instruction as scheduled and puts all dependent ready
	/// instructions into the ready-list.
	template <typename ReadyListType>
	void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
	SD->IsScheduled = true;
	LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");

	ScheduleData *BundleMember = SD;
	while (BundleMember) {
	if (BundleMember->Inst != BundleMember->OpValue) {
	BundleMember = BundleMember->NextInBundle;
	continue;
	}
	// Handle the def-use chain dependencies.

	// Decrement the unscheduled counter and insert to ready list if ready.
	auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
	doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
	if (OpDef && OpDef->hasValidDependencies() &&
	OpDef->incrementUnscheduledDeps(-1) == 0) {
	// There are no more unscheduled dependencies after
	// decrementing, so we can put the dependent instruction
	// into the ready list.
	ScheduleData *DepBundle = OpDef->FirstInBundle;
	assert(!DepBundle->IsScheduled &&
	"already scheduled bundle gets ready");
	ReadyList.insert(DepBundle);
	LLVM_DEBUG(dbgs()
	<< "SLP: gets ready (def): " << *DepBundle << "\n");
	}
	});
	};

	// If BundleMember is a vector bundle, its operands may have been
	// reordered duiring buildTree(). We therefore need to get its operands
	// through the TreeEntry.
	if (TreeEntry *TE = BundleMember->TE) {
	int Lane = BundleMember->Lane;
	assert(Lane >= 0 && "Lane not set");

	// Since vectorization tree is being built recursively this assertion
	// ensures that the tree entry has all operands set before reaching
	// this code. Couple of exceptions known at the moment are extracts
	// where their second (immediate) operand is not added. Since
	// immediates do not affect scheduler behavior this is considered
	// okay.
	auto *In = TE->getMainOp();
	assert(In &&
	(isa<ExtractValueInst>(In) \|\| isa<ExtractElementInst>(In) \|\|
	In->getNumOperands() == TE->getNumOperands()) &&
	"Missed TreeEntry operands?");
	(void)In; // fake use to avoid build failure when assertions disabled

	for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
	OpIdx != NumOperands; ++OpIdx)
	if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
	DecrUnsched(I);
	} else {
	// If BundleMember is a stand-alone instruction, no operand reordering
	// has taken place, so we directly access its operands.
	for (Use &U : BundleMember->Inst->operands())
	if (auto *I = dyn_cast<Instruction>(U.get()))
	DecrUnsched(I);
	}
	// Handle the memory dependencies.
	for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
	if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
	// There are no more unscheduled dependencies after decrementing,
	// so we can put the dependent instruction into the ready list.
	ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
	assert(!DepBundle->IsScheduled &&
	"already scheduled bundle gets ready");
	ReadyList.insert(DepBundle);
	LLVM_DEBUG(dbgs()
	<< "SLP: gets ready (mem): " << *DepBundle << "\n");
	}
	}
	BundleMember = BundleMember->NextInBundle;
	}
	}

	void doForAllOpcodes(Value *V,
	function_ref<void(ScheduleData *SD)> Action) {
	if (ScheduleData *SD = getScheduleData(V))
	Action(SD);
	auto I = ExtraScheduleDataMap.find(V);
	if (I != ExtraScheduleDataMap.end())
	for (auto &P : I->second)
	if (P.second->SchedulingRegionID == SchedulingRegionID)
	Action(P.second);
	}

	/// Put all instructions into the ReadyList which are ready for scheduling.
	template <typename ReadyListType>
	void initialFillReadyList(ReadyListType &ReadyList) {
	for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
	doForAllOpcodes(I, [&](ScheduleData *SD) {
	if (SD->isSchedulingEntity() && SD->isReady()) {
	ReadyList.insert(SD);
	LLVM_DEBUG(dbgs()
	<< "SLP: initially in ready list: " << *I << "\n");
	}
	});
	}
	}

	/// Checks if a bundle of instructions can be scheduled, i.e. has no
	/// cyclic dependencies. This is only a dry-run, no instructions are
	/// actually moved at this stage.
	/// \returns the scheduling bundle. The returned Optional value is non-None
	/// if \p VL is allowed to be scheduled.
	Optional<ScheduleData *>
	tryScheduleBundle(ArrayRef<Value > VL, BoUpSLP SLP,
	const InstructionsState &S);

	/// Un-bundles a group of instructions.
	void cancelScheduling(ArrayRef<Value > VL, Value OpValue);

	/// Allocates schedule data chunk.
	ScheduleData *allocateScheduleDataChunks();

	/// Extends the scheduling region so that V is inside the region.
	/// \returns true if the region size is within the limit.
	bool extendSchedulingRegion(Value *V, const InstructionsState &S);

	/// Initialize the ScheduleData structures for new instructions in the
	/// scheduling region.
	void initScheduleData(Instruction FromI, Instruction ToI,
	ScheduleData *PrevLoadStore,
	ScheduleData *NextLoadStore);

	/// Updates the dependency information of a bundle and of all instructions/
	/// bundles which depend on the original bundle.
	void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
	BoUpSLP *SLP);

	/// Sets all instruction in the scheduling region to un-scheduled.
	void resetSchedule();

	BasicBlock *BB;

	/// Simple memory allocation for ScheduleData.
	std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;

	/// The size of a ScheduleData array in ScheduleDataChunks.
	int ChunkSize;

	/// The allocator position in the current chunk, which is the last entry
	/// of ScheduleDataChunks.
	int ChunkPos;

	/// Attaches ScheduleData to Instruction.
	/// Note that the mapping survives during all vectorization iterations, i.e.
	/// ScheduleData structures are recycled.
	DenseMap<Value , ScheduleData > ScheduleDataMap;

	/// Attaches ScheduleData to Instruction with the leading key.
	DenseMap<Value , SmallDenseMap<Value , ScheduleData *>>
	ExtraScheduleDataMap;

	struct ReadyList : SmallVector<ScheduleData *, 8> {
	void insert(ScheduleData *SD) { push_back(SD); }
	};

	/// The ready-list for scheduling (only used for the dry-run).
	ReadyList ReadyInsts;

	/// The first instruction of the scheduling region.
	Instruction *ScheduleStart = nullptr;

	/// The first instruction _after_ the scheduling region.
	Instruction *ScheduleEnd = nullptr;

	/// The first memory accessing instruction in the scheduling region
	/// (can be null).
	ScheduleData *FirstLoadStoreInRegion = nullptr;

	/// The last memory accessing instruction in the scheduling region
	/// (can be null).
	ScheduleData *LastLoadStoreInRegion = nullptr;

	/// The current size of the scheduling region.
	int ScheduleRegionSize = 0;

	/// The maximum size allowed for the scheduling region.
	int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;

	/// The ID of the scheduling region. For a new vectorization iteration this
	/// is incremented which "removes" all ScheduleData from the region.
	// Make sure that the initial SchedulingRegionID is greater than the
	// initial SchedulingRegionID in ScheduleData (which is 0).
	int SchedulingRegionID = 1;
	};

	/// Attaches the BlockScheduling structures to basic blocks.
	MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;

	/// Performs the "real" scheduling. Done before vectorization is actually
	/// performed in a basic block.
	void scheduleBlock(BlockScheduling *BS);

	/// List of users to ignore during scheduling and that don't need extracting.
	ArrayRef<Value *> UserIgnoreList;

	using OrdersType = SmallVector<unsigned, 4>;
	/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
	/// sorted SmallVectors of unsigned.
	struct OrdersTypeDenseMapInfo {
	static OrdersType getEmptyKey() {
	OrdersType V;
	V.push_back(~1U);
	return V;
	}

	static OrdersType getTombstoneKey() {
	OrdersType V;
	V.push_back(~2U);
	return V;
	}

	static unsigned getHashValue(const OrdersType &V) {
	return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
	}

	static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
	return LHS == RHS;
	}
	};

	/// Contains orders of operations along with the number of bundles that have
	/// operations in this order. It stores only those orders that require
	/// reordering, if reordering is not required it is counted using \a
	/// NumOpsWantToKeepOriginalOrder.
	DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo> NumOpsWantToKeepOrder;
	/// Number of bundles that do not require reordering.
	unsigned NumOpsWantToKeepOriginalOrder = 0;

	// Analysis and block reference.
	Function *F;
	ScalarEvolution *SE;
	TargetTransformInfo *TTI;
	TargetLibraryInfo *TLI;
	AliasAnalysis *AA;
	LoopInfo *LI;
	DominatorTree *DT;
	AssumptionCache *AC;
	DemandedBits *DB;
	const DataLayout *DL;
	OptimizationRemarkEmitter *ORE;

	unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
	unsigned MinVecRegSize; // Set by cl::opt (default: 128).

	/// Instruction builder to construct the vectorized tree.
	IRBuilder<> Builder;

	/// A map of scalar integer values to the smallest bit width with which they
	/// can legally be represented. The values map to (width, signed) pairs,
	/// where "width" indicates the minimum bit width and "signed" is True if the
	/// value must be signed-extended, rather than zero-extended, back to its
	/// original width.
	MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
	};

	} // end namespace slpvectorizer

	template <> struct GraphTraits<BoUpSLP *> {
	using TreeEntry = BoUpSLP::TreeEntry;

	/// NodeRef has to be a pointer per the GraphWriter.
	using NodeRef = TreeEntry *;

	using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;

	/// Add the VectorizableTree to the index iterator to be able to return
	/// TreeEntry pointers.
	struct ChildIteratorType
	: public iterator_adaptor_base<
	ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
	ContainerTy &VectorizableTree;

	ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,
	ContainerTy &VT)
	: ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}

	NodeRef operator*() { return I->UserTE; }
	};

	static NodeRef getEntryNode(BoUpSLP &R) {
	return R.VectorizableTree[0].get();
	}

	static ChildIteratorType child_begin(NodeRef N) {
	return {N->UserTreeIndices.begin(), N->Container};
	}

	static ChildIteratorType child_end(NodeRef N) {
	return {N->UserTreeIndices.end(), N->Container};
	}

	/// For the node iterator we just need to turn the TreeEntry iterator into a
	/// TreeEntry* iterator so that it dereferences to NodeRef.
	class nodes_iterator {
	using ItTy = ContainerTy::iterator;
	ItTy It;

	public:
	nodes_iterator(const ItTy &It2) : It(It2) {}
	NodeRef operator*() { return It->get(); }
	nodes_iterator operator++() {
	++It;
	return *this;
	}
	bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
	};

	static nodes_iterator nodes_begin(BoUpSLP *R) {
	return nodes_iterator(R->VectorizableTree.begin());
	}

	static nodes_iterator nodes_end(BoUpSLP *R) {
	return nodes_iterator(R->VectorizableTree.end());
	}

	static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
	};

	template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
	using TreeEntry = BoUpSLP::TreeEntry;

	DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}

	std::string getNodeLabel(const TreeEntry Entry, const BoUpSLP R) {
	std::string Str;
	raw_string_ostream OS(Str);
	if (isSplat(Entry->Scalars)) {
	OS << "<splat> " << *Entry->Scalars[0];
	return Str;
	}
	for (auto V : Entry->Scalars) {
	OS << *V;
	if (std::any_of(
	R->ExternalUses.begin(), R->ExternalUses.end(),
	[&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; }))
	OS << " <extract>";
	OS << "\n";
	}
	return Str;
	}

	static std::string getNodeAttributes(const TreeEntry *Entry,
	const BoUpSLP *) {
	if (Entry->State == TreeEntry::NeedToGather)
	return "color=red";
	return "";
	}
	};

	} // end namespace llvm

	BoUpSLP::~BoUpSLP() {
	for (const auto &Pair : DeletedInstructions) {
	// Replace operands of ignored instructions with Undefs in case if they were
	// marked for deletion.
	if (Pair.getSecond()) {
	Value *Undef = UndefValue::get(Pair.getFirst()->getType());
	Pair.getFirst()->replaceAllUsesWith(Undef);
	}
	Pair.getFirst()->dropAllReferences();
	}
	for (const auto &Pair : DeletedInstructions) {
	assert(Pair.getFirst()->use_empty() &&
	"trying to erase instruction with users.");
	Pair.getFirst()->eraseFromParent();
	}
	assert(!verifyFunction(*F, &dbgs()));
	}

	void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
	for (auto *V : AV) {
	if (auto *I = dyn_cast<Instruction>(V))
	eraseInstruction(I, /ReplaceWithUndef=/true);
	};
	}

	void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
	ArrayRef<Value *> UserIgnoreLst) {
	ExtraValueToDebugLocsMap ExternallyUsedValues;
	buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
	}

	void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
	ExtraValueToDebugLocsMap &ExternallyUsedValues,
	ArrayRef<Value *> UserIgnoreLst) {
	deleteTree();
	UserIgnoreList = UserIgnoreLst;
	if (!allSameType(Roots))
	return;
	buildTree_rec(Roots, 0, EdgeInfo());

	// Collect the values that we need to extract from the tree.
	for (auto &TEPtr : VectorizableTree) {
	TreeEntry *Entry = TEPtr.get();

	// No need to handle users of gathered values.
	if (Entry->State == TreeEntry::NeedToGather)
	continue;

	// For each lane:
	for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
	Value *Scalar = Entry->Scalars[Lane];
	int FoundLane = Lane;
	if (!Entry->ReuseShuffleIndices.empty()) {
	FoundLane =
	std::distance(Entry->ReuseShuffleIndices.begin(),
	llvm::find(Entry->ReuseShuffleIndices, FoundLane));
	}

	// Check if the scalar is externally used as an extra arg.
	auto ExtI = ExternallyUsedValues.find(Scalar);
	if (ExtI != ExternallyUsedValues.end()) {
	LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
	<< Lane << " from " << *Scalar << ".\n");
	ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
	}
	for (User *U : Scalar->users()) {
	LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");

	Instruction *UserInst = dyn_cast<Instruction>(U);
	if (!UserInst)
	continue;

	// Skip in-tree scalars that become vectors
	if (TreeEntry *UseEntry = getTreeEntry(U)) {
	Value *UseScalar = UseEntry->Scalars[0];
	// Some in-tree scalars will remain as scalar in vectorized
	// instructions. If that is the case, the one in Lane 0 will
	// be used.
	if (UseScalar != U \|\|
	!InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
	LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
	<< ".\n");
	assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
	continue;
	}
	}

	// Ignore users in the user ignore list.
	if (is_contained(UserIgnoreList, UserInst))
	continue;

	LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
	<< Lane << " from " << *Scalar << ".\n");
	ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
	}
	}
	}
	}

	void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
	const EdgeInfo &UserTreeIdx) {
	assert((allConstant(VL) \|\| allSameType(VL)) && "Invalid types!");

	InstructionsState S = getSameOpcode(VL);
	if (Depth == RecursionMaxDepth) {
	LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
	newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx);
	return;
	}

	// Don't handle vectors.
	if (S.OpValue->getType()->isVectorTy()) {
	LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
	newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx);
	return;
	}

	if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
	if (SI->getValueOperand()->getType()->isVectorTy()) {
	LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
	newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx);
	return;
	}

	// If all of the operands are identical or constant we have a simple solution.
	if (allConstant(VL) \|\| isSplat(VL) \|\| !allSameBlock(VL) \|\| !S.getOpcode()) {
	LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
	newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx);
	return;
	}

	// We now know that this is a vector of instructions of the same type from
	// the same block.

	// Don't vectorize ephemeral values.
	for (Value *V : VL) {
	if (EphValues.count(V)) {
	LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
	<< ") is ephemeral.\n");
	newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx);
	return;
	}
	}

	// Check if this is a duplicate of another entry.
	if (TreeEntry *E = getTreeEntry(S.OpValue)) {
	LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
	if (!E->isSame(VL)) {
	LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
	newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx);
	return;
	}
	// Record the reuse of the tree node. FIXME, currently this is only used to
	// properly draw the graph rather than for the actual vectorization.
	E->UserTreeIndices.push_back(UserTreeIdx);
	LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
	<< ".\n");
	return;
	}

	// Check that none of the instructions in the bundle are already in the tree.
	for (Value *V : VL) {
	auto *I = dyn_cast<Instruction>(V);
	if (!I)
	continue;
	if (getTreeEntry(I)) {
	LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
	<< ") is already in tree.\n");
	newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx);
	return;
	}
	}

	// If any of the scalars is marked as a value that needs to stay scalar, then
	// we need to gather the scalars.
	// The reduction nodes (stored in UserIgnoreList) also should stay scalar.
	for (Value *V : VL) {
	if (MustGather.count(V) \|\| is_contained(UserIgnoreList, V)) {
	LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
	newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx);
	return;
	}
	}

	// Check that all of the users of the scalars that we want to vectorize are
	// schedulable.
	auto *VL0 = cast<Instruction>(S.OpValue);
	BasicBlock *BB = VL0->getParent();

	if (!DT->isReachableFromEntry(BB)) {
	// Don't go into unreachable blocks. They may contain instructions with
	// dependency cycles which confuse the final scheduling.
	LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
	newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx);
	return;
	}

	// Check that every instruction appears once in this bundle.
	SmallVector<unsigned, 4> ReuseShuffleIndicies;
	SmallVector<Value *, 4> UniqueValues;
	DenseMap<Value *, unsigned> UniquePositions;
	for (Value *V : VL) {
	auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
	ReuseShuffleIndicies.emplace_back(Res.first->second);
	if (Res.second)
	UniqueValues.emplace_back(V);
	}
	size_t NumUniqueScalarValues = UniqueValues.size();
	if (NumUniqueScalarValues == VL.size()) {
	ReuseShuffleIndicies.clear();
	} else {
	LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
	if (NumUniqueScalarValues <= 1 \|\|
	!llvm::isPowerOf2_32(NumUniqueScalarValues)) {
	LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
	newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx);
	return;
	}
	VL = UniqueValues;
	}

	auto &BSRef = BlocksSchedules[BB];
	if (!BSRef)
	BSRef = std::make_unique<BlockScheduling>(BB);

	BlockScheduling &BS = *BSRef.get();

	Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);
	if (!Bundle) {
	LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
	assert((!BS.getScheduleData(VL0) \|\|
	!BS.getScheduleData(VL0)->isPartOfBundle()) &&
	"tryScheduleBundle should cancelScheduling on failure");
	newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx,
	ReuseShuffleIndicies);
	return;
	}
	LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");

	unsigned ShuffleOrOp = S.isAltShuffle() ?
	(unsigned) Instruction::ShuffleVector : S.getOpcode();
	switch (ShuffleOrOp) {
	case Instruction::PHI: {
	auto *PH = cast<PHINode>(VL0);

	// Check for terminator values (e.g. invoke).
	for (unsigned j = 0; j < VL.size(); ++j)
	for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
	Instruction *Term = dyn_cast<Instruction>(
	cast<PHINode>(VL[j])->getIncomingValueForBlock(
	PH->getIncomingBlock(i)));
	if (Term && Term->isTerminator()) {
	LLVM_DEBUG(dbgs()
	<< "SLP: Need to swizzle PHINodes (terminator use).\n");
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx,
	ReuseShuffleIndicies);
	return;
	}
	}

	TreeEntry *TE =
	newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
	LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");

	// Keeps the reordered operands to avoid code duplication.
	SmallVector<ValueList, 2> OperandsVec;
	for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
	ValueList Operands;
	// Prepare the operand vector.
	for (Value *j : VL)
	Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
	PH->getIncomingBlock(i)));
	TE->setOperand(i, Operands);
	OperandsVec.push_back(Operands);
	}
	for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
	buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx});
	return;
	}
	case Instruction::ExtractValue:
	case Instruction::ExtractElement: {
	OrdersType CurrentOrder;
	bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
	if (Reuse) {
	LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
	++NumOpsWantToKeepOriginalOrder;
	newTreeEntry(VL, Bundle /vectorized/, S, UserTreeIdx,
	ReuseShuffleIndicies);
	// This is a special case, as it does not gather, but at the same time
	// we are not extending buildTree_rec() towards the operands.
	ValueList Op0;
	Op0.assign(VL.size(), VL0->getOperand(0));
	VectorizableTree.back()->setOperand(0, Op0);
	return;
	}
	if (!CurrentOrder.empty()) {
	LLVM_DEBUG({
	dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
	"with order";
	for (unsigned Idx : CurrentOrder)
	dbgs() << " " << Idx;
	dbgs() << "\n";
	});
	// Insert new order with initial value 0, if it does not exist,
	// otherwise return the iterator to the existing one.
	auto StoredCurrentOrderAndNum =
	NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
	++StoredCurrentOrderAndNum->getSecond();
	newTreeEntry(VL, Bundle /vectorized/, S, UserTreeIdx,
	ReuseShuffleIndicies,
	StoredCurrentOrderAndNum->getFirst());
	// This is a special case, as it does not gather, but at the same time
	// we are not extending buildTree_rec() towards the operands.
	ValueList Op0;
	Op0.assign(VL.size(), VL0->getOperand(0));
	VectorizableTree.back()->setOperand(0, Op0);
	return;
	}
	LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
	newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx,
	ReuseShuffleIndicies);
	BS.cancelScheduling(VL, VL0);
	return;
	}
	case Instruction::Load: {
	// Check that a vectorized load would load the same memory as a scalar
	// load. For example, we don't want to vectorize loads that are smaller
	// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
	// treats loading/storing it as an i8 struct. If we vectorize loads/stores
	// from such a struct, we read/write packed bits disagreeing with the
	// unvectorized version.
	Type *ScalarTy = VL0->getType();

	if (DL->getTypeSizeInBits(ScalarTy) !=
	DL->getTypeAllocSizeInBits(ScalarTy)) {
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx,
	ReuseShuffleIndicies);
	LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
	return;
	}

	// Make sure all loads in the bundle are simple - we can't vectorize
	// atomic or volatile loads.
	SmallVector<Value *, 4> PointerOps(VL.size());
	auto POIter = PointerOps.begin();
	for (Value *V : VL) {
	auto *L = cast<LoadInst>(V);
	if (!L->isSimple()) {
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx,
	ReuseShuffleIndicies);
	LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
	return;
	}
	*POIter = L->getPointerOperand();
	++POIter;
	}

	OrdersType CurrentOrder;
	// Check the order of pointer operands.
	if (llvm::sortPtrAccesses(PointerOps, DL, SE, CurrentOrder)) {
	Value *Ptr0;
	Value *PtrN;
	if (CurrentOrder.empty()) {
	Ptr0 = PointerOps.front();
	PtrN = PointerOps.back();
	} else {
	Ptr0 = PointerOps[CurrentOrder.front()];
	PtrN = PointerOps[CurrentOrder.back()];
	}
	const SCEV *Scev0 = SE->getSCEV(Ptr0);
	const SCEV *ScevN = SE->getSCEV(PtrN);
	const auto *Diff =
	dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
	uint64_t Size = DL->getTypeAllocSize(ScalarTy);
	// Check that the sorted loads are consecutive.
	if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {
	if (CurrentOrder.empty()) {
	// Original loads are consecutive and does not require reordering.
	++NumOpsWantToKeepOriginalOrder;
	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized*/, S,
	UserTreeIdx, ReuseShuffleIndicies);
	TE->setOperandsInOrder();
	LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
	} else {
	// Need to reorder.
	auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
	++I->getSecond();
	TreeEntry *TE =
	newTreeEntry(VL, Bundle /vectorized/, S, UserTreeIdx,
	ReuseShuffleIndicies, I->getFirst());
	TE->setOperandsInOrder();
	LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
	}
	return;
	}
	}

	LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx,
	ReuseShuffleIndicies);
	return;
	}
	case Instruction::ZExt:
	case Instruction::SExt:
	case Instruction::FPToUI:
	case Instruction::FPToSI:
	case Instruction::FPExt:
	case Instruction::PtrToInt:
	case Instruction::IntToPtr:
	case Instruction::SIToFP:
	case Instruction::UIToFP:
	case Instruction::Trunc:
	case Instruction::FPTrunc:
	case Instruction::BitCast: {
	Type *SrcTy = VL0->getOperand(0)->getType();
	for (Value *V : VL) {
	Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
	if (Ty != SrcTy \|\| !isValidElementType(Ty)) {
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx,
	ReuseShuffleIndicies);
	LLVM_DEBUG(dbgs()
	<< "SLP: Gathering casts with different src types.\n");
	return;
	}
	}
	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized*/, S, UserTreeIdx,
	ReuseShuffleIndicies);
	LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");

	TE->setOperandsInOrder();
	for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
	ValueList Operands;
	// Prepare the operand vector.
	for (Value *V : VL)
	Operands.push_back(cast<Instruction>(V)->getOperand(i));

	buildTree_rec(Operands, Depth + 1, {TE, i});
	}
	return;
	}
	case Instruction::ICmp:
	case Instruction::FCmp: {
	// Check that all of the compares have the same predicate.
	CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
	CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);
	Type *ComparedTy = VL0->getOperand(0)->getType();
	for (Value *V : VL) {
	CmpInst *Cmp = cast<CmpInst>(V);
	if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) \|\|
	Cmp->getOperand(0)->getType() != ComparedTy) {
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx,
	ReuseShuffleIndicies);
	LLVM_DEBUG(dbgs()
	<< "SLP: Gathering cmp with different predicate.\n");
	return;
	}
	}

	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized*/, S, UserTreeIdx,
	ReuseShuffleIndicies);
	LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");

	ValueList Left, Right;
	if (cast<CmpInst>(VL0)->isCommutative()) {
	// Commutative predicate - collect + sort operands of the instructions
	// so that each side is more likely to have the same opcode.
	assert(P0 == SwapP0 && "Commutative Predicate mismatch");
	reorderInputsAccordingToOpcode(VL, Left, Right, DL, SE, *this);
	} else {
	// Collect operands - commute if it uses the swapped predicate.
	for (Value *V : VL) {
	auto *Cmp = cast<CmpInst>(V);
	Value *LHS = Cmp->getOperand(0);
	Value *RHS = Cmp->getOperand(1);
	if (Cmp->getPredicate() != P0)
	std::swap(LHS, RHS);
	Left.push_back(LHS);
	Right.push_back(RHS);
	}
	}
	TE->setOperand(0, Left);
	TE->setOperand(1, Right);
	buildTree_rec(Left, Depth + 1, {TE, 0});
	buildTree_rec(Right, Depth + 1, {TE, 1});
	return;
	}
	case Instruction::Select:
	case Instruction::FNeg:
	case Instruction::Add:
	case Instruction::FAdd:
	case Instruction::Sub:
	case Instruction::FSub:
	case Instruction::Mul:
	case Instruction::FMul:
	case Instruction::UDiv:
	case Instruction::SDiv:
	case Instruction::FDiv:
	case Instruction::URem:
	case Instruction::SRem:
	case Instruction::FRem:
	case Instruction::Shl:
	case Instruction::LShr:
	case Instruction::AShr:
	case Instruction::And:
	case Instruction::Or:
	case Instruction::Xor: {
	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized*/, S, UserTreeIdx,
	ReuseShuffleIndicies);
	LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");

	// Sort operands of the instructions so that each side is more likely to
	// have the same opcode.
	if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
	ValueList Left, Right;
	reorderInputsAccordingToOpcode(VL, Left, Right, DL, SE, *this);
	TE->setOperand(0, Left);
	TE->setOperand(1, Right);
	buildTree_rec(Left, Depth + 1, {TE, 0});
	buildTree_rec(Right, Depth + 1, {TE, 1});
	return;
	}

	TE->setOperandsInOrder();
	for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
	ValueList Operands;
	// Prepare the operand vector.
	for (Value *j : VL)
	Operands.push_back(cast<Instruction>(j)->getOperand(i));

	buildTree_rec(Operands, Depth + 1, {TE, i});
	}
	return;
	}
	case Instruction::GetElementPtr: {
	// We don't combine GEPs with complicated (nested) indexing.
	for (Value *V : VL) {
	if (cast<Instruction>(V)->getNumOperands() != 2) {
	LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx,
	ReuseShuffleIndicies);
	return;
	}
	}

	// We can't combine several GEPs into one vector if they operate on
	// different types.
	Type *Ty0 = VL0->getOperand(0)->getType();
	for (Value *V : VL) {
	Type *CurTy = cast<Instruction>(V)->getOperand(0)->getType();
	if (Ty0 != CurTy) {
	LLVM_DEBUG(dbgs()
	<< "SLP: not-vectorizable GEP (different types).\n");
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx,
	ReuseShuffleIndicies);
	return;
	}
	}

	// We don't combine GEPs with non-constant indexes.
	Type *Ty1 = VL0->getOperand(1)->getType();
	for (Value *V : VL) {
	auto Op = cast<Instruction>(V)->getOperand(1);
	if (!isa<ConstantInt>(Op) \|\|
	(Op->getType() != Ty1 &&
	Op->getType()->getScalarSizeInBits() >
	DL->getIndexSizeInBits(
	V->getType()->getPointerAddressSpace()))) {
	LLVM_DEBUG(dbgs()
	<< "SLP: not-vectorizable GEP (non-constant indexes).\n");
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx,
	ReuseShuffleIndicies);
	return;
	}
	}

	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized*/, S, UserTreeIdx,
	ReuseShuffleIndicies);
	LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
	TE->setOperandsInOrder();
	for (unsigned i = 0, e = 2; i < e; ++i) {
	ValueList Operands;
	// Prepare the operand vector.
	for (Value *V : VL)
	Operands.push_back(cast<Instruction>(V)->getOperand(i));

	buildTree_rec(Operands, Depth + 1, {TE, i});
	}
	return;
	}
	case Instruction::Store: {
	// Check if the stores are consecutive or if we need to swizzle them.
	llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
	// Make sure all stores in the bundle are simple - we can't vectorize
	// atomic or volatile stores.
	SmallVector<Value *, 4> PointerOps(VL.size());
	ValueList Operands(VL.size());
	auto POIter = PointerOps.begin();
	auto OIter = Operands.begin();
	for (Value *V : VL) {
	auto *SI = cast<StoreInst>(V);
	if (!SI->isSimple()) {
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx,
	ReuseShuffleIndicies);
	LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
	return;
	}
	*POIter = SI->getPointerOperand();
	*OIter = SI->getValueOperand();
	++POIter;
	++OIter;
	}

	OrdersType CurrentOrder;
	// Check the order of pointer operands.
	if (llvm::sortPtrAccesses(PointerOps, DL, SE, CurrentOrder)) {
	Value *Ptr0;
	Value *PtrN;
	if (CurrentOrder.empty()) {
	Ptr0 = PointerOps.front();
	PtrN = PointerOps.back();
	} else {
	Ptr0 = PointerOps[CurrentOrder.front()];
	PtrN = PointerOps[CurrentOrder.back()];
	}
	const SCEV *Scev0 = SE->getSCEV(Ptr0);
	const SCEV *ScevN = SE->getSCEV(PtrN);
	const auto *Diff =
	dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
	uint64_t Size = DL->getTypeAllocSize(ScalarTy);
	// Check that the sorted pointer operands are consecutive.
	if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {
	if (CurrentOrder.empty()) {
	// Original stores are consecutive and does not require reordering.
	++NumOpsWantToKeepOriginalOrder;
	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized*/, S,
	UserTreeIdx, ReuseShuffleIndicies);
	TE->setOperandsInOrder();
	buildTree_rec(Operands, Depth + 1, {TE, 0});
	LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
	} else {
	// Need to reorder.
	auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
	++(I->getSecond());
	TreeEntry *TE =
	newTreeEntry(VL, Bundle /vectorized/, S, UserTreeIdx,
	ReuseShuffleIndicies, I->getFirst());
	TE->setOperandsInOrder();
	buildTree_rec(Operands, Depth + 1, {TE, 0});
	LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
	}
	return;
	}
	}

	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx,
	ReuseShuffleIndicies);
	LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
	return;
	}
	case Instruction::Call: {
	// Check if the calls are all to the same vectorizable intrinsic or
	// library function.
	CallInst *CI = cast<CallInst>(VL0);
	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

	VFShape Shape = VFShape::get(
	CI, {static_cast<unsigned int>(VL.size()), false /Scalable*/},
	false /HasGlobalPred/);
	Function VecFunc = VFDatabase(CI).getVectorizedFunction(Shape);

	if (!VecFunc && !isTriviallyVectorizable(ID)) {
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx,
	ReuseShuffleIndicies);
	LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
	return;
	}
	Function *F = CI->getCalledFunction();
	unsigned NumArgs = CI->getNumArgOperands();
	SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);
	for (unsigned j = 0; j != NumArgs; ++j)
	if (hasVectorInstrinsicScalarOpd(ID, j))
	ScalarArgs[j] = CI->getArgOperand(j);
	for (Value *V : VL) {
	CallInst *CI2 = dyn_cast<CallInst>(V);
	if (!CI2 \|\| CI2->getCalledFunction() != F \|\|
	getVectorIntrinsicIDForCall(CI2, TLI) != ID \|\|
	(VecFunc &&
	VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) \|\|
	!CI->hasIdenticalOperandBundleSchema(*CI2)) {
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx,
	ReuseShuffleIndicies);
	LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << CI << "!=" << V
	<< "\n");
	return;
	}
	// Some intrinsics have scalar arguments and should be same in order for
	// them to be vectorized.
	for (unsigned j = 0; j != NumArgs; ++j) {
	if (hasVectorInstrinsicScalarOpd(ID, j)) {
	Value *A1J = CI2->getArgOperand(j);
	if (ScalarArgs[j] != A1J) {
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx,
	ReuseShuffleIndicies);
	LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
	<< " argument " << ScalarArgs[j] << "!=" << A1J
	<< "\n");
	return;
	}
	}
	}
	// Verify that the bundle operands are identical between the two calls.
	if (CI->hasOperandBundles() &&
	!std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
	CI->op_begin() + CI->getBundleOperandsEndIndex(),
	CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx,
	ReuseShuffleIndicies);
	LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"
	<< CI << "!=" << V << '\n');
	return;
	}
	}

	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized*/, S, UserTreeIdx,
	ReuseShuffleIndicies);
	TE->setOperandsInOrder();
	for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
	ValueList Operands;
	// Prepare the operand vector.
	for (Value *V : VL) {
	auto *CI2 = cast<CallInst>(V);
	Operands.push_back(CI2->getArgOperand(i));
	}
	buildTree_rec(Operands, Depth + 1, {TE, i});
	}
	return;
	}
	case Instruction::ShuffleVector: {
	// If this is not an alternate sequence of opcode like add-sub
	// then do not vectorize this instruction.
	if (!S.isAltShuffle()) {
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx,
	ReuseShuffleIndicies);
	LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
	return;
	}
	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized*/, S, UserTreeIdx,
	ReuseShuffleIndicies);
	LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");

	// Reorder operands if reordering would enable vectorization.
	if (isa<BinaryOperator>(VL0)) {
	ValueList Left, Right;
	reorderInputsAccordingToOpcode(VL, Left, Right, DL, SE, *this);
	TE->setOperand(0, Left);
	TE->setOperand(1, Right);
	buildTree_rec(Left, Depth + 1, {TE, 0});
	buildTree_rec(Right, Depth + 1, {TE, 1});
	return;
	}

	TE->setOperandsInOrder();
	for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
	ValueList Operands;
	// Prepare the operand vector.
	for (Value *V : VL)
	Operands.push_back(cast<Instruction>(V)->getOperand(i));

	buildTree_rec(Operands, Depth + 1, {TE, i});
	}
	return;
	}
	default:
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, None /not vectorized/, S, UserTreeIdx,
	ReuseShuffleIndicies);
	LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
	return;
	}
	}

	unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
	unsigned N = 1;
	Type *EltTy = T;

	while (isa<StructType>(EltTy) \|\| isa<ArrayType>(EltTy) \|\|
	isa<VectorType>(EltTy)) {
	if (auto *ST = dyn_cast<StructType>(EltTy)) {
	// Check that struct is homogeneous.
	for (const auto *Ty : ST->elements())
	if (Ty != *ST->element_begin())
	return 0;
	N *= ST->getNumElements();
	EltTy = *ST->element_begin();
	} else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
	N *= AT->getNumElements();
	EltTy = AT->getElementType();
	} else {
	auto *VT = cast<VectorType>(EltTy);
	N *= VT->getNumElements();
	EltTy = VT->getElementType();
	}
	}

	if (!isValidElementType(EltTy))
	return 0;
	uint64_t VTSize = DL.getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));
	if (VTSize < MinVecRegSize \|\| VTSize > MaxVecRegSize \|\| VTSize != DL.getTypeStoreSizeInBits(T))
	return 0;
	return N;
	}

	bool BoUpSLP::canReuseExtract(ArrayRef<Value > VL, Value OpValue,
	SmallVectorImpl<unsigned> &CurrentOrder) const {
	Instruction *E0 = cast<Instruction>(OpValue);
	assert(E0->getOpcode() == Instruction::ExtractElement \|\|
	E0->getOpcode() == Instruction::ExtractValue);
	assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode");
	// Check if all of the extracts come from the same vector and from the
	// correct offset.
	Value *Vec = E0->getOperand(0);

	CurrentOrder.clear();

	// We have to extract from a vector/aggregate with the same number of elements.
	unsigned NElts;
	if (E0->getOpcode() == Instruction::ExtractValue) {
	const DataLayout &DL = E0->getModule()->getDataLayout();
	NElts = canMapToVector(Vec->getType(), DL);
	if (!NElts)
	return false;
	// Check if load can be rewritten as load of vector.
	LoadInst *LI = dyn_cast<LoadInst>(Vec);
	if (!LI \|\| !LI->isSimple() \|\| !LI->hasNUses(VL.size()))
	return false;
	} else {
	NElts = cast<VectorType>(Vec->getType())->getNumElements();
	}

	if (NElts != VL.size())
	return false;

	// Check that all of the indices extract from the correct offset.
	bool ShouldKeepOrder = true;
	unsigned E = VL.size();
	// Assign to all items the initial value E + 1 so we can check if the extract
	// instruction index was used already.
	// Also, later we can check that all the indices are used and we have a
	// consecutive access in the extract instructions, by checking that no
	// element of CurrentOrder still has value E + 1.
	CurrentOrder.assign(E, E + 1);
	unsigned I = 0;
	for (; I < E; ++I) {
	auto *Inst = cast<Instruction>(VL[I]);
	if (Inst->getOperand(0) != Vec)
	break;
	Optional<unsigned> Idx = getExtractIndex(Inst);
	if (!Idx)
	break;
	const unsigned ExtIdx = *Idx;
	if (ExtIdx != I) {
	if (ExtIdx >= E \|\| CurrentOrder[ExtIdx] != E + 1)
	break;
	ShouldKeepOrder = false;
	CurrentOrder[ExtIdx] = I;
	} else {
	if (CurrentOrder[I] != E + 1)
	break;
	CurrentOrder[I] = I;
	}
	}
	if (I < E) {
	CurrentOrder.clear();
	return false;
	}

	return ShouldKeepOrder;
	}

	bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
	return I->hasOneUse() \|\|
	std::all_of(I->user_begin(), I->user_end(), [this](User *U) {
	return ScalarToTreeEntry.count(U) > 0;
	});
	}

	static std::pair<unsigned, unsigned>
	getVectorCallCosts(CallInst CI, VectorType VecTy, TargetTransformInfo *TTI,
	TargetLibraryInfo *TLI) {
	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

	// Calculate the cost of the scalar and vector calls.
	IntrinsicCostAttributes CostAttrs(ID, *CI, VecTy->getNumElements());
	int IntrinsicCost =
	TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);

	auto Shape =
	VFShape::get(*CI, {static_cast<unsigned>(VecTy->getNumElements()), false},
	false /HasGlobalPred/);
	Function VecFunc = VFDatabase(CI).getVectorizedFunction(Shape);
	int LibCost = IntrinsicCost;
	if (!CI->isNoBuiltin() && VecFunc) {
	// Calculate the cost of the vector library call.
	SmallVector<Type *, 4> VecTys;
	for (Use &Arg : CI->args())
	VecTys.push_back(
	FixedVectorType::get(Arg->getType(), VecTy->getNumElements()));

	// If the corresponding vector call is cheaper, return its cost.
	LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys,
	TTI::TCK_RecipThroughput);
	}
	return {IntrinsicCost, LibCost};
	}

	int BoUpSLP::getEntryCost(TreeEntry *E) {
	ArrayRef<Value*> VL = E->Scalars;

	Type *ScalarTy = VL[0]->getType();
	if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
	ScalarTy = SI->getValueOperand()->getType();
	else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
	ScalarTy = CI->getOperand(0)->getType();
	auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

	// If we have computed a smaller type for the expression, update VecTy so
	// that the costs will be accurate.
	if (MinBWs.count(VL[0]))
	VecTy = FixedVectorType::get(
	IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());

	unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
	bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
	int ReuseShuffleCost = 0;
	if (NeedToShuffleReuses) {
	ReuseShuffleCost =
	TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
	}
	if (E->State == TreeEntry::NeedToGather) {
	if (allConstant(VL))
	return 0;
	if (isSplat(VL)) {
	return ReuseShuffleCost +
	TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
	}
	if (E->getOpcode() == Instruction::ExtractElement &&
	allSameType(VL) && allSameBlock(VL)) {
	Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
	if (ShuffleKind.hasValue()) {
	int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
	for (auto *V : VL) {
	// If all users of instruction are going to be vectorized and this
	// instruction itself is not going to be vectorized, consider this
	// instruction as dead and remove its cost from the final cost of the
	// vectorized tree.
	if (areAllUsersVectorized(cast<Instruction>(V)) &&
	!ScalarToTreeEntry.count(V)) {
	auto *IO = cast<ConstantInt>(
	cast<ExtractElementInst>(V)->getIndexOperand());
	Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
	IO->getZExtValue());
	}
	}
	return ReuseShuffleCost + Cost;
	}
	}
	return ReuseShuffleCost + getGatherCost(VL);
	}
	assert(E->State == TreeEntry::Vectorize && "Unhandled state");
	assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
	Instruction *VL0 = E->getMainOp();
	unsigned ShuffleOrOp =
	E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
	switch (ShuffleOrOp) {
	case Instruction::PHI:
	return 0;

	case Instruction::ExtractValue:
	case Instruction::ExtractElement: {
	if (NeedToShuffleReuses) {
	unsigned Idx = 0;
	for (unsigned I : E->ReuseShuffleIndices) {
	if (ShuffleOrOp == Instruction::ExtractElement) {
	auto *IO = cast<ConstantInt>(
	cast<ExtractElementInst>(VL[I])->getIndexOperand());
	Idx = IO->getZExtValue();
	ReuseShuffleCost -= TTI->getVectorInstrCost(
	Instruction::ExtractElement, VecTy, Idx);
	} else {
	ReuseShuffleCost -= TTI->getVectorInstrCost(
	Instruction::ExtractElement, VecTy, Idx);
	++Idx;
	}
	}
	Idx = ReuseShuffleNumbers;
	for (Value *V : VL) {
	if (ShuffleOrOp == Instruction::ExtractElement) {
	auto *IO = cast<ConstantInt>(
	cast<ExtractElementInst>(V)->getIndexOperand());
	Idx = IO->getZExtValue();
	} else {
	--Idx;
	}
	ReuseShuffleCost +=
	TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
	}
	}
	int DeadCost = ReuseShuffleCost;
	if (!E->ReorderIndices.empty()) {
	// TODO: Merge this shuffle with the ReuseShuffleCost.
	DeadCost += TTI->getShuffleCost(
	TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
	}
	for (unsigned i = 0, e = VL.size(); i < e; ++i) {
	Instruction *E = cast<Instruction>(VL[i]);
	// If all users are going to be vectorized, instruction can be
	// considered as dead.
	// The same, if have only one user, it will be vectorized for sure.
	if (areAllUsersVectorized(E)) {
	// Take credit for instruction that will become dead.
	if (E->hasOneUse()) {
	Instruction *Ext = E->user_back();
	if ((isa<SExtInst>(Ext) \|\| isa<ZExtInst>(Ext)) &&
	all_of(Ext->users(),
	[](User *U) { return isa<GetElementPtrInst>(U); })) {
	// Use getExtractWithExtendCost() to calculate the cost of
	// extractelement/ext pair.
	DeadCost -= TTI->getExtractWithExtendCost(
	Ext->getOpcode(), Ext->getType(), VecTy, i);
	// Add back the cost of s\|zext which is subtracted separately.
	DeadCost += TTI->getCastInstrCost(
	Ext->getOpcode(), Ext->getType(), E->getType(), CostKind,
	Ext);
	continue;
	}
	}
	DeadCost -=
	TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
	}
	}
	return DeadCost;
	}
	case Instruction::ZExt:
	case Instruction::SExt:
	case Instruction::FPToUI:
	case Instruction::FPToSI:
	case Instruction::FPExt:
	case Instruction::PtrToInt:
	case Instruction::IntToPtr:
	case Instruction::SIToFP:
	case Instruction::UIToFP:
	case Instruction::Trunc:
	case Instruction::FPTrunc:
	case Instruction::BitCast: {
	Type *SrcTy = VL0->getOperand(0)->getType();
	int ScalarEltCost =
	TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, CostKind,
	VL0);
	if (NeedToShuffleReuses) {
	ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
	}

	// Calculate the cost of this instruction.
	int ScalarCost = VL.size() * ScalarEltCost;

	auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());
	int VecCost = 0;
	// Check if the values are candidates to demote.
	if (!MinBWs.count(VL0) \|\| VecTy != SrcVecTy) {
	VecCost = ReuseShuffleCost +
	TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy,
	CostKind, VL0);
	}
	return VecCost - ScalarCost;
	}
	case Instruction::FCmp:
	case Instruction::ICmp:
	case Instruction::Select: {
	// Calculate the cost of this instruction.
	int ScalarEltCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
	Builder.getInt1Ty(),
	CostKind, VL0);
	if (NeedToShuffleReuses) {
	ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
	}
	auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
	int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
	int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
	CostKind, VL0);
	return ReuseShuffleCost + VecCost - ScalarCost;
	}
	case Instruction::FNeg:
	case Instruction::Add:
	case Instruction::FAdd:
	case Instruction::Sub:
	case Instruction::FSub:
	case Instruction::Mul:
	case Instruction::FMul:
	case Instruction::UDiv:
	case Instruction::SDiv:
	case Instruction::FDiv:
	case Instruction::URem:
	case Instruction::SRem:
	case Instruction::FRem:
	case Instruction::Shl:
	case Instruction::LShr:
	case Instruction::AShr:
	case Instruction::And:
	case Instruction::Or:
	case Instruction::Xor: {
	// Certain instructions can be cheaper to vectorize if they have a
	// constant second vector operand.
	TargetTransformInfo::OperandValueKind Op1VK =
	TargetTransformInfo::OK_AnyValue;
	TargetTransformInfo::OperandValueKind Op2VK =
	TargetTransformInfo::OK_UniformConstantValue;
	TargetTransformInfo::OperandValueProperties Op1VP =
	TargetTransformInfo::OP_None;
	TargetTransformInfo::OperandValueProperties Op2VP =
	TargetTransformInfo::OP_PowerOf2;

	// If all operands are exactly the same ConstantInt then set the
	// operand kind to OK_UniformConstantValue.
	// If instead not all operands are constants, then set the operand kind
	// to OK_AnyValue. If all operands are constants but not the same,
	// then set the operand kind to OK_NonUniformConstantValue.
	ConstantInt *CInt0 = nullptr;
	for (unsigned i = 0, e = VL.size(); i < e; ++i) {
	const Instruction *I = cast<Instruction>(VL[i]);
	unsigned OpIdx = isa<BinaryOperator>(I) ? 1 : 0;
	ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(OpIdx));
	if (!CInt) {
	Op2VK = TargetTransformInfo::OK_AnyValue;
	Op2VP = TargetTransformInfo::OP_None;
	break;
	}
	if (Op2VP == TargetTransformInfo::OP_PowerOf2 &&
	!CInt->getValue().isPowerOf2())
	Op2VP = TargetTransformInfo::OP_None;
	if (i == 0) {
	CInt0 = CInt;
	continue;
	}
	if (CInt0 != CInt)
	Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
	}

	SmallVector<const Value *, 4> Operands(VL0->operand_values());
	int ScalarEltCost = TTI->getArithmeticInstrCost(
	E->getOpcode(), ScalarTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP,
	Operands, VL0);
	if (NeedToShuffleReuses) {
	ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
	}
	int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
	int VecCost = TTI->getArithmeticInstrCost(
	E->getOpcode(), VecTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP,
	Operands, VL0);
	return ReuseShuffleCost + VecCost - ScalarCost;
	}
	case Instruction::GetElementPtr: {
	TargetTransformInfo::OperandValueKind Op1VK =
	TargetTransformInfo::OK_AnyValue;
	TargetTransformInfo::OperandValueKind Op2VK =
	TargetTransformInfo::OK_UniformConstantValue;

	int ScalarEltCost =
	TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, CostKind,
	Op1VK, Op2VK);
	if (NeedToShuffleReuses) {
	ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
	}
	int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
	int VecCost =
	TTI->getArithmeticInstrCost(Instruction::Add, VecTy, CostKind,
	Op1VK, Op2VK);
	return ReuseShuffleCost + VecCost - ScalarCost;
	}
	case Instruction::Load: {
	// Cost of wide load - cost of scalar loads.
	Align alignment = cast<LoadInst>(VL0)->getAlign();
	int ScalarEltCost =
	TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0,
	CostKind, VL0);
	if (NeedToShuffleReuses) {
	ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
	}
	int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
	int VecLdCost =
	TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0,
	CostKind, VL0);
	if (!E->ReorderIndices.empty()) {
	// TODO: Merge this shuffle with the ReuseShuffleCost.
	VecLdCost += TTI->getShuffleCost(
	TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
	}
	return ReuseShuffleCost + VecLdCost - ScalarLdCost;
	}
	case Instruction::Store: {
	// We know that we can merge the stores. Calculate the cost.
	bool IsReorder = !E->ReorderIndices.empty();
	auto *SI =
	cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
	Align Alignment = SI->getAlign();
	int ScalarEltCost =
	TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0,
	CostKind, VL0);
	if (NeedToShuffleReuses)
	ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
	int ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
	int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
	VecTy, Alignment, 0, CostKind, VL0);
	if (IsReorder) {
	// TODO: Merge this shuffle with the ReuseShuffleCost.
	VecStCost += TTI->getShuffleCost(
	TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
	}
	return ReuseShuffleCost + VecStCost - ScalarStCost;
	}
	case Instruction::Call: {
	CallInst *CI = cast<CallInst>(VL0);
	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

	// Calculate the cost of the scalar and vector calls.
	IntrinsicCostAttributes CostAttrs(ID, *CI, 1, 1);
	int ScalarEltCost = TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
	if (NeedToShuffleReuses) {
	ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
	}
	int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;

	auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
	int VecCallCost = std::min(VecCallCosts.first, VecCallCosts.second);

	LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost
	<< " (" << VecCallCost << "-" << ScalarCallCost << ")"
	<< " for " << *CI << "\n");

	return ReuseShuffleCost + VecCallCost - ScalarCallCost;
	}
	case Instruction::ShuffleVector: {
	assert(E->isAltShuffle() &&
	((Instruction::isBinaryOp(E->getOpcode()) &&
	Instruction::isBinaryOp(E->getAltOpcode())) \|\|
	(Instruction::isCast(E->getOpcode()) &&
	Instruction::isCast(E->getAltOpcode()))) &&
	"Invalid Shuffle Vector Operand");
	int ScalarCost = 0;
	if (NeedToShuffleReuses) {
	for (unsigned Idx : E->ReuseShuffleIndices) {
	Instruction *I = cast<Instruction>(VL[Idx]);
	ReuseShuffleCost -= TTI->getInstructionCost(I, CostKind);
	}
	for (Value *V : VL) {
	Instruction *I = cast<Instruction>(V);
	ReuseShuffleCost += TTI->getInstructionCost(I, CostKind);
	}
	}
	for (Value *V : VL) {
	Instruction *I = cast<Instruction>(V);
	assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
	ScalarCost += TTI->getInstructionCost(I, CostKind);
	}
	// VecCost is equal to sum of the cost of creating 2 vectors
	// and the cost of creating shuffle.
	int VecCost = 0;
	if (Instruction::isBinaryOp(E->getOpcode())) {
	VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
	VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
	CostKind);
	} else {
	Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
	Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
	auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size());
	auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size());
	VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,
	CostKind);
	VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
	CostKind);
	}
	VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
	return ReuseShuffleCost + VecCost - ScalarCost;
	}
	default:
	llvm_unreachable("Unknown instruction");
	}
	}

	bool BoUpSLP::isFullyVectorizableTinyTree() const {
	LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
	<< VectorizableTree.size() << " is fully vectorizable .\n");

	// We only handle trees of heights 1 and 2.
	if (VectorizableTree.size() == 1 &&
	VectorizableTree[0]->State == TreeEntry::Vectorize)
	return true;

	if (VectorizableTree.size() != 2)
	return false;

	// Handle splat and all-constants stores.
	if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
	(allConstant(VectorizableTree[1]->Scalars) \|\|
	isSplat(VectorizableTree[1]->Scalars)))
	return true;

	// Gathering cost would be too much for tiny trees.
	if (VectorizableTree[0]->State == TreeEntry::NeedToGather \|\|
	VectorizableTree[1]->State == TreeEntry::NeedToGather)
	return false;

	return true;
	}

	static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
	TargetTransformInfo *TTI) {
	// Look past the root to find a source value. Arbitrarily follow the
	// path through operand 0 of any 'or'. Also, peek through optional
	// shift-left-by-constant.
	Value *ZextLoad = Root;
	while (!isa<ConstantExpr>(ZextLoad) &&
	(match(ZextLoad, m_Or(m_Value(), m_Value())) \|\|
	match(ZextLoad, m_Shl(m_Value(), m_Constant()))))
	ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);

	// Check if the input is an extended load of the required or/shift expression.
	Value *LoadPtr;
	if (ZextLoad == Root \|\| !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
	return false;

	// Require that the total load bit width is a legal integer type.
	// For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
	// But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
	Type *SrcTy = LoadPtr->getType()->getPointerElementType();
	unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
	if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
	return false;

	// Everything matched - assume that we can fold the whole sequence using
	// load combining.
	LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
	<< *(cast<Instruction>(Root)) << "\n");

	return true;
	}

	bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const {
	if (RdxOpcode != Instruction::Or)
	return false;

	unsigned NumElts = VectorizableTree[0]->Scalars.size();
	Value *FirstReduced = VectorizableTree[0]->Scalars[0];
	return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI);
	}

	bool BoUpSLP::isLoadCombineCandidate() const {
	// Peek through a final sequence of stores and check if all operations are
	// likely to be load-combined.
	unsigned NumElts = VectorizableTree[0]->Scalars.size();
	for (Value *Scalar : VectorizableTree[0]->Scalars) {
	Value *X;
	if (!match(Scalar, m_Store(m_Value(X), m_Value())) \|\|
	!isLoadCombineCandidateImpl(X, NumElts, TTI))
	return false;
	}
	return true;
	}

	bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
	// We can vectorize the tree if its size is greater than or equal to the
	// minimum size specified by the MinTreeSize command line option.
	if (VectorizableTree.size() >= MinTreeSize)
	return false;

	// If we have a tiny tree (a tree whose size is less than MinTreeSize), we
	// can vectorize it if we can prove it fully vectorizable.
	if (isFullyVectorizableTinyTree())
	return false;

	assert(VectorizableTree.empty()
	? ExternalUses.empty()
	: true && "We shouldn't have any external users");

	// Otherwise, we can't vectorize the tree. It is both tiny and not fully
	// vectorizable.
	return true;
	}

	int BoUpSLP::getSpillCost() const {
	// Walk from the bottom of the tree to the top, tracking which values are
	// live. When we see a call instruction that is not part of our tree,
	// query TTI to see if there is a cost to keeping values live over it
	// (for example, if spills and fills are required).
	unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
	int Cost = 0;

	SmallPtrSet<Instruction*, 4> LiveValues;
	Instruction *PrevInst = nullptr;

	for (const auto &TEPtr : VectorizableTree) {
	Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
	if (!Inst)
	continue;

	if (!PrevInst) {
	PrevInst = Inst;
	continue;
	}

	// Update LiveValues.
	LiveValues.erase(PrevInst);
	for (auto &J : PrevInst->operands()) {
	if (isa<Instruction>(&J) && getTreeEntry(&J))
	LiveValues.insert(cast<Instruction>(&*J));
	}

	LLVM_DEBUG({
	dbgs() << "SLP: #LV: " << LiveValues.size();
	for (auto *X : LiveValues)
	dbgs() << " " << X->getName();
	dbgs() << ", Looking at ";
	Inst->dump();
	});

	// Now find the sequence of instructions between PrevInst and Inst.
	unsigned NumCalls = 0;
	BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
	PrevInstIt =
	PrevInst->getIterator().getReverse();
	while (InstIt != PrevInstIt) {
	if (PrevInstIt == PrevInst->getParent()->rend()) {
	PrevInstIt = Inst->getParent()->rbegin();
	continue;
	}

	// Debug information does not impact spill cost.
	if ((isa<CallInst>(&*PrevInstIt) &&
	!isa<DbgInfoIntrinsic>(&*PrevInstIt)) &&
	&*PrevInstIt != PrevInst)
	NumCalls++;

	++PrevInstIt;
	}

	if (NumCalls) {
	SmallVector<Type*, 4> V;
	for (auto *II : LiveValues)
	V.push_back(FixedVectorType::get(II->getType(), BundleWidth));
	Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
	}

	PrevInst = Inst;
	}

	return Cost;
	}

	int BoUpSLP::getTreeCost() {
	int Cost = 0;
	LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
	<< VectorizableTree.size() << ".\n");

	unsigned BundleWidth = VectorizableTree[0]->Scalars.size();

	for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
	TreeEntry &TE = *VectorizableTree[I].get();

	// We create duplicate tree entries for gather sequences that have multiple
	// uses. However, we should not compute the cost of duplicate sequences.
	// For example, if we have a build vector (i.e., insertelement sequence)
	// that is used by more than one vector instruction, we only need to
	// compute the cost of the insertelement instructions once. The redundant
	// instructions will be eliminated by CSE.
	//
	// We should consider not creating duplicate tree entries for gather
	// sequences, and instead add additional edges to the tree representing
	// their uses. Since such an approach results in fewer total entries,
	// existing heuristics based on tree size may yield different results.
	//
	if (TE.State == TreeEntry::NeedToGather &&
	std::any_of(std::next(VectorizableTree.begin(), I + 1),
	VectorizableTree.end(),
	[TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
	return EntryPtr->State == TreeEntry::NeedToGather &&
	EntryPtr->isSame(TE.Scalars);
	}))
	continue;

	int C = getEntryCost(&TE);
	LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
	<< " for bundle that starts with " << *TE.Scalars[0]
	<< ".\n");
	Cost += C;
	}

	SmallPtrSet<Value *, 16> ExtractCostCalculated;
	int ExtractCost = 0;
	for (ExternalUser &EU : ExternalUses) {
	// We only add extract cost once for the same scalar.
	if (!ExtractCostCalculated.insert(EU.Scalar).second)
	continue;

	// Uses by ephemeral values are free (because the ephemeral value will be
	// removed prior to code generation, and so the extraction will be
	// removed as well).
	if (EphValues.count(EU.User))
	continue;

	// If we plan to rewrite the tree in a smaller type, we will need to sign
	// extend the extracted value back to the original type. Here, we account
	// for the extract and the added cost of the sign extend if needed.
	auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
	auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
	if (MinBWs.count(ScalarRoot)) {
	auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
	auto Extend =
	MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
	VecTy = FixedVectorType::get(MinTy, BundleWidth);
	ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
	VecTy, EU.Lane);
	} else {
	ExtractCost +=
	TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
	}
	}

	int SpillCost = getSpillCost();
	Cost += SpillCost + ExtractCost;

	std::string Str;
	{
	raw_string_ostream OS(Str);
	OS << "SLP: Spill Cost = " << SpillCost << ".\n"
	<< "SLP: Extract Cost = " << ExtractCost << ".\n"
	<< "SLP: Total Cost = " << Cost << ".\n";
	}
	LLVM_DEBUG(dbgs() << Str);

	if (ViewSLPTree)
	ViewGraph(this, "SLP" + F->getName(), false, Str);

	return Cost;
	}

	int BoUpSLP::getGatherCost(VectorType *Ty,
	const DenseSet<unsigned> &ShuffledIndices) const {
	unsigned NumElts = Ty->getNumElements();
	APInt DemandedElts = APInt::getNullValue(NumElts);
	for (unsigned i = 0; i < NumElts; ++i)
	if (!ShuffledIndices.count(i))
	DemandedElts.setBit(i);
	int Cost = TTI->getScalarizationOverhead(Ty, DemandedElts, /Insert/ true,
	/Extract/ false);
	if (!ShuffledIndices.empty())
	Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
	return Cost;
	}

	int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
	// Find the type of the operands in VL.
	Type *ScalarTy = VL[0]->getType();
	if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
	ScalarTy = SI->getValueOperand()->getType();
	auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
	// Find the cost of inserting/extracting values from the vector.
	// Check if the same elements are inserted several times and count them as
	// shuffle candidates.
	DenseSet<unsigned> ShuffledElements;
	DenseSet<Value *> UniqueElements;
	// Iterate in reverse order to consider insert elements with the high cost.
	for (unsigned I = VL.size(); I > 0; --I) {
	unsigned Idx = I - 1;
	if (!UniqueElements.insert(VL[Idx]).second)
	ShuffledElements.insert(Idx);
	}
	return getGatherCost(VecTy, ShuffledElements);
	}

	// Perform operand reordering on the instructions in VL and return the reordered
	// operands in Left and Right.
	void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
	SmallVectorImpl<Value *> &Left,
	SmallVectorImpl<Value *> &Right,
	const DataLayout &DL,
	ScalarEvolution &SE,
	const BoUpSLP &R) {
	if (VL.empty())
	return;
	VLOperands Ops(VL, DL, SE, R);
	// Reorder the operands in place.
	Ops.reorder();
	Left = Ops.getVL(0);
	Right = Ops.getVL(1);
	}

	void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) {
	// Get the basic block this bundle is in. All instructions in the bundle
	// should be in this block.
	auto *Front = E->getMainOp();
	auto *BB = Front->getParent();
	assert(llvm::all_of(make_range(E->Scalars.begin(), E->Scalars.end()),
	[=](Value *V) -> bool {
	auto *I = cast<Instruction>(V);
	return !E->isOpcodeOrAlt(I) \|\| I->getParent() == BB;
	}));

	// The last instruction in the bundle in program order.
	Instruction *LastInst = nullptr;

	// Find the last instruction. The common case should be that BB has been
	// scheduled, and the last instruction is VL.back(). So we start with
	// VL.back() and iterate over schedule data until we reach the end of the
	// bundle. The end of the bundle is marked by null ScheduleData.
	if (BlocksSchedules.count(BB)) {
	auto *Bundle =
	BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back()));
	if (Bundle && Bundle->isPartOfBundle())
	for (; Bundle; Bundle = Bundle->NextInBundle)
	if (Bundle->OpValue == Bundle->Inst)
	LastInst = Bundle->Inst;
	}

	// LastInst can still be null at this point if there's either not an entry
	// for BB in BlocksSchedules or there's no ScheduleData available for
	// VL.back(). This can be the case if buildTree_rec aborts for various
	// reasons (e.g., the maximum recursion depth is reached, the maximum region
	// size is reached, etc.). ScheduleData is initialized in the scheduling
	// "dry-run".
	//
	// If this happens, we can still find the last instruction by brute force. We
	// iterate forwards from Front (inclusive) until we either see all
	// instructions in the bundle or reach the end of the block. If Front is the
	// last instruction in program order, LastInst will be set to Front, and we
	// will visit all the remaining instructions in the block.
	//
	// One of the reasons we exit early from buildTree_rec is to place an upper
	// bound on compile-time. Thus, taking an additional compile-time hit here is
	// not ideal. However, this should be exceedingly rare since it requires that
	// we both exit early from buildTree_rec and that the bundle be out-of-order
	// (causing us to iterate all the way to the end of the block).
	if (!LastInst) {
	SmallPtrSet<Value *, 16> Bundle(E->Scalars.begin(), E->Scalars.end());
	for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
	if (Bundle.erase(&I) && E->isOpcodeOrAlt(&I))
	LastInst = &I;
	if (Bundle.empty())
	break;
	}
	}
	assert(LastInst && "Failed to find last instruction in bundle");

	// Set the insertion point after the last instruction in the bundle. Set the
	// debug location to Front.
	Builder.SetInsertPoint(BB, ++LastInst->getIterator());
	Builder.SetCurrentDebugLocation(Front->getDebugLoc());
	}

	Value BoUpSLP::Gather(ArrayRef<Value > VL, VectorType *Ty) {
	Value *Vec = UndefValue::get(Ty);
	// Generate the 'InsertElement' instruction.
	for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
	Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
	if (auto *Insrt = dyn_cast<InsertElementInst>(Vec)) {
	GatherSeq.insert(Insrt);
	CSEBlocks.insert(Insrt->getParent());

	// Add to our 'need-to-extract' list.
	if (TreeEntry *E = getTreeEntry(VL[i])) {
	// Find which lane we need to extract.
	int FoundLane = -1;
	for (unsigned Lane = 0, LE = E->Scalars.size(); Lane != LE; ++Lane) {
	// Is this the lane of the scalar that we are looking for ?
	if (E->Scalars[Lane] == VL[i]) {
	FoundLane = Lane;
	break;
	}
	}
	assert(FoundLane >= 0 && "Could not find the correct lane");
	if (!E->ReuseShuffleIndices.empty()) {
	FoundLane =
	std::distance(E->ReuseShuffleIndices.begin(),
	llvm::find(E->ReuseShuffleIndices, FoundLane));
	}
	ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
	}
	}
	}

	return Vec;
	}

	Value BoUpSLP::vectorizeTree(ArrayRef<Value > VL) {
	InstructionsState S = getSameOpcode(VL);
	if (S.getOpcode()) {
	if (TreeEntry *E = getTreeEntry(S.OpValue)) {
	if (E->isSame(VL)) {
	Value *V = vectorizeTree(E);
	if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) {
	// We need to get the vectorized value but without shuffle.
	if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
	V = SV->getOperand(0);
	} else {
	// Reshuffle to get only unique values.
	SmallVector<int, 4> UniqueIdxs;
	SmallSet<int, 4> UsedIdxs;
	for (int Idx : E->ReuseShuffleIndices)
	if (UsedIdxs.insert(Idx).second)
	UniqueIdxs.emplace_back(Idx);
	V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
	UniqueIdxs);
	}
	}
	return V;
	}
	}
	}

	Type *ScalarTy = S.OpValue->getType();
	if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
	ScalarTy = SI->getValueOperand()->getType();

	// Check that every instruction appears once in this bundle.
	SmallVector<int, 4> ReuseShuffleIndicies;
	SmallVector<Value *, 4> UniqueValues;
	if (VL.size() > 2) {
	DenseMap<Value *, unsigned> UniquePositions;
	for (Value *V : VL) {
	auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
	ReuseShuffleIndicies.emplace_back(Res.first->second);
	if (Res.second \|\| isa<Constant>(V))
	UniqueValues.emplace_back(V);
	}
	// Do not shuffle single element or if number of unique values is not power
	// of 2.
	if (UniqueValues.size() == VL.size() \|\| UniqueValues.size() <= 1 \|\|
	!llvm::isPowerOf2_32(UniqueValues.size()))
	ReuseShuffleIndicies.clear();
	else
	VL = UniqueValues;
	}
	auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());

	Value *V = Gather(VL, VecTy);
	if (!ReuseShuffleIndicies.empty()) {
	V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
	ReuseShuffleIndicies, "shuffle");
	if (auto *I = dyn_cast<Instruction>(V)) {
	GatherSeq.insert(I);
	CSEBlocks.insert(I->getParent());
	}
	}
	return V;
	}

	static void inversePermutation(ArrayRef<unsigned> Indices,
	SmallVectorImpl<int> &Mask) {
	Mask.clear();
	const unsigned E = Indices.size();
	Mask.resize(E);
	for (unsigned I = 0; I < E; ++I)
	Mask[Indices[I]] = I;
	}

	Value BoUpSLP::vectorizeTree(TreeEntry E) {
	IRBuilder<>::InsertPointGuard Guard(Builder);

	if (E->VectorizedValue) {
	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
	return E->VectorizedValue;
	}

	Instruction *VL0 = E->getMainOp();
	Type *ScalarTy = VL0->getType();
	if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
	ScalarTy = SI->getValueOperand()->getType();
	auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());

	bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();

	if (E->State == TreeEntry::NeedToGather) {
	setInsertPointAfterBundle(E);
	auto *V = Gather(E->Scalars, VecTy);
	if (NeedToShuffleReuses) {
	V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
	E->ReuseShuffleIndices, "shuffle");
	if (auto *I = dyn_cast<Instruction>(V)) {
	GatherSeq.insert(I);
	CSEBlocks.insert(I->getParent());
	}
	}
	E->VectorizedValue = V;
	return V;
	}

	assert(E->State == TreeEntry::Vectorize && "Unhandled state");
	unsigned ShuffleOrOp =
	E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
	switch (ShuffleOrOp) {
	case Instruction::PHI: {
	auto *PH = cast<PHINode>(VL0);
	Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
	Builder.SetCurrentDebugLocation(PH->getDebugLoc());
	PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
	Value *V = NewPhi;
	if (NeedToShuffleReuses) {
	V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
	E->ReuseShuffleIndices, "shuffle");
	}
	E->VectorizedValue = V;

	// PHINodes may have multiple entries from the same block. We want to
	// visit every block once.
	SmallPtrSet<BasicBlock*, 4> VisitedBBs;

	for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
	ValueList Operands;
	BasicBlock *IBB = PH->getIncomingBlock(i);

	if (!VisitedBBs.insert(IBB).second) {
	NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
	continue;
	}

	Builder.SetInsertPoint(IBB->getTerminator());
	Builder.SetCurrentDebugLocation(PH->getDebugLoc());
	Value *Vec = vectorizeTree(E->getOperand(i));
	NewPhi->addIncoming(Vec, IBB);
	}

	assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
	"Invalid number of incoming values");
	return V;
	}

	case Instruction::ExtractElement: {
	Value *V = E->getSingleOperand(0);
	if (!E->ReorderIndices.empty()) {
	SmallVector<int, 4> Mask;
	inversePermutation(E->ReorderIndices, Mask);
	Builder.SetInsertPoint(VL0);
	V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask,
	"reorder_shuffle");
	}
	if (NeedToShuffleReuses) {
	// TODO: Merge this shuffle with the ReorderShuffleMask.
	if (E->ReorderIndices.empty())
	Builder.SetInsertPoint(VL0);
	V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
	E->ReuseShuffleIndices, "shuffle");
	}
	E->VectorizedValue = V;
	return V;
	}
	case Instruction::ExtractValue: {
	LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0));
	Builder.SetInsertPoint(LI);
	PointerType *PtrTy =
	PointerType::get(VecTy, LI->getPointerAddressSpace());
	Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
	LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
	Value *NewV = propagateMetadata(V, E->Scalars);
	if (!E->ReorderIndices.empty()) {
	SmallVector<int, 4> Mask;
	inversePermutation(E->ReorderIndices, Mask);
	NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask,
	"reorder_shuffle");
	}
	if (NeedToShuffleReuses) {
	// TODO: Merge this shuffle with the ReorderShuffleMask.
	NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy),
	E->ReuseShuffleIndices, "shuffle");
	}
	E->VectorizedValue = NewV;
	return NewV;
	}
	case Instruction::ZExt:
	case Instruction::SExt:
	case Instruction::FPToUI:
	case Instruction::FPToSI:
	case Instruction::FPExt:
	case Instruction::PtrToInt:
	case Instruction::IntToPtr:
	case Instruction::SIToFP:
	case Instruction::UIToFP:
	case Instruction::Trunc:
	case Instruction::FPTrunc:
	case Instruction::BitCast: {
	setInsertPointAfterBundle(E);

	Value *InVec = vectorizeTree(E->getOperand(0));

	if (E->VectorizedValue) {
	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
	return E->VectorizedValue;
	}

	auto *CI = cast<CastInst>(VL0);
	Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
	if (NeedToShuffleReuses) {
	V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
	E->ReuseShuffleIndices, "shuffle");
	}
	E->VectorizedValue = V;
	++NumVectorInstructions;
	return V;
	}
	case Instruction::FCmp:
	case Instruction::ICmp: {
	setInsertPointAfterBundle(E);

	Value *L = vectorizeTree(E->getOperand(0));
	Value *R = vectorizeTree(E->getOperand(1));

	if (E->VectorizedValue) {
	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
	return E->VectorizedValue;
	}

	CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
	Value *V = Builder.CreateCmp(P0, L, R);
	propagateIRFlags(V, E->Scalars, VL0);
	if (NeedToShuffleReuses) {
	V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
	E->ReuseShuffleIndices, "shuffle");
	}
	E->VectorizedValue = V;
	++NumVectorInstructions;
	return V;
	}
	case Instruction::Select: {
	setInsertPointAfterBundle(E);

	Value *Cond = vectorizeTree(E->getOperand(0));
	Value *True = vectorizeTree(E->getOperand(1));
	Value *False = vectorizeTree(E->getOperand(2));

	if (E->VectorizedValue) {
	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
	return E->VectorizedValue;
	}

	Value *V = Builder.CreateSelect(Cond, True, False);
	if (NeedToShuffleReuses) {
	V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
	E->ReuseShuffleIndices, "shuffle");
	}
	E->VectorizedValue = V;
	++NumVectorInstructions;
	return V;
	}
	case Instruction::FNeg: {
	setInsertPointAfterBundle(E);

	Value *Op = vectorizeTree(E->getOperand(0));

	if (E->VectorizedValue) {
	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
	return E->VectorizedValue;
	}

	Value *V = Builder.CreateUnOp(
	static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
	propagateIRFlags(V, E->Scalars, VL0);
	if (auto *I = dyn_cast<Instruction>(V))
	V = propagateMetadata(I, E->Scalars);

	if (NeedToShuffleReuses) {
	V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
	E->ReuseShuffleIndices, "shuffle");
	}
	E->VectorizedValue = V;
	++NumVectorInstructions;

	return V;
	}
	case Instruction::Add:
	case Instruction::FAdd:
	case Instruction::Sub:
	case Instruction::FSub:
	case Instruction::Mul:
	case Instruction::FMul:
	case Instruction::UDiv:
	case Instruction::SDiv:
	case Instruction::FDiv:
	case Instruction::URem:
	case Instruction::SRem:
	case Instruction::FRem:
	case Instruction::Shl:
	case Instruction::LShr:
	case Instruction::AShr:
	case Instruction::And:
	case Instruction::Or:
	case Instruction::Xor: {
	setInsertPointAfterBundle(E);

	Value *LHS = vectorizeTree(E->getOperand(0));
	Value *RHS = vectorizeTree(E->getOperand(1));

	if (E->VectorizedValue) {
	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
	return E->VectorizedValue;
	}

	Value *V = Builder.CreateBinOp(
	static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
	RHS);
	propagateIRFlags(V, E->Scalars, VL0);
	if (auto *I = dyn_cast<Instruction>(V))
	V = propagateMetadata(I, E->Scalars);

	if (NeedToShuffleReuses) {
	V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
	E->ReuseShuffleIndices, "shuffle");
	}
	E->VectorizedValue = V;
	++NumVectorInstructions;

	return V;
	}
	case Instruction::Load: {
	// Loads are inserted at the head of the tree because we don't want to
	// sink them all the way down past store instructions.
	bool IsReorder = E->updateStateIfReorder();
	if (IsReorder)
	VL0 = E->getMainOp();
	setInsertPointAfterBundle(E);

	LoadInst *LI = cast<LoadInst>(VL0);
	unsigned AS = LI->getPointerAddressSpace();

	Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
	VecTy->getPointerTo(AS));

	// The pointer operand uses an in-tree scalar so we add the new BitCast to
	// ExternalUses list to make sure that an extract will be generated in the
	// future.
	Value *PO = LI->getPointerOperand();
	if (getTreeEntry(PO))
	ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));

	LI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
	Value *V = propagateMetadata(LI, E->Scalars);
	if (IsReorder) {
	SmallVector<int, 4> Mask;
	inversePermutation(E->ReorderIndices, Mask);
	V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
	Mask, "reorder_shuffle");
	}
	if (NeedToShuffleReuses) {
	// TODO: Merge this shuffle with the ReorderShuffleMask.
	V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
	E->ReuseShuffleIndices, "shuffle");
	}
	E->VectorizedValue = V;
	++NumVectorInstructions;
	return V;
	}
	case Instruction::Store: {
	bool IsReorder = !E->ReorderIndices.empty();
	auto *SI = cast<StoreInst>(
	IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0);
	unsigned AS = SI->getPointerAddressSpace();

	setInsertPointAfterBundle(E);

	Value *VecValue = vectorizeTree(E->getOperand(0));
	if (IsReorder) {
	SmallVector<int, 4> Mask(E->ReorderIndices.begin(),
	E->ReorderIndices.end());
	VecValue = Builder.CreateShuffleVector(
	VecValue, UndefValue::get(VecValue->getType()), Mask,
	"reorder_shuffle");
	}
	Value *ScalarPtr = SI->getPointerOperand();
	Value *VecPtr = Builder.CreateBitCast(
	ScalarPtr, VecValue->getType()->getPointerTo(AS));
	StoreInst *ST = Builder.CreateAlignedStore(VecValue, VecPtr,
	SI->getAlign());

	// The pointer operand uses an in-tree scalar, so add the new BitCast to
	// ExternalUses to make sure that an extract will be generated in the
	// future.
	if (getTreeEntry(ScalarPtr))
	ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));

	Value *V = propagateMetadata(ST, E->Scalars);
	if (NeedToShuffleReuses) {
	V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
	E->ReuseShuffleIndices, "shuffle");
	}
	E->VectorizedValue = V;
	++NumVectorInstructions;
	return V;
	}
	case Instruction::GetElementPtr: {
	setInsertPointAfterBundle(E);

	Value *Op0 = vectorizeTree(E->getOperand(0));

	std::vector<Value *> OpVecs;
	for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
	++j) {
	ValueList &VL = E->getOperand(j);
	// Need to cast all elements to the same type before vectorization to
	// avoid crash.
	Type *VL0Ty = VL0->getOperand(j)->getType();
	Type *Ty = llvm::all_of(
	VL, [VL0Ty](Value *V) { return VL0Ty == V->getType(); })
	? VL0Ty
	: DL->getIndexType(cast<GetElementPtrInst>(VL0)
	->getPointerOperandType()
	->getScalarType());
	for (Value *&V : VL) {
	auto *CI = cast<ConstantInt>(V);
	V = ConstantExpr::getIntegerCast(CI, Ty,
	CI->getValue().isSignBitSet());
	}
	Value *OpVec = vectorizeTree(VL);
	OpVecs.push_back(OpVec);
	}

	Value *V = Builder.CreateGEP(
	cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
	if (Instruction *I = dyn_cast<Instruction>(V))
	V = propagateMetadata(I, E->Scalars);

	if (NeedToShuffleReuses) {
	V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
	E->ReuseShuffleIndices, "shuffle");
	}
	E->VectorizedValue = V;
	++NumVectorInstructions;

	return V;
	}
	case Instruction::Call: {
	CallInst *CI = cast<CallInst>(VL0);
	setInsertPointAfterBundle(E);

	Intrinsic::ID IID = Intrinsic::not_intrinsic;
	if (Function *FI = CI->getCalledFunction())
	IID = FI->getIntrinsicID();

	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

	auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
	bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
	VecCallCosts.first <= VecCallCosts.second;

	Value *ScalarArg = nullptr;
	std::vector<Value *> OpVecs;
	for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
	ValueList OpVL;
	// Some intrinsics have scalar arguments. This argument should not be
	// vectorized.
	if (UseIntrinsic && hasVectorInstrinsicScalarOpd(IID, j)) {
	CallInst *CEI = cast<CallInst>(VL0);
	ScalarArg = CEI->getArgOperand(j);
	OpVecs.push_back(CEI->getArgOperand(j));
	continue;
	}

	Value *OpVec = vectorizeTree(E->getOperand(j));
	LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
	OpVecs.push_back(OpVec);
	}

	Function *CF;
	if (!UseIntrinsic) {
	VFShape Shape = VFShape::get(
	*CI, {static_cast<unsigned>(VecTy->getNumElements()), false},
	false /HasGlobalPred/);
	CF = VFDatabase(*CI).getVectorizedFunction(Shape);
	} else {
	Type *Tys[] = {FixedVectorType::get(CI->getType(), E->Scalars.size())};
	CF = Intrinsic::getDeclaration(F->getParent(), ID, Tys);
	}

	SmallVector<OperandBundleDef, 1> OpBundles;
	CI->getOperandBundlesAsDefs(OpBundles);
	Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);

	// The scalar argument uses an in-tree scalar so we add the new vectorized
	// call to ExternalUses list to make sure that an extract will be
	// generated in the future.
	if (ScalarArg && getTreeEntry(ScalarArg))
	ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));

	propagateIRFlags(V, E->Scalars, VL0);
	if (NeedToShuffleReuses) {
	V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
	E->ReuseShuffleIndices, "shuffle");
	}
	E->VectorizedValue = V;
	++NumVectorInstructions;
	return V;
	}
	case Instruction::ShuffleVector: {
	assert(E->isAltShuffle() &&
	((Instruction::isBinaryOp(E->getOpcode()) &&
	Instruction::isBinaryOp(E->getAltOpcode())) \|\|
	(Instruction::isCast(E->getOpcode()) &&
	Instruction::isCast(E->getAltOpcode()))) &&
	"Invalid Shuffle Vector Operand");

	Value LHS = nullptr, RHS = nullptr;
	if (Instruction::isBinaryOp(E->getOpcode())) {
	setInsertPointAfterBundle(E);
	LHS = vectorizeTree(E->getOperand(0));
	RHS = vectorizeTree(E->getOperand(1));
	} else {
	setInsertPointAfterBundle(E);
	LHS = vectorizeTree(E->getOperand(0));
	}

	if (E->VectorizedValue) {
	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
	return E->VectorizedValue;
	}

	Value V0, V1;
	if (Instruction::isBinaryOp(E->getOpcode())) {
	V0 = Builder.CreateBinOp(
	static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
	V1 = Builder.CreateBinOp(
	static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
	} else {
	V0 = Builder.CreateCast(
	static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
	V1 = Builder.CreateCast(
	static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
	}

	// Create shuffle to take alternate operations from the vector.
	// Also, gather up main and alt scalar ops to propagate IR flags to
	// each vector operation.
	ValueList OpScalars, AltScalars;
	unsigned e = E->Scalars.size();
	SmallVector<int, 8> Mask(e);
	for (unsigned i = 0; i < e; ++i) {
	auto *OpInst = cast<Instruction>(E->Scalars[i]);
	assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
	if (OpInst->getOpcode() == E->getAltOpcode()) {
	Mask[i] = e + i;
	AltScalars.push_back(E->Scalars[i]);
	} else {
	Mask[i] = i;
	OpScalars.push_back(E->Scalars[i]);
	}
	}

	propagateIRFlags(V0, OpScalars);
	propagateIRFlags(V1, AltScalars);

	Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
	if (Instruction *I = dyn_cast<Instruction>(V))
	V = propagateMetadata(I, E->Scalars);
	if (NeedToShuffleReuses) {
	V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
	E->ReuseShuffleIndices, "shuffle");
	}
	E->VectorizedValue = V;
	++NumVectorInstructions;

	return V;
	}
	default:
	llvm_unreachable("unknown inst");
	}
	return nullptr;
	}

	Value *BoUpSLP::vectorizeTree() {
	ExtraValueToDebugLocsMap ExternallyUsedValues;
	return vectorizeTree(ExternallyUsedValues);
	}

	Value *
	BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
	// All blocks must be scheduled before any instructions are inserted.
	for (auto &BSIter : BlocksSchedules) {
	scheduleBlock(BSIter.second.get());
	}

	Builder.SetInsertPoint(&F->getEntryBlock().front());
	auto *VectorRoot = vectorizeTree(VectorizableTree[0].get());

	// If the vectorized tree can be rewritten in a smaller type, we truncate the
	// vectorized root. InstCombine will then rewrite the entire expression. We
	// sign extend the extracted values below.
	auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
	if (MinBWs.count(ScalarRoot)) {
	if (auto *I = dyn_cast<Instruction>(VectorRoot))
	Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
	auto BundleWidth = VectorizableTree[0]->Scalars.size();
	auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
	auto *VecTy = FixedVectorType::get(MinTy, BundleWidth);
	auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
	VectorizableTree[0]->VectorizedValue = Trunc;
	}

	LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
	<< " values .\n");

	// If necessary, sign-extend or zero-extend ScalarRoot to the larger type
	// specified by ScalarType.
	auto extend = [&](Value ScalarRoot, Value Ex, Type *ScalarType) {
	if (!MinBWs.count(ScalarRoot))
	return Ex;
	if (MinBWs[ScalarRoot].second)
	return Builder.CreateSExt(Ex, ScalarType);
	return Builder.CreateZExt(Ex, ScalarType);
	};

	// Extract all of the elements with the external uses.
	for (const auto &ExternalUse : ExternalUses) {
	Value *Scalar = ExternalUse.Scalar;
	llvm::User *User = ExternalUse.User;

	// Skip users that we already RAUW. This happens when one instruction
	// has multiple uses of the same value.
	if (User && !is_contained(Scalar->users(), User))
	continue;
	TreeEntry *E = getTreeEntry(Scalar);
	assert(E && "Invalid scalar");
	assert(E->State == TreeEntry::Vectorize && "Extracting from a gather list");

	Value *Vec = E->VectorizedValue;
	assert(Vec && "Can't find vectorizable value");

	Value *Lane = Builder.getInt32(ExternalUse.Lane);
	// If User == nullptr, the Scalar is used as extra arg. Generate
	// ExtractElement instruction and update the record for this scalar in
	// ExternallyUsedValues.
	if (!User) {
	assert(ExternallyUsedValues.count(Scalar) &&
	"Scalar with nullptr as an external user must be registered in "
	"ExternallyUsedValues map");
	if (auto *VecI = dyn_cast<Instruction>(Vec)) {
	Builder.SetInsertPoint(VecI->getParent(),
	std::next(VecI->getIterator()));
	} else {
	Builder.SetInsertPoint(&F->getEntryBlock().front());
	}
	Value *Ex = Builder.CreateExtractElement(Vec, Lane);
	Ex = extend(ScalarRoot, Ex, Scalar->getType());
	CSEBlocks.insert(cast<Instruction>(Scalar)->getParent());
	auto &Locs = ExternallyUsedValues[Scalar];
	ExternallyUsedValues.insert({Ex, Locs});
	ExternallyUsedValues.erase(Scalar);
	// Required to update internally referenced instructions.
	Scalar->replaceAllUsesWith(Ex);
	continue;
	}

	// Generate extracts for out-of-tree users.
	// Find the insertion point for the extractelement lane.
	if (auto *VecI = dyn_cast<Instruction>(Vec)) {
	if (PHINode *PH = dyn_cast<PHINode>(User)) {
	for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
	if (PH->getIncomingValue(i) == Scalar) {
	Instruction *IncomingTerminator =
	PH->getIncomingBlock(i)->getTerminator();
	if (isa<CatchSwitchInst>(IncomingTerminator)) {
	Builder.SetInsertPoint(VecI->getParent(),
	std::next(VecI->getIterator()));
	} else {
	Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
	}
	Value *Ex = Builder.CreateExtractElement(Vec, Lane);
	Ex = extend(ScalarRoot, Ex, Scalar->getType());
	CSEBlocks.insert(PH->getIncomingBlock(i));
	PH->setOperand(i, Ex);
	}
	}
	} else {
	Builder.SetInsertPoint(cast<Instruction>(User));
	Value *Ex = Builder.CreateExtractElement(Vec, Lane);
	Ex = extend(ScalarRoot, Ex, Scalar->getType());
	CSEBlocks.insert(cast<Instruction>(User)->getParent());
	User->replaceUsesOfWith(Scalar, Ex);
	}
	} else {
	Builder.SetInsertPoint(&F->getEntryBlock().front());
	Value *Ex = Builder.CreateExtractElement(Vec, Lane);
	Ex = extend(ScalarRoot, Ex, Scalar->getType());
	CSEBlocks.insert(&F->getEntryBlock());
	User->replaceUsesOfWith(Scalar, Ex);
	}

	LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
	}

	// For each vectorized value:
	for (auto &TEPtr : VectorizableTree) {
	TreeEntry *Entry = TEPtr.get();

	// No need to handle users of gathered values.
	if (Entry->State == TreeEntry::NeedToGather)
	continue;

	assert(Entry->VectorizedValue && "Can't find vectorizable value");

	// For each lane:
	for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
	Value *Scalar = Entry->Scalars[Lane];

	#ifndef NDEBUG
	Type *Ty = Scalar->getType();
	if (!Ty->isVoidTy()) {
	for (User *U : Scalar->users()) {
	LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");

	// It is legal to delete users in the ignorelist.
	assert((getTreeEntry(U) \|\| is_contained(UserIgnoreList, U)) &&
	"Deleting out-of-tree value");
	}
	}
	#endif
	LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
	eraseInstruction(cast<Instruction>(Scalar));
	}
	}

	Builder.ClearInsertionPoint();
	InstrElementSize.clear();

	return VectorizableTree[0]->VectorizedValue;
	}

	void BoUpSLP::optimizeGatherSequence() {
	LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
	<< " gather sequences instructions.\n");
	// LICM InsertElementInst sequences.
	for (Instruction *I : GatherSeq) {
	if (isDeleted(I))
	continue;

	// Check if this block is inside a loop.
	Loop *L = LI->getLoopFor(I->getParent());
	if (!L)
	continue;

	// Check if it has a preheader.
	BasicBlock *PreHeader = L->getLoopPreheader();
	if (!PreHeader)
	continue;

	// If the vector or the element that we insert into it are
	// instructions that are defined in this basic block then we can't
	// hoist this instruction.
	auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
	auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
	if (Op0 && L->contains(Op0))
	continue;
	if (Op1 && L->contains(Op1))
	continue;

	// We can hoist this instruction. Move it to the pre-header.
	I->moveBefore(PreHeader->getTerminator());
	}

	// Make a list of all reachable blocks in our CSE queue.
	SmallVector<const DomTreeNode *, 8> CSEWorkList;
	CSEWorkList.reserve(CSEBlocks.size());
	for (BasicBlock *BB : CSEBlocks)
	if (DomTreeNode *N = DT->getNode(BB)) {
	assert(DT->isReachableFromEntry(N));
	CSEWorkList.push_back(N);
	}

	// Sort blocks by domination. This ensures we visit a block after all blocks
	// dominating it are visited.
	llvm::stable_sort(CSEWorkList,
	[this](const DomTreeNode A, const DomTreeNode B) {
	return DT->properlyDominates(A, B);
	});

	// Perform O(N^2) search over the gather sequences and merge identical
	// instructions. TODO: We can further optimize this scan if we split the
	// instructions into different buckets based on the insert lane.
	SmallVector<Instruction *, 16> Visited;
	for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
	assert((I == CSEWorkList.begin() \|\| !DT->dominates(I, std::prev(I))) &&
	"Worklist not sorted properly!");
	BasicBlock BB = (I)->getBlock();
	// For all instructions in blocks containing gather sequences:
	for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
	Instruction In = &it++;
	if (isDeleted(In))
	continue;
	if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
	continue;

	// Check if we can replace this instruction with any of the
	// visited instructions.
	for (Instruction *v : Visited) {
	if (In->isIdenticalTo(v) &&
	DT->dominates(v->getParent(), In->getParent())) {
	In->replaceAllUsesWith(v);
	eraseInstruction(In);
	In = nullptr;
	break;
	}
	}
	if (In) {
	assert(!is_contained(Visited, In));
	Visited.push_back(In);
	}
	}
	}
	CSEBlocks.clear();
	GatherSeq.clear();
	}

	// Groups the instructions to a bundle (which is then a single scheduling entity)
	// and schedules instructions until the bundle gets ready.
	Optional<BoUpSLP::ScheduleData *>
	BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value > VL, BoUpSLP SLP,
	const InstructionsState &S) {
	if (isa<PHINode>(S.OpValue))
	return nullptr;

	// Initialize the instruction bundle.
	Instruction *OldScheduleEnd = ScheduleEnd;
	ScheduleData *PrevInBundle = nullptr;
	ScheduleData *Bundle = nullptr;
	bool ReSchedule = false;
	LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");

	// Make sure that the scheduling region contains all
	// instructions of the bundle.
	for (Value *V : VL) {
	if (!extendSchedulingRegion(V, S))
	return None;
	}

	for (Value *V : VL) {
	ScheduleData *BundleMember = getScheduleData(V);
	assert(BundleMember &&
	"no ScheduleData for bundle member (maybe not in same basic block)");
	if (BundleMember->IsScheduled) {
	// A bundle member was scheduled as single instruction before and now
	// needs to be scheduled as part of the bundle. We just get rid of the
	// existing schedule.
	LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
	<< " was already scheduled\n");
	ReSchedule = true;
	}
	assert(BundleMember->isSchedulingEntity() &&
	"bundle member already part of other bundle");
	if (PrevInBundle) {
	PrevInBundle->NextInBundle = BundleMember;
	} else {
	Bundle = BundleMember;
	}
	BundleMember->UnscheduledDepsInBundle = 0;
	Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;

	// Group the instructions to a bundle.
	BundleMember->FirstInBundle = Bundle;
	PrevInBundle = BundleMember;
	}
	if (ScheduleEnd != OldScheduleEnd) {
	// The scheduling region got new instructions at the lower end (or it is a
	// new region for the first bundle). This makes it necessary to
	// recalculate all dependencies.
	// It is seldom that this needs to be done a second time after adding the
	// initial bundle to the region.
	for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
	doForAllOpcodes(I, [](ScheduleData *SD) {
	SD->clearDependencies();
	});
	}
	ReSchedule = true;
	}
	if (ReSchedule) {
	resetSchedule();
	initialFillReadyList(ReadyInsts);
	}
	assert(Bundle && "Failed to find schedule bundle");

	LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
	<< BB->getName() << "\n");

	calculateDependencies(Bundle, true, SLP);

	// Now try to schedule the new bundle. As soon as the bundle is "ready" it
	// means that there are no cyclic dependencies and we can schedule it.
	// Note that's important that we don't "schedule" the bundle yet (see
	// cancelScheduling).
	while (!Bundle->isReady() && !ReadyInsts.empty()) {

	ScheduleData *pickedSD = ReadyInsts.back();
	ReadyInsts.pop_back();

	if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
	schedule(pickedSD, ReadyInsts);
	}
	}
	if (!Bundle->isReady()) {
	cancelScheduling(VL, S.OpValue);
	return None;
	}
	return Bundle;
	}

	void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
	Value *OpValue) {
	if (isa<PHINode>(OpValue))
	return;

	ScheduleData *Bundle = getScheduleData(OpValue);
	LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
	assert(!Bundle->IsScheduled &&
	"Can't cancel bundle which is already scheduled");
	assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
	"tried to unbundle something which is not a bundle");

	// Un-bundle: make single instructions out of the bundle.
	ScheduleData *BundleMember = Bundle;
	while (BundleMember) {
	assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
	BundleMember->FirstInBundle = BundleMember;
	ScheduleData *Next = BundleMember->NextInBundle;
	BundleMember->NextInBundle = nullptr;
	BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
	if (BundleMember->UnscheduledDepsInBundle == 0) {
	ReadyInsts.insert(BundleMember);
	}
	BundleMember = Next;
	}
	}

	BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
	// Allocate a new ScheduleData for the instruction.
	if (ChunkPos >= ChunkSize) {
	ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
	ChunkPos = 0;
	}
	return &(ScheduleDataChunks.back()[ChunkPos++]);
	}

	bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
	const InstructionsState &S) {
	if (getScheduleData(V, isOneOf(S, V)))
	return true;
	Instruction *I = dyn_cast<Instruction>(V);
	assert(I && "bundle member must be an instruction");
	assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
	auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool {
	ScheduleData *ISD = getScheduleData(I);
	if (!ISD)
	return false;
	assert(isInSchedulingRegion(ISD) &&
	"ScheduleData not in scheduling region");
	ScheduleData *SD = allocateScheduleDataChunks();
	SD->Inst = I;
	SD->init(SchedulingRegionID, S.OpValue);
	ExtraScheduleDataMap[I][S.OpValue] = SD;
	return true;
	};
	if (CheckSheduleForI(I))
	return true;
	if (!ScheduleStart) {
	// It's the first instruction in the new region.
	initScheduleData(I, I->getNextNode(), nullptr, nullptr);
	ScheduleStart = I;
	ScheduleEnd = I->getNextNode();
	if (isOneOf(S, I) != I)
	CheckSheduleForI(I);
	assert(ScheduleEnd && "tried to vectorize a terminator?");
	LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
	return true;
	}
	// Search up and down at the same time, because we don't know if the new
	// instruction is above or below the existing scheduling region.
	BasicBlock::reverse_iterator UpIter =
	++ScheduleStart->getIterator().getReverse();
	BasicBlock::reverse_iterator UpperEnd = BB->rend();
	BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
	BasicBlock::iterator LowerEnd = BB->end();
	while (true) {
	if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
	LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
	return false;
	}

	if (UpIter != UpperEnd) {
	if (&*UpIter == I) {
	initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
	ScheduleStart = I;
	if (isOneOf(S, I) != I)
	CheckSheduleForI(I);
	LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
	<< "\n");
	return true;
	}
	++UpIter;
	}
	if (DownIter != LowerEnd) {
	if (&*DownIter == I) {
	initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
	nullptr);
	ScheduleEnd = I->getNextNode();
	if (isOneOf(S, I) != I)
	CheckSheduleForI(I);
	assert(ScheduleEnd && "tried to vectorize a terminator?");
	LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I
	<< "\n");
	return true;
	}
	++DownIter;
	}
	assert((UpIter != UpperEnd \|\| DownIter != LowerEnd) &&
	"instruction not found in block");
	}
	return true;
	}

	void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
	Instruction *ToI,
	ScheduleData *PrevLoadStore,
	ScheduleData *NextLoadStore) {
	ScheduleData *CurrentLoadStore = PrevLoadStore;
	for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
	ScheduleData *SD = ScheduleDataMap[I];
	if (!SD) {
	SD = allocateScheduleDataChunks();
	ScheduleDataMap[I] = SD;
	SD->Inst = I;
	}
	assert(!isInSchedulingRegion(SD) &&
	"new ScheduleData already in scheduling region");
	SD->init(SchedulingRegionID, I);

	if (I->mayReadOrWriteMemory() &&
	(!isa<IntrinsicInst>(I) \|\|
	cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect)) {
	// Update the linked list of memory accessing instructions.
	if (CurrentLoadStore) {
	CurrentLoadStore->NextLoadStore = SD;
	} else {
	FirstLoadStoreInRegion = SD;
	}
	CurrentLoadStore = SD;
	}
	}
	if (NextLoadStore) {
	if (CurrentLoadStore)
	CurrentLoadStore->NextLoadStore = NextLoadStore;
	} else {
	LastLoadStoreInRegion = CurrentLoadStore;
	}
	}

	void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
	bool InsertInReadyList,
	BoUpSLP *SLP) {
	assert(SD->isSchedulingEntity());

	SmallVector<ScheduleData *, 10> WorkList;
	WorkList.push_back(SD);

	while (!WorkList.empty()) {
	ScheduleData *SD = WorkList.back();
	WorkList.pop_back();

	ScheduleData *BundleMember = SD;
	while (BundleMember) {
	assert(isInSchedulingRegion(BundleMember));
	if (!BundleMember->hasValidDependencies()) {

	LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
	<< "\n");
	BundleMember->Dependencies = 0;
	BundleMember->resetUnscheduledDeps();

	// Handle def-use chain dependencies.
	if (BundleMember->OpValue != BundleMember->Inst) {
	ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
	if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
	BundleMember->Dependencies++;
	ScheduleData *DestBundle = UseSD->FirstInBundle;
	if (!DestBundle->IsScheduled)
	BundleMember->incrementUnscheduledDeps(1);
	if (!DestBundle->hasValidDependencies())
	WorkList.push_back(DestBundle);
	}
	} else {
	for (User *U : BundleMember->Inst->users()) {
	if (isa<Instruction>(U)) {
	ScheduleData *UseSD = getScheduleData(U);
	if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
	BundleMember->Dependencies++;
	ScheduleData *DestBundle = UseSD->FirstInBundle;
	if (!DestBundle->IsScheduled)
	BundleMember->incrementUnscheduledDeps(1);
	if (!DestBundle->hasValidDependencies())
	WorkList.push_back(DestBundle);
	}
	} else {
	// I'm not sure if this can ever happen. But we need to be safe.
	// This lets the instruction/bundle never be scheduled and
	// eventually disable vectorization.
	BundleMember->Dependencies++;
	BundleMember->incrementUnscheduledDeps(1);
	}
	}
	}

	// Handle the memory dependencies.
	ScheduleData *DepDest = BundleMember->NextLoadStore;
	if (DepDest) {
	Instruction *SrcInst = BundleMember->Inst;
	MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
	bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
	unsigned numAliased = 0;
	unsigned DistToSrc = 1;

	while (DepDest) {
	assert(isInSchedulingRegion(DepDest));

	// We have two limits to reduce the complexity:
	// 1) AliasedCheckLimit: It's a small limit to reduce calls to
	// SLP->isAliased (which is the expensive part in this loop).
	// 2) MaxMemDepDistance: It's for very large blocks and it aborts
	// the whole loop (even if the loop is fast, it's quadratic).
	// It's important for the loop break condition (see below) to
	// check this limit even between two read-only instructions.
	if (DistToSrc >= MaxMemDepDistance \|\|
	((SrcMayWrite \|\| DepDest->Inst->mayWriteToMemory()) &&
	(numAliased >= AliasedCheckLimit \|\|
	SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {

	// We increment the counter only if the locations are aliased
	// (instead of counting all alias checks). This gives a better
	// balance between reduced runtime and accurate dependencies.
	numAliased++;

	DepDest->MemoryDependencies.push_back(BundleMember);
	BundleMember->Dependencies++;
	ScheduleData *DestBundle = DepDest->FirstInBundle;
	if (!DestBundle->IsScheduled) {
	BundleMember->incrementUnscheduledDeps(1);
	}
	if (!DestBundle->hasValidDependencies()) {
	WorkList.push_back(DestBundle);
	}
	}
	DepDest = DepDest->NextLoadStore;

	// Example, explaining the loop break condition: Let's assume our
	// starting instruction is i0 and MaxMemDepDistance = 3.
	//
	// +--------v--v--v
	// i0,i1,i2,i3,i4,i5,i6,i7,i8
	// +--------^--^--^
	//
	// MaxMemDepDistance let us stop alias-checking at i3 and we add
	// dependencies from i0 to i3,i4,.. (even if they are not aliased).
	// Previously we already added dependencies from i3 to i6,i7,i8
	// (because of MaxMemDepDistance). As we added a dependency from
	// i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
	// and we can abort this loop at i6.
	if (DistToSrc >= 2 * MaxMemDepDistance)
	break;
	DistToSrc++;
	}
	}
	}
	BundleMember = BundleMember->NextInBundle;
	}
	if (InsertInReadyList && SD->isReady()) {
	ReadyInsts.push_back(SD);
	LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
	<< "\n");
	}
	}
	}

	void BoUpSLP::BlockScheduling::resetSchedule() {
	assert(ScheduleStart &&
	"tried to reset schedule on block which has not been scheduled");
	for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
	doForAllOpcodes(I, [&](ScheduleData *SD) {
	assert(isInSchedulingRegion(SD) &&
	"ScheduleData not in scheduling region");
	SD->IsScheduled = false;
	SD->resetUnscheduledDeps();
	});
	}
	ReadyInsts.clear();
	}

	void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
	if (!BS->ScheduleStart)
	return;

	LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");

	BS->resetSchedule();

	// For the real scheduling we use a more sophisticated ready-list: it is
	// sorted by the original instruction location. This lets the final schedule
	// be as close as possible to the original instruction order.
	struct ScheduleDataCompare {
	bool operator()(ScheduleData SD1, ScheduleData SD2) const {
	return SD2->SchedulingPriority < SD1->SchedulingPriority;
	}
	};
	std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;

	// Ensure that all dependency data is updated and fill the ready-list with
	// initial instructions.
	int Idx = 0;
	int NumToSchedule = 0;
	for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
	I = I->getNextNode()) {
	BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
	assert(SD->isPartOfBundle() ==
	(getTreeEntry(SD->Inst) != nullptr) &&
	"scheduler and vectorizer bundle mismatch");
	SD->FirstInBundle->SchedulingPriority = Idx++;
	if (SD->isSchedulingEntity()) {
	BS->calculateDependencies(SD, false, this);
	NumToSchedule++;
	}
	});
	}
	BS->initialFillReadyList(ReadyInsts);

	Instruction *LastScheduledInst = BS->ScheduleEnd;

	// Do the "real" scheduling.
	while (!ReadyInsts.empty()) {
	ScheduleData picked = ReadyInsts.begin();
	ReadyInsts.erase(ReadyInsts.begin());

	// Move the scheduled instruction(s) to their dedicated places, if not
	// there yet.
	ScheduleData *BundleMember = picked;
	while (BundleMember) {
	Instruction *pickedInst = BundleMember->Inst;
	if (LastScheduledInst->getNextNode() != pickedInst) {
	BS->BB->getInstList().remove(pickedInst);
	BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
	pickedInst);
	}
	LastScheduledInst = pickedInst;
	BundleMember = BundleMember->NextInBundle;
	}

	BS->schedule(picked, ReadyInsts);
	NumToSchedule--;
	}
	assert(NumToSchedule == 0 && "could not schedule all instructions");

	// Avoid duplicate scheduling of the block.
	BS->ScheduleStart = nullptr;
	}

	unsigned BoUpSLP::getVectorElementSize(Value *V) {
	// If V is a store, just return the width of the stored value without
	// traversing the expression tree. This is the common case.
	if (auto *Store = dyn_cast<StoreInst>(V))
	return DL->getTypeSizeInBits(Store->getValueOperand()->getType());

	auto E = InstrElementSize.find(V);
	if (E != InstrElementSize.end())
	return E->second;

	// If V is not a store, we can traverse the expression tree to find loads
	// that feed it. The type of the loaded value may indicate a more suitable
	// width than V's type. We want to base the vector element size on the width
	// of memory operations where possible.
	SmallVector<Instruction *, 16> Worklist;
	SmallPtrSet<Instruction *, 16> Visited;
	if (auto *I = dyn_cast<Instruction>(V)) {
	Worklist.push_back(I);
	Visited.insert(I);
	}

	// Traverse the expression tree in bottom-up order looking for loads. If we
	// encounter an instruction we don't yet handle, we give up.
	auto MaxWidth = 0u;
	auto FoundUnknownInst = false;
	while (!Worklist.empty() && !FoundUnknownInst) {
	auto *I = Worklist.pop_back_val();

	// We should only be looking at scalar instructions here. If the current
	// instruction has a vector type, give up.
	auto *Ty = I->getType();
	if (isa<VectorType>(Ty))
	FoundUnknownInst = true;

	// If the current instruction is a load, update MaxWidth to reflect the
	// width of the loaded value.
	else if (isa<LoadInst>(I))
	MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));

	// Otherwise, we need to visit the operands of the instruction. We only
	// handle the interesting cases from buildTree here. If an operand is an
	// instruction we haven't yet visited, we add it to the worklist.
	else if (isa<PHINode>(I) \|\| isa<CastInst>(I) \|\| isa<GetElementPtrInst>(I) \|\|
	isa<CmpInst>(I) \|\| isa<SelectInst>(I) \|\| isa<BinaryOperator>(I)) {
	for (Use &U : I->operands())
	if (auto *J = dyn_cast<Instruction>(U.get()))
	if (Visited.insert(J).second)
	Worklist.push_back(J);
	}

	// If we don't yet handle the instruction, give up.
	else
	FoundUnknownInst = true;
	}

	int Width = MaxWidth;
	// If we didn't encounter a memory access in the expression tree, or if we
	// gave up for some reason, just return the width of V. Otherwise, return the
	// maximum width we found.
	if (!MaxWidth \|\| FoundUnknownInst)
	Width = DL->getTypeSizeInBits(V->getType());

	for (Instruction *I : Visited)
	InstrElementSize[I] = Width;

	return Width;
	}

	// Determine if a value V in a vectorizable expression Expr can be demoted to a
	// smaller type with a truncation. We collect the values that will be demoted
	// in ToDemote and additional roots that require investigating in Roots.
	static bool collectValuesToDemote(Value V, SmallPtrSetImpl<Value > &Expr,
	SmallVectorImpl<Value *> &ToDemote,
	SmallVectorImpl<Value *> &Roots) {
	// We can always demote constants.
	if (isa<Constant>(V)) {
	ToDemote.push_back(V);
	return true;
	}

	// If the value is not an instruction in the expression with only one use, it
	// cannot be demoted.
	auto *I = dyn_cast<Instruction>(V);
	if (!I \|\| !I->hasOneUse() \|\| !Expr.count(I))
	return false;

	switch (I->getOpcode()) {

	// We can always demote truncations and extensions. Since truncations can
	// seed additional demotion, we save the truncated value.
	case Instruction::Trunc:
	Roots.push_back(I->getOperand(0));
	break;
	case Instruction::ZExt:
	case Instruction::SExt:
	break;

	// We can demote certain binary operations if we can demote both of their
	// operands.
	case Instruction::Add:
	case Instruction::Sub:
	case Instruction::Mul:
	case Instruction::And:
	case Instruction::Or:
	case Instruction::Xor:
	if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) \|\|
	!collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))
	return false;
	break;

	// We can demote selects if we can demote their true and false values.
	case Instruction::Select: {
	SelectInst *SI = cast<SelectInst>(I);
	if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) \|\|
	!collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))
	return false;
	break;
	}

	// We can demote phis if we can demote all their incoming operands. Note that
	// we don't need to worry about cycles since we ensure single use above.
	case Instruction::PHI: {
	PHINode *PN = cast<PHINode>(I);
	for (Value *IncValue : PN->incoming_values())
	if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
	return false;
	break;
	}

	// Otherwise, conservatively give up.
	default:
	return false;
	}

	// Record the value that we can demote.
	ToDemote.push_back(V);
	return true;
	}

	void BoUpSLP::computeMinimumValueSizes() {
	// If there are no external uses, the expression tree must be rooted by a
	// store. We can't demote in-memory values, so there is nothing to do here.
	if (ExternalUses.empty())
	return;

	// We only attempt to truncate integer expressions.
	auto &TreeRoot = VectorizableTree[0]->Scalars;
	auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
	if (!TreeRootIT)
	return;

	// If the expression is not rooted by a store, these roots should have
	// external uses. We will rely on InstCombine to rewrite the expression in
	// the narrower type. However, InstCombine only rewrites single-use values.
	// This means that if a tree entry other than a root is used externally, it
	// must have multiple uses and InstCombine will not rewrite it. The code
	// below ensures that only the roots are used externally.
	SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
	for (auto &EU : ExternalUses)
	if (!Expr.erase(EU.Scalar))
	return;
	if (!Expr.empty())
	return;

	// Collect the scalar values of the vectorizable expression. We will use this
	// context to determine which values can be demoted. If we see a truncation,
	// we mark it as seeding another demotion.
	for (auto &EntryPtr : VectorizableTree)
	Expr.insert(EntryPtr->Scalars.begin(), EntryPtr->Scalars.end());

	// Ensure the roots of the vectorizable tree don't form a cycle. They must
	// have a single external user that is not in the vectorizable tree.
	for (auto *Root : TreeRoot)
	if (!Root->hasOneUse() \|\| Expr.count(*Root->user_begin()))
	return;

	// Conservatively determine if we can actually truncate the roots of the
	// expression. Collect the values that can be demoted in ToDemote and
	// additional roots that require investigating in Roots.
	SmallVector<Value *, 32> ToDemote;
	SmallVector<Value *, 4> Roots;
	for (auto *Root : TreeRoot)
	if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
	return;

	// The maximum bit width required to represent all the values that can be
	// demoted without loss of precision. It would be safe to truncate the roots
	// of the expression to this width.
	auto MaxBitWidth = 8u;

	// We first check if all the bits of the roots are demanded. If they're not,
	// we can truncate the roots to this narrower type.
	for (auto *Root : TreeRoot) {
	auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
	MaxBitWidth = std::max<unsigned>(
	Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
	}

	// True if the roots can be zero-extended back to their original type, rather
	// than sign-extended. We know that if the leading bits are not demanded, we
	// can safely zero-extend. So we initialize IsKnownPositive to True.
	bool IsKnownPositive = true;

	// If all the bits of the roots are demanded, we can try a little harder to
	// compute a narrower type. This can happen, for example, if the roots are
	// getelementptr indices. InstCombine promotes these indices to the pointer
	// width. Thus, all their bits are technically demanded even though the
	// address computation might be vectorized in a smaller type.
	//
	// We start by looking at each entry that can be demoted. We compute the
	// maximum bit width required to store the scalar by using ValueTracking to
	// compute the number of high-order bits we can truncate.
	if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) &&
	llvm::all_of(TreeRoot, [](Value *R) {
	assert(R->hasOneUse() && "Root should have only one use!");
	return isa<GetElementPtrInst>(R->user_back());
	})) {
	MaxBitWidth = 8u;

	// Determine if the sign bit of all the roots is known to be zero. If not,
	// IsKnownPositive is set to False.
	IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) {
	KnownBits Known = computeKnownBits(R, *DL);
	return Known.isNonNegative();
	});

	// Determine the maximum number of bits required to store the scalar
	// values.
	for (auto *Scalar : ToDemote) {
	auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, nullptr, DT);
	auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
	MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
	}

	// If we can't prove that the sign bit is zero, we must add one to the
	// maximum bit width to account for the unknown sign bit. This preserves
	// the existing sign bit so we can safely sign-extend the root back to the
	// original type. Otherwise, if we know the sign bit is zero, we will
	// zero-extend the root instead.
	//
	// FIXME: This is somewhat suboptimal, as there will be cases where adding
	// one to the maximum bit width will yield a larger-than-necessary
	// type. In general, we need to add an extra bit only if we can't
	// prove that the upper bit of the original type is equal to the
	// upper bit of the proposed smaller type. If these two bits are the
	// same (either zero or one) we know that sign-extending from the
	// smaller type will result in the same value. Here, since we can't
	// yet prove this, we are just making the proposed smaller type
	// larger to ensure correctness.
	if (!IsKnownPositive)
	++MaxBitWidth;
	}

	// Round MaxBitWidth up to the next power-of-two.
	if (!isPowerOf2_64(MaxBitWidth))
	MaxBitWidth = NextPowerOf2(MaxBitWidth);

	// If the maximum bit width we compute is less than the with of the roots'
	// type, we can proceed with the narrowing. Otherwise, do nothing.
	if (MaxBitWidth >= TreeRootIT->getBitWidth())
	return;

	// If we can truncate the root, we must collect additional values that might
	// be demoted as a result. That is, those seeded by truncations we will
	// modify.
	while (!Roots.empty())
	collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);

	// Finally, map the values we can demote to the maximum bit with we computed.
	for (auto *Scalar : ToDemote)
	MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
	}

	namespace {

	/// The SLPVectorizer Pass.
	struct SLPVectorizer : public FunctionPass {
	SLPVectorizerPass Impl;

	/// Pass identification, replacement for typeid
	static char ID;

	explicit SLPVectorizer() : FunctionPass(ID) {
	initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
	}

	bool doInitialization(Module &M) override {
	return false;
	}

	bool runOnFunction(Function &F) override {
	if (skipFunction(F))
	return false;

	auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
	auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
	auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
	auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
	auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
	auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
	auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
	auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
	auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
	auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();

	return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
	}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	FunctionPass::getAnalysisUsage(AU);
	AU.addRequired<AssumptionCacheTracker>();
	AU.addRequired<ScalarEvolutionWrapperPass>();
	AU.addRequired<AAResultsWrapperPass>();
	AU.addRequired<TargetTransformInfoWrapperPass>();
	AU.addRequired<LoopInfoWrapperPass>();
	AU.addRequired<DominatorTreeWrapperPass>();
	AU.addRequired<DemandedBitsWrapperPass>();
	AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
	AU.addRequired<InjectTLIMappingsLegacy>();
	AU.addPreserved<LoopInfoWrapperPass>();
	AU.addPreserved<DominatorTreeWrapperPass>();
	AU.addPreserved<AAResultsWrapperPass>();
	AU.addPreserved<GlobalsAAWrapperPass>();
	AU.setPreservesCFG();
	}
	};

	} // end anonymous namespace

	PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
	auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
	auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
	auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
	auto *AA = &AM.getResult<AAManager>(F);
	auto *LI = &AM.getResult<LoopAnalysis>(F);
	auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
	auto *AC = &AM.getResult<AssumptionAnalysis>(F);
	auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
	auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);

	bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
	if (!Changed)
	return PreservedAnalyses::all();

	PreservedAnalyses PA;
	PA.preserveSet<CFGAnalyses>();
	PA.preserve<AAManager>();
	PA.preserve<GlobalsAA>();
	return PA;
	}

	bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
	TargetTransformInfo *TTI_,
	TargetLibraryInfo TLI_, AliasAnalysis AA_,
	LoopInfo LI_, DominatorTree DT_,
	AssumptionCache AC_, DemandedBits DB_,
	OptimizationRemarkEmitter *ORE_) {
	if (!RunSLPVectorization)
	return false;
	SE = SE_;
	TTI = TTI_;
	TLI = TLI_;
	AA = AA_;
	LI = LI_;
	DT = DT_;
	AC = AC_;
	DB = DB_;
	DL = &F.getParent()->getDataLayout();

	Stores.clear();
	GEPs.clear();
	bool Changed = false;

	// If the target claims to have no vector registers don't attempt
	// vectorization.
	if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)))
	return false;

	// Don't vectorize when the attribute NoImplicitFloat is used.
	if (F.hasFnAttribute(Attribute::NoImplicitFloat))
	return false;

	LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");

	// Use the bottom up slp vectorizer to construct chains that start with
	// store instructions.
	BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);

	// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
	// delete instructions.

	// Scan the blocks in the function in post order.
	for (auto BB : post_order(&F.getEntryBlock())) {
	collectSeedInstructions(BB);

	// Vectorize trees that end at stores.
	if (!Stores.empty()) {
	LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
	<< " underlying objects.\n");
	Changed \|= vectorizeStoreChains(R);
	}

	// Vectorize trees that end at reductions.
	Changed \|= vectorizeChainsInBlock(BB, R);

	// Vectorize the index computations of getelementptr instructions. This
	// is primarily intended to catch gather-like idioms ending at
	// non-consecutive loads.
	if (!GEPs.empty()) {
	LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
	<< " underlying objects.\n");
	Changed \|= vectorizeGEPIndices(BB, R);
	}
	}

	if (Changed) {
	R.optimizeGatherSequence();
	LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
	}
	return Changed;
	}

	bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
	unsigned Idx) {
	LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
	<< "\n");
	const unsigned Sz = R.getVectorElementSize(Chain[0]);
	const unsigned MinVF = R.getMinVecRegSize() / Sz;
	unsigned VF = Chain.size();

	if (!isPowerOf2_32(Sz) \|\| !isPowerOf2_32(VF) \|\| VF < 2 \|\| VF < MinVF)
	return false;

	LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
	<< "\n");

	R.buildTree(Chain);
	Optional<ArrayRef<unsigned>> Order = R.bestOrder();
	// TODO: Handle orders of size less than number of elements in the vector.
	if (Order && Order->size() == Chain.size()) {
	// TODO: reorder tree nodes without tree rebuilding.
	SmallVector<Value *, 4> ReorderedOps(Chain.rbegin(), Chain.rend());
	llvm::transform(*Order, ReorderedOps.begin(),
	[Chain](const unsigned Idx) { return Chain[Idx]; });
	R.buildTree(ReorderedOps);
	}
	if (R.isTreeTinyAndNotFullyVectorizable())
	return false;
	if (R.isLoadCombineCandidate())
	return false;

	R.computeMinimumValueSizes();

	int Cost = R.getTreeCost();

	LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
	if (Cost < -SLPCostThreshold) {
	LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");

	using namespace ore;

	R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
	cast<StoreInst>(Chain[0]))
	<< "Stores SLP vectorized with cost " << NV("Cost", Cost)
	<< " and with tree size "
	<< NV("TreeSize", R.getTreeSize()));

	R.vectorizeTree();
	return true;
	}

	return false;
	}

	bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
	BoUpSLP &R) {
	// We may run into multiple chains that merge into a single chain. We mark the
	// stores that we vectorized so that we don't visit the same store twice.
	BoUpSLP::ValueSet VectorizedStores;
	bool Changed = false;

	int E = Stores.size();
	SmallBitVector Tails(E, false);
	SmallVector<int, 16> ConsecutiveChain(E, E + 1);
	int MaxIter = MaxStoreLookup.getValue();
	int IterCnt;
	auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter,
	&ConsecutiveChain](int K, int Idx) {
	if (IterCnt >= MaxIter)
	return true;
	++IterCnt;
	if (!isConsecutiveAccess(Stores[K], Stores[Idx], DL, SE))
	return false;

	Tails.set(Idx);
	ConsecutiveChain[K] = Idx;
	return true;
	};
	// Do a quadratic search on all of the given stores in reverse order and find
	// all of the pairs of stores that follow each other.
	for (int Idx = E - 1; Idx >= 0; --Idx) {
	// If a store has multiple consecutive store candidates, search according
	// to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...
	// This is because usually pairing with immediate succeeding or preceding
	// candidate create the best chance to find slp vectorization opportunity.
	const int MaxLookDepth = std::max(E - Idx, Idx + 1);
	IterCnt = 0;
	for (int Offset = 1, F = MaxLookDepth; Offset < F; ++Offset)
	if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) \|\|
	(Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx)))
	break;
	}

	// For stores that start but don't end a link in the chain:
	for (int Cnt = E; Cnt > 0; --Cnt) {
	int I = Cnt - 1;
	if (ConsecutiveChain[I] == E + 1 \|\| Tails.test(I))
	continue;
	// We found a store instr that starts a chain. Now follow the chain and try
	// to vectorize it.
	BoUpSLP::ValueList Operands;
	// Collect the chain into a list.
	while (I != E + 1 && !VectorizedStores.count(Stores[I])) {
	Operands.push_back(Stores[I]);
	// Move to the next value in the chain.
	I = ConsecutiveChain[I];
	}

	// If a vector register can't hold 1 element, we are done.
	unsigned MaxVecRegSize = R.getMaxVecRegSize();
	unsigned EltSize = R.getVectorElementSize(Stores[0]);
	if (MaxVecRegSize % EltSize != 0)
	continue;

	unsigned MaxElts = MaxVecRegSize / EltSize;
	// FIXME: Is division-by-2 the correct step? Should we assert that the
	// register size is a power-of-2?
	unsigned StartIdx = 0;
	for (unsigned Size = llvm::PowerOf2Ceil(MaxElts); Size >= 2; Size /= 2) {
	for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
	ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size);
	if (!VectorizedStores.count(Slice.front()) &&
	!VectorizedStores.count(Slice.back()) &&
	vectorizeStoreChain(Slice, R, Cnt)) {
	// Mark the vectorized stores so that we don't vectorize them again.
	VectorizedStores.insert(Slice.begin(), Slice.end());
	Changed = true;
	// If we vectorized initial block, no need to try to vectorize it
	// again.
	if (Cnt == StartIdx)
	StartIdx += Size;
	Cnt += Size;
	continue;
	}
	++Cnt;
	}
	// Check if the whole array was vectorized already - exit.
	if (StartIdx >= Operands.size())
	break;
	}
	}

	return Changed;
	}

	void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
	// Initialize the collections. We will make a single pass over the block.
	Stores.clear();
	GEPs.clear();

	// Visit the store and getelementptr instructions in BB and organize them in
	// Stores and GEPs according to the underlying objects of their pointer
	// operands.
	for (Instruction &I : *BB) {
	// Ignore store instructions that are volatile or have a pointer operand
	// that doesn't point to a scalar type.
	if (auto *SI = dyn_cast<StoreInst>(&I)) {
	if (!SI->isSimple())
	continue;
	if (!isValidElementType(SI->getValueOperand()->getType()))
	continue;
	Stores[GetUnderlyingObject(SI->getPointerOperand(), *DL)].push_back(SI);
	}

	// Ignore getelementptr instructions that have more than one index, a
	// constant index, or a pointer operand that doesn't point to a scalar
	// type.
	else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
	auto Idx = GEP->idx_begin()->get();
	if (GEP->getNumIndices() > 1 \|\| isa<Constant>(Idx))
	continue;
	if (!isValidElementType(Idx->getType()))
	continue;
	if (GEP->getType()->isVectorTy())
	continue;
	GEPs[GEP->getPointerOperand()].push_back(GEP);
	}
	}
	}

	bool SLPVectorizerPass::tryToVectorizePair(Value A, Value B, BoUpSLP &R) {
	if (!A \|\| !B)
	return false;
	Value *VL[] = {A, B};
	return tryToVectorizeList(VL, R, /AllowReorder=/true);
	}

	bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
	bool AllowReorder,
	ArrayRef<Value *> InsertUses) {
	if (VL.size() < 2)
	return false;

	LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
	<< VL.size() << ".\n");

	// Check that all of the parts are instructions of the same type,
	// we permit an alternate opcode via InstructionsState.
	InstructionsState S = getSameOpcode(VL);
	if (!S.getOpcode())
	return false;

	Instruction *I0 = cast<Instruction>(S.OpValue);
	// Make sure invalid types (including vector type) are rejected before
	// determining vectorization factor for scalar instructions.
	for (Value *V : VL) {
	Type *Ty = V->getType();
	if (!isValidElementType(Ty)) {
	// NOTE: the following will give user internal llvm type name, which may
	// not be useful.
	R.getORE()->emit([&]() {
	std::string type_str;
	llvm::raw_string_ostream rso(type_str);
	Ty->print(rso);
	return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
	<< "Cannot SLP vectorize list: type "
	<< rso.str() + " is unsupported by vectorizer";
	});
	return false;
	}
	}

	unsigned Sz = R.getVectorElementSize(I0);
	unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
	unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
	if (MaxVF < 2) {
	R.getORE()->emit([&]() {
	return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
	<< "Cannot SLP vectorize list: vectorization factor "
	<< "less than 2 is not supported";
	});
	return false;
	}

	bool Changed = false;
	bool CandidateFound = false;
	int MinCost = SLPCostThreshold;

	bool CompensateUseCost =
	!InsertUses.empty() && llvm::all_of(InsertUses, [](const Value *V) {
	return V && isa<InsertElementInst>(V);
	});
	assert((!CompensateUseCost \|\| InsertUses.size() == VL.size()) &&
	"Each scalar expected to have an associated InsertElement user.");

	unsigned NextInst = 0, MaxInst = VL.size();
	for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
	// No actual vectorization should happen, if number of parts is the same as
	// provided vectorization factor (i.e. the scalar type is used for vector
	// code during codegen).
	auto *VecTy = FixedVectorType::get(VL[0]->getType(), VF);
	if (TTI->getNumberOfParts(VecTy) == VF)
	continue;
	for (unsigned I = NextInst; I < MaxInst; ++I) {
	unsigned OpsWidth = 0;

	if (I + VF > MaxInst)
	OpsWidth = MaxInst - I;
	else
	OpsWidth = VF;

	if (!isPowerOf2_32(OpsWidth) \|\| OpsWidth < 2)
	break;

	ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
	// Check that a previous iteration of this loop did not delete the Value.
	if (llvm::any_of(Ops, [&R](Value *V) {
	auto *I = dyn_cast<Instruction>(V);
	return I && R.isDeleted(I);
	}))
	continue;

	LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
	<< "\n");

	R.buildTree(Ops);
	Optional<ArrayRef<unsigned>> Order = R.bestOrder();
	// TODO: check if we can allow reordering for more cases.
	if (AllowReorder && Order) {
	// TODO: reorder tree nodes without tree rebuilding.
	// Conceptually, there is nothing actually preventing us from trying to
	// reorder a larger list. In fact, we do exactly this when vectorizing
	// reductions. However, at this point, we only expect to get here when
	// there are exactly two operations.
	assert(Ops.size() == 2);
	Value *ReorderedOps[] = {Ops[1], Ops[0]};
	R.buildTree(ReorderedOps, None);
	}
	if (R.isTreeTinyAndNotFullyVectorizable())
	continue;

	R.computeMinimumValueSizes();
	int Cost = R.getTreeCost();
	CandidateFound = true;
	if (CompensateUseCost) {
	// TODO: Use TTI's getScalarizationOverhead for sequence of inserts
	// rather than sum of single inserts as the latter may overestimate
	// cost. This work should imply improving cost estimation for extracts
	// that added in for external (for vectorization tree) users,i.e. that
	// part should also switch to same interface.
	// For example, the following case is projected code after SLP:
	// %4 = extractelement <4 x i64> %3, i32 0
	// %v0 = insertelement <4 x i64> undef, i64 %4, i32 0
	// %5 = extractelement <4 x i64> %3, i32 1
	// %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1
	// %6 = extractelement <4 x i64> %3, i32 2
	// %v2 = insertelement <4 x i64> %v1, i64 %6, i32 2
	// %7 = extractelement <4 x i64> %3, i32 3
	// %v3 = insertelement <4 x i64> %v2, i64 %7, i32 3
	//
	// Extracts here added by SLP in order to feed users (the inserts) of
	// original scalars and contribute to "ExtractCost" at cost evaluation.
	// The inserts in turn form sequence to build an aggregate that
	// detected by findBuildAggregate routine.
	// SLP makes an assumption that such sequence will be optimized away
	// later (instcombine) so it tries to compensate ExctractCost with
	// cost of insert sequence.
	// Current per element cost calculation approach is not quite accurate
	// and tends to create bias toward favoring vectorization.
	// Switching to the TTI interface might help a bit.
	// Alternative solution could be pattern-match to detect a no-op or
	// shuffle.
	unsigned UserCost = 0;
	for (unsigned Lane = 0; Lane < OpsWidth; Lane++) {
	auto *IE = cast<InsertElementInst>(InsertUses[I + Lane]);
	if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2)))
	UserCost += TTI->getVectorInstrCost(
	Instruction::InsertElement, IE->getType(), CI->getZExtValue());
	}
	LLVM_DEBUG(dbgs() << "SLP: Compensate cost of users by: " << UserCost
	<< ".\n");
	Cost -= UserCost;
	}

	MinCost = std::min(MinCost, Cost);

	if (Cost < -SLPCostThreshold) {
	LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
	R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
	cast<Instruction>(Ops[0]))
	<< "SLP vectorized with cost " << ore::NV("Cost", Cost)
	<< " and with tree size "
	<< ore::NV("TreeSize", R.getTreeSize()));

	R.vectorizeTree();
	// Move to the next bundle.
	I += VF - 1;
	NextInst = I + 1;
	Changed = true;
	}
	}
	}

	if (!Changed && CandidateFound) {
	R.getORE()->emit([&]() {
	return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
	<< "List vectorization was possible but not beneficial with cost "
	<< ore::NV("Cost", MinCost) << " >= "
	<< ore::NV("Treshold", -SLPCostThreshold);
	});
	} else if (!Changed) {
	R.getORE()->emit([&]() {
	return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
	<< "Cannot SLP vectorize list: vectorization was impossible"
	<< " with available vectorization factors";
	});
	}
	return Changed;
	}

	bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
	if (!I)
	return false;

	if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I))
	return false;

	Value *P = I->getParent();

	// Vectorize in current basic block only.
	auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
	auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
	if (!Op0 \|\| !Op1 \|\| Op0->getParent() != P \|\| Op1->getParent() != P)
	return false;

	// Try to vectorize V.
	if (tryToVectorizePair(Op0, Op1, R))
	return true;

	auto *A = dyn_cast<BinaryOperator>(Op0);
	auto *B = dyn_cast<BinaryOperator>(Op1);
	// Try to skip B.
	if (B && B->hasOneUse()) {
	auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
	auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
	if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
	return true;
	if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
	return true;
	}

	// Try to skip A.
	if (A && A->hasOneUse()) {
	auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
	auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
	if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
	return true;
	if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
	return true;
	}
	return false;
	}

	/// Generate a shuffle mask to be used in a reduction tree.
	///
	/// \param VecLen The length of the vector to be reduced.
	/// \param NumEltsToRdx The number of elements that should be reduced in the
	/// vector.
	/// \param IsPairwise Whether the reduction is a pairwise or splitting
	/// reduction. A pairwise reduction will generate a mask of
	/// <0,2,...> or <1,3,..> while a splitting reduction will generate
	/// <2,3, undef,undef> for a vector of 4 and NumElts = 2.
	/// \param IsLeft True will generate a mask of even elements, odd otherwise.
	static SmallVector<int, 32> createRdxShuffleMask(unsigned VecLen,
	unsigned NumEltsToRdx,
	bool IsPairwise, bool IsLeft) {
	assert((IsPairwise \|\| !IsLeft) && "Don't support a <0,1,undef,...> mask");

	SmallVector<int, 32> ShuffleMask(VecLen, -1);

	if (IsPairwise)
	// Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
	for (unsigned i = 0; i != NumEltsToRdx; ++i)
	ShuffleMask[i] = 2 * i + !IsLeft;
	else
	// Move the upper half of the vector to the lower half.
	for (unsigned i = 0; i != NumEltsToRdx; ++i)
	ShuffleMask[i] = NumEltsToRdx + i;

	return ShuffleMask;
	}

	namespace {

	/// Model horizontal reductions.
	///
	/// A horizontal reduction is a tree of reduction operations (currently add and
	/// fadd) that has operations that can be put into a vector as its leaf.
	/// For example, this tree:
	///
	/// mul mul mul mul
	/// \ / \ /
	/// + +
	/// \ /
	/// +
	/// This tree has "mul" as its reduced values and "+" as its reduction
	/// operations. A reduction might be feeding into a store or a binary operation
	/// feeding a phi.
	/// ...
	/// \ /
	/// +
	/// \|
	/// phi +=
	///
	/// Or:
	/// ...
	/// \ /
	/// +
	/// \|
	/// *p =
	///
	class HorizontalReduction {
	using ReductionOpsType = SmallVector<Value *, 16>;
	using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
	ReductionOpsListType ReductionOps;
	SmallVector<Value *, 32> ReducedVals;
	// Use map vector to make stable output.
	MapVector<Instruction , Value > ExtraArgs;

	/// Kind of the reduction data.
	enum ReductionKind {
	RK_None, /// Not a reduction.
	RK_Arithmetic, /// Binary reduction data.
	RK_Min, /// Minimum reduction data.
	RK_UMin, /// Unsigned minimum reduction data.
	RK_Max, /// Maximum reduction data.
	RK_UMax, /// Unsigned maximum reduction data.
	};

	/// Contains info about operation, like its opcode, left and right operands.
	class OperationData {
	/// Opcode of the instruction.
	unsigned Opcode = 0;

	/// Left operand of the reduction operation.
	Value *LHS = nullptr;

	/// Right operand of the reduction operation.
	Value *RHS = nullptr;

	/// Kind of the reduction operation.
	ReductionKind Kind = RK_None;

	/// True if float point min/max reduction has no NaNs.
	bool NoNaN = false;

	/// Checks if the reduction operation can be vectorized.
	bool isVectorizable() const {
	return LHS && RHS &&
	// We currently only support add/mul/logical && min/max reductions.
	((Kind == RK_Arithmetic &&
	(Opcode == Instruction::Add \|\| Opcode == Instruction::FAdd \|\|
	Opcode == Instruction::Mul \|\| Opcode == Instruction::FMul \|\|
	Opcode == Instruction::And \|\| Opcode == Instruction::Or \|\|
	Opcode == Instruction::Xor)) \|\|
	((Opcode == Instruction::ICmp \|\| Opcode == Instruction::FCmp) &&
	(Kind == RK_Min \|\| Kind == RK_Max)) \|\|
	(Opcode == Instruction::ICmp &&
	(Kind == RK_UMin \|\| Kind == RK_UMax)));
	}

	/// Creates reduction operation with the current opcode.
	Value *createOp(IRBuilder<> &Builder, const Twine &Name) const {
	assert(isVectorizable() &&
	"Expected add\|fadd or min/max reduction operation.");
	Value *Cmp = nullptr;
	switch (Kind) {
	case RK_Arithmetic:
	return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS,
	Name);
	case RK_Min:
	Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS)
	: Builder.CreateFCmpOLT(LHS, RHS);
	return Builder.CreateSelect(Cmp, LHS, RHS, Name);
	case RK_Max:
	Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS)
	: Builder.CreateFCmpOGT(LHS, RHS);
	return Builder.CreateSelect(Cmp, LHS, RHS, Name);
	case RK_UMin:
	assert(Opcode == Instruction::ICmp && "Expected integer types.");
	Cmp = Builder.CreateICmpULT(LHS, RHS);
	return Builder.CreateSelect(Cmp, LHS, RHS, Name);
	case RK_UMax:
	assert(Opcode == Instruction::ICmp && "Expected integer types.");
	Cmp = Builder.CreateICmpUGT(LHS, RHS);
	return Builder.CreateSelect(Cmp, LHS, RHS, Name);
	case RK_None:
	break;
	}
	llvm_unreachable("Unknown reduction operation.");
	}

	public:
	explicit OperationData() = default;

	/// Construction for reduced values. They are identified by opcode only and
	/// don't have associated LHS/RHS values.
	explicit OperationData(Value *V) {
	if (auto *I = dyn_cast<Instruction>(V))
	Opcode = I->getOpcode();
	}

	/// Constructor for reduction operations with opcode and its left and
	/// right operands.
	OperationData(unsigned Opcode, Value LHS, Value RHS, ReductionKind Kind,
	bool NoNaN = false)
	: Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind), NoNaN(NoNaN) {
	assert(Kind != RK_None && "One of the reduction operations is expected.");
	}

	explicit operator bool() const { return Opcode; }

	/// Return true if this operation is any kind of minimum or maximum.
	bool isMinMax() const {
	switch (Kind) {
	case RK_Arithmetic:
	return false;
	case RK_Min:
	case RK_Max:
	case RK_UMin:
	case RK_UMax:
	return true;
	case RK_None:
	break;
	}
	llvm_unreachable("Reduction kind is not set");
	}

	/// Get the index of the first operand.
	unsigned getFirstOperandIndex() const {
	assert(!!*this && "The opcode is not set.");
	// We allow calling this before 'Kind' is set, so handle that specially.
	if (Kind == RK_None)
	return 0;
	return isMinMax() ? 1 : 0;
	}

	/// Total number of operands in the reduction operation.
	unsigned getNumberOfOperands() const {
	assert(Kind != RK_None && !!*this && LHS && RHS &&
	"Expected reduction operation.");
	return isMinMax() ? 3 : 2;
	}

	/// Checks if the operation has the same parent as \p P.
	bool hasSameParent(Instruction I, Value P, bool IsRedOp) const {
	assert(Kind != RK_None && !!*this && LHS && RHS &&
	"Expected reduction operation.");
	if (!IsRedOp)
	return I->getParent() == P;
	if (isMinMax()) {
	// SelectInst must be used twice while the condition op must have single
	// use only.
	auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition());
	return I->getParent() == P && Cmp && Cmp->getParent() == P;
	}
	// Arithmetic reduction operation must be used once only.
	return I->getParent() == P;
	}

	/// Expected number of uses for reduction operations/reduced values.
	bool hasRequiredNumberOfUses(Instruction *I, bool IsReductionOp) const {
	assert(Kind != RK_None && !!*this && LHS && RHS &&
	"Expected reduction operation.");
	if (isMinMax())
	return I->hasNUses(2) &&
	(!IsReductionOp \|\|
	cast<SelectInst>(I)->getCondition()->hasOneUse());
	return I->hasOneUse();
	}

	/// Initializes the list of reduction operations.
	void initReductionOps(ReductionOpsListType &ReductionOps) {
	assert(Kind != RK_None && !!*this && LHS && RHS &&
	"Expected reduction operation.");
	if (isMinMax())
	ReductionOps.assign(2, ReductionOpsType());
	else
	ReductionOps.assign(1, ReductionOpsType());
	}

	/// Add all reduction operations for the reduction instruction \p I.
	void addReductionOps(Instruction *I, ReductionOpsListType &ReductionOps) {
	assert(Kind != RK_None && !!*this && LHS && RHS &&
	"Expected reduction operation.");
	if (isMinMax()) {
	ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
	ReductionOps[1].emplace_back(I);
	} else {
	ReductionOps[0].emplace_back(I);
	}
	}

	/// Checks if instruction is associative and can be vectorized.
	bool isAssociative(Instruction *I) const {
	assert(Kind != RK_None && *this && LHS && RHS &&
	"Expected reduction operation.");
	switch (Kind) {
	case RK_Arithmetic:
	return I->isAssociative();
	case RK_Min:
	case RK_Max:
	return Opcode == Instruction::ICmp \|\|
	cast<Instruction>(I->getOperand(0))->isFast();
	case RK_UMin:
	case RK_UMax:
	assert(Opcode == Instruction::ICmp &&
	"Only integer compare operation is expected.");
	return true;
	case RK_None:
	break;
	}
	llvm_unreachable("Reduction kind is not set");
	}

	/// Checks if the reduction operation can be vectorized.
	bool isVectorizable(Instruction *I) const {
	return isVectorizable() && isAssociative(I);
	}

	/// Checks if two operation data are both a reduction op or both a reduced
	/// value.
	bool operator==(const OperationData &OD) const {
	assert(((Kind != OD.Kind) \|\| ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) &&
	"One of the comparing operations is incorrect.");
	return this == &OD \|\| (Kind == OD.Kind && Opcode == OD.Opcode);
	}
	bool operator!=(const OperationData &OD) const { return !(*this == OD); }
	void clear() {
	Opcode = 0;
	LHS = nullptr;
	RHS = nullptr;
	Kind = RK_None;
	NoNaN = false;
	}

	/// Get the opcode of the reduction operation.
	unsigned getOpcode() const {
	assert(isVectorizable() && "Expected vectorizable operation.");
	return Opcode;
	}

	/// Get kind of reduction data.
	ReductionKind getKind() const { return Kind; }
	Value *getLHS() const { return LHS; }
	Value *getRHS() const { return RHS; }
	Type *getConditionType() const {
	return isMinMax() ? CmpInst::makeCmpResultType(LHS->getType()) : nullptr;
	}

	/// Creates reduction operation with the current opcode with the IR flags
	/// from \p ReductionOps.
	Value *createOp(IRBuilder<> &Builder, const Twine &Name,
	const ReductionOpsListType &ReductionOps) const {
	assert(isVectorizable() &&
	"Expected add\|fadd or min/max reduction operation.");
	auto *Op = createOp(Builder, Name);
	switch (Kind) {
	case RK_Arithmetic:
	propagateIRFlags(Op, ReductionOps[0]);
	return Op;
	case RK_Min:
	case RK_Max:
	case RK_UMin:
	case RK_UMax:
	if (auto *SI = dyn_cast<SelectInst>(Op))
	propagateIRFlags(SI->getCondition(), ReductionOps[0]);
	propagateIRFlags(Op, ReductionOps[1]);
	return Op;
	case RK_None:
	break;
	}
	llvm_unreachable("Unknown reduction operation.");
	}
	/// Creates reduction operation with the current opcode with the IR flags
	/// from \p I.
	Value *createOp(IRBuilder<> &Builder, const Twine &Name,
	Instruction *I) const {
	assert(isVectorizable() &&
	"Expected add\|fadd or min/max reduction operation.");
	auto *Op = createOp(Builder, Name);
	switch (Kind) {
	case RK_Arithmetic:
	propagateIRFlags(Op, I);
	return Op;
	case RK_Min:
	case RK_Max:
	case RK_UMin:
	case RK_UMax:
	if (auto *SI = dyn_cast<SelectInst>(Op)) {
	propagateIRFlags(SI->getCondition(),
	cast<SelectInst>(I)->getCondition());
	}
	propagateIRFlags(Op, I);
	return Op;
	case RK_None:
	break;
	}
	llvm_unreachable("Unknown reduction operation.");
	}

	TargetTransformInfo::ReductionFlags getFlags() const {
	TargetTransformInfo::ReductionFlags Flags;
	Flags.NoNaN = NoNaN;
	switch (Kind) {
	case RK_Arithmetic:
	break;
	case RK_Min:
	Flags.IsSigned = Opcode == Instruction::ICmp;
	Flags.IsMaxOp = false;
	break;
	case RK_Max:
	Flags.IsSigned = Opcode == Instruction::ICmp;
	Flags.IsMaxOp = true;
	break;
	case RK_UMin:
	Flags.IsSigned = false;
	Flags.IsMaxOp = false;
	break;
	case RK_UMax:
	Flags.IsSigned = false;
	Flags.IsMaxOp = true;
	break;
	case RK_None:
	llvm_unreachable("Reduction kind is not set");
	}
	return Flags;
	}
	};

	WeakTrackingVH ReductionRoot;

	/// The operation data of the reduction operation.
	OperationData ReductionData;

	/// The operation data of the values we perform a reduction on.
	OperationData ReducedValueData;

	/// Should we model this reduction as a pairwise reduction tree or a tree that
	/// splits the vector in halves and adds those halves.
	bool IsPairwiseReduction = false;

	/// Checks if the ParentStackElem.first should be marked as a reduction
	/// operation with an extra argument or as extra argument itself.
	void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
	Value *ExtraArg) {
	if (ExtraArgs.count(ParentStackElem.first)) {
	ExtraArgs[ParentStackElem.first] = nullptr;
	// We ran into something like:
	// ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.
	// The whole ParentStackElem.first should be considered as an extra value
	// in this case.
	// Do not perform analysis of remaining operands of ParentStackElem.first
	// instruction, this whole instruction is an extra argument.
	ParentStackElem.second = ParentStackElem.first->getNumOperands();
	} else {
	// We ran into something like:
	// ParentStackElem.first += ... + ExtraArg + ...
	ExtraArgs[ParentStackElem.first] = ExtraArg;
	}
	}

	static OperationData getOperationData(Value *V) {
	if (!V)
	return OperationData();

	Value *LHS;
	Value *RHS;
	if (m_BinOp(m_Value(LHS), m_Value(RHS)).match(V)) {
	return OperationData(cast<BinaryOperator>(V)->getOpcode(), LHS, RHS,
	RK_Arithmetic);
	}
	if (auto *Select = dyn_cast<SelectInst>(V)) {
	// Look for a min/max pattern.
	if (m_UMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
	return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
	} else if (m_SMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
	return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
	} else if (m_OrdFMin(m_Value(LHS), m_Value(RHS)).match(Select) \|\|
	m_UnordFMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
	return OperationData(
	Instruction::FCmp, LHS, RHS, RK_Min,
	cast<Instruction>(Select->getCondition())->hasNoNaNs());
	} else if (m_UMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
	return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
	} else if (m_SMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
	return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
	} else if (m_OrdFMax(m_Value(LHS), m_Value(RHS)).match(Select) \|\|
	m_UnordFMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
	return OperationData(
	Instruction::FCmp, LHS, RHS, RK_Max,
	cast<Instruction>(Select->getCondition())->hasNoNaNs());
	} else {
	// Try harder: look for min/max pattern based on instructions producing
	// same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
	// During the intermediate stages of SLP, it's very common to have
	// pattern like this (since optimizeGatherSequence is run only once
	// at the end):
	// %1 = extractelement <2 x i32> %a, i32 0
	// %2 = extractelement <2 x i32> %a, i32 1
	// %cond = icmp sgt i32 %1, %2
	// %3 = extractelement <2 x i32> %a, i32 0
	// %4 = extractelement <2 x i32> %a, i32 1
	// %select = select i1 %cond, i32 %3, i32 %4
	CmpInst::Predicate Pred;
	Instruction *L1;
	Instruction *L2;

	LHS = Select->getTrueValue();
	RHS = Select->getFalseValue();
	Value *Cond = Select->getCondition();

	// TODO: Support inverse predicates.
	if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
	if (!isa<ExtractElementInst>(RHS) \|\|
	!L2->isIdenticalTo(cast<Instruction>(RHS)))
	return OperationData(V);
	} else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
	if (!isa<ExtractElementInst>(LHS) \|\|
	!L1->isIdenticalTo(cast<Instruction>(LHS)))
	return OperationData(V);
	} else {
	if (!isa<ExtractElementInst>(LHS) \|\| !isa<ExtractElementInst>(RHS))
	return OperationData(V);
	if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) \|\|
	!L1->isIdenticalTo(cast<Instruction>(LHS)) \|\|
	!L2->isIdenticalTo(cast<Instruction>(RHS)))
	return OperationData(V);
	}
	switch (Pred) {
	default:
	return OperationData(V);

	case CmpInst::ICMP_ULT:
	case CmpInst::ICMP_ULE:
	return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);

	case CmpInst::ICMP_SLT:
	case CmpInst::ICMP_SLE:
	return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);

	case CmpInst::FCMP_OLT:
	case CmpInst::FCMP_OLE:
	case CmpInst::FCMP_ULT:
	case CmpInst::FCMP_ULE:
	return OperationData(Instruction::FCmp, LHS, RHS, RK_Min,
	cast<Instruction>(Cond)->hasNoNaNs());

	case CmpInst::ICMP_UGT:
	case CmpInst::ICMP_UGE:
	return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);

	case CmpInst::ICMP_SGT:
	case CmpInst::ICMP_SGE:
	return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);

	case CmpInst::FCMP_OGT:
	case CmpInst::FCMP_OGE:
	case CmpInst::FCMP_UGT:
	case CmpInst::FCMP_UGE:
	return OperationData(Instruction::FCmp, LHS, RHS, RK_Max,
	cast<Instruction>(Cond)->hasNoNaNs());
	}
	}
	}
	return OperationData(V);
	}

	public:
	HorizontalReduction() = default;

	/// Try to find a reduction tree.
	bool matchAssociativeReduction(PHINode Phi, Instruction B) {
	assert((!Phi \|\| is_contained(Phi->operands(), B)) &&
	"Thi phi needs to use the binary operator");

	ReductionData = getOperationData(B);

	// We could have a initial reductions that is not an add.
	// r *= v1 + v2 + v3 + v4
	// In such a case start looking for a tree rooted in the first '+'.
	if (Phi) {
	if (ReductionData.getLHS() == Phi) {
	Phi = nullptr;
	B = dyn_cast<Instruction>(ReductionData.getRHS());
	ReductionData = getOperationData(B);
	} else if (ReductionData.getRHS() == Phi) {
	Phi = nullptr;
	B = dyn_cast<Instruction>(ReductionData.getLHS());
	ReductionData = getOperationData(B);
	}
	}

	if (!ReductionData.isVectorizable(B))
	return false;

	Type *Ty = B->getType();
	if (!isValidElementType(Ty))
	return false;
	if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy())
	return false;

	ReducedValueData.clear();
	ReductionRoot = B;

	// Post order traverse the reduction tree starting at B. We only handle true
	// trees containing only binary operators.
	SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
	Stack.push_back(std::make_pair(B, ReductionData.getFirstOperandIndex()));
	ReductionData.initReductionOps(ReductionOps);
	while (!Stack.empty()) {
	Instruction *TreeN = Stack.back().first;
	unsigned EdgeToVist = Stack.back().second++;
	OperationData OpData = getOperationData(TreeN);
	bool IsReducedValue = OpData != ReductionData;

	// Postorder vist.
	if (IsReducedValue \|\| EdgeToVist == OpData.getNumberOfOperands()) {
	if (IsReducedValue)
	ReducedVals.push_back(TreeN);
	else {
	auto I = ExtraArgs.find(TreeN);
	if (I != ExtraArgs.end() && !I->second) {
	// Check if TreeN is an extra argument of its parent operation.
	if (Stack.size() <= 1) {
	// TreeN can't be an extra argument as it is a root reduction
	// operation.
	return false;
	}
	// Yes, TreeN is an extra argument, do not add it to a list of
	// reduction operations.
	// Stack[Stack.size() - 2] always points to the parent operation.
	markExtraArg(Stack[Stack.size() - 2], TreeN);
	ExtraArgs.erase(TreeN);
	} else
	ReductionData.addReductionOps(TreeN, ReductionOps);
	}
	// Retract.
	Stack.pop_back();
	continue;
	}

	// Visit left or right.
	Value *NextV = TreeN->getOperand(EdgeToVist);
	if (NextV != Phi) {
	auto *I = dyn_cast<Instruction>(NextV);
	OpData = getOperationData(I);
	// Continue analysis if the next operand is a reduction operation or
	// (possibly) a reduced value. If the reduced value opcode is not set,
	// the first met operation != reduction operation is considered as the
	// reduced value class.
	if (I && (!ReducedValueData \|\| OpData == ReducedValueData \|\|
	OpData == ReductionData)) {
	const bool IsReductionOperation = OpData == ReductionData;
	// Only handle trees in the current basic block.
	if (!ReductionData.hasSameParent(I, B->getParent(),
	IsReductionOperation)) {
	// I is an extra argument for TreeN (its parent operation).
	markExtraArg(Stack.back(), I);
	continue;
	}

	// Each tree node needs to have minimal number of users except for the
	// ultimate reduction.
	if (!ReductionData.hasRequiredNumberOfUses(I,
	OpData == ReductionData) &&
	I != B) {
	// I is an extra argument for TreeN (its parent operation).
	markExtraArg(Stack.back(), I);
	continue;
	}

	if (IsReductionOperation) {
	// We need to be able to reassociate the reduction operations.
	if (!OpData.isAssociative(I)) {
	// I is an extra argument for TreeN (its parent operation).
	markExtraArg(Stack.back(), I);
	continue;
	}
	} else if (ReducedValueData &&
	ReducedValueData != OpData) {
	// Make sure that the opcodes of the operations that we are going to
	// reduce match.
	// I is an extra argument for TreeN (its parent operation).
	markExtraArg(Stack.back(), I);
	continue;
	} else if (!ReducedValueData)
	ReducedValueData = OpData;

	Stack.push_back(std::make_pair(I, OpData.getFirstOperandIndex()));
	continue;
	}
	}
	// NextV is an extra argument for TreeN (its parent operation).
	markExtraArg(Stack.back(), NextV);
	}
	return true;
	}

	/// Attempt to vectorize the tree found by
	/// matchAssociativeReduction.
	bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
	if (ReducedVals.empty())
	return false;

	// If there is a sufficient number of reduction values, reduce
	// to a nearby power-of-2. Can safely generate oversized
	// vectors and rely on the backend to split them to legal sizes.
	unsigned NumReducedVals = ReducedVals.size();
	if (NumReducedVals < 4)
	return false;

	unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);

	Value *VectorizedTree = nullptr;

	// FIXME: Fast-math-flags should be set based on the instructions in the
	// reduction (not all of 'fast' are required).
	IRBuilder<> Builder(cast<Instruction>(ReductionRoot));
	FastMathFlags Unsafe;
	Unsafe.setFast();
	Builder.setFastMathFlags(Unsafe);
	unsigned i = 0;

	BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
	// The same extra argument may be used several time, so log each attempt
	// to use it.
	for (auto &Pair : ExtraArgs) {
	assert(Pair.first && "DebugLoc must be set.");
	ExternallyUsedValues[Pair.second].push_back(Pair.first);
	}

	// The compare instruction of a min/max is the insertion point for new
	// instructions and may be replaced with a new compare instruction.
	auto getCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
	assert(isa<SelectInst>(RdxRootInst) &&
	"Expected min/max reduction to have select root instruction");
	Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
	assert(isa<Instruction>(ScalarCond) &&
	"Expected min/max reduction to have compare condition");
	return cast<Instruction>(ScalarCond);
	};

	// The reduction root is used as the insertion point for new instructions,
	// so set it as externally used to prevent it from being deleted.
	ExternallyUsedValues[ReductionRoot];
	SmallVector<Value *, 16> IgnoreList;
	for (auto &V : ReductionOps)
	IgnoreList.append(V.begin(), V.end());
	while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
	auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
	V.buildTree(VL, ExternallyUsedValues, IgnoreList);
	Optional<ArrayRef<unsigned>> Order = V.bestOrder();
	// TODO: Handle orders of size less than number of elements in the vector.
	if (Order && Order->size() == VL.size()) {
	// TODO: reorder tree nodes without tree rebuilding.
	SmallVector<Value *, 4> ReorderedOps(VL.size());
	llvm::transform(*Order, ReorderedOps.begin(),
	[VL](const unsigned Idx) { return VL[Idx]; });
	V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList);
	}
	if (V.isTreeTinyAndNotFullyVectorizable())
	break;
	if (V.isLoadCombineReductionCandidate(ReductionData.getOpcode()))
	break;

	V.computeMinimumValueSizes();

	// Estimate cost.
	int TreeCost = V.getTreeCost();
	int ReductionCost = getReductionCost(TTI, ReducedVals[i], ReduxWidth);
	int Cost = TreeCost + ReductionCost;
	if (Cost >= -SLPCostThreshold) {
	V.getORE()->emit([&]() {
	return OptimizationRemarkMissed(
	SV_NAME, "HorSLPNotBeneficial", cast<Instruction>(VL[0]))
	<< "Vectorizing horizontal reduction is possible"
	<< "but not beneficial with cost "
	<< ore::NV("Cost", Cost) << " and threshold "
	<< ore::NV("Threshold", -SLPCostThreshold);
	});
	break;
	}

	LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
	<< Cost << ". (HorRdx)\n");
	V.getORE()->emit([&]() {
	return OptimizationRemark(
	SV_NAME, "VectorizedHorizontalReduction", cast<Instruction>(VL[0]))
	<< "Vectorized horizontal reduction with cost "
	<< ore::NV("Cost", Cost) << " and with tree size "
	<< ore::NV("TreeSize", V.getTreeSize());
	});

	// Vectorize a tree.
	DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
	Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);

	// Emit a reduction. For min/max, the root is a select, but the insertion
	// point is the compare condition of that select.
	Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
	if (ReductionData.isMinMax())
	Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst));
	else
	Builder.SetInsertPoint(RdxRootInst);

	Value *ReducedSubTree =
	emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
	if (VectorizedTree) {
	Builder.SetCurrentDebugLocation(Loc);
	OperationData VectReductionData(ReductionData.getOpcode(),
	VectorizedTree, ReducedSubTree,
	ReductionData.getKind());
	VectorizedTree =
	VectReductionData.createOp(Builder, "op.rdx", ReductionOps);
	} else
	VectorizedTree = ReducedSubTree;
	i += ReduxWidth;
	ReduxWidth = PowerOf2Floor(NumReducedVals - i);
	}

	if (VectorizedTree) {
	// Finish the reduction.
	for (; i < NumReducedVals; ++i) {
	auto *I = cast<Instruction>(ReducedVals[i]);
	Builder.SetCurrentDebugLocation(I->getDebugLoc());
	OperationData VectReductionData(ReductionData.getOpcode(),
	VectorizedTree, I,
	ReductionData.getKind());
	VectorizedTree = VectReductionData.createOp(Builder, "", ReductionOps);
	}
	for (auto &Pair : ExternallyUsedValues) {
	// Add each externally used value to the final reduction.
	for (auto *I : Pair.second) {
	Builder.SetCurrentDebugLocation(I->getDebugLoc());
	OperationData VectReductionData(ReductionData.getOpcode(),
	VectorizedTree, Pair.first,
	ReductionData.getKind());
	VectorizedTree = VectReductionData.createOp(Builder, "op.extra", I);
	}
	}

	// Update users. For a min/max reduction that ends with a compare and
	// select, we also have to RAUW for the compare instruction feeding the
	// reduction root. That's because the original compare may have extra uses
	// besides the final select of the reduction.
	if (ReductionData.isMinMax()) {
	if (auto *VecSelect = dyn_cast<SelectInst>(VectorizedTree)) {
	Instruction *ScalarCmp =
	getCmpForMinMaxReduction(cast<Instruction>(ReductionRoot));
	ScalarCmp->replaceAllUsesWith(VecSelect->getCondition());
	}
	}
	ReductionRoot->replaceAllUsesWith(VectorizedTree);

	// Mark all scalar reduction ops for deletion, they are replaced by the
	// vector reductions.
	V.eraseInstructions(IgnoreList);
	}
	return VectorizedTree != nullptr;
	}

	unsigned numReductionValues() const {
	return ReducedVals.size();
	}

	private:
	/// Calculate the cost of a reduction.
	int getReductionCost(TargetTransformInfo TTI, Value FirstReducedVal,
	unsigned ReduxWidth) {
	Type *ScalarTy = FirstReducedVal->getType();
	auto *VecTy = FixedVectorType::get(ScalarTy, ReduxWidth);

	int PairwiseRdxCost;
	int SplittingRdxCost;
	switch (ReductionData.getKind()) {
	case RK_Arithmetic:
	PairwiseRdxCost =
	TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
	/IsPairwiseForm=/true);
	SplittingRdxCost =
	TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
	/IsPairwiseForm=/false);
	break;
	case RK_Min:
	case RK_Max:
	case RK_UMin:
	case RK_UMax: {
	auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VecTy));
	bool IsUnsigned = ReductionData.getKind() == RK_UMin \|\|
	ReductionData.getKind() == RK_UMax;
	PairwiseRdxCost =
	TTI->getMinMaxReductionCost(VecTy, VecCondTy,
	/IsPairwiseForm=/true, IsUnsigned);
	SplittingRdxCost =
	TTI->getMinMaxReductionCost(VecTy, VecCondTy,
	/IsPairwiseForm=/false, IsUnsigned);
	break;
	}
	case RK_None:
	llvm_unreachable("Expected arithmetic or min/max reduction operation");
	}

	IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
	int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;

	int ScalarReduxCost = 0;
	switch (ReductionData.getKind()) {
	case RK_Arithmetic:
	ScalarReduxCost =
	TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy);
	break;
	case RK_Min:
	case RK_Max:
	case RK_UMin:
	case RK_UMax:
	ScalarReduxCost =
	TTI->getCmpSelInstrCost(ReductionData.getOpcode(), ScalarTy) +
	TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
	CmpInst::makeCmpResultType(ScalarTy));
	break;
	case RK_None:
	llvm_unreachable("Expected arithmetic or min/max reduction operation");
	}
	ScalarReduxCost *= (ReduxWidth - 1);

	LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
	<< " for reduction that starts with " << *FirstReducedVal
	<< " (It is a "
	<< (IsPairwiseReduction ? "pairwise" : "splitting")
	<< " reduction)\n");

	return VecReduxCost - ScalarReduxCost;
	}

	/// Emit a horizontal reduction of the vectorized value.
	Value emitReduction(Value VectorizedValue, IRBuilder<> &Builder,
	unsigned ReduxWidth, const TargetTransformInfo *TTI) {
	assert(VectorizedValue && "Need to have a vectorized tree node");
	assert(isPowerOf2_32(ReduxWidth) &&
	"We only handle power-of-two reductions for now");

	if (!IsPairwiseReduction) {
	// FIXME: The builder should use an FMF guard. It should not be hard-coded
	// to 'fast'.
	assert(Builder.getFastMathFlags().isFast() && "Expected 'fast' FMF");
	return createSimpleTargetReduction(
	Builder, TTI, ReductionData.getOpcode(), VectorizedValue,
	ReductionData.getFlags(), ReductionOps.back());
	}

	Value *TmpVec = VectorizedValue;
	for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
	auto LeftMask = createRdxShuffleMask(ReduxWidth, i, true, true);
	auto RightMask = createRdxShuffleMask(ReduxWidth, i, true, false);

	Value *LeftShuf = Builder.CreateShuffleVector(
	TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
	Value *RightShuf = Builder.CreateShuffleVector(
	TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
	"rdx.shuf.r");
	OperationData VectReductionData(ReductionData.getOpcode(), LeftShuf,
	RightShuf, ReductionData.getKind());
	TmpVec = VectReductionData.createOp(Builder, "op.rdx", ReductionOps);
	}

	// The result is in the first element of the vector.
	return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
	}
	};

	} // end anonymous namespace

	/// Recognize construction of vectors like
	/// %ra = insertelement <4 x float> undef, float %s0, i32 0
	/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
	/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
	/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
	/// starting from the last insertelement or insertvalue instruction.
	///
	/// Also recognize aggregates like {<2 x float>, <2 x float>},
	/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
	/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
	///
	/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
	///
	/// \return true if it matches.
	static bool findBuildAggregate(Value LastInsertInst, TargetTransformInfo TTI,
	SmallVectorImpl<Value *> &BuildVectorOpds,
	SmallVectorImpl<Value *> &InsertElts) {
	assert((isa<InsertElementInst>(LastInsertInst) \|\|
	isa<InsertValueInst>(LastInsertInst)) &&
	"Expected insertelement or insertvalue instruction!");
	do {
	Value *InsertedOperand;
	auto *IE = dyn_cast<InsertElementInst>(LastInsertInst);
	if (IE) {
	InsertedOperand = IE->getOperand(1);
	LastInsertInst = IE->getOperand(0);
	} else {
	auto *IV = cast<InsertValueInst>(LastInsertInst);
	InsertedOperand = IV->getInsertedValueOperand();
	LastInsertInst = IV->getAggregateOperand();
	}
	if (isa<InsertElementInst>(InsertedOperand) \|\|
	isa<InsertValueInst>(InsertedOperand)) {
	SmallVector<Value *, 8> TmpBuildVectorOpds;
	SmallVector<Value *, 8> TmpInsertElts;
	if (!findBuildAggregate(InsertedOperand, TTI, TmpBuildVectorOpds,
	TmpInsertElts))
	return false;
	BuildVectorOpds.append(TmpBuildVectorOpds.rbegin(),
	TmpBuildVectorOpds.rend());
	InsertElts.append(TmpInsertElts.rbegin(), TmpInsertElts.rend());
	} else {
	BuildVectorOpds.push_back(InsertedOperand);
	InsertElts.push_back(IE);
	}
	if (isa<UndefValue>(LastInsertInst))
	break;
	if ((!isa<InsertValueInst>(LastInsertInst) &&
	!isa<InsertElementInst>(LastInsertInst)) \|\|
	!LastInsertInst->hasOneUse())
	return false;
	} while (true);
	std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
	std::reverse(InsertElts.begin(), InsertElts.end());
	return true;
	}

	static bool PhiTypeSorterFunc(Value V, Value V2) {
	return V->getType() < V2->getType();
	}

	/// Try and get a reduction value from a phi node.
	///
	/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
	/// if they come from either \p ParentBB or a containing loop latch.
	///
	/// \returns A candidate reduction value if possible, or \code nullptr \endcode
	/// if not possible.
	static Value getReductionValue(const DominatorTree DT, PHINode *P,
	BasicBlock ParentBB, LoopInfo LI) {
	// There are situations where the reduction value is not dominated by the
	// reduction phi. Vectorizing such cases has been reported to cause
	// miscompiles. See PR25787.
	auto DominatedReduxValue = [&](Value *R) {
	return isa<Instruction>(R) &&
	DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
	};

	Value *Rdx = nullptr;

	// Return the incoming value if it comes from the same BB as the phi node.
	if (P->getIncomingBlock(0) == ParentBB) {
	Rdx = P->getIncomingValue(0);
	} else if (P->getIncomingBlock(1) == ParentBB) {
	Rdx = P->getIncomingValue(1);
	}

	if (Rdx && DominatedReduxValue(Rdx))
	return Rdx;

	// Otherwise, check whether we have a loop latch to look at.
	Loop *BBL = LI->getLoopFor(ParentBB);
	if (!BBL)
	return nullptr;
	BasicBlock *BBLatch = BBL->getLoopLatch();
	if (!BBLatch)
	return nullptr;

	// There is a loop latch, return the incoming value if it comes from
	// that. This reduction pattern occasionally turns up.
	if (P->getIncomingBlock(0) == BBLatch) {
	Rdx = P->getIncomingValue(0);
	} else if (P->getIncomingBlock(1) == BBLatch) {
	Rdx = P->getIncomingValue(1);
	}

	if (Rdx && DominatedReduxValue(Rdx))
	return Rdx;

	return nullptr;
	}

	/// Attempt to reduce a horizontal reduction.
	/// If it is legal to match a horizontal reduction feeding the phi node \a P
	/// with reduction operators \a Root (or one of its operands) in a basic block
	/// \a BB, then check if it can be done. If horizontal reduction is not found
	/// and root instruction is a binary operation, vectorization of the operands is
	/// attempted.
	/// \returns true if a horizontal reduction was matched and reduced or operands
	/// of one of the binary instruction were vectorized.
	/// \returns false if a horizontal reduction was not matched (or not possible)
	/// or no vectorization of any binary operation feeding \a Root instruction was
	/// performed.
	static bool tryToVectorizeHorReductionOrInstOperands(
	PHINode P, Instruction Root, BasicBlock *BB, BoUpSLP &R,
	TargetTransformInfo *TTI,
	const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
	if (!ShouldVectorizeHor)
	return false;

	if (!Root)
	return false;

	if (Root->getParent() != BB \|\| isa<PHINode>(Root))
	return false;
	// Start analysis starting from Root instruction. If horizontal reduction is
	// found, try to vectorize it. If it is not a horizontal reduction or
	// vectorization is not possible or not effective, and currently analyzed
	// instruction is a binary operation, try to vectorize the operands, using
	// pre-order DFS traversal order. If the operands were not vectorized, repeat
	// the same procedure considering each operand as a possible root of the
	// horizontal reduction.
	// Interrupt the process if the Root instruction itself was vectorized or all
	// sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
	SmallVector<std::pair<Instruction *, unsigned>, 8> Stack(1, {Root, 0});
	SmallPtrSet<Value *, 8> VisitedInstrs;
	bool Res = false;
	while (!Stack.empty()) {
	Instruction *Inst;
	unsigned Level;
	std::tie(Inst, Level) = Stack.pop_back_val();
	auto *BI = dyn_cast<BinaryOperator>(Inst);
	auto *SI = dyn_cast<SelectInst>(Inst);
	if (BI \|\| SI) {
	HorizontalReduction HorRdx;
	if (HorRdx.matchAssociativeReduction(P, Inst)) {
	if (HorRdx.tryToReduce(R, TTI)) {
	Res = true;
	// Set P to nullptr to avoid re-analysis of phi node in
	// matchAssociativeReduction function unless this is the root node.
	P = nullptr;
	continue;
	}
	}
	if (P && BI) {
	Inst = dyn_cast<Instruction>(BI->getOperand(0));
	if (Inst == P)
	Inst = dyn_cast<Instruction>(BI->getOperand(1));
	if (!Inst) {
	// Set P to nullptr to avoid re-analysis of phi node in
	// matchAssociativeReduction function unless this is the root node.
	P = nullptr;
	continue;
	}
	}
	}
	// Set P to nullptr to avoid re-analysis of phi node in
	// matchAssociativeReduction function unless this is the root node.
	P = nullptr;
	if (Vectorize(Inst, R)) {
	Res = true;
	continue;
	}

	// Try to vectorize operands.
	// Continue analysis for the instruction from the same basic block only to
	// save compile time.
	if (++Level < RecursionMaxDepth)
	for (auto *Op : Inst->operand_values())
	if (VisitedInstrs.insert(Op).second)
	if (auto *I = dyn_cast<Instruction>(Op))
	if (!isa<PHINode>(I) && !R.isDeleted(I) && I->getParent() == BB)
	Stack.emplace_back(I, Level);
	}
	return Res;
	}

	bool SLPVectorizerPass::vectorizeRootInstruction(PHINode P, Value V,
	BasicBlock *BB, BoUpSLP &R,
	TargetTransformInfo *TTI) {
	if (!V)
	return false;
	auto *I = dyn_cast<Instruction>(V);
	if (!I)
	return false;

	if (!isa<BinaryOperator>(I))
	P = nullptr;
	// Try to match and vectorize a horizontal reduction.
	auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {
	return tryToVectorize(I, R);
	};
	return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI,
	ExtraVectorization);
	}

	bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
	BasicBlock *BB, BoUpSLP &R) {
	const DataLayout &DL = BB->getModule()->getDataLayout();
	if (!R.canMapToVector(IVI->getType(), DL))
	return false;

	SmallVector<Value *, 16> BuildVectorOpds;
	SmallVector<Value *, 16> BuildVectorInsts;
	if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts) \|\|
	BuildVectorOpds.size() < 2)
	return false;

	LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
	// Aggregate value is unlikely to be processed in vector register, we need to
	// extract scalars into scalar registers, so NeedExtraction is set true.
	return tryToVectorizeList(BuildVectorOpds, R, /AllowReorder=/false,
	BuildVectorInsts);
	}

	bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
	BasicBlock *BB, BoUpSLP &R) {
	SmallVector<Value *, 16> BuildVectorInsts;
	SmallVector<Value *, 16> BuildVectorOpds;
	if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) \|\|
	BuildVectorOpds.size() < 2 \|\|
	(llvm::all_of(BuildVectorOpds,
	[](Value *V) { return isa<ExtractElementInst>(V); }) &&
	isShuffle(BuildVectorOpds)))
	return false;

	// Vectorize starting with the build vector operands ignoring the BuildVector
	// instructions for the purpose of scheduling and user extraction.
	return tryToVectorizeList(BuildVectorOpds, R, /AllowReorder=/false,
	BuildVectorInsts);
	}

	bool SLPVectorizerPass::vectorizeCmpInst(CmpInst CI, BasicBlock BB,
	BoUpSLP &R) {
	if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R))
	return true;

	bool OpsChanged = false;
	for (int Idx = 0; Idx < 2; ++Idx) {
	OpsChanged \|=
	vectorizeRootInstruction(nullptr, CI->getOperand(Idx), BB, R, TTI);
	}
	return OpsChanged;
	}

	bool SLPVectorizerPass::vectorizeSimpleInstructions(
	SmallVectorImpl<Instruction > &Instructions, BasicBlock BB, BoUpSLP &R) {
	bool OpsChanged = false;
	for (auto *I : reverse(Instructions)) {
	if (R.isDeleted(I))
	continue;
	if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
	OpsChanged \|= vectorizeInsertValueInst(LastInsertValue, BB, R);
	else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))
	OpsChanged \|= vectorizeInsertElementInst(LastInsertElem, BB, R);
	else if (auto *CI = dyn_cast<CmpInst>(I))
	OpsChanged \|= vectorizeCmpInst(CI, BB, R);
	}
	Instructions.clear();
	return OpsChanged;
	}

	bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
	bool Changed = false;
	SmallVector<Value *, 4> Incoming;
	SmallPtrSet<Value *, 16> VisitedInstrs;
	unsigned MaxVecRegSize = R.getMaxVecRegSize();

	bool HaveVectorizedPhiNodes = true;
	while (HaveVectorizedPhiNodes) {
	HaveVectorizedPhiNodes = false;

	// Collect the incoming values from the PHIs.
	Incoming.clear();
	for (Instruction &I : *BB) {
	PHINode *P = dyn_cast<PHINode>(&I);
	if (!P)
	break;

	if (!VisitedInstrs.count(P) && !R.isDeleted(P))
	Incoming.push_back(P);
	}

	// Sort by type.
	llvm::stable_sort(Incoming, PhiTypeSorterFunc);

	// Try to vectorize elements base on their type.
	for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
	E = Incoming.end();
	IncIt != E;) {

	// Look for the next elements with the same type.
	SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
	Type EltTy = (IncIt)->getType();
	- unsigned EltSize = EltTy->isSized() ? DL->getTypeSizeInBits(EltTy)
	- : MaxVecRegSize;
	+
	+ assert(EltTy->isSized() &&
	+ "Instructions should all be sized at this point");
	+ TypeSize EltTS = DL->getTypeSizeInBits(EltTy);
	+ if (EltTS.isScalable()) {
	+ // For now, just ignore vectorizing scalable types.
	+ ++IncIt;
	+ continue;
	+ }
	+
	+ unsigned EltSize = EltTS.getFixedSize();
	unsigned MaxNumElts = MaxVecRegSize / EltSize;
	if (MaxNumElts < 2) {
	++IncIt;
	continue;
	}

	while (SameTypeIt != E &&
	(*SameTypeIt)->getType() == EltTy &&
	(SameTypeIt - IncIt) < MaxNumElts) {
	VisitedInstrs.insert(*SameTypeIt);
	++SameTypeIt;
	}

	// Try to vectorize them.
	unsigned NumElts = (SameTypeIt - IncIt);
	LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at PHIs ("
	<< NumElts << ")\n");
	// The order in which the phi nodes appear in the program does not matter.
	// So allow tryToVectorizeList to reorder them if it is beneficial. This
	// is done when there are exactly two elements since tryToVectorizeList
	// asserts that there are only two values when AllowReorder is true.
	bool AllowReorder = NumElts == 2;
	if (NumElts > 1 &&
	tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, AllowReorder)) {
	// Success start over because instructions might have been changed.
	HaveVectorizedPhiNodes = true;
	Changed = true;
	break;
	}

	// Start over at the next instruction of a different type (or the end).
	IncIt = SameTypeIt;
	}
	}

	VisitedInstrs.clear();

	SmallVector<Instruction *, 8> PostProcessInstructions;
	SmallDenseSet<Instruction *, 4> KeyNodes;
	for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
	// Skip instructions marked for the deletion.
	if (R.isDeleted(&*it))
	continue;
	// We may go through BB multiple times so skip the one we have checked.
	if (!VisitedInstrs.insert(&*it).second) {
	if (it->use_empty() && KeyNodes.count(&*it) > 0 &&
	vectorizeSimpleInstructions(PostProcessInstructions, BB, R)) {
	// We would like to start over since some instructions are deleted
	// and the iterator may become invalid value.
	Changed = true;
	it = BB->begin();
	e = BB->end();
	}
	continue;
	}

	if (isa<DbgInfoIntrinsic>(it))
	continue;

	// Try to vectorize reductions that use PHINodes.
	if (PHINode *P = dyn_cast<PHINode>(it)) {
	// Check that the PHI is a reduction PHI.
	if (P->getNumIncomingValues() != 2)
	return Changed;

	// Try to match and vectorize a horizontal reduction.
	if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
	TTI)) {
	Changed = true;
	it = BB->begin();
	e = BB->end();
	continue;
	}
	continue;
	}

	// Ran into an instruction without users, like terminator, or function call
	// with ignored return value, store. Ignore unused instructions (basing on
	// instruction type, except for CallInst and InvokeInst).
	if (it->use_empty() && (it->getType()->isVoidTy() \|\| isa<CallInst>(it) \|\|
	isa<InvokeInst>(it))) {
	KeyNodes.insert(&*it);
	bool OpsChanged = false;
	if (ShouldStartVectorizeHorAtStore \|\| !isa<StoreInst>(it)) {
	for (auto *V : it->operand_values()) {
	// Try to match and vectorize a horizontal reduction.
	OpsChanged \|= vectorizeRootInstruction(nullptr, V, BB, R, TTI);
	}
	}
	// Start vectorization of post-process list of instructions from the
	// top-tree instructions to try to vectorize as many instructions as
	// possible.
	OpsChanged \|= vectorizeSimpleInstructions(PostProcessInstructions, BB, R);
	if (OpsChanged) {
	// We would like to start over since some instructions are deleted
	// and the iterator may become invalid value.
	Changed = true;
	it = BB->begin();
	e = BB->end();
	continue;
	}
	}

	if (isa<InsertElementInst>(it) \|\| isa<CmpInst>(it) \|\|
	isa<InsertValueInst>(it))
	PostProcessInstructions.push_back(&*it);
	}

	return Changed;
	}

	bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
	auto Changed = false;
	for (auto &Entry : GEPs) {
	// If the getelementptr list has fewer than two elements, there's nothing
	// to do.
	if (Entry.second.size() < 2)
	continue;

	LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
	<< Entry.second.size() << ".\n");

	// Process the GEP list in chunks suitable for the target's supported
	// vector size. If a vector register can't hold 1 element, we are done. We
	// are trying to vectorize the index computations, so the maximum number of
	// elements is based on the size of the index expression, rather than the
	// size of the GEP itself (the target's pointer size).
	unsigned MaxVecRegSize = R.getMaxVecRegSize();
	unsigned EltSize = R.getVectorElementSize(*Entry.second[0]->idx_begin());
	if (MaxVecRegSize < EltSize)
	continue;

	unsigned MaxElts = MaxVecRegSize / EltSize;
	for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
	auto Len = std::min<unsigned>(BE - BI, MaxElts);
	auto GEPList = makeArrayRef(&Entry.second[BI], Len);

	// Initialize a set a candidate getelementptrs. Note that we use a
	// SetVector here to preserve program order. If the index computations
	// are vectorizable and begin with loads, we want to minimize the chance
	// of having to reorder them later.
	SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());

	// Some of the candidates may have already been vectorized after we
	// initially collected them. If so, they are marked as deleted, so remove
	// them from the set of candidates.
	Candidates.remove_if(
	[&R](Value *I) { return R.isDeleted(cast<Instruction>(I)); });

	// Remove from the set of candidates all pairs of getelementptrs with
	// constant differences. Such getelementptrs are likely not good
	// candidates for vectorization in a bottom-up phase since one can be
	// computed from the other. We also ensure all candidate getelementptr
	// indices are unique.
	for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
	auto *GEPI = GEPList[I];
	if (!Candidates.count(GEPI))
	continue;
	auto *SCEVI = SE->getSCEV(GEPList[I]);
	for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
	auto *GEPJ = GEPList[J];
	auto *SCEVJ = SE->getSCEV(GEPList[J]);
	if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
	Candidates.remove(GEPI);
	Candidates.remove(GEPJ);
	} else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
	Candidates.remove(GEPJ);
	}
	}
	}

	// We break out of the above computation as soon as we know there are
	// fewer than two candidates remaining.
	if (Candidates.size() < 2)
	continue;

	// Add the single, non-constant index of each candidate to the bundle. We
	// ensured the indices met these constraints when we originally collected
	// the getelementptrs.
	SmallVector<Value *, 16> Bundle(Candidates.size());
	auto BundleIndex = 0u;
	for (auto *V : Candidates) {
	auto *GEP = cast<GetElementPtrInst>(V);
	auto *GEPIdx = GEP->idx_begin()->get();
	assert(GEP->getNumIndices() == 1 \|\| !isa<Constant>(GEPIdx));
	Bundle[BundleIndex++] = GEPIdx;
	}

	// Try and vectorize the indices. We are currently only interested in
	// gather-like cases of the form:
	//
	// ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
	//
	// where the loads of "a", the loads of "b", and the subtractions can be
	// performed in parallel. It's likely that detecting this pattern in a
	// bottom-up phase will be simpler and less costly than building a
	// full-blown top-down phase beginning at the consecutive loads.
	Changed \|= tryToVectorizeList(Bundle, R);
	}
	}
	return Changed;
	}

	bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
	bool Changed = false;
	// Attempt to sort and vectorize each of the store-groups.
	for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;
	++it) {
	if (it->second.size() < 2)
	continue;

	LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
	<< it->second.size() << ".\n");

	Changed \|= vectorizeStores(it->second, R);
	}
	return Changed;
	}

	char SLPVectorizer::ID = 0;

	static const char lv_name[] = "SLP Vectorizer";

	INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
	INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
	INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
	INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
	INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)

	Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); }
	diff --git a/openmp/runtime/src/kmp_ftn_entry.h b/openmp/runtime/src/kmp_ftn_entry.h
	index ab57907e088e..b4b0dea0d1af 100644
	--- a/openmp/runtime/src/kmp_ftn_entry.h
	+++ b/openmp/runtime/src/kmp_ftn_entry.h
	@@ -1,1488 +1,1488 @@
	/*
	* kmp_ftn_entry.h -- Fortran entry linkage support for OpenMP.
	*/

	//===----------------------------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#ifndef FTN_STDCALL
	#error The support file kmp_ftn_entry.h should not be compiled by itself.
	#endif

	#ifdef KMP_STUB
	#include "kmp_stub.h"
	#endif

	#include "kmp_i18n.h"

	// For affinity format functions
	#include "kmp_io.h"
	#include "kmp_str.h"

	#if OMPT_SUPPORT
	#include "ompt-specific.h"
	#endif

	#ifdef __cplusplus
	extern "C" {
	#endif // __cplusplus

	/* For compatibility with the Gnu/MS Open MP codegen, omp_set_num_threads(),
	* omp_set_nested(), and omp_set_dynamic() [in lowercase on MS, and w/o
	* a trailing underscore on Linux* OS] take call by value integer arguments.
	* + omp_set_max_active_levels()
	* + omp_set_schedule()
	*
	* For backward compatibility with 9.1 and previous Intel compiler, these
	* entry points take call by reference integer arguments. */
	#ifdef KMP_GOMP_COMPAT
	#if (KMP_FTN_ENTRIES == KMP_FTN_PLAIN) \|\| (KMP_FTN_ENTRIES == KMP_FTN_UPPER)
	#define PASS_ARGS_BY_VALUE 1
	#endif
	#endif
	#if KMP_OS_WINDOWS
	#if (KMP_FTN_ENTRIES == KMP_FTN_PLAIN) \|\| (KMP_FTN_ENTRIES == KMP_FTN_APPEND)
	#define PASS_ARGS_BY_VALUE 1
	#endif
	#endif

	// This macro helps to reduce code duplication.
	#ifdef PASS_ARGS_BY_VALUE
	#define KMP_DEREF
	#else
	#define KMP_DEREF *
	#endif

	void FTN_STDCALL FTN_SET_STACKSIZE(int KMP_DEREF arg) {
	#ifdef KMP_STUB
	__kmps_set_stacksize(KMP_DEREF arg);
	#else
	// __kmp_aux_set_stacksize initializes the library if needed
	__kmp_aux_set_stacksize((size_t)KMP_DEREF arg);
	#endif
	}

	void FTN_STDCALL FTN_SET_STACKSIZE_S(size_t KMP_DEREF arg) {
	#ifdef KMP_STUB
	__kmps_set_stacksize(KMP_DEREF arg);
	#else
	// __kmp_aux_set_stacksize initializes the library if needed
	__kmp_aux_set_stacksize(KMP_DEREF arg);
	#endif
	}

	int FTN_STDCALL FTN_GET_STACKSIZE(void) {
	#ifdef KMP_STUB
	return __kmps_get_stacksize();
	#else
	if (!__kmp_init_serial) {
	__kmp_serial_initialize();
	}
	return (int)__kmp_stksize;
	#endif
	}

	size_t FTN_STDCALL FTN_GET_STACKSIZE_S(void) {
	#ifdef KMP_STUB
	return __kmps_get_stacksize();
	#else
	if (!__kmp_init_serial) {
	__kmp_serial_initialize();
	}
	return __kmp_stksize;
	#endif
	}

	void FTN_STDCALL FTN_SET_BLOCKTIME(int KMP_DEREF arg) {
	#ifdef KMP_STUB
	__kmps_set_blocktime(KMP_DEREF arg);
	#else
	int gtid, tid;
	kmp_info_t *thread;

	gtid = __kmp_entry_gtid();
	tid = __kmp_tid_from_gtid(gtid);
	thread = __kmp_thread_from_gtid(gtid);

	__kmp_aux_set_blocktime(KMP_DEREF arg, thread, tid);
	#endif
	}

	int FTN_STDCALL FTN_GET_BLOCKTIME(void) {
	#ifdef KMP_STUB
	return __kmps_get_blocktime();
	#else
	int gtid, tid;
	kmp_info_t *thread;
	kmp_team_p *team;

	gtid = __kmp_entry_gtid();
	tid = __kmp_tid_from_gtid(gtid);
	thread = __kmp_thread_from_gtid(gtid);
	team = __kmp_threads[gtid]->th.th_team;

	/* These must match the settings used in __kmp_wait_sleep() */
	if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
	KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d\n", gtid,
	team->t.t_id, tid, KMP_MAX_BLOCKTIME));
	return KMP_MAX_BLOCKTIME;
	}
	#ifdef KMP_ADJUST_BLOCKTIME
	else if (__kmp_zero_bt && !get__bt_set(team, tid)) {
	KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d\n", gtid,
	team->t.t_id, tid, 0));
	return 0;
	}
	#endif /* KMP_ADJUST_BLOCKTIME */
	else {
	KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d\n", gtid,
	team->t.t_id, tid, get__blocktime(team, tid)));
	return get__blocktime(team, tid);
	}
	#endif
	}

	void FTN_STDCALL FTN_SET_LIBRARY_SERIAL(void) {
	#ifdef KMP_STUB
	__kmps_set_library(library_serial);
	#else
	// __kmp_user_set_library initializes the library if needed
	__kmp_user_set_library(library_serial);
	#endif
	}

	void FTN_STDCALL FTN_SET_LIBRARY_TURNAROUND(void) {
	#ifdef KMP_STUB
	__kmps_set_library(library_turnaround);
	#else
	// __kmp_user_set_library initializes the library if needed
	__kmp_user_set_library(library_turnaround);
	#endif
	}

	void FTN_STDCALL FTN_SET_LIBRARY_THROUGHPUT(void) {
	#ifdef KMP_STUB
	__kmps_set_library(library_throughput);
	#else
	// __kmp_user_set_library initializes the library if needed
	__kmp_user_set_library(library_throughput);
	#endif
	}

	void FTN_STDCALL FTN_SET_LIBRARY(int KMP_DEREF arg) {
	#ifdef KMP_STUB
	__kmps_set_library(KMP_DEREF arg);
	#else
	enum library_type lib;
	lib = (enum library_type)KMP_DEREF arg;
	// __kmp_user_set_library initializes the library if needed
	__kmp_user_set_library(lib);
	#endif
	}

	int FTN_STDCALL FTN_GET_LIBRARY(void) {
	#ifdef KMP_STUB
	return __kmps_get_library();
	#else
	if (!__kmp_init_serial) {
	__kmp_serial_initialize();
	}
	return ((int)__kmp_library);
	#endif
	}

	void FTN_STDCALL FTN_SET_DISP_NUM_BUFFERS(int KMP_DEREF arg) {
	#ifdef KMP_STUB
	; // empty routine
	#else
	// ignore after initialization because some teams have already
	// allocated dispatch buffers
	if (__kmp_init_serial == 0 && (KMP_DEREF arg) > 0)
	__kmp_dispatch_num_buffers = KMP_DEREF arg;
	#endif
	}

	int FTN_STDCALL FTN_SET_AFFINITY(void **mask) {
	#if defined(KMP_STUB) \|\| !KMP_AFFINITY_SUPPORTED
	return -1;
	#else
	if (!TCR_4(__kmp_init_middle)) {
	__kmp_middle_initialize();
	}
	return __kmp_aux_set_affinity(mask);
	#endif
	}

	int FTN_STDCALL FTN_GET_AFFINITY(void **mask) {
	#if defined(KMP_STUB) \|\| !KMP_AFFINITY_SUPPORTED
	return -1;
	#else
	if (!TCR_4(__kmp_init_middle)) {
	__kmp_middle_initialize();
	}
	return __kmp_aux_get_affinity(mask);
	#endif
	}

	int FTN_STDCALL FTN_GET_AFFINITY_MAX_PROC(void) {
	#if defined(KMP_STUB) \|\| !KMP_AFFINITY_SUPPORTED
	return 0;
	#else
	// We really only NEED serial initialization here.
	if (!TCR_4(__kmp_init_middle)) {
	__kmp_middle_initialize();
	}
	return __kmp_aux_get_affinity_max_proc();
	#endif
	}

	void FTN_STDCALL FTN_CREATE_AFFINITY_MASK(void **mask) {
	#if defined(KMP_STUB) \|\| !KMP_AFFINITY_SUPPORTED
	*mask = NULL;
	#else
	// We really only NEED serial initialization here.
	kmp_affin_mask_t *mask_internals;
	if (!TCR_4(__kmp_init_middle)) {
	__kmp_middle_initialize();
	}
	mask_internals = __kmp_affinity_dispatch->allocate_mask();
	KMP_CPU_ZERO(mask_internals);
	*mask = mask_internals;
	#endif
	}

	void FTN_STDCALL FTN_DESTROY_AFFINITY_MASK(void **mask) {
	#if defined(KMP_STUB) \|\| !KMP_AFFINITY_SUPPORTED
	// Nothing
	#else
	// We really only NEED serial initialization here.
	kmp_affin_mask_t *mask_internals;
	if (!TCR_4(__kmp_init_middle)) {
	__kmp_middle_initialize();
	}
	if (__kmp_env_consistency_check) {
	if (*mask == NULL) {
	KMP_FATAL(AffinityInvalidMask, "kmp_destroy_affinity_mask");
	}
	}
	mask_internals = (kmp_affin_mask_t )(mask);
	__kmp_affinity_dispatch->deallocate_mask(mask_internals);
	*mask = NULL;
	#endif
	}

	int FTN_STDCALL FTN_SET_AFFINITY_MASK_PROC(int KMP_DEREF proc, void **mask) {
	#if defined(KMP_STUB) \|\| !KMP_AFFINITY_SUPPORTED
	return -1;
	#else
	if (!TCR_4(__kmp_init_middle)) {
	__kmp_middle_initialize();
	}
	return __kmp_aux_set_affinity_mask_proc(KMP_DEREF proc, mask);
	#endif
	}

	int FTN_STDCALL FTN_UNSET_AFFINITY_MASK_PROC(int KMP_DEREF proc, void **mask) {
	#if defined(KMP_STUB) \|\| !KMP_AFFINITY_SUPPORTED
	return -1;
	#else
	if (!TCR_4(__kmp_init_middle)) {
	__kmp_middle_initialize();
	}
	return __kmp_aux_unset_affinity_mask_proc(KMP_DEREF proc, mask);
	#endif
	}

	int FTN_STDCALL FTN_GET_AFFINITY_MASK_PROC(int KMP_DEREF proc, void **mask) {
	#if defined(KMP_STUB) \|\| !KMP_AFFINITY_SUPPORTED
	return -1;
	#else
	if (!TCR_4(__kmp_init_middle)) {
	__kmp_middle_initialize();
	}
	return __kmp_aux_get_affinity_mask_proc(KMP_DEREF proc, mask);
	#endif
	}

	/* ------------------------------------------------------------------------ */

	/* sets the requested number of threads for the next parallel region */
	void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_NUM_THREADS)(int KMP_DEREF arg) {
	#ifdef KMP_STUB
	// Nothing.
	#else
	__kmp_set_num_threads(KMP_DEREF arg, __kmp_entry_gtid());
	#endif
	}

	/* returns the number of threads in current team */
	int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_THREADS)(void) {
	#ifdef KMP_STUB
	return 1;
	#else
	// __kmpc_bound_num_threads initializes the library if needed
	return __kmpc_bound_num_threads(NULL);
	#endif
	}

	int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_MAX_THREADS)(void) {
	#ifdef KMP_STUB
	return 1;
	#else
	int gtid;
	kmp_info_t *thread;
	if (!TCR_4(__kmp_init_middle)) {
	__kmp_middle_initialize();
	}
	gtid = __kmp_entry_gtid();
	thread = __kmp_threads[gtid];
	// return thread -> th.th_team -> t.t_current_task[
	// thread->th.th_info.ds.ds_tid ] -> icvs.nproc;
	return thread->th.th_current_task->td_icvs.nproc;
	#endif
	}

	int FTN_STDCALL FTN_CONTROL_TOOL(int command, int modifier, void *arg) {
	#if defined(KMP_STUB) \|\| !OMPT_SUPPORT
	return -2;
	#else
	OMPT_STORE_RETURN_ADDRESS(__kmp_entry_gtid());
	if (!TCR_4(__kmp_init_middle)) {
	return -2;
	}
	kmp_info_t *this_thr = __kmp_threads[__kmp_entry_gtid()];
	ompt_task_info_t *parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
	parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
	int ret = __kmp_control_tool(command, modifier, arg);
	parent_task_info->frame.enter_frame.ptr = 0;
	return ret;
	#endif
	}

	/* OpenMP 5.0 Memory Management support */
	omp_allocator_handle_t FTN_STDCALL
	FTN_INIT_ALLOCATOR(omp_memspace_handle_t KMP_DEREF m, int KMP_DEREF ntraits,
	omp_alloctrait_t tr[]) {
	#ifdef KMP_STUB
	return NULL;
	#else
	return __kmpc_init_allocator(__kmp_entry_gtid(), KMP_DEREF m,
	KMP_DEREF ntraits, tr);
	#endif
	}

	void FTN_STDCALL FTN_DESTROY_ALLOCATOR(omp_allocator_handle_t al) {
	#ifndef KMP_STUB
	__kmpc_destroy_allocator(__kmp_entry_gtid(), al);
	#endif
	}
	void FTN_STDCALL FTN_SET_DEFAULT_ALLOCATOR(omp_allocator_handle_t al) {
	#ifndef KMP_STUB
	__kmpc_set_default_allocator(__kmp_entry_gtid(), al);
	#endif
	}
	omp_allocator_handle_t FTN_STDCALL FTN_GET_DEFAULT_ALLOCATOR(void) {
	#ifdef KMP_STUB
	return NULL;
	#else
	return __kmpc_get_default_allocator(__kmp_entry_gtid());
	#endif
	}

	/* OpenMP 5.0 affinity format support */
	#ifndef KMP_STUB
	static void __kmp_fortran_strncpy_truncate(char *buffer, size_t buf_size,
	char const *csrc, size_t csrc_size) {
	size_t capped_src_size = csrc_size;
	if (csrc_size >= buf_size) {
	capped_src_size = buf_size - 1;
	}
	KMP_STRNCPY_S(buffer, buf_size, csrc, capped_src_size);
	if (csrc_size >= buf_size) {
	KMP_DEBUG_ASSERT(buffer[buf_size - 1] == '\0');
	buffer[buf_size - 1] = csrc[buf_size - 1];
	} else {
	for (size_t i = csrc_size; i < buf_size; ++i)
	buffer[i] = ' ';
	}
	}

	// Convert a Fortran string to a C string by adding null byte
	class ConvertedString {
	char *buf;
	kmp_info_t *th;

	public:
	ConvertedString(char const *fortran_str, size_t size) {
	th = __kmp_get_thread();
	buf = (char *)__kmp_thread_malloc(th, size + 1);
	KMP_STRNCPY_S(buf, size + 1, fortran_str, size);
	buf[size] = '\0';
	}
	~ConvertedString() { __kmp_thread_free(th, buf); }
	const char *get() const { return buf; }
	};
	#endif // KMP_STUB

	/*
	* Set the value of the affinity-format-var ICV on the current device to the
	* format specified in the argument.
	*/
	void FTN_STDCALL FTN_SET_AFFINITY_FORMAT(char const *format, size_t size) {
	#ifdef KMP_STUB
	return;
	#else
	if (!__kmp_init_serial) {
	__kmp_serial_initialize();
	}
	ConvertedString cformat(format, size);
	// Since the __kmp_affinity_format variable is a C string, do not
	// use the fortran strncpy function
	__kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
	cformat.get(), KMP_STRLEN(cformat.get()));
	#endif
	}

	/*
	* Returns the number of characters required to hold the entire affinity format
	* specification (not including null byte character) and writes the value of the
	* affinity-format-var ICV on the current device to buffer. If the return value
	* is larger than size, the affinity format specification is truncated.
	*/
	size_t FTN_STDCALL FTN_GET_AFFINITY_FORMAT(char *buffer, size_t size) {
	#ifdef KMP_STUB
	return 0;
	#else
	size_t format_size;
	if (!__kmp_init_serial) {
	__kmp_serial_initialize();
	}
	format_size = KMP_STRLEN(__kmp_affinity_format);
	if (buffer && size) {
	__kmp_fortran_strncpy_truncate(buffer, size, __kmp_affinity_format,
	format_size);
	}
	return format_size;
	#endif
	}

	/*
	* Prints the thread affinity information of the current thread in the format
	* specified by the format argument. If the format is NULL or a zero-length
	* string, the value of the affinity-format-var ICV is used.
	*/
	void FTN_STDCALL FTN_DISPLAY_AFFINITY(char const *format, size_t size) {
	#ifdef KMP_STUB
	return;
	#else
	int gtid;
	if (!TCR_4(__kmp_init_middle)) {
	__kmp_middle_initialize();
	}
	gtid = __kmp_get_gtid();
	ConvertedString cformat(format, size);
	__kmp_aux_display_affinity(gtid, cformat.get());
	#endif
	}

	/*
	* Returns the number of characters required to hold the entire affinity format
	* specification (not including null byte) and prints the thread affinity
	* information of the current thread into the character string buffer with the
	* size of size in the format specified by the format argument. If the format is
	* NULL or a zero-length string, the value of the affinity-format-var ICV is
	* used. The buffer must be allocated prior to calling the routine. If the
	* return value is larger than size, the affinity format specification is
	* truncated.
	*/
	size_t FTN_STDCALL FTN_CAPTURE_AFFINITY(char buffer, char const format,
	size_t buf_size, size_t for_size) {
	#if defined(KMP_STUB)
	return 0;
	#else
	int gtid;
	size_t num_required;
	kmp_str_buf_t capture_buf;
	if (!TCR_4(__kmp_init_middle)) {
	__kmp_middle_initialize();
	}
	gtid = __kmp_get_gtid();
	__kmp_str_buf_init(&capture_buf);
	ConvertedString cformat(format, for_size);
	num_required = __kmp_aux_capture_affinity(gtid, cformat.get(), &capture_buf);
	if (buffer && buf_size) {
	__kmp_fortran_strncpy_truncate(buffer, buf_size, capture_buf.str,
	capture_buf.used);
	}
	__kmp_str_buf_free(&capture_buf);
	return num_required;
	#endif
	}

	int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_THREAD_NUM)(void) {
	#ifdef KMP_STUB
	return 0;
	#else
	int gtid;

	#if KMP_OS_DARWIN \|\| KMP_OS_DRAGONFLY \|\| KMP_OS_FREEBSD \|\| KMP_OS_NETBSD \|\| \
	KMP_OS_HURD\|\| KMP_OS_OPENBSD
	gtid = __kmp_entry_gtid();
	#elif KMP_OS_WINDOWS
	if (!__kmp_init_parallel \|\|
	(gtid = (int)((kmp_intptr_t)TlsGetValue(__kmp_gtid_threadprivate_key))) ==
	0) {
	// Either library isn't initialized or thread is not registered
	// 0 is the correct TID in this case
	return 0;
	}
	--gtid; // We keep (gtid+1) in TLS
	#elif KMP_OS_LINUX
	#ifdef KMP_TDATA_GTID
	if (__kmp_gtid_mode >= 3) {
	if ((gtid = __kmp_gtid) == KMP_GTID_DNE) {
	return 0;
	}
	} else {
	#endif
	if (!__kmp_init_parallel \|\|
	(gtid = (kmp_intptr_t)(
	pthread_getspecific(__kmp_gtid_threadprivate_key))) == 0) {
	return 0;
	}
	--gtid;
	#ifdef KMP_TDATA_GTID
	}
	#endif
	#else
	#error Unknown or unsupported OS
	#endif

	return __kmp_tid_from_gtid(gtid);
	#endif
	}

	int FTN_STDCALL FTN_GET_NUM_KNOWN_THREADS(void) {
	#ifdef KMP_STUB
	return 1;
	#else
	if (!__kmp_init_serial) {
	__kmp_serial_initialize();
	}
	/* NOTE: this is not syncronized, so it can change at any moment */
	/* NOTE: this number also includes threads preallocated in hot-teams */
	return TCR_4(__kmp_nth);
	#endif
	}

	int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_PROCS)(void) {
	#ifdef KMP_STUB
	return 1;
	#else
	if (!TCR_4(__kmp_init_middle)) {
	__kmp_middle_initialize();
	}
	return __kmp_avail_proc;
	#endif
	}

	void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_NESTED)(int KMP_DEREF flag) {
	KMP_INFORM(APIDeprecated, "omp_set_nested", "omp_set_max_active_levels");
	#ifdef KMP_STUB
	__kmps_set_nested(KMP_DEREF flag);
	#else
	kmp_info_t *thread;
	/* For the thread-private internal controls implementation */
	thread = __kmp_entry_thread();
	__kmp_save_internal_controls(thread);
	// Somewhat arbitrarily decide where to get a value for max_active_levels
	int max_active_levels = get__max_active_levels(thread);
	if (max_active_levels == 1)
	max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
	set__max_active_levels(thread, (KMP_DEREF flag) ? max_active_levels : 1);
	#endif
	}

	int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NESTED)(void) {
	KMP_INFORM(APIDeprecated, "omp_get_nested", "omp_get_max_active_levels");
	#ifdef KMP_STUB
	return __kmps_get_nested();
	#else
	kmp_info_t *thread;
	thread = __kmp_entry_thread();
	return get__max_active_levels(thread) > 1;
	#endif
	}

	void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_DYNAMIC)(int KMP_DEREF flag) {
	#ifdef KMP_STUB
	__kmps_set_dynamic(KMP_DEREF flag ? TRUE : FALSE);
	#else
	kmp_info_t *thread;
	/* For the thread-private implementation of the internal controls */
	thread = __kmp_entry_thread();
	// !!! What if foreign thread calls it?
	__kmp_save_internal_controls(thread);
	set__dynamic(thread, KMP_DEREF flag ? TRUE : FALSE);
	#endif
	}

	int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_DYNAMIC)(void) {
	#ifdef KMP_STUB
	return __kmps_get_dynamic();
	#else
	kmp_info_t *thread;
	thread = __kmp_entry_thread();
	return get__dynamic(thread);
	#endif
	}

	int FTN_STDCALL KMP_EXPAND_NAME(FTN_IN_PARALLEL)(void) {
	#ifdef KMP_STUB
	return 0;
	#else
	kmp_info_t *th = __kmp_entry_thread();
	if (th->th.th_teams_microtask) {
	// AC: r_in_parallel does not work inside teams construct where real
	// parallel is inactive, but all threads have same root, so setting it in
	// one team affects other teams.
	// The solution is to use per-team nesting level
	return (th->th.th_team->t.t_active_level ? 1 : 0);
	} else
	return (th->th.th_root->r.r_in_parallel ? FTN_TRUE : FTN_FALSE);
	#endif
	}

	void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_SCHEDULE)(kmp_sched_t KMP_DEREF kind,
	int KMP_DEREF modifier) {
	#ifdef KMP_STUB
	__kmps_set_schedule(KMP_DEREF kind, KMP_DEREF modifier);
	#else
	/* TO DO: For the per-task implementation of the internal controls */
	__kmp_set_schedule(__kmp_entry_gtid(), KMP_DEREF kind, KMP_DEREF modifier);
	#endif
	}

	void FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_SCHEDULE)(kmp_sched_t *kind,
	int *modifier) {
	#ifdef KMP_STUB
	__kmps_get_schedule(kind, modifier);
	#else
	/* TO DO: For the per-task implementation of the internal controls */
	__kmp_get_schedule(__kmp_entry_gtid(), kind, modifier);
	#endif
	}

	void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_MAX_ACTIVE_LEVELS)(int KMP_DEREF arg) {
	#ifdef KMP_STUB
	// Nothing.
	#else
	/* TO DO: We want per-task implementation of this internal control */
	__kmp_set_max_active_levels(__kmp_entry_gtid(), KMP_DEREF arg);
	#endif
	}

	int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_MAX_ACTIVE_LEVELS)(void) {
	#ifdef KMP_STUB
	return 0;
	#else
	/* TO DO: We want per-task implementation of this internal control */
	return __kmp_get_max_active_levels(__kmp_entry_gtid());
	#endif
	}

	int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_ACTIVE_LEVEL)(void) {
	#ifdef KMP_STUB
	return 0; // returns 0 if it is called from the sequential part of the program
	#else
	/* TO DO: For the per-task implementation of the internal controls */
	return __kmp_entry_thread()->th.th_team->t.t_active_level;
	#endif
	}

	int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_LEVEL)(void) {
	#ifdef KMP_STUB
	return 0; // returns 0 if it is called from the sequential part of the program
	#else
	/* TO DO: For the per-task implementation of the internal controls */
	return __kmp_entry_thread()->th.th_team->t.t_level;
	#endif
	}

	int FTN_STDCALL
	KMP_EXPAND_NAME(FTN_GET_ANCESTOR_THREAD_NUM)(int KMP_DEREF level) {
	#ifdef KMP_STUB
	return (KMP_DEREF level) ? (-1) : (0);
	#else
	return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), KMP_DEREF level);
	#endif
	}

	int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_TEAM_SIZE)(int KMP_DEREF level) {
	#ifdef KMP_STUB
	return (KMP_DEREF level) ? (-1) : (1);
	#else
	return __kmp_get_team_size(__kmp_entry_gtid(), KMP_DEREF level);
	#endif
	}

	int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_THREAD_LIMIT)(void) {
	#ifdef KMP_STUB
	return 1; // TO DO: clarify whether it returns 1 or 0?
	#else
	int gtid;
	kmp_info_t *thread;
	if (!__kmp_init_serial) {
	__kmp_serial_initialize();
	}

	gtid = __kmp_entry_gtid();
	thread = __kmp_threads[gtid];
	return thread->th.th_current_task->td_icvs.thread_limit;
	#endif
	}

	int FTN_STDCALL KMP_EXPAND_NAME(FTN_IN_FINAL)(void) {
	#ifdef KMP_STUB
	return 0; // TO DO: clarify whether it returns 1 or 0?
	#else
	if (!TCR_4(__kmp_init_parallel)) {
	return 0;
	}
	return __kmp_entry_thread()->th.th_current_task->td_flags.final;
	#endif
	}

	kmp_proc_bind_t FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PROC_BIND)(void) {
	#ifdef KMP_STUB
	return __kmps_get_proc_bind();
	#else
	return get__proc_bind(__kmp_entry_thread());
	#endif
	}

	int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_PLACES)(void) {
	#if defined(KMP_STUB) \|\| !KMP_AFFINITY_SUPPORTED
	return 0;
	#else
	if (!TCR_4(__kmp_init_middle)) {
	__kmp_middle_initialize();
	}
	if (!KMP_AFFINITY_CAPABLE())
	return 0;
	return __kmp_affinity_num_masks;
	#endif
	}

	int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PLACE_NUM_PROCS)(int place_num) {
	#if defined(KMP_STUB) \|\| !KMP_AFFINITY_SUPPORTED
	return 0;
	#else
	int i;
	int retval = 0;
	if (!TCR_4(__kmp_init_middle)) {
	__kmp_middle_initialize();
	}
	if (!KMP_AFFINITY_CAPABLE())
	return 0;
	if (place_num < 0 \|\| place_num >= (int)__kmp_affinity_num_masks)
	return 0;
	kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, place_num);
	KMP_CPU_SET_ITERATE(i, mask) {
	if ((!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) \|\|
	(!KMP_CPU_ISSET(i, mask))) {
	continue;
	}
	++retval;
	}
	return retval;
	#endif
	}

	void FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PLACE_PROC_IDS)(int place_num,
	int *ids) {
	#if defined(KMP_STUB) \|\| !KMP_AFFINITY_SUPPORTED
	// Nothing.
	#else
	int i, j;
	if (!TCR_4(__kmp_init_middle)) {
	__kmp_middle_initialize();
	}
	if (!KMP_AFFINITY_CAPABLE())
	return;
	if (place_num < 0 \|\| place_num >= (int)__kmp_affinity_num_masks)
	return;
	kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, place_num);
	j = 0;
	KMP_CPU_SET_ITERATE(i, mask) {
	if ((!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) \|\|
	(!KMP_CPU_ISSET(i, mask))) {
	continue;
	}
	ids[j++] = i;
	}
	#endif
	}

	int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PLACE_NUM)(void) {
	#if defined(KMP_STUB) \|\| !KMP_AFFINITY_SUPPORTED
	return -1;
	#else
	int gtid;
	kmp_info_t *thread;
	if (!TCR_4(__kmp_init_middle)) {
	__kmp_middle_initialize();
	}
	if (!KMP_AFFINITY_CAPABLE())
	return -1;
	gtid = __kmp_entry_gtid();
	thread = __kmp_thread_from_gtid(gtid);
	if (thread->th.th_current_place < 0)
	return -1;
	return thread->th.th_current_place;
	#endif
	}

	int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PARTITION_NUM_PLACES)(void) {
	#if defined(KMP_STUB) \|\| !KMP_AFFINITY_SUPPORTED
	return 0;
	#else
	int gtid, num_places, first_place, last_place;
	kmp_info_t *thread;
	if (!TCR_4(__kmp_init_middle)) {
	__kmp_middle_initialize();
	}
	if (!KMP_AFFINITY_CAPABLE())
	return 0;
	gtid = __kmp_entry_gtid();
	thread = __kmp_thread_from_gtid(gtid);
	first_place = thread->th.th_first_place;
	last_place = thread->th.th_last_place;
	if (first_place < 0 \|\| last_place < 0)
	return 0;
	if (first_place <= last_place)
	num_places = last_place - first_place + 1;
	else
	num_places = __kmp_affinity_num_masks - first_place + last_place + 1;
	return num_places;
	#endif
	}

	void
	FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PARTITION_PLACE_NUMS)(int *place_nums) {
	#if defined(KMP_STUB) \|\| !KMP_AFFINITY_SUPPORTED
	// Nothing.
	#else
	int i, gtid, place_num, first_place, last_place, start, end;
	kmp_info_t *thread;
	if (!TCR_4(__kmp_init_middle)) {
	__kmp_middle_initialize();
	}
	if (!KMP_AFFINITY_CAPABLE())
	return;
	gtid = __kmp_entry_gtid();
	thread = __kmp_thread_from_gtid(gtid);
	first_place = thread->th.th_first_place;
	last_place = thread->th.th_last_place;
	if (first_place < 0 \|\| last_place < 0)
	return;
	if (first_place <= last_place) {
	start = first_place;
	end = last_place;
	} else {
	start = last_place;
	end = first_place;
	}
	for (i = 0, place_num = start; place_num <= end; ++place_num, ++i) {
	place_nums[i] = place_num;
	}
	#endif
	}

	int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_TEAMS)(void) {
	#ifdef KMP_STUB
	return 1;
	#else
	return __kmp_aux_get_num_teams();
	#endif
	}

	int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_TEAM_NUM)(void) {
	#ifdef KMP_STUB
	return 0;
	#else
	return __kmp_aux_get_team_num();
	#endif
	}

	int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_DEFAULT_DEVICE)(void) {
	#if KMP_MIC \|\| KMP_OS_DARWIN \|\| defined(KMP_STUB)
	return 0;
	#else
	return __kmp_entry_thread()->th.th_current_task->td_icvs.default_device;
	#endif
	}

	void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_DEFAULT_DEVICE)(int KMP_DEREF arg) {
	#if KMP_MIC \|\| KMP_OS_DARWIN \|\| defined(KMP_STUB)
	// Nothing.
	#else
	__kmp_entry_thread()->th.th_current_task->td_icvs.default_device =
	KMP_DEREF arg;
	#endif
	}

	// Get number of NON-HOST devices.
	// libomptarget, if loaded, provides this function in api.cpp.
	-int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_DEVICES)(void) KMP_WEAK_ATTRIBUTE;
	+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_DEVICES)(void) KMP_WEAK_ATTRIBUTE_EXTERNAL;
	int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_DEVICES)(void) {
	#if KMP_MIC \|\| KMP_OS_DARWIN \|\| KMP_OS_WINDOWS \|\| defined(KMP_STUB)
	return 0;
	#else
	int (*fptr)();
	if (((void *)(&fptr) = dlsym(RTLD_DEFAULT, "_Offload_number_of_devices"))) {
	return (*fptr)();
	} else if (((void *)(&fptr) = dlsym(RTLD_NEXT, "omp_get_num_devices"))) {
	return (*fptr)();
	} else { // liboffload & libomptarget don't exist
	return 0;
	}
	#endif // KMP_MIC \|\| KMP_OS_DARWIN \|\| KMP_OS_WINDOWS \|\| defined(KMP_STUB)
	}

	// This function always returns true when called on host device.
	// Compiler/libomptarget should handle when it is called inside target region.
	-int FTN_STDCALL KMP_EXPAND_NAME(FTN_IS_INITIAL_DEVICE)(void) KMP_WEAK_ATTRIBUTE;
	+int FTN_STDCALL KMP_EXPAND_NAME(FTN_IS_INITIAL_DEVICE)(void) KMP_WEAK_ATTRIBUTE_EXTERNAL;
	int FTN_STDCALL KMP_EXPAND_NAME(FTN_IS_INITIAL_DEVICE)(void) {
	return 1; // This is the host
	}

	// libomptarget, if loaded, provides this function
	-int FTN_STDCALL FTN_GET_INITIAL_DEVICE(void) KMP_WEAK_ATTRIBUTE;
	+int FTN_STDCALL FTN_GET_INITIAL_DEVICE(void) KMP_WEAK_ATTRIBUTE_EXTERNAL;
	int FTN_STDCALL FTN_GET_INITIAL_DEVICE(void) {
	#if KMP_MIC \|\| KMP_OS_DARWIN \|\| KMP_OS_WINDOWS \|\| defined(KMP_STUB)
	return KMP_HOST_DEVICE;
	#else
	int (*fptr)();
	if (((void *)(&fptr) = dlsym(RTLD_NEXT, "omp_get_initial_device"))) {
	return (*fptr)();
	} else { // liboffload & libomptarget don't exist
	return KMP_HOST_DEVICE;
	}
	#endif
	}

	#if defined(KMP_STUB)
	// Entries for stubs library
	// As all target functions are C-only parameters always passed by value
	void *FTN_STDCALL FTN_TARGET_ALLOC(size_t size, int device_num) { return 0; }

	void FTN_STDCALL FTN_TARGET_FREE(void *device_ptr, int device_num) {}

	int FTN_STDCALL FTN_TARGET_IS_PRESENT(void *ptr, int device_num) { return 0; }

	int FTN_STDCALL FTN_TARGET_MEMCPY(void dst, void src, size_t length,
	size_t dst_offset, size_t src_offset,
	int dst_device, int src_device) {
	return -1;
	}

	int FTN_STDCALL FTN_TARGET_MEMCPY_RECT(
	void dst, void src, size_t element_size, int num_dims,
	const size_t volume, const size_t dst_offsets, const size_t *src_offsets,
	const size_t dst_dimensions, const size_t src_dimensions, int dst_device,
	int src_device) {
	return -1;
	}

	int FTN_STDCALL FTN_TARGET_ASSOCIATE_PTR(void host_ptr, void device_ptr,
	size_t size, size_t device_offset,
	int device_num) {
	return -1;
	}

	int FTN_STDCALL FTN_TARGET_DISASSOCIATE_PTR(void *host_ptr, int device_num) {
	return -1;
	}
	#endif // defined(KMP_STUB)

	#ifdef KMP_STUB
	typedef enum { UNINIT = -1, UNLOCKED, LOCKED } kmp_stub_lock_t;
	#endif /* KMP_STUB */

	#if KMP_USE_DYNAMIC_LOCK
	void FTN_STDCALL FTN_INIT_LOCK_WITH_HINT(void **user_lock,
	uintptr_t KMP_DEREF hint) {
	#ifdef KMP_STUB
	((kmp_stub_lock_t )user_lock) = UNLOCKED;
	#else
	int gtid = __kmp_entry_gtid();
	#if OMPT_SUPPORT && OMPT_OPTIONAL
	OMPT_STORE_RETURN_ADDRESS(gtid);
	#endif
	__kmpc_init_lock_with_hint(NULL, gtid, user_lock, KMP_DEREF hint);
	#endif
	}

	void FTN_STDCALL FTN_INIT_NEST_LOCK_WITH_HINT(void **user_lock,
	uintptr_t KMP_DEREF hint) {
	#ifdef KMP_STUB
	((kmp_stub_lock_t )user_lock) = UNLOCKED;
	#else
	int gtid = __kmp_entry_gtid();
	#if OMPT_SUPPORT && OMPT_OPTIONAL
	OMPT_STORE_RETURN_ADDRESS(gtid);
	#endif
	__kmpc_init_nest_lock_with_hint(NULL, gtid, user_lock, KMP_DEREF hint);
	#endif
	}
	#endif

	/* initialize the lock */
	void FTN_STDCALL KMP_EXPAND_NAME(FTN_INIT_LOCK)(void **user_lock) {
	#ifdef KMP_STUB
	((kmp_stub_lock_t )user_lock) = UNLOCKED;
	#else
	int gtid = __kmp_entry_gtid();
	#if OMPT_SUPPORT && OMPT_OPTIONAL
	OMPT_STORE_RETURN_ADDRESS(gtid);
	#endif
	__kmpc_init_lock(NULL, gtid, user_lock);
	#endif
	}

	/* initialize the lock */
	void FTN_STDCALL KMP_EXPAND_NAME(FTN_INIT_NEST_LOCK)(void **user_lock) {
	#ifdef KMP_STUB
	((kmp_stub_lock_t )user_lock) = UNLOCKED;
	#else
	int gtid = __kmp_entry_gtid();
	#if OMPT_SUPPORT && OMPT_OPTIONAL
	OMPT_STORE_RETURN_ADDRESS(gtid);
	#endif
	__kmpc_init_nest_lock(NULL, gtid, user_lock);
	#endif
	}

	void FTN_STDCALL KMP_EXPAND_NAME(FTN_DESTROY_LOCK)(void **user_lock) {
	#ifdef KMP_STUB
	((kmp_stub_lock_t )user_lock) = UNINIT;
	#else
	int gtid = __kmp_entry_gtid();
	#if OMPT_SUPPORT && OMPT_OPTIONAL
	OMPT_STORE_RETURN_ADDRESS(gtid);
	#endif
	__kmpc_destroy_lock(NULL, gtid, user_lock);
	#endif
	}

	void FTN_STDCALL KMP_EXPAND_NAME(FTN_DESTROY_NEST_LOCK)(void **user_lock) {
	#ifdef KMP_STUB
	((kmp_stub_lock_t )user_lock) = UNINIT;
	#else
	int gtid = __kmp_entry_gtid();
	#if OMPT_SUPPORT && OMPT_OPTIONAL
	OMPT_STORE_RETURN_ADDRESS(gtid);
	#endif
	__kmpc_destroy_nest_lock(NULL, gtid, user_lock);
	#endif
	}

	void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_LOCK)(void **user_lock) {
	#ifdef KMP_STUB
	if (((kmp_stub_lock_t )user_lock) == UNINIT) {
	// TODO: Issue an error.
	}
	if (((kmp_stub_lock_t )user_lock) != UNLOCKED) {
	// TODO: Issue an error.
	}
	((kmp_stub_lock_t )user_lock) = LOCKED;
	#else
	int gtid = __kmp_entry_gtid();
	#if OMPT_SUPPORT && OMPT_OPTIONAL
	OMPT_STORE_RETURN_ADDRESS(gtid);
	#endif
	__kmpc_set_lock(NULL, gtid, user_lock);
	#endif
	}

	void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_NEST_LOCK)(void **user_lock) {
	#ifdef KMP_STUB
	if (((kmp_stub_lock_t )user_lock) == UNINIT) {
	// TODO: Issue an error.
	}
	(((int )user_lock))++;
	#else
	int gtid = __kmp_entry_gtid();
	#if OMPT_SUPPORT && OMPT_OPTIONAL
	OMPT_STORE_RETURN_ADDRESS(gtid);
	#endif
	__kmpc_set_nest_lock(NULL, gtid, user_lock);
	#endif
	}

	void FTN_STDCALL KMP_EXPAND_NAME(FTN_UNSET_LOCK)(void **user_lock) {
	#ifdef KMP_STUB
	if (((kmp_stub_lock_t )user_lock) == UNINIT) {
	// TODO: Issue an error.
	}
	if (((kmp_stub_lock_t )user_lock) == UNLOCKED) {
	// TODO: Issue an error.
	}
	((kmp_stub_lock_t )user_lock) = UNLOCKED;
	#else
	int gtid = __kmp_entry_gtid();
	#if OMPT_SUPPORT && OMPT_OPTIONAL
	OMPT_STORE_RETURN_ADDRESS(gtid);
	#endif
	__kmpc_unset_lock(NULL, gtid, user_lock);
	#endif
	}

	void FTN_STDCALL KMP_EXPAND_NAME(FTN_UNSET_NEST_LOCK)(void **user_lock) {
	#ifdef KMP_STUB
	if (((kmp_stub_lock_t )user_lock) == UNINIT) {
	// TODO: Issue an error.
	}
	if (((kmp_stub_lock_t )user_lock) == UNLOCKED) {
	// TODO: Issue an error.
	}
	(((int )user_lock))--;
	#else
	int gtid = __kmp_entry_gtid();
	#if OMPT_SUPPORT && OMPT_OPTIONAL
	OMPT_STORE_RETURN_ADDRESS(gtid);
	#endif
	__kmpc_unset_nest_lock(NULL, gtid, user_lock);
	#endif
	}

	int FTN_STDCALL KMP_EXPAND_NAME(FTN_TEST_LOCK)(void **user_lock) {
	#ifdef KMP_STUB
	if (((kmp_stub_lock_t )user_lock) == UNINIT) {
	// TODO: Issue an error.
	}
	if (((kmp_stub_lock_t )user_lock) == LOCKED) {
	return 0;
	}
	((kmp_stub_lock_t )user_lock) = LOCKED;
	return 1;
	#else
	int gtid = __kmp_entry_gtid();
	#if OMPT_SUPPORT && OMPT_OPTIONAL
	OMPT_STORE_RETURN_ADDRESS(gtid);
	#endif
	return __kmpc_test_lock(NULL, gtid, user_lock);
	#endif
	}

	int FTN_STDCALL KMP_EXPAND_NAME(FTN_TEST_NEST_LOCK)(void **user_lock) {
	#ifdef KMP_STUB
	if (((kmp_stub_lock_t )user_lock) == UNINIT) {
	// TODO: Issue an error.
	}
	return ++(((int )user_lock));
	#else
	int gtid = __kmp_entry_gtid();
	#if OMPT_SUPPORT && OMPT_OPTIONAL
	OMPT_STORE_RETURN_ADDRESS(gtid);
	#endif
	return __kmpc_test_nest_lock(NULL, gtid, user_lock);
	#endif
	}

	double FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_WTIME)(void) {
	#ifdef KMP_STUB
	return __kmps_get_wtime();
	#else
	double data;
	#if !KMP_OS_LINUX
	// We don't need library initialization to get the time on Linux* OS. The
	// routine can be used to measure library initialization time on Linux* OS now
	if (!__kmp_init_serial) {
	__kmp_serial_initialize();
	}
	#endif
	__kmp_elapsed(&data);
	return data;
	#endif
	}

	double FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_WTICK)(void) {
	#ifdef KMP_STUB
	return __kmps_get_wtick();
	#else
	double data;
	if (!__kmp_init_serial) {
	__kmp_serial_initialize();
	}
	__kmp_elapsed_tick(&data);
	return data;
	#endif
	}

	/* ------------------------------------------------------------------------ */

	void *FTN_STDCALL FTN_MALLOC(size_t KMP_DEREF size) {
	// kmpc_malloc initializes the library if needed
	return kmpc_malloc(KMP_DEREF size);
	}

	void *FTN_STDCALL FTN_ALIGNED_MALLOC(size_t KMP_DEREF size,
	size_t KMP_DEREF alignment) {
	// kmpc_aligned_malloc initializes the library if needed
	return kmpc_aligned_malloc(KMP_DEREF size, KMP_DEREF alignment);
	}

	void *FTN_STDCALL FTN_CALLOC(size_t KMP_DEREF nelem, size_t KMP_DEREF elsize) {
	// kmpc_calloc initializes the library if needed
	return kmpc_calloc(KMP_DEREF nelem, KMP_DEREF elsize);
	}

	void FTN_STDCALL FTN_REALLOC(void KMP_DEREF ptr, size_t KMP_DEREF size) {
	// kmpc_realloc initializes the library if needed
	return kmpc_realloc(KMP_DEREF ptr, KMP_DEREF size);
	}

	void FTN_STDCALL FTN_KFREE(void *KMP_DEREF ptr) {
	// does nothing if the library is not initialized
	kmpc_free(KMP_DEREF ptr);
	}

	void FTN_STDCALL FTN_SET_WARNINGS_ON(void) {
	#ifndef KMP_STUB
	__kmp_generate_warnings = kmp_warnings_explicit;
	#endif
	}

	void FTN_STDCALL FTN_SET_WARNINGS_OFF(void) {
	#ifndef KMP_STUB
	__kmp_generate_warnings = FALSE;
	#endif
	}

	void FTN_STDCALL FTN_SET_DEFAULTS(char const *str
	#ifndef PASS_ARGS_BY_VALUE
	,
	int len
	#endif
	) {
	#ifndef KMP_STUB
	#ifdef PASS_ARGS_BY_VALUE
	int len = (int)KMP_STRLEN(str);
	#endif
	__kmp_aux_set_defaults(str, len);
	#endif
	}

	/* ------------------------------------------------------------------------ */

	/* returns the status of cancellation */
	int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_CANCELLATION)(void) {
	#ifdef KMP_STUB
	return 0 /* false */;
	#else
	// initialize the library if needed
	if (!__kmp_init_serial) {
	__kmp_serial_initialize();
	}
	return __kmp_omp_cancellation;
	#endif
	}

	int FTN_STDCALL FTN_GET_CANCELLATION_STATUS(int cancel_kind) {
	#ifdef KMP_STUB
	return 0 /* false */;
	#else
	return __kmp_get_cancellation_status(cancel_kind);
	#endif
	}

	/* returns the maximum allowed task priority */
	int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_MAX_TASK_PRIORITY)(void) {
	#ifdef KMP_STUB
	return 0;
	#else
	if (!__kmp_init_serial) {
	__kmp_serial_initialize();
	}
	return __kmp_max_task_priority;
	#endif
	}

	// This function will be defined in libomptarget. When libomptarget is not
	// loaded, we assume we are on the host and return KMP_HOST_DEVICE.
	// Compiler/libomptarget will handle this if called inside target.
	-int FTN_STDCALL FTN_GET_DEVICE_NUM(void) KMP_WEAK_ATTRIBUTE;
	+int FTN_STDCALL FTN_GET_DEVICE_NUM(void) KMP_WEAK_ATTRIBUTE_EXTERNAL;
	int FTN_STDCALL FTN_GET_DEVICE_NUM(void) { return KMP_HOST_DEVICE; }

	// Compiler will ensure that this is only called from host in sequential region
	int FTN_STDCALL FTN_PAUSE_RESOURCE(kmp_pause_status_t kind, int device_num) {
	#ifdef KMP_STUB
	return 1; // just fail
	#else
	if (device_num == KMP_HOST_DEVICE)
	return __kmpc_pause_resource(kind);
	else {
	#if !KMP_OS_WINDOWS
	int (*fptr)(kmp_pause_status_t, int);
	if (((void *)(&fptr) = dlsym(RTLD_DEFAULT, "tgt_pause_resource")))
	return (*fptr)(kind, device_num);
	else
	#endif
	return 1; // just fail if there is no libomptarget
	}
	#endif
	}

	// Compiler will ensure that this is only called from host in sequential region
	int FTN_STDCALL FTN_PAUSE_RESOURCE_ALL(kmp_pause_status_t kind) {
	#ifdef KMP_STUB
	return 1; // just fail
	#else
	int fails = 0;
	#if !KMP_OS_WINDOWS
	int (*fptr)(kmp_pause_status_t, int);
	if (((void *)(&fptr) = dlsym(RTLD_DEFAULT, "tgt_pause_resource")))
	fails = (*fptr)(kind, KMP_DEVICE_ALL); // pause devices
	#endif
	fails += __kmpc_pause_resource(kind); // pause host
	return fails;
	#endif
	}

	// Returns the maximum number of nesting levels supported by implementation
	int FTN_STDCALL FTN_GET_SUPPORTED_ACTIVE_LEVELS(void) {
	#ifdef KMP_STUB
	return 1;
	#else
	return KMP_MAX_ACTIVE_LEVELS_LIMIT;
	#endif
	}

	void FTN_STDCALL FTN_FULFILL_EVENT(kmp_event_t *event) {
	#ifndef KMP_STUB
	__kmp_fulfill_event(event);
	#endif
	}

	// display environment variables when requested
	void FTN_STDCALL FTN_DISPLAY_ENV(int verbose) {
	#ifndef KMP_STUB
	__kmp_omp_display_env(verbose);
	#endif
	}

	// GCC compatibility (versioned symbols)
	#ifdef KMP_USE_VERSION_SYMBOLS

	/* These following sections create versioned symbols for the
	omp_* routines. The KMP_VERSION_SYMBOL macro expands the API name and
	then maps it to a versioned symbol.
	libgomp ``versions'' its symbols (OMP_1.0, OMP_2.0, OMP_3.0, ...) while also
	retaining the default version which libomp uses: VERSION (defined in
	exports_so.txt). If you want to see the versioned symbols for libgomp.so.1
	then just type:

	objdump -T /path/to/libgomp.so.1 \| grep omp_

	Example:
	Step 1) Create __kmp_api_omp_set_num_threads_10_alias which is alias of
	__kmp_api_omp_set_num_threads
	Step 2) Set __kmp_api_omp_set_num_threads_10_alias to version:
	omp_set_num_threads@OMP_1.0
	Step 2B) Set __kmp_api_omp_set_num_threads to default version:
	omp_set_num_threads@@VERSION
	*/

	// OMP_1.0 versioned symbols
	KMP_VERSION_SYMBOL(FTN_SET_NUM_THREADS, 10, "OMP_1.0");
	KMP_VERSION_SYMBOL(FTN_GET_NUM_THREADS, 10, "OMP_1.0");
	KMP_VERSION_SYMBOL(FTN_GET_MAX_THREADS, 10, "OMP_1.0");
	KMP_VERSION_SYMBOL(FTN_GET_THREAD_NUM, 10, "OMP_1.0");
	KMP_VERSION_SYMBOL(FTN_GET_NUM_PROCS, 10, "OMP_1.0");
	KMP_VERSION_SYMBOL(FTN_IN_PARALLEL, 10, "OMP_1.0");
	KMP_VERSION_SYMBOL(FTN_SET_DYNAMIC, 10, "OMP_1.0");
	KMP_VERSION_SYMBOL(FTN_GET_DYNAMIC, 10, "OMP_1.0");
	KMP_VERSION_SYMBOL(FTN_SET_NESTED, 10, "OMP_1.0");
	KMP_VERSION_SYMBOL(FTN_GET_NESTED, 10, "OMP_1.0");
	KMP_VERSION_SYMBOL(FTN_INIT_LOCK, 10, "OMP_1.0");
	KMP_VERSION_SYMBOL(FTN_INIT_NEST_LOCK, 10, "OMP_1.0");
	KMP_VERSION_SYMBOL(FTN_DESTROY_LOCK, 10, "OMP_1.0");
	KMP_VERSION_SYMBOL(FTN_DESTROY_NEST_LOCK, 10, "OMP_1.0");
	KMP_VERSION_SYMBOL(FTN_SET_LOCK, 10, "OMP_1.0");
	KMP_VERSION_SYMBOL(FTN_SET_NEST_LOCK, 10, "OMP_1.0");
	KMP_VERSION_SYMBOL(FTN_UNSET_LOCK, 10, "OMP_1.0");
	KMP_VERSION_SYMBOL(FTN_UNSET_NEST_LOCK, 10, "OMP_1.0");
	KMP_VERSION_SYMBOL(FTN_TEST_LOCK, 10, "OMP_1.0");
	KMP_VERSION_SYMBOL(FTN_TEST_NEST_LOCK, 10, "OMP_1.0");

	// OMP_2.0 versioned symbols
	KMP_VERSION_SYMBOL(FTN_GET_WTICK, 20, "OMP_2.0");
	KMP_VERSION_SYMBOL(FTN_GET_WTIME, 20, "OMP_2.0");

	// OMP_3.0 versioned symbols
	KMP_VERSION_SYMBOL(FTN_SET_SCHEDULE, 30, "OMP_3.0");
	KMP_VERSION_SYMBOL(FTN_GET_SCHEDULE, 30, "OMP_3.0");
	KMP_VERSION_SYMBOL(FTN_GET_THREAD_LIMIT, 30, "OMP_3.0");
	KMP_VERSION_SYMBOL(FTN_SET_MAX_ACTIVE_LEVELS, 30, "OMP_3.0");
	KMP_VERSION_SYMBOL(FTN_GET_MAX_ACTIVE_LEVELS, 30, "OMP_3.0");
	KMP_VERSION_SYMBOL(FTN_GET_ANCESTOR_THREAD_NUM, 30, "OMP_3.0");
	KMP_VERSION_SYMBOL(FTN_GET_LEVEL, 30, "OMP_3.0");
	KMP_VERSION_SYMBOL(FTN_GET_TEAM_SIZE, 30, "OMP_3.0");
	KMP_VERSION_SYMBOL(FTN_GET_ACTIVE_LEVEL, 30, "OMP_3.0");

	// the lock routines have a 1.0 and 3.0 version
	KMP_VERSION_SYMBOL(FTN_INIT_LOCK, 30, "OMP_3.0");
	KMP_VERSION_SYMBOL(FTN_INIT_NEST_LOCK, 30, "OMP_3.0");
	KMP_VERSION_SYMBOL(FTN_DESTROY_LOCK, 30, "OMP_3.0");
	KMP_VERSION_SYMBOL(FTN_DESTROY_NEST_LOCK, 30, "OMP_3.0");
	KMP_VERSION_SYMBOL(FTN_SET_LOCK, 30, "OMP_3.0");
	KMP_VERSION_SYMBOL(FTN_SET_NEST_LOCK, 30, "OMP_3.0");
	KMP_VERSION_SYMBOL(FTN_UNSET_LOCK, 30, "OMP_3.0");
	KMP_VERSION_SYMBOL(FTN_UNSET_NEST_LOCK, 30, "OMP_3.0");
	KMP_VERSION_SYMBOL(FTN_TEST_LOCK, 30, "OMP_3.0");
	KMP_VERSION_SYMBOL(FTN_TEST_NEST_LOCK, 30, "OMP_3.0");

	// OMP_3.1 versioned symbol
	KMP_VERSION_SYMBOL(FTN_IN_FINAL, 31, "OMP_3.1");

	// OMP_4.0 versioned symbols
	KMP_VERSION_SYMBOL(FTN_GET_PROC_BIND, 40, "OMP_4.0");
	KMP_VERSION_SYMBOL(FTN_GET_NUM_TEAMS, 40, "OMP_4.0");
	KMP_VERSION_SYMBOL(FTN_GET_TEAM_NUM, 40, "OMP_4.0");
	KMP_VERSION_SYMBOL(FTN_GET_CANCELLATION, 40, "OMP_4.0");
	KMP_VERSION_SYMBOL(FTN_GET_DEFAULT_DEVICE, 40, "OMP_4.0");
	KMP_VERSION_SYMBOL(FTN_SET_DEFAULT_DEVICE, 40, "OMP_4.0");
	KMP_VERSION_SYMBOL(FTN_IS_INITIAL_DEVICE, 40, "OMP_4.0");
	KMP_VERSION_SYMBOL(FTN_GET_NUM_DEVICES, 40, "OMP_4.0");

	// OMP_4.5 versioned symbols
	KMP_VERSION_SYMBOL(FTN_GET_MAX_TASK_PRIORITY, 45, "OMP_4.5");
	KMP_VERSION_SYMBOL(FTN_GET_NUM_PLACES, 45, "OMP_4.5");
	KMP_VERSION_SYMBOL(FTN_GET_PLACE_NUM_PROCS, 45, "OMP_4.5");
	KMP_VERSION_SYMBOL(FTN_GET_PLACE_PROC_IDS, 45, "OMP_4.5");
	KMP_VERSION_SYMBOL(FTN_GET_PLACE_NUM, 45, "OMP_4.5");
	KMP_VERSION_SYMBOL(FTN_GET_PARTITION_NUM_PLACES, 45, "OMP_4.5");
	KMP_VERSION_SYMBOL(FTN_GET_PARTITION_PLACE_NUMS, 45, "OMP_4.5");
	// KMP_VERSION_SYMBOL(FTN_GET_INITIAL_DEVICE, 45, "OMP_4.5");

	// OMP_5.0 versioned symbols
	// KMP_VERSION_SYMBOL(FTN_GET_DEVICE_NUM, 50, "OMP_5.0");
	// KMP_VERSION_SYMBOL(FTN_PAUSE_RESOURCE, 50, "OMP_5.0");
	// KMP_VERSION_SYMBOL(FTN_PAUSE_RESOURCE_ALL, 50, "OMP_5.0");
	// KMP_VERSION_SYMBOL(FTN_GET_SUPPORTED_ACTIVE_LEVELS, 50, "OMP_5.0");
	// KMP_VERSION_SYMBOL(FTN_FULFILL_EVENT, 50, "OMP_5.0");

	#endif // KMP_USE_VERSION_SYMBOLS

	#ifdef __cplusplus
	} // extern "C"
	#endif // __cplusplus

	// end of file //
	diff --git a/openmp/runtime/src/kmp_os.h b/openmp/runtime/src/kmp_os.h
	index bfe7765b2a96..d1511904e94b 100644
	--- a/openmp/runtime/src/kmp_os.h
	+++ b/openmp/runtime/src/kmp_os.h
	@@ -1,1041 +1,1047 @@
	/*
	* kmp_os.h -- KPTS runtime header file.
	*/

	//===----------------------------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#ifndef KMP_OS_H
	#define KMP_OS_H

	#include "kmp_config.h"
	#include <stdlib.h>
	#include <atomic>

	#define KMP_FTN_PLAIN 1
	#define KMP_FTN_APPEND 2
	#define KMP_FTN_UPPER 3
	/*
	#define KMP_FTN_PREPEND 4
	#define KMP_FTN_UAPPEND 5
	*/

	#define KMP_PTR_SKIP (sizeof(void *))

	/* -------------------------- Compiler variations ------------------------ */

	#define KMP_OFF 0
	#define KMP_ON 1

	#define KMP_MEM_CONS_VOLATILE 0
	#define KMP_MEM_CONS_FENCE 1

	#ifndef KMP_MEM_CONS_MODEL
	#define KMP_MEM_CONS_MODEL KMP_MEM_CONS_VOLATILE
	#endif

	#ifndef __has_cpp_attribute
	#define __has_cpp_attribute(x) 0
	#endif

	#ifndef __has_attribute
	#define __has_attribute(x) 0
	#endif

	/* ------------------------- Compiler recognition ---------------------- */
	#define KMP_COMPILER_ICC 0
	#define KMP_COMPILER_GCC 0
	#define KMP_COMPILER_CLANG 0
	#define KMP_COMPILER_MSVC 0

	#if defined(__INTEL_COMPILER)
	#undef KMP_COMPILER_ICC
	#define KMP_COMPILER_ICC 1
	#elif defined(__clang__)
	#undef KMP_COMPILER_CLANG
	#define KMP_COMPILER_CLANG 1
	#elif defined(__GNUC__)
	#undef KMP_COMPILER_GCC
	#define KMP_COMPILER_GCC 1
	#elif defined(_MSC_VER)
	#undef KMP_COMPILER_MSVC
	#define KMP_COMPILER_MSVC 1
	#else
	#error Unknown compiler
	#endif

	#if (KMP_OS_LINUX \|\| KMP_OS_WINDOWS \|\| KMP_OS_FREEBSD) && !KMP_OS_CNK
	#define KMP_AFFINITY_SUPPORTED 1
	#if KMP_OS_WINDOWS && KMP_ARCH_X86_64
	#define KMP_GROUP_AFFINITY 1
	#else
	#define KMP_GROUP_AFFINITY 0
	#endif
	#else
	#define KMP_AFFINITY_SUPPORTED 0
	#define KMP_GROUP_AFFINITY 0
	#endif

	/* Check for quad-precision extension. */
	#define KMP_HAVE_QUAD 0
	#if KMP_ARCH_X86 \|\| KMP_ARCH_X86_64
	#if KMP_COMPILER_ICC
	/* _Quad is already defined for icc */
	#undef KMP_HAVE_QUAD
	#define KMP_HAVE_QUAD 1
	#elif KMP_COMPILER_CLANG
	/* Clang doesn't support a software-implemented
	128-bit extended precision type yet */
	typedef long double _Quad;
	#elif KMP_COMPILER_GCC
	/* GCC on NetBSD lacks __multc3/__divtc3 builtins needed for quad */
	#if !KMP_OS_NETBSD
	typedef __float128 _Quad;
	#undef KMP_HAVE_QUAD
	#define KMP_HAVE_QUAD 1
	#endif
	#elif KMP_COMPILER_MSVC
	typedef long double _Quad;
	#endif
	#else
	#if __LDBL_MAX_EXP__ >= 16384 && KMP_COMPILER_GCC
	typedef long double _Quad;
	#undef KMP_HAVE_QUAD
	#define KMP_HAVE_QUAD 1
	#endif
	#endif /* KMP_ARCH_X86 \|\| KMP_ARCH_X86_64 */

	#define KMP_USE_X87CONTROL 0
	#if KMP_OS_WINDOWS
	#define KMP_END_OF_LINE "\r\n"
	typedef char kmp_int8;
	typedef unsigned char kmp_uint8;
	typedef short kmp_int16;
	typedef unsigned short kmp_uint16;
	typedef int kmp_int32;
	typedef unsigned int kmp_uint32;
	#define KMP_INT32_SPEC "d"
	#define KMP_UINT32_SPEC "u"
	#ifndef KMP_STRUCT64
	typedef __int64 kmp_int64;
	typedef unsigned __int64 kmp_uint64;
	#define KMP_INT64_SPEC "I64d"
	#define KMP_UINT64_SPEC "I64u"
	#else
	struct kmp_struct64 {
	kmp_int32 a, b;
	};
	typedef struct kmp_struct64 kmp_int64;
	typedef struct kmp_struct64 kmp_uint64;
	/* Not sure what to use for KMP_[U]INT64_SPEC here */
	#endif
	#if KMP_ARCH_X86 && KMP_MSVC_COMPAT
	#undef KMP_USE_X87CONTROL
	#define KMP_USE_X87CONTROL 1
	#endif
	#if KMP_ARCH_X86_64
	#define KMP_INTPTR 1
	typedef __int64 kmp_intptr_t;
	typedef unsigned __int64 kmp_uintptr_t;
	#define KMP_INTPTR_SPEC "I64d"
	#define KMP_UINTPTR_SPEC "I64u"
	#endif
	#endif /* KMP_OS_WINDOWS */

	#if KMP_OS_UNIX
	#define KMP_END_OF_LINE "\n"
	typedef char kmp_int8;
	typedef unsigned char kmp_uint8;
	typedef short kmp_int16;
	typedef unsigned short kmp_uint16;
	typedef int kmp_int32;
	typedef unsigned int kmp_uint32;
	typedef long long kmp_int64;
	typedef unsigned long long kmp_uint64;
	#define KMP_INT32_SPEC "d"
	#define KMP_UINT32_SPEC "u"
	#define KMP_INT64_SPEC "lld"
	#define KMP_UINT64_SPEC "llu"
	#endif /* KMP_OS_UNIX */

	#if KMP_ARCH_X86 \|\| KMP_ARCH_ARM \|\| KMP_ARCH_MIPS
	#define KMP_SIZE_T_SPEC KMP_UINT32_SPEC
	#elif KMP_ARCH_X86_64 \|\| KMP_ARCH_PPC64 \|\| KMP_ARCH_AARCH64 \|\| \
	KMP_ARCH_MIPS64 \|\| KMP_ARCH_RISCV64
	#define KMP_SIZE_T_SPEC KMP_UINT64_SPEC
	#else
	#error "Can't determine size_t printf format specifier."
	#endif

	#if KMP_ARCH_X86
	#define KMP_SIZE_T_MAX (0xFFFFFFFF)
	#else
	#define KMP_SIZE_T_MAX (0xFFFFFFFFFFFFFFFF)
	#endif

	typedef size_t kmp_size_t;
	typedef float kmp_real32;
	typedef double kmp_real64;

	#ifndef KMP_INTPTR
	#define KMP_INTPTR 1
	typedef long kmp_intptr_t;
	typedef unsigned long kmp_uintptr_t;
	#define KMP_INTPTR_SPEC "ld"
	#define KMP_UINTPTR_SPEC "lu"
	#endif

	#ifdef BUILD_I8
	typedef kmp_int64 kmp_int;
	typedef kmp_uint64 kmp_uint;
	#else
	typedef kmp_int32 kmp_int;
	typedef kmp_uint32 kmp_uint;
	#endif /* BUILD_I8 */
	#define KMP_INT_MAX ((kmp_int32)0x7FFFFFFF)
	#define KMP_INT_MIN ((kmp_int32)0x80000000)

	#ifdef __cplusplus
	// macros to cast out qualifiers and to re-interpret types
	#define CCAST(type, var) const_cast<type>(var)
	#define RCAST(type, var) reinterpret_cast<type>(var)
	//-------------------------------------------------------------------------
	// template for debug prints specification ( d, u, lld, llu ), and to obtain
	// signed/unsigned flavors of a type
	template <typename T> struct traits_t {};
	// int
	template <> struct traits_t<signed int> {
	typedef signed int signed_t;
	typedef unsigned int unsigned_t;
	typedef double floating_t;
	static char const *spec;
	static const signed_t max_value = 0x7fffffff;
	static const signed_t min_value = 0x80000000;
	static const int type_size = sizeof(signed_t);
	};
	// unsigned int
	template <> struct traits_t<unsigned int> {
	typedef signed int signed_t;
	typedef unsigned int unsigned_t;
	typedef double floating_t;
	static char const *spec;
	static const unsigned_t max_value = 0xffffffff;
	static const unsigned_t min_value = 0x00000000;
	static const int type_size = sizeof(unsigned_t);
	};
	// long
	template <> struct traits_t<signed long> {
	typedef signed long signed_t;
	typedef unsigned long unsigned_t;
	typedef long double floating_t;
	static char const *spec;
	static const int type_size = sizeof(signed_t);
	};
	// long long
	template <> struct traits_t<signed long long> {
	typedef signed long long signed_t;
	typedef unsigned long long unsigned_t;
	typedef long double floating_t;
	static char const *spec;
	static const signed_t max_value = 0x7fffffffffffffffLL;
	static const signed_t min_value = 0x8000000000000000LL;
	static const int type_size = sizeof(signed_t);
	};
	// unsigned long long
	template <> struct traits_t<unsigned long long> {
	typedef signed long long signed_t;
	typedef unsigned long long unsigned_t;
	typedef long double floating_t;
	static char const *spec;
	static const unsigned_t max_value = 0xffffffffffffffffLL;
	static const unsigned_t min_value = 0x0000000000000000LL;
	static const int type_size = sizeof(unsigned_t);
	};
	//-------------------------------------------------------------------------
	#else
	#define CCAST(type, var) (type)(var)
	#define RCAST(type, var) (type)(var)
	#endif // __cplusplus

	#define KMP_EXPORT extern /* export declaration in guide libraries */

	#if __GNUC__ >= 4 && !defined(__MINGW32__)
	#define __forceinline __inline
	#endif

	#if KMP_OS_WINDOWS
	#include <windows.h>

	static inline int KMP_GET_PAGE_SIZE(void) {
	SYSTEM_INFO si;
	GetSystemInfo(&si);
	return si.dwPageSize;
	}
	#else
	#define KMP_GET_PAGE_SIZE() getpagesize()
	#endif

	#define PAGE_ALIGNED(_addr) \
	(!((size_t)_addr & (size_t)(KMP_GET_PAGE_SIZE() - 1)))
	#define ALIGN_TO_PAGE(x) \
	(void *)(((size_t)(x)) & ~((size_t)(KMP_GET_PAGE_SIZE() - 1)))

	/* ---------- Support for cache alignment, padding, etc. ----------------*/

	#ifdef __cplusplus
	extern "C" {
	#endif // __cplusplus

	#define INTERNODE_CACHE_LINE 4096 /* for multi-node systems */

	/* Define the default size of the cache line */
	#ifndef CACHE_LINE
	#define CACHE_LINE 128 /* cache line size in bytes */
	#else
	#if (CACHE_LINE < 64) && !defined(KMP_OS_DARWIN)
	// 2006-02-13: This produces too many warnings on OS X*. Disable for now
	#warning CACHE_LINE is too small.
	#endif
	#endif /* CACHE_LINE */

	#define KMP_CACHE_PREFETCH(ADDR) /* nothing */

	// Define attribute that indicates that the fall through from the previous
	// case label is intentional and should not be diagnosed by a compiler
	// Code from libcxx/include/__config
	// Use a function like macro to imply that it must be followed by a semicolon
	#if __cplusplus > 201402L && __has_cpp_attribute(fallthrough)
	# define KMP_FALLTHROUGH() [[fallthrough]]
	#elif __has_cpp_attribute(clang::fallthrough)
	# define KMP_FALLTHROUGH() [[clang::fallthrough]]
	#elif __has_attribute(fallthrough) \|\| __GNUC__ >= 7
	# define KMP_FALLTHROUGH() __attribute__((__fallthrough__))
	#else
	# define KMP_FALLTHROUGH() ((void)0)
	#endif

	// Define attribute that indicates a function does not return
	#if __cplusplus >= 201103L
	#define KMP_NORETURN [[noreturn]]
	#elif KMP_OS_WINDOWS
	#define KMP_NORETURN __declspec(noreturn)
	#else
	#define KMP_NORETURN __attribute__((noreturn))
	#endif

	#if KMP_OS_WINDOWS && KMP_MSVC_COMPAT
	#define KMP_ALIGN(bytes) __declspec(align(bytes))
	#define KMP_THREAD_LOCAL __declspec(thread)
	#define KMP_ALIAS /* Nothing */
	#else
	#define KMP_ALIGN(bytes) __attribute__((aligned(bytes)))
	#define KMP_THREAD_LOCAL __thread
	#define KMP_ALIAS(alias_of) __attribute__((alias(alias_of)))
	#endif

	+#if KMP_HAVE_WEAK_ATTRIBUTE && !KMP_DYNAMIC_LIB
	+#define KMP_WEAK_ATTRIBUTE_EXTERNAL __attribute__((weak))
	+#else
	+#define KMP_WEAK_ATTRIBUTE_EXTERNAL /* Nothing */
	+#endif
	+
	#if KMP_HAVE_WEAK_ATTRIBUTE
	-#define KMP_WEAK_ATTRIBUTE __attribute__((weak))
	+#define KMP_WEAK_ATTRIBUTE_INTERNAL __attribute__((weak))
	#else
	-#define KMP_WEAK_ATTRIBUTE /* Nothing */
	+#define KMP_WEAK_ATTRIBUTE_INTERNAL /* Nothing */
	#endif

	// Define KMP_VERSION_SYMBOL and KMP_EXPAND_NAME
	#ifndef KMP_STR
	#define KMP_STR(x) _KMP_STR(x)
	#define _KMP_STR(x) #x
	#endif

	#ifdef KMP_USE_VERSION_SYMBOLS
	// If using versioned symbols, KMP_EXPAND_NAME prepends
	// __kmp_api_ to the real API name
	#define KMP_EXPAND_NAME(api_name) _KMP_EXPAND_NAME(api_name)
	#define _KMP_EXPAND_NAME(api_name) __kmp_api_##api_name
	#define KMP_VERSION_SYMBOL(api_name, ver_num, ver_str) \
	_KMP_VERSION_SYMBOL(api_name, ver_num, ver_str, "VERSION")
	#define _KMP_VERSION_SYMBOL(api_name, ver_num, ver_str, default_ver) \
	__typeof__(__kmp_api_##api_name) __kmp_api_##api_name##_##ver_num##_alias \
	__attribute__((alias(KMP_STR(__kmp_api_##api_name)))); \
	__asm__( \
	".symver " KMP_STR(__kmp_api_##api_name##_##ver_num##_alias) "," KMP_STR( \
	api_name) "@" ver_str "\n\t"); \
	__asm__(".symver " KMP_STR(__kmp_api_##api_name) "," KMP_STR( \
	api_name) "@@" default_ver "\n\t")
	#else // KMP_USE_VERSION_SYMBOLS
	#define KMP_EXPAND_NAME(api_name) api_name
	#define KMP_VERSION_SYMBOL(api_name, ver_num, ver_str) /* Nothing */
	#endif // KMP_USE_VERSION_SYMBOLS

	/* Temporary note: if performance testing of this passes, we can remove
	all references to KMP_DO_ALIGN and replace with KMP_ALIGN. */
	#define KMP_DO_ALIGN(bytes) KMP_ALIGN(bytes)
	#define KMP_ALIGN_CACHE KMP_ALIGN(CACHE_LINE)
	#define KMP_ALIGN_CACHE_INTERNODE KMP_ALIGN(INTERNODE_CACHE_LINE)

	/* General purpose fence types for memory operations */
	enum kmp_mem_fence_type {
	kmp_no_fence, /* No memory fence */
	kmp_acquire_fence, /* Acquire (read) memory fence */
	kmp_release_fence, /* Release (write) memory fence */
	kmp_full_fence /* Full (read+write) memory fence */
	};

	// Synchronization primitives

	#if KMP_ASM_INTRINS && KMP_OS_WINDOWS

	#if KMP_MSVC_COMPAT && !KMP_COMPILER_CLANG
	#pragma intrinsic(InterlockedExchangeAdd)
	#pragma intrinsic(InterlockedCompareExchange)
	#pragma intrinsic(InterlockedExchange)
	#pragma intrinsic(InterlockedExchange64)
	#endif

	// Using InterlockedIncrement / InterlockedDecrement causes a library loading
	// ordering problem, so we use InterlockedExchangeAdd instead.
	#define KMP_TEST_THEN_INC32(p) InterlockedExchangeAdd((volatile long *)(p), 1)
	#define KMP_TEST_THEN_INC_ACQ32(p) \
	InterlockedExchangeAdd((volatile long *)(p), 1)
	#define KMP_TEST_THEN_ADD4_32(p) InterlockedExchangeAdd((volatile long *)(p), 4)
	#define KMP_TEST_THEN_ADD4_ACQ32(p) \
	InterlockedExchangeAdd((volatile long *)(p), 4)
	#define KMP_TEST_THEN_DEC32(p) InterlockedExchangeAdd((volatile long *)(p), -1)
	#define KMP_TEST_THEN_DEC_ACQ32(p) \
	InterlockedExchangeAdd((volatile long *)(p), -1)
	#define KMP_TEST_THEN_ADD32(p, v) \
	InterlockedExchangeAdd((volatile long *)(p), (v))

	#define KMP_COMPARE_AND_STORE_RET32(p, cv, sv) \
	InterlockedCompareExchange((volatile long *)(p), (long)(sv), (long)(cv))

	#define KMP_XCHG_FIXED32(p, v) \
	InterlockedExchange((volatile long *)(p), (long)(v))
	#define KMP_XCHG_FIXED64(p, v) \
	InterlockedExchange64((volatile kmp_int64 *)(p), (kmp_int64)(v))

	inline kmp_real32 KMP_XCHG_REAL32(volatile kmp_real32 *p, kmp_real32 v) {
	kmp_int32 tmp = InterlockedExchange((volatile long )p, (long *)&v);
	return (kmp_real32 )&tmp;
	}

	// Routines that we still need to implement in assembly.
	extern kmp_int8 __kmp_test_then_add8(volatile kmp_int8 *p, kmp_int8 v);
	extern kmp_int8 __kmp_test_then_or8(volatile kmp_int8 *p, kmp_int8 v);
	extern kmp_int8 __kmp_test_then_and8(volatile kmp_int8 *p, kmp_int8 v);
	extern kmp_int32 __kmp_test_then_add32(volatile kmp_int32 *p, kmp_int32 v);
	extern kmp_uint32 __kmp_test_then_or32(volatile kmp_uint32 *p, kmp_uint32 v);
	extern kmp_uint32 __kmp_test_then_and32(volatile kmp_uint32 *p, kmp_uint32 v);
	extern kmp_int64 __kmp_test_then_add64(volatile kmp_int64 *p, kmp_int64 v);
	extern kmp_uint64 __kmp_test_then_or64(volatile kmp_uint64 *p, kmp_uint64 v);
	extern kmp_uint64 __kmp_test_then_and64(volatile kmp_uint64 *p, kmp_uint64 v);

	extern kmp_int8 __kmp_compare_and_store8(volatile kmp_int8 *p, kmp_int8 cv,
	kmp_int8 sv);
	extern kmp_int16 __kmp_compare_and_store16(volatile kmp_int16 *p, kmp_int16 cv,
	kmp_int16 sv);
	extern kmp_int32 __kmp_compare_and_store32(volatile kmp_int32 *p, kmp_int32 cv,
	kmp_int32 sv);
	extern kmp_int32 __kmp_compare_and_store64(volatile kmp_int64 *p, kmp_int64 cv,
	kmp_int64 sv);
	extern kmp_int8 __kmp_compare_and_store_ret8(volatile kmp_int8 *p, kmp_int8 cv,
	kmp_int8 sv);
	extern kmp_int16 __kmp_compare_and_store_ret16(volatile kmp_int16 *p,
	kmp_int16 cv, kmp_int16 sv);
	extern kmp_int32 __kmp_compare_and_store_ret32(volatile kmp_int32 *p,
	kmp_int32 cv, kmp_int32 sv);
	extern kmp_int64 __kmp_compare_and_store_ret64(volatile kmp_int64 *p,
	kmp_int64 cv, kmp_int64 sv);

	extern kmp_int8 __kmp_xchg_fixed8(volatile kmp_int8 *p, kmp_int8 v);
	extern kmp_int16 __kmp_xchg_fixed16(volatile kmp_int16 *p, kmp_int16 v);
	extern kmp_int32 __kmp_xchg_fixed32(volatile kmp_int32 *p, kmp_int32 v);
	extern kmp_int64 __kmp_xchg_fixed64(volatile kmp_int64 *p, kmp_int64 v);
	extern kmp_real32 __kmp_xchg_real32(volatile kmp_real32 *p, kmp_real32 v);
	extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v);

	//#define KMP_TEST_THEN_INC32(p) __kmp_test_then_add32((p), 1)
	//#define KMP_TEST_THEN_INC_ACQ32(p) __kmp_test_then_add32((p), 1)
	#define KMP_TEST_THEN_INC64(p) __kmp_test_then_add64((p), 1LL)
	#define KMP_TEST_THEN_INC_ACQ64(p) __kmp_test_then_add64((p), 1LL)
	//#define KMP_TEST_THEN_ADD4_32(p) __kmp_test_then_add32((p), 4)
	//#define KMP_TEST_THEN_ADD4_ACQ32(p) __kmp_test_then_add32((p), 4)
	#define KMP_TEST_THEN_ADD4_64(p) __kmp_test_then_add64((p), 4LL)
	#define KMP_TEST_THEN_ADD4_ACQ64(p) __kmp_test_then_add64((p), 4LL)
	//#define KMP_TEST_THEN_DEC32(p) __kmp_test_then_add32((p), -1)
	//#define KMP_TEST_THEN_DEC_ACQ32(p) __kmp_test_then_add32((p), -1)
	#define KMP_TEST_THEN_DEC64(p) __kmp_test_then_add64((p), -1LL)
	#define KMP_TEST_THEN_DEC_ACQ64(p) __kmp_test_then_add64((p), -1LL)
	//#define KMP_TEST_THEN_ADD32(p, v) __kmp_test_then_add32((p), (v))
	#define KMP_TEST_THEN_ADD8(p, v) __kmp_test_then_add8((p), (v))
	#define KMP_TEST_THEN_ADD64(p, v) __kmp_test_then_add64((p), (v))

	#define KMP_TEST_THEN_OR8(p, v) __kmp_test_then_or8((p), (v))
	#define KMP_TEST_THEN_AND8(p, v) __kmp_test_then_and8((p), (v))
	#define KMP_TEST_THEN_OR32(p, v) __kmp_test_then_or32((p), (v))
	#define KMP_TEST_THEN_AND32(p, v) __kmp_test_then_and32((p), (v))
	#define KMP_TEST_THEN_OR64(p, v) __kmp_test_then_or64((p), (v))
	#define KMP_TEST_THEN_AND64(p, v) __kmp_test_then_and64((p), (v))

	#define KMP_COMPARE_AND_STORE_ACQ8(p, cv, sv) \
	__kmp_compare_and_store8((p), (cv), (sv))
	#define KMP_COMPARE_AND_STORE_REL8(p, cv, sv) \
	__kmp_compare_and_store8((p), (cv), (sv))
	#define KMP_COMPARE_AND_STORE_ACQ16(p, cv, sv) \
	__kmp_compare_and_store16((p), (cv), (sv))
	#define KMP_COMPARE_AND_STORE_REL16(p, cv, sv) \
	__kmp_compare_and_store16((p), (cv), (sv))
	#define KMP_COMPARE_AND_STORE_ACQ32(p, cv, sv) \
	__kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv), \
	(kmp_int32)(sv))
	#define KMP_COMPARE_AND_STORE_REL32(p, cv, sv) \
	__kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv), \
	(kmp_int32)(sv))
	#define KMP_COMPARE_AND_STORE_ACQ64(p, cv, sv) \
	__kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv), \
	(kmp_int64)(sv))
	#define KMP_COMPARE_AND_STORE_REL64(p, cv, sv) \
	__kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv), \
	(kmp_int64)(sv))

	#if KMP_ARCH_X86
	#define KMP_COMPARE_AND_STORE_PTR(p, cv, sv) \
	__kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv), \
	(kmp_int32)(sv))
	#else /* 64 bit pointers */
	#define KMP_COMPARE_AND_STORE_PTR(p, cv, sv) \
	__kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv), \
	(kmp_int64)(sv))
	#endif /* KMP_ARCH_X86 */

	#define KMP_COMPARE_AND_STORE_RET8(p, cv, sv) \
	__kmp_compare_and_store_ret8((p), (cv), (sv))
	#define KMP_COMPARE_AND_STORE_RET16(p, cv, sv) \
	__kmp_compare_and_store_ret16((p), (cv), (sv))
	#define KMP_COMPARE_AND_STORE_RET64(p, cv, sv) \
	__kmp_compare_and_store_ret64((volatile kmp_int64 *)(p), (kmp_int64)(cv), \
	(kmp_int64)(sv))

	#define KMP_XCHG_FIXED8(p, v) \
	__kmp_xchg_fixed8((volatile kmp_int8 *)(p), (kmp_int8)(v));
	#define KMP_XCHG_FIXED16(p, v) __kmp_xchg_fixed16((p), (v));
	//#define KMP_XCHG_FIXED32(p, v) __kmp_xchg_fixed32((p), (v));
	//#define KMP_XCHG_FIXED64(p, v) __kmp_xchg_fixed64((p), (v));
	//#define KMP_XCHG_REAL32(p, v) __kmp_xchg_real32((p), (v));
	#define KMP_XCHG_REAL64(p, v) __kmp_xchg_real64((p), (v));

	#elif (KMP_ASM_INTRINS && KMP_OS_UNIX) \|\| !(KMP_ARCH_X86 \|\| KMP_ARCH_X86_64)

	/* cast p to correct type so that proper intrinsic will be used */
	#define KMP_TEST_THEN_INC32(p) \
	__sync_fetch_and_add((volatile kmp_int32 *)(p), 1)
	#define KMP_TEST_THEN_INC_ACQ32(p) \
	__sync_fetch_and_add((volatile kmp_int32 *)(p), 1)
	#if KMP_ARCH_MIPS
	#define KMP_TEST_THEN_INC64(p) \
	__atomic_fetch_add((volatile kmp_int64 *)(p), 1LL, __ATOMIC_SEQ_CST)
	#define KMP_TEST_THEN_INC_ACQ64(p) \
	__atomic_fetch_add((volatile kmp_int64 *)(p), 1LL, __ATOMIC_SEQ_CST)
	#else
	#define KMP_TEST_THEN_INC64(p) \
	__sync_fetch_and_add((volatile kmp_int64 *)(p), 1LL)
	#define KMP_TEST_THEN_INC_ACQ64(p) \
	__sync_fetch_and_add((volatile kmp_int64 *)(p), 1LL)
	#endif
	#define KMP_TEST_THEN_ADD4_32(p) \
	__sync_fetch_and_add((volatile kmp_int32 *)(p), 4)
	#define KMP_TEST_THEN_ADD4_ACQ32(p) \
	__sync_fetch_and_add((volatile kmp_int32 *)(p), 4)
	#if KMP_ARCH_MIPS
	#define KMP_TEST_THEN_ADD4_64(p) \
	__atomic_fetch_add((volatile kmp_int64 *)(p), 4LL, __ATOMIC_SEQ_CST)
	#define KMP_TEST_THEN_ADD4_ACQ64(p) \
	__atomic_fetch_add((volatile kmp_int64 *)(p), 4LL, __ATOMIC_SEQ_CST)
	#define KMP_TEST_THEN_DEC64(p) \
	__atomic_fetch_sub((volatile kmp_int64 *)(p), 1LL, __ATOMIC_SEQ_CST)
	#define KMP_TEST_THEN_DEC_ACQ64(p) \
	__atomic_fetch_sub((volatile kmp_int64 *)(p), 1LL, __ATOMIC_SEQ_CST)
	#else
	#define KMP_TEST_THEN_ADD4_64(p) \
	__sync_fetch_and_add((volatile kmp_int64 *)(p), 4LL)
	#define KMP_TEST_THEN_ADD4_ACQ64(p) \
	__sync_fetch_and_add((volatile kmp_int64 *)(p), 4LL)
	#define KMP_TEST_THEN_DEC64(p) \
	__sync_fetch_and_sub((volatile kmp_int64 *)(p), 1LL)
	#define KMP_TEST_THEN_DEC_ACQ64(p) \
	__sync_fetch_and_sub((volatile kmp_int64 *)(p), 1LL)
	#endif
	#define KMP_TEST_THEN_DEC32(p) \
	__sync_fetch_and_sub((volatile kmp_int32 *)(p), 1)
	#define KMP_TEST_THEN_DEC_ACQ32(p) \
	__sync_fetch_and_sub((volatile kmp_int32 *)(p), 1)
	#define KMP_TEST_THEN_ADD8(p, v) \
	__sync_fetch_and_add((volatile kmp_int8 *)(p), (kmp_int8)(v))
	#define KMP_TEST_THEN_ADD32(p, v) \
	__sync_fetch_and_add((volatile kmp_int32 *)(p), (kmp_int32)(v))
	#if KMP_ARCH_MIPS
	#define KMP_TEST_THEN_ADD64(p, v) \
	__atomic_fetch_add((volatile kmp_uint64 *)(p), (kmp_uint64)(v), \
	__ATOMIC_SEQ_CST)
	#else
	#define KMP_TEST_THEN_ADD64(p, v) \
	__sync_fetch_and_add((volatile kmp_int64 *)(p), (kmp_int64)(v))
	#endif

	#define KMP_TEST_THEN_OR8(p, v) \
	__sync_fetch_and_or((volatile kmp_int8 *)(p), (kmp_int8)(v))
	#define KMP_TEST_THEN_AND8(p, v) \
	__sync_fetch_and_and((volatile kmp_int8 *)(p), (kmp_int8)(v))
	#define KMP_TEST_THEN_OR32(p, v) \
	__sync_fetch_and_or((volatile kmp_uint32 *)(p), (kmp_uint32)(v))
	#define KMP_TEST_THEN_AND32(p, v) \
	__sync_fetch_and_and((volatile kmp_uint32 *)(p), (kmp_uint32)(v))
	#if KMP_ARCH_MIPS
	#define KMP_TEST_THEN_OR64(p, v) \
	__atomic_fetch_or((volatile kmp_uint64 *)(p), (kmp_uint64)(v), \
	__ATOMIC_SEQ_CST)
	#define KMP_TEST_THEN_AND64(p, v) \
	__atomic_fetch_and((volatile kmp_uint64 *)(p), (kmp_uint64)(v), \
	__ATOMIC_SEQ_CST)
	#else
	#define KMP_TEST_THEN_OR64(p, v) \
	__sync_fetch_and_or((volatile kmp_uint64 *)(p), (kmp_uint64)(v))
	#define KMP_TEST_THEN_AND64(p, v) \
	__sync_fetch_and_and((volatile kmp_uint64 *)(p), (kmp_uint64)(v))
	#endif

	#define KMP_COMPARE_AND_STORE_ACQ8(p, cv, sv) \
	__sync_bool_compare_and_swap((volatile kmp_uint8 *)(p), (kmp_uint8)(cv), \
	(kmp_uint8)(sv))
	#define KMP_COMPARE_AND_STORE_REL8(p, cv, sv) \
	__sync_bool_compare_and_swap((volatile kmp_uint8 *)(p), (kmp_uint8)(cv), \
	(kmp_uint8)(sv))
	#define KMP_COMPARE_AND_STORE_ACQ16(p, cv, sv) \
	__sync_bool_compare_and_swap((volatile kmp_uint16 *)(p), (kmp_uint16)(cv), \
	(kmp_uint16)(sv))
	#define KMP_COMPARE_AND_STORE_REL16(p, cv, sv) \
	__sync_bool_compare_and_swap((volatile kmp_uint16 *)(p), (kmp_uint16)(cv), \
	(kmp_uint16)(sv))
	#define KMP_COMPARE_AND_STORE_ACQ32(p, cv, sv) \
	__sync_bool_compare_and_swap((volatile kmp_uint32 *)(p), (kmp_uint32)(cv), \
	(kmp_uint32)(sv))
	#define KMP_COMPARE_AND_STORE_REL32(p, cv, sv) \
	__sync_bool_compare_and_swap((volatile kmp_uint32 *)(p), (kmp_uint32)(cv), \
	(kmp_uint32)(sv))
	#define KMP_COMPARE_AND_STORE_PTR(p, cv, sv) \
	__sync_bool_compare_and_swap((void volatile )(p), (void *)(cv), \
	(void *)(sv))

	#define KMP_COMPARE_AND_STORE_RET8(p, cv, sv) \
	__sync_val_compare_and_swap((volatile kmp_uint8 *)(p), (kmp_uint8)(cv), \
	(kmp_uint8)(sv))
	#define KMP_COMPARE_AND_STORE_RET16(p, cv, sv) \
	__sync_val_compare_and_swap((volatile kmp_uint16 *)(p), (kmp_uint16)(cv), \
	(kmp_uint16)(sv))
	#define KMP_COMPARE_AND_STORE_RET32(p, cv, sv) \
	__sync_val_compare_and_swap((volatile kmp_uint32 *)(p), (kmp_uint32)(cv), \
	(kmp_uint32)(sv))
	#if KMP_ARCH_MIPS
	static inline bool mips_sync_bool_compare_and_swap(
	volatile kmp_uint64 *p, kmp_uint64 cv, kmp_uint64 sv) {
	return __atomic_compare_exchange(p, &cv, &sv, false, __ATOMIC_SEQ_CST,
	__ATOMIC_SEQ_CST);
	}
	static inline bool mips_sync_val_compare_and_swap(
	volatile kmp_uint64 *p, kmp_uint64 cv, kmp_uint64 sv) {
	__atomic_compare_exchange(p, &cv, &sv, false, __ATOMIC_SEQ_CST,
	__ATOMIC_SEQ_CST);
	return cv;
	}
	#define KMP_COMPARE_AND_STORE_ACQ64(p, cv, sv) \
	mips_sync_bool_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv),\
	(kmp_uint64)(sv))
	#define KMP_COMPARE_AND_STORE_REL64(p, cv, sv) \
	mips_sync_bool_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv),\
	(kmp_uint64)(sv))
	#define KMP_COMPARE_AND_STORE_RET64(p, cv, sv) \
	mips_sync_val_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv), \
	(kmp_uint64)(sv))
	#else
	#define KMP_COMPARE_AND_STORE_ACQ64(p, cv, sv) \
	__sync_bool_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv), \
	(kmp_uint64)(sv))
	#define KMP_COMPARE_AND_STORE_REL64(p, cv, sv) \
	__sync_bool_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv), \
	(kmp_uint64)(sv))
	#define KMP_COMPARE_AND_STORE_RET64(p, cv, sv) \
	__sync_val_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv), \
	(kmp_uint64)(sv))
	#endif

	#define KMP_XCHG_FIXED8(p, v) \
	__sync_lock_test_and_set((volatile kmp_uint8 *)(p), (kmp_uint8)(v))
	#define KMP_XCHG_FIXED16(p, v) \
	__sync_lock_test_and_set((volatile kmp_uint16 *)(p), (kmp_uint16)(v))
	#define KMP_XCHG_FIXED32(p, v) \
	__sync_lock_test_and_set((volatile kmp_uint32 *)(p), (kmp_uint32)(v))
	#define KMP_XCHG_FIXED64(p, v) \
	__sync_lock_test_and_set((volatile kmp_uint64 *)(p), (kmp_uint64)(v))

	inline kmp_real32 KMP_XCHG_REAL32(volatile kmp_real32 *p, kmp_real32 v) {
	kmp_int32 tmp =
	__sync_lock_test_and_set((volatile kmp_uint32 )(p), (kmp_uint32 *)&v);
	return (kmp_real32 )&tmp;
	}

	inline kmp_real64 KMP_XCHG_REAL64(volatile kmp_real64 *p, kmp_real64 v) {
	kmp_int64 tmp =
	__sync_lock_test_and_set((volatile kmp_uint64 )(p), (kmp_uint64 *)&v);
	return (kmp_real64 )&tmp;
	}

	#else

	extern kmp_int8 __kmp_test_then_add8(volatile kmp_int8 *p, kmp_int8 v);
	extern kmp_int8 __kmp_test_then_or8(volatile kmp_int8 *p, kmp_int8 v);
	extern kmp_int8 __kmp_test_then_and8(volatile kmp_int8 *p, kmp_int8 v);
	extern kmp_int32 __kmp_test_then_add32(volatile kmp_int32 *p, kmp_int32 v);
	extern kmp_uint32 __kmp_test_then_or32(volatile kmp_uint32 *p, kmp_uint32 v);
	extern kmp_uint32 __kmp_test_then_and32(volatile kmp_uint32 *p, kmp_uint32 v);
	extern kmp_int64 __kmp_test_then_add64(volatile kmp_int64 *p, kmp_int64 v);
	extern kmp_uint64 __kmp_test_then_or64(volatile kmp_uint64 *p, kmp_uint64 v);
	extern kmp_uint64 __kmp_test_then_and64(volatile kmp_uint64 *p, kmp_uint64 v);

	extern kmp_int8 __kmp_compare_and_store8(volatile kmp_int8 *p, kmp_int8 cv,
	kmp_int8 sv);
	extern kmp_int16 __kmp_compare_and_store16(volatile kmp_int16 *p, kmp_int16 cv,
	kmp_int16 sv);
	extern kmp_int32 __kmp_compare_and_store32(volatile kmp_int32 *p, kmp_int32 cv,
	kmp_int32 sv);
	extern kmp_int32 __kmp_compare_and_store64(volatile kmp_int64 *p, kmp_int64 cv,
	kmp_int64 sv);
	extern kmp_int8 __kmp_compare_and_store_ret8(volatile kmp_int8 *p, kmp_int8 cv,
	kmp_int8 sv);
	extern kmp_int16 __kmp_compare_and_store_ret16(volatile kmp_int16 *p,
	kmp_int16 cv, kmp_int16 sv);
	extern kmp_int32 __kmp_compare_and_store_ret32(volatile kmp_int32 *p,
	kmp_int32 cv, kmp_int32 sv);
	extern kmp_int64 __kmp_compare_and_store_ret64(volatile kmp_int64 *p,
	kmp_int64 cv, kmp_int64 sv);

	extern kmp_int8 __kmp_xchg_fixed8(volatile kmp_int8 *p, kmp_int8 v);
	extern kmp_int16 __kmp_xchg_fixed16(volatile kmp_int16 *p, kmp_int16 v);
	extern kmp_int32 __kmp_xchg_fixed32(volatile kmp_int32 *p, kmp_int32 v);
	extern kmp_int64 __kmp_xchg_fixed64(volatile kmp_int64 *p, kmp_int64 v);
	extern kmp_real32 __kmp_xchg_real32(volatile kmp_real32 *p, kmp_real32 v);
	extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v);

	#define KMP_TEST_THEN_INC32(p) \
	__kmp_test_then_add32((volatile kmp_int32 *)(p), 1)
	#define KMP_TEST_THEN_INC_ACQ32(p) \
	__kmp_test_then_add32((volatile kmp_int32 *)(p), 1)
	#define KMP_TEST_THEN_INC64(p) \
	__kmp_test_then_add64((volatile kmp_int64 *)(p), 1LL)
	#define KMP_TEST_THEN_INC_ACQ64(p) \
	__kmp_test_then_add64((volatile kmp_int64 *)(p), 1LL)
	#define KMP_TEST_THEN_ADD4_32(p) \
	__kmp_test_then_add32((volatile kmp_int32 *)(p), 4)
	#define KMP_TEST_THEN_ADD4_ACQ32(p) \
	__kmp_test_then_add32((volatile kmp_int32 *)(p), 4)
	#define KMP_TEST_THEN_ADD4_64(p) \
	__kmp_test_then_add64((volatile kmp_int64 *)(p), 4LL)
	#define KMP_TEST_THEN_ADD4_ACQ64(p) \
	__kmp_test_then_add64((volatile kmp_int64 *)(p), 4LL)
	#define KMP_TEST_THEN_DEC32(p) \
	__kmp_test_then_add32((volatile kmp_int32 *)(p), -1)
	#define KMP_TEST_THEN_DEC_ACQ32(p) \
	__kmp_test_then_add32((volatile kmp_int32 *)(p), -1)
	#define KMP_TEST_THEN_DEC64(p) \
	__kmp_test_then_add64((volatile kmp_int64 *)(p), -1LL)
	#define KMP_TEST_THEN_DEC_ACQ64(p) \
	__kmp_test_then_add64((volatile kmp_int64 *)(p), -1LL)
	#define KMP_TEST_THEN_ADD8(p, v) \
	__kmp_test_then_add8((volatile kmp_int8 *)(p), (kmp_int8)(v))
	#define KMP_TEST_THEN_ADD32(p, v) \
	__kmp_test_then_add32((volatile kmp_int32 *)(p), (kmp_int32)(v))
	#define KMP_TEST_THEN_ADD64(p, v) \
	__kmp_test_then_add64((volatile kmp_int64 *)(p), (kmp_int64)(v))

	#define KMP_TEST_THEN_OR8(p, v) \
	__kmp_test_then_or8((volatile kmp_int8 *)(p), (kmp_int8)(v))
	#define KMP_TEST_THEN_AND8(p, v) \
	__kmp_test_then_and8((volatile kmp_int8 *)(p), (kmp_int8)(v))
	#define KMP_TEST_THEN_OR32(p, v) \
	__kmp_test_then_or32((volatile kmp_uint32 *)(p), (kmp_uint32)(v))
	#define KMP_TEST_THEN_AND32(p, v) \
	__kmp_test_then_and32((volatile kmp_uint32 *)(p), (kmp_uint32)(v))
	#define KMP_TEST_THEN_OR64(p, v) \
	__kmp_test_then_or64((volatile kmp_uint64 *)(p), (kmp_uint64)(v))
	#define KMP_TEST_THEN_AND64(p, v) \
	__kmp_test_then_and64((volatile kmp_uint64 *)(p), (kmp_uint64)(v))

	#define KMP_COMPARE_AND_STORE_ACQ8(p, cv, sv) \
	__kmp_compare_and_store8((volatile kmp_int8 *)(p), (kmp_int8)(cv), \
	(kmp_int8)(sv))
	#define KMP_COMPARE_AND_STORE_REL8(p, cv, sv) \
	__kmp_compare_and_store8((volatile kmp_int8 *)(p), (kmp_int8)(cv), \
	(kmp_int8)(sv))
	#define KMP_COMPARE_AND_STORE_ACQ16(p, cv, sv) \
	__kmp_compare_and_store16((volatile kmp_int16 *)(p), (kmp_int16)(cv), \
	(kmp_int16)(sv))
	#define KMP_COMPARE_AND_STORE_REL16(p, cv, sv) \
	__kmp_compare_and_store16((volatile kmp_int16 *)(p), (kmp_int16)(cv), \
	(kmp_int16)(sv))
	#define KMP_COMPARE_AND_STORE_ACQ32(p, cv, sv) \
	__kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv), \
	(kmp_int32)(sv))
	#define KMP_COMPARE_AND_STORE_REL32(p, cv, sv) \
	__kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv), \
	(kmp_int32)(sv))
	#define KMP_COMPARE_AND_STORE_ACQ64(p, cv, sv) \
	__kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv), \
	(kmp_int64)(sv))
	#define KMP_COMPARE_AND_STORE_REL64(p, cv, sv) \
	__kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv), \
	(kmp_int64)(sv))

	#if KMP_ARCH_X86
	#define KMP_COMPARE_AND_STORE_PTR(p, cv, sv) \
	__kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv), \
	(kmp_int32)(sv))
	#else /* 64 bit pointers */
	#define KMP_COMPARE_AND_STORE_PTR(p, cv, sv) \
	__kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv), \
	(kmp_int64)(sv))
	#endif /* KMP_ARCH_X86 */

	#define KMP_COMPARE_AND_STORE_RET8(p, cv, sv) \
	__kmp_compare_and_store_ret8((p), (cv), (sv))
	#define KMP_COMPARE_AND_STORE_RET16(p, cv, sv) \
	__kmp_compare_and_store_ret16((p), (cv), (sv))
	#define KMP_COMPARE_AND_STORE_RET32(p, cv, sv) \
	__kmp_compare_and_store_ret32((volatile kmp_int32 *)(p), (kmp_int32)(cv), \
	(kmp_int32)(sv))
	#define KMP_COMPARE_AND_STORE_RET64(p, cv, sv) \
	__kmp_compare_and_store_ret64((volatile kmp_int64 *)(p), (kmp_int64)(cv), \
	(kmp_int64)(sv))

	#define KMP_XCHG_FIXED8(p, v) \
	__kmp_xchg_fixed8((volatile kmp_int8 *)(p), (kmp_int8)(v));
	#define KMP_XCHG_FIXED16(p, v) __kmp_xchg_fixed16((p), (v));
	#define KMP_XCHG_FIXED32(p, v) __kmp_xchg_fixed32((p), (v));
	#define KMP_XCHG_FIXED64(p, v) __kmp_xchg_fixed64((p), (v));
	#define KMP_XCHG_REAL32(p, v) __kmp_xchg_real32((p), (v));
	#define KMP_XCHG_REAL64(p, v) __kmp_xchg_real64((p), (v));

	#endif /* KMP_ASM_INTRINS */

	/* ------------- relaxed consistency memory model stuff ------------------ */

	#if KMP_OS_WINDOWS
	#ifdef __ABSOFT_WIN
	#define KMP_MB() asm("nop")
	#define KMP_IMB() asm("nop")
	#else
	#define KMP_MB() /* _asm{ nop } */
	#define KMP_IMB() /* _asm{ nop } */
	#endif
	#endif /* KMP_OS_WINDOWS */

	#if KMP_ARCH_PPC64 \|\| KMP_ARCH_ARM \|\| KMP_ARCH_AARCH64 \|\| KMP_ARCH_MIPS \|\| \
	KMP_ARCH_MIPS64 \|\| KMP_ARCH_RISCV64
	#define KMP_MB() __sync_synchronize()
	#endif

	#ifndef KMP_MB
	#define KMP_MB() /* nothing to do */
	#endif

	#ifndef KMP_IMB
	#define KMP_IMB() /* nothing to do */
	#endif

	#ifndef KMP_ST_REL32
	#define KMP_ST_REL32(A, D) (*(A) = (D))
	#endif

	#ifndef KMP_ST_REL64
	#define KMP_ST_REL64(A, D) (*(A) = (D))
	#endif

	#ifndef KMP_LD_ACQ32
	#define KMP_LD_ACQ32(A) (*(A))
	#endif

	#ifndef KMP_LD_ACQ64
	#define KMP_LD_ACQ64(A) (*(A))
	#endif

	/* ------------------------------------------------------------------------ */
	// FIXME - maybe this should this be
	//
	// #define TCR_4(a) ((volatile kmp_int32 )(&a))
	// #define TCW_4(a,b) (a) = ((volatile kmp_int32 )&(b))
	//
	// #define TCR_8(a) ((volatile kmp_int64 )(a))
	// #define TCW_8(a,b) (a) = ((volatile kmp_int64 )(&b))
	//
	// I'm fairly certain this is the correct thing to do, but I'm afraid
	// of performance regressions.

	#define TCR_1(a) (a)
	#define TCW_1(a, b) (a) = (b)
	#define TCR_4(a) (a)
	#define TCW_4(a, b) (a) = (b)
	#define TCI_4(a) (++(a))
	#define TCD_4(a) (--(a))
	#define TCR_8(a) (a)
	#define TCW_8(a, b) (a) = (b)
	#define TCI_8(a) (++(a))
	#define TCD_8(a) (--(a))
	#define TCR_SYNC_4(a) (a)
	#define TCW_SYNC_4(a, b) (a) = (b)
	#define TCX_SYNC_4(a, b, c) \
	KMP_COMPARE_AND_STORE_REL32((volatile kmp_int32 )(volatile void )&(a), \
	(kmp_int32)(b), (kmp_int32)(c))
	#define TCR_SYNC_8(a) (a)
	#define TCW_SYNC_8(a, b) (a) = (b)
	#define TCX_SYNC_8(a, b, c) \
	KMP_COMPARE_AND_STORE_REL64((volatile kmp_int64 )(volatile void )&(a), \
	(kmp_int64)(b), (kmp_int64)(c))

	#if KMP_ARCH_X86 \|\| KMP_ARCH_MIPS
	// What about ARM?
	#define TCR_PTR(a) ((void *)TCR_4(a))
	#define TCW_PTR(a, b) TCW_4((a), (b))
	#define TCR_SYNC_PTR(a) ((void *)TCR_SYNC_4(a))
	#define TCW_SYNC_PTR(a, b) TCW_SYNC_4((a), (b))
	#define TCX_SYNC_PTR(a, b, c) ((void *)TCX_SYNC_4((a), (b), (c)))

	#else /* 64 bit pointers */

	#define TCR_PTR(a) ((void *)TCR_8(a))
	#define TCW_PTR(a, b) TCW_8((a), (b))
	#define TCR_SYNC_PTR(a) ((void *)TCR_SYNC_8(a))
	#define TCW_SYNC_PTR(a, b) TCW_SYNC_8((a), (b))
	#define TCX_SYNC_PTR(a, b, c) ((void *)TCX_SYNC_8((a), (b), (c)))

	#endif /* KMP_ARCH_X86 */

	/* If these FTN_{TRUE,FALSE} values change, may need to change several places
	where they are used to check that language is Fortran, not C. */

	#ifndef FTN_TRUE
	#define FTN_TRUE TRUE
	#endif

	#ifndef FTN_FALSE
	#define FTN_FALSE FALSE
	#endif

	typedef void (microtask_t)(int gtid, int *npr, ...);

	#ifdef USE_VOLATILE_CAST
	#define VOLATILE_CAST(x) (volatile x)
	#else
	#define VOLATILE_CAST(x) (x)
	#endif

	#define KMP_WAIT __kmp_wait_4
	#define KMP_WAIT_PTR __kmp_wait_4_ptr
	#define KMP_EQ __kmp_eq_4
	#define KMP_NEQ __kmp_neq_4
	#define KMP_LT __kmp_lt_4
	#define KMP_GE __kmp_ge_4
	#define KMP_LE __kmp_le_4

	/* Workaround for Intel(R) 64 code gen bug when taking address of static array
	* (Intel(R) 64 Tracker #138) */
	#if (KMP_ARCH_X86_64 \|\| KMP_ARCH_PPC64) && KMP_OS_LINUX
	#define STATIC_EFI2_WORKAROUND
	#else
	#define STATIC_EFI2_WORKAROUND static
	#endif

	// Support of BGET usage
	#ifndef KMP_USE_BGET
	#define KMP_USE_BGET 1
	#endif

	// Switches for OSS builds
	#ifndef USE_CMPXCHG_FIX
	#define USE_CMPXCHG_FIX 1
	#endif

	// Enable dynamic user lock
	#define KMP_USE_DYNAMIC_LOCK 1

	// Enable Intel(R) Transactional Synchronization Extensions (Intel(R) TSX) if
	// dynamic user lock is turned on
	#if KMP_USE_DYNAMIC_LOCK
	// Visual studio can't handle the asm sections in this code
	#define KMP_USE_TSX (KMP_ARCH_X86 \|\| KMP_ARCH_X86_64) && !KMP_COMPILER_MSVC
	#ifdef KMP_USE_ADAPTIVE_LOCKS
	#undef KMP_USE_ADAPTIVE_LOCKS
	#endif
	#define KMP_USE_ADAPTIVE_LOCKS KMP_USE_TSX
	#endif

	// Enable tick time conversion of ticks to seconds
	#if KMP_STATS_ENABLED
	#define KMP_HAVE_TICK_TIME \
	(KMP_OS_LINUX && (KMP_MIC \|\| KMP_ARCH_X86 \|\| KMP_ARCH_X86_64))
	#endif

	// Warning levels
	enum kmp_warnings_level {
	kmp_warnings_off = 0, /* No warnings */
	kmp_warnings_low, /* Minimal warnings (default) */
	kmp_warnings_explicit = 6, /* Explicitly set to ON - more warnings */
	kmp_warnings_verbose /* reserved */
	};

	#ifdef __cplusplus
	} // extern "C"
	#endif // __cplusplus

	// Macros for C++11 atomic functions
	#define KMP_ATOMIC_LD(p, order) (p)->load(std::memory_order_##order)
	#define KMP_ATOMIC_OP(op, p, v, order) (p)->op(v, std::memory_order_##order)

	// For non-default load/store
	#define KMP_ATOMIC_LD_ACQ(p) KMP_ATOMIC_LD(p, acquire)
	#define KMP_ATOMIC_LD_RLX(p) KMP_ATOMIC_LD(p, relaxed)
	#define KMP_ATOMIC_ST_REL(p, v) KMP_ATOMIC_OP(store, p, v, release)
	#define KMP_ATOMIC_ST_RLX(p, v) KMP_ATOMIC_OP(store, p, v, relaxed)

	// For non-default fetch_<op>
	#define KMP_ATOMIC_ADD(p, v) KMP_ATOMIC_OP(fetch_add, p, v, acq_rel)
	#define KMP_ATOMIC_SUB(p, v) KMP_ATOMIC_OP(fetch_sub, p, v, acq_rel)
	#define KMP_ATOMIC_AND(p, v) KMP_ATOMIC_OP(fetch_and, p, v, acq_rel)
	#define KMP_ATOMIC_OR(p, v) KMP_ATOMIC_OP(fetch_or, p, v, acq_rel)
	#define KMP_ATOMIC_INC(p) KMP_ATOMIC_OP(fetch_add, p, 1, acq_rel)
	#define KMP_ATOMIC_DEC(p) KMP_ATOMIC_OP(fetch_sub, p, 1, acq_rel)
	#define KMP_ATOMIC_ADD_RLX(p, v) KMP_ATOMIC_OP(fetch_add, p, v, relaxed)
	#define KMP_ATOMIC_INC_RLX(p) KMP_ATOMIC_OP(fetch_add, p, 1, relaxed)

	// Callers of the following functions cannot see the side effect on "expected".
	template <typename T>
	bool __kmp_atomic_compare_store(std::atomic<T> *p, T expected, T desired) {
	return p->compare_exchange_strong(
	expected, desired, std::memory_order_acq_rel, std::memory_order_relaxed);
	}

	template <typename T>
	bool __kmp_atomic_compare_store_acq(std::atomic<T> *p, T expected, T desired) {
	return p->compare_exchange_strong(
	expected, desired, std::memory_order_acquire, std::memory_order_relaxed);
	}

	template <typename T>
	bool __kmp_atomic_compare_store_rel(std::atomic<T> *p, T expected, T desired) {
	return p->compare_exchange_strong(
	expected, desired, std::memory_order_release, std::memory_order_relaxed);
	}

	#endif /* KMP_OS_H */
	// Safe C API
	#include "kmp_safe_c_api.h"
	diff --git a/openmp/runtime/src/ompt-specific.cpp b/openmp/runtime/src/ompt-specific.cpp
	index a7288f08a661..9be699110fc6 100644
	--- a/openmp/runtime/src/ompt-specific.cpp
	+++ b/openmp/runtime/src/ompt-specific.cpp
	@@ -1,504 +1,504 @@
	/*
	* ompt-specific.cpp -- OMPT internal functions
	*/

	//===----------------------------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	//******************************************************************************
	// include files
	//******************************************************************************

	#include "kmp.h"
	#include "ompt-specific.h"

	#if KMP_OS_UNIX
	#include <dlfcn.h>
	#endif

	#if KMP_OS_WINDOWS
	#define THREAD_LOCAL __declspec(thread)
	#else
	#define THREAD_LOCAL __thread
	#endif

	-#define OMPT_WEAK_ATTRIBUTE KMP_WEAK_ATTRIBUTE
	+#define OMPT_WEAK_ATTRIBUTE KMP_WEAK_ATTRIBUTE_INTERNAL

	//******************************************************************************
	// macros
	//******************************************************************************

	#define LWT_FROM_TEAM(team) (team)->t.ompt_serialized_team_info

	#define OMPT_THREAD_ID_BITS 16

	//******************************************************************************
	// private operations
	//******************************************************************************

	//----------------------------------------------------------
	// traverse the team and task hierarchy
	// note: __ompt_get_teaminfo and __ompt_get_task_info_object
	// traverse the hierarchy similarly and need to be
	// kept consistent
	//----------------------------------------------------------

	ompt_team_info_t __ompt_get_teaminfo(int depth, int size) {
	kmp_info_t *thr = ompt_get_thread();

	if (thr) {
	kmp_team *team = thr->th.th_team;
	if (team == NULL)
	return NULL;

	ompt_lw_taskteam_t next_lwt = LWT_FROM_TEAM(team), lwt = NULL;

	while (depth > 0) {
	// next lightweight team (if any)
	if (lwt)
	lwt = lwt->parent;

	// next heavyweight team (if any) after
	// lightweight teams are exhausted
	if (!lwt && team) {
	if (next_lwt) {
	lwt = next_lwt;
	next_lwt = NULL;
	} else {
	team = team->t.t_parent;
	if (team) {
	next_lwt = LWT_FROM_TEAM(team);
	}
	}
	}

	depth--;
	}

	if (lwt) {
	// lightweight teams have one task
	if (size)
	*size = 1;

	// return team info for lightweight team
	return &lwt->ompt_team_info;
	} else if (team) {
	// extract size from heavyweight team
	if (size)
	*size = team->t.t_nproc;

	// return team info for heavyweight team
	return &team->t.ompt_team_info;
	}
	}

	return NULL;
	}

	ompt_task_info_t *__ompt_get_task_info_object(int depth) {
	ompt_task_info_t *info = NULL;
	kmp_info_t *thr = ompt_get_thread();

	if (thr) {
	kmp_taskdata_t *taskdata = thr->th.th_current_task;
	ompt_lw_taskteam_t *lwt = NULL,
	*next_lwt = LWT_FROM_TEAM(taskdata->td_team);

	while (depth > 0) {
	// next lightweight team (if any)
	if (lwt)
	lwt = lwt->parent;

	// next heavyweight team (if any) after
	// lightweight teams are exhausted
	if (!lwt && taskdata) {
	if (next_lwt) {
	lwt = next_lwt;
	next_lwt = NULL;
	} else {
	taskdata = taskdata->td_parent;
	if (taskdata) {
	next_lwt = LWT_FROM_TEAM(taskdata->td_team);
	}
	}
	}
	depth--;
	}

	if (lwt) {
	info = &lwt->ompt_task_info;
	} else if (taskdata) {
	info = &taskdata->ompt_task_info;
	}
	}

	return info;
	}

	ompt_task_info_t *__ompt_get_scheduling_taskinfo(int depth) {
	ompt_task_info_t *info = NULL;
	kmp_info_t *thr = ompt_get_thread();

	if (thr) {
	kmp_taskdata_t *taskdata = thr->th.th_current_task;

	ompt_lw_taskteam_t *lwt = NULL,
	*next_lwt = LWT_FROM_TEAM(taskdata->td_team);

	while (depth > 0) {
	// next lightweight team (if any)
	if (lwt)
	lwt = lwt->parent;

	// next heavyweight team (if any) after
	// lightweight teams are exhausted
	if (!lwt && taskdata) {
	// first try scheduling parent (for explicit task scheduling)
	if (taskdata->ompt_task_info.scheduling_parent) {
	taskdata = taskdata->ompt_task_info.scheduling_parent;
	} else if (next_lwt) {
	lwt = next_lwt;
	next_lwt = NULL;
	} else {
	// then go for implicit tasks
	taskdata = taskdata->td_parent;
	if (taskdata) {
	next_lwt = LWT_FROM_TEAM(taskdata->td_team);
	}
	}
	}
	depth--;
	}

	if (lwt) {
	info = &lwt->ompt_task_info;
	} else if (taskdata) {
	info = &taskdata->ompt_task_info;
	}
	}

	return info;
	}

	//******************************************************************************
	// interface operations
	//******************************************************************************

	//----------------------------------------------------------
	// thread support
	//----------------------------------------------------------

	ompt_data_t *__ompt_get_thread_data_internal() {
	if (__kmp_get_gtid() >= 0) {
	kmp_info_t *thread = ompt_get_thread();
	if (thread == NULL)
	return NULL;
	return &(thread->th.ompt_thread_info.thread_data);
	}
	return NULL;
	}

	//----------------------------------------------------------
	// state support
	//----------------------------------------------------------

	void __ompt_thread_assign_wait_id(void *variable) {
	kmp_info_t *ti = ompt_get_thread();

	if (ti)
	ti->th.ompt_thread_info.wait_id = (ompt_wait_id_t)(uintptr_t)variable;
	}

	int __ompt_get_state_internal(ompt_wait_id_t *omp_wait_id) {
	kmp_info_t *ti = ompt_get_thread();

	if (ti) {
	if (omp_wait_id)
	*omp_wait_id = ti->th.ompt_thread_info.wait_id;
	return ti->th.ompt_thread_info.state;
	}
	return ompt_state_undefined;
	}

	//----------------------------------------------------------
	// parallel region support
	//----------------------------------------------------------

	int __ompt_get_parallel_info_internal(int ancestor_level,
	ompt_data_t **parallel_data,
	int *team_size) {
	if (__kmp_get_gtid() >= 0) {
	ompt_team_info_t *info;
	if (team_size) {
	info = __ompt_get_teaminfo(ancestor_level, team_size);
	} else {
	info = __ompt_get_teaminfo(ancestor_level, NULL);
	}
	if (parallel_data) {
	*parallel_data = info ? &(info->parallel_data) : NULL;
	}
	return info ? 2 : 0;
	} else {
	return 0;
	}
	}

	//----------------------------------------------------------
	// lightweight task team support
	//----------------------------------------------------------

	void __ompt_lw_taskteam_init(ompt_lw_taskteam_t lwt, kmp_info_t thr, int gtid,
	ompt_data_t ompt_pid, void codeptr) {
	// initialize parallel_data with input, return address to parallel_data on
	// exit
	lwt->ompt_team_info.parallel_data = *ompt_pid;
	lwt->ompt_team_info.master_return_address = codeptr;
	lwt->ompt_task_info.task_data.value = 0;
	lwt->ompt_task_info.frame.enter_frame = ompt_data_none;
	lwt->ompt_task_info.frame.exit_frame = ompt_data_none;
	lwt->ompt_task_info.scheduling_parent = NULL;
	lwt->heap = 0;
	lwt->parent = 0;
	}

	void __ompt_lw_taskteam_link(ompt_lw_taskteam_t lwt, kmp_info_t thr,
	int on_heap, bool always) {
	ompt_lw_taskteam_t *link_lwt = lwt;
	if (always \|\|
	thr->th.th_team->t.t_serialized >
	1) { // we already have a team, so link the new team and swap values
	if (on_heap) { // the lw_taskteam cannot stay on stack, allocate it on heap
	link_lwt =
	(ompt_lw_taskteam_t *)__kmp_allocate(sizeof(ompt_lw_taskteam_t));
	}
	link_lwt->heap = on_heap;

	// would be swap in the (on_stack) case.
	ompt_team_info_t tmp_team = lwt->ompt_team_info;
	link_lwt->ompt_team_info = *OMPT_CUR_TEAM_INFO(thr);
	*OMPT_CUR_TEAM_INFO(thr) = tmp_team;

	ompt_task_info_t tmp_task = lwt->ompt_task_info;
	link_lwt->ompt_task_info = *OMPT_CUR_TASK_INFO(thr);
	*OMPT_CUR_TASK_INFO(thr) = tmp_task;

	// link the taskteam into the list of taskteams:
	ompt_lw_taskteam_t *my_parent =
	thr->th.th_team->t.ompt_serialized_team_info;
	link_lwt->parent = my_parent;
	thr->th.th_team->t.ompt_serialized_team_info = link_lwt;
	} else {
	// this is the first serialized team, so we just store the values in the
	// team and drop the taskteam-object
	*OMPT_CUR_TEAM_INFO(thr) = lwt->ompt_team_info;
	*OMPT_CUR_TASK_INFO(thr) = lwt->ompt_task_info;
	}
	}

	void __ompt_lw_taskteam_unlink(kmp_info_t *thr) {
	ompt_lw_taskteam_t *lwtask = thr->th.th_team->t.ompt_serialized_team_info;
	if (lwtask) {
	thr->th.th_team->t.ompt_serialized_team_info = lwtask->parent;

	ompt_team_info_t tmp_team = lwtask->ompt_team_info;
	lwtask->ompt_team_info = *OMPT_CUR_TEAM_INFO(thr);
	*OMPT_CUR_TEAM_INFO(thr) = tmp_team;

	ompt_task_info_t tmp_task = lwtask->ompt_task_info;
	lwtask->ompt_task_info = *OMPT_CUR_TASK_INFO(thr);
	*OMPT_CUR_TASK_INFO(thr) = tmp_task;

	if (lwtask->heap) {
	__kmp_free(lwtask);
	lwtask = NULL;
	}
	}
	// return lwtask;
	}

	//----------------------------------------------------------
	// task support
	//----------------------------------------------------------

	int __ompt_get_task_info_internal(int ancestor_level, int *type,
	ompt_data_t **task_data,
	ompt_frame_t **task_frame,
	ompt_data_t **parallel_data,
	int *thread_num) {
	if (__kmp_get_gtid() < 0)
	return 0;

	if (ancestor_level < 0)
	return 0;

	// copied from __ompt_get_scheduling_taskinfo
	ompt_task_info_t *info = NULL;
	ompt_team_info_t *team_info = NULL;
	kmp_info_t *thr = ompt_get_thread();
	int level = ancestor_level;

	if (thr) {
	kmp_taskdata_t *taskdata = thr->th.th_current_task;
	if (taskdata == NULL)
	return 0;
	kmp_team team = thr->th.th_team, prev_team = NULL;
	if (team == NULL)
	return 0;
	ompt_lw_taskteam_t *lwt = NULL,
	*next_lwt = LWT_FROM_TEAM(taskdata->td_team),
	*prev_lwt = NULL;

	while (ancestor_level > 0) {
	// needed for thread_num
	prev_team = team;
	prev_lwt = lwt;
	// next lightweight team (if any)
	if (lwt)
	lwt = lwt->parent;

	// next heavyweight team (if any) after
	// lightweight teams are exhausted
	if (!lwt && taskdata) {
	// first try scheduling parent (for explicit task scheduling)
	if (taskdata->ompt_task_info.scheduling_parent) {
	taskdata = taskdata->ompt_task_info.scheduling_parent;
	} else if (next_lwt) {
	lwt = next_lwt;
	next_lwt = NULL;
	} else {
	// then go for implicit tasks
	taskdata = taskdata->td_parent;
	if (team == NULL)
	return 0;
	team = team->t.t_parent;
	if (taskdata) {
	next_lwt = LWT_FROM_TEAM(taskdata->td_team);
	}
	}
	}
	ancestor_level--;
	}

	if (lwt) {
	info = &lwt->ompt_task_info;
	team_info = &lwt->ompt_team_info;
	if (type) {
	*type = ompt_task_implicit;
	}
	} else if (taskdata) {
	info = &taskdata->ompt_task_info;
	team_info = &team->t.ompt_team_info;
	if (type) {
	if (taskdata->td_parent) {
	*type = (taskdata->td_flags.tasktype ? ompt_task_explicit
	: ompt_task_implicit) \|
	TASK_TYPE_DETAILS_FORMAT(taskdata);
	} else {
	*type = ompt_task_initial;
	}
	}
	}
	if (task_data) {
	*task_data = info ? &info->task_data : NULL;
	}
	if (task_frame) {
	// OpenMP spec asks for the scheduling task to be returned.
	*task_frame = info ? &info->frame : NULL;
	}
	if (parallel_data) {
	*parallel_data = team_info ? &(team_info->parallel_data) : NULL;
	}
	if (thread_num) {
	if (level == 0)
	*thread_num = __kmp_get_tid();
	else if (prev_lwt)
	*thread_num = 0;
	else
	*thread_num = prev_team->t.t_master_tid;
	// *thread_num = team->t.t_master_tid;
	}
	return info ? 2 : 0;
	}
	return 0;
	}

	int __ompt_get_task_memory_internal(void *addr, size_t size, int blocknum) {
	if (blocknum != 0)
	return 0; // support only a single block

	kmp_info_t *thr = ompt_get_thread();
	if (!thr)
	return 0;

	kmp_taskdata_t *taskdata = thr->th.th_current_task;
	kmp_task_t *task = KMP_TASKDATA_TO_TASK(taskdata);

	if (taskdata->td_flags.tasktype != TASK_EXPLICIT)
	return 0; // support only explicit task

	void *ret_addr;
	int64_t ret_size = taskdata->td_size_alloc - sizeof(kmp_taskdata_t);

	// kmp_task_t->data1 is an optional member
	if (taskdata->td_flags.destructors_thunk)
	ret_addr = &task->data1 + 1;
	else
	ret_addr = &task->part_id + 1;

	ret_size -= (char )(ret_addr) - (char )(task);
	if (ret_size < 0)
	return 0;

	*addr = ret_addr;
	*size = ret_size;
	return 1;
	}

	//----------------------------------------------------------
	// team support
	//----------------------------------------------------------

	void __ompt_team_assign_id(kmp_team_t *team, ompt_data_t ompt_pid) {
	team->t.ompt_team_info.parallel_data = ompt_pid;
	}

	//----------------------------------------------------------
	// misc
	//----------------------------------------------------------

	static uint64_t __ompt_get_unique_id_internal() {
	static uint64_t thread = 1;
	static THREAD_LOCAL uint64_t ID = 0;
	if (ID == 0) {
	uint64_t new_thread = KMP_TEST_THEN_INC64((kmp_int64 *)&thread);
	ID = new_thread << (sizeof(uint64_t) * 8 - OMPT_THREAD_ID_BITS);
	}
	return ++ID;
	}

	ompt_sync_region_t __ompt_get_barrier_kind(enum barrier_type bt,
	kmp_info_t *thr) {
	if (bt == bs_forkjoin_barrier)
	return ompt_sync_region_barrier_implicit;

	if (bt != bs_plain_barrier)
	return ompt_sync_region_barrier_implementation;

	if (!thr->th.th_ident)
	return ompt_sync_region_barrier;

	kmp_int32 flags = thr->th.th_ident->flags;

	if ((flags & KMP_IDENT_BARRIER_EXPL) != 0)
	return ompt_sync_region_barrier_explicit;

	if ((flags & KMP_IDENT_BARRIER_IMPL) != 0)
	return ompt_sync_region_barrier_implicit;

	return ompt_sync_region_barrier_implementation;
	}

File Metadata

Mime Type: application/octet-stream
Expires: Wed, Jul 3, 9:54 AM (1 d, 23 h)
Storage Engine: chunks
Storage Format: Chunks
Storage Handle: RTtTX0g5em05
Default Alt Text: (7 MB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions